Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
0002 
0003 /*
0004  * AF_XDP user-space access library.
0005  *
0006  * Copyright(c) 2018 - 2019 Intel Corporation.
0007  *
0008  * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
0009  */
0010 
0011 #include <errno.h>
0012 #include <stdlib.h>
0013 #include <string.h>
0014 #include <unistd.h>
0015 #include <arpa/inet.h>
0016 #include <asm/barrier.h>
0017 #include <linux/compiler.h>
0018 #include <linux/ethtool.h>
0019 #include <linux/filter.h>
0020 #include <linux/if_ether.h>
0021 #include <linux/if_packet.h>
0022 #include <linux/if_xdp.h>
0023 #include <linux/kernel.h>
0024 #include <linux/list.h>
0025 #include <linux/sockios.h>
0026 #include <net/if.h>
0027 #include <sys/ioctl.h>
0028 #include <sys/mman.h>
0029 #include <sys/socket.h>
0030 #include <sys/types.h>
0031 #include <linux/if_link.h>
0032 
0033 #include <bpf/bpf.h>
0034 #include <bpf/libbpf.h>
0035 #include "xsk.h"
0036 
0037 #ifndef SOL_XDP
0038  #define SOL_XDP 283
0039 #endif
0040 
0041 #ifndef AF_XDP
0042  #define AF_XDP 44
0043 #endif
0044 
0045 #ifndef PF_XDP
0046  #define PF_XDP AF_XDP
0047 #endif
0048 
0049 #define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
0050 
0051 enum xsk_prog {
0052     XSK_PROG_FALLBACK,
0053     XSK_PROG_REDIRECT_FLAGS,
0054 };
0055 
0056 struct xsk_umem {
0057     struct xsk_ring_prod *fill_save;
0058     struct xsk_ring_cons *comp_save;
0059     char *umem_area;
0060     struct xsk_umem_config config;
0061     int fd;
0062     int refcount;
0063     struct list_head ctx_list;
0064     bool rx_ring_setup_done;
0065     bool tx_ring_setup_done;
0066 };
0067 
0068 struct xsk_ctx {
0069     struct xsk_ring_prod *fill;
0070     struct xsk_ring_cons *comp;
0071     __u32 queue_id;
0072     struct xsk_umem *umem;
0073     int refcount;
0074     int ifindex;
0075     struct list_head list;
0076     int prog_fd;
0077     int link_fd;
0078     int xsks_map_fd;
0079     char ifname[IFNAMSIZ];
0080     bool has_bpf_link;
0081 };
0082 
0083 struct xsk_socket {
0084     struct xsk_ring_cons *rx;
0085     struct xsk_ring_prod *tx;
0086     __u64 outstanding_tx;
0087     struct xsk_ctx *ctx;
0088     struct xsk_socket_config config;
0089     int fd;
0090 };
0091 
0092 struct xsk_nl_info {
0093     bool xdp_prog_attached;
0094     int ifindex;
0095     int fd;
0096 };
0097 
0098 /* Up until and including Linux 5.3 */
0099 struct xdp_ring_offset_v1 {
0100     __u64 producer;
0101     __u64 consumer;
0102     __u64 desc;
0103 };
0104 
0105 /* Up until and including Linux 5.3 */
0106 struct xdp_mmap_offsets_v1 {
0107     struct xdp_ring_offset_v1 rx;
0108     struct xdp_ring_offset_v1 tx;
0109     struct xdp_ring_offset_v1 fr;
0110     struct xdp_ring_offset_v1 cr;
0111 };
0112 
0113 int xsk_umem__fd(const struct xsk_umem *umem)
0114 {
0115     return umem ? umem->fd : -EINVAL;
0116 }
0117 
0118 int xsk_socket__fd(const struct xsk_socket *xsk)
0119 {
0120     return xsk ? xsk->fd : -EINVAL;
0121 }
0122 
0123 static bool xsk_page_aligned(void *buffer)
0124 {
0125     unsigned long addr = (unsigned long)buffer;
0126 
0127     return !(addr & (getpagesize() - 1));
0128 }
0129 
0130 static void xsk_set_umem_config(struct xsk_umem_config *cfg,
0131                 const struct xsk_umem_config *usr_cfg)
0132 {
0133     if (!usr_cfg) {
0134         cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
0135         cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
0136         cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
0137         cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
0138         cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
0139         return;
0140     }
0141 
0142     cfg->fill_size = usr_cfg->fill_size;
0143     cfg->comp_size = usr_cfg->comp_size;
0144     cfg->frame_size = usr_cfg->frame_size;
0145     cfg->frame_headroom = usr_cfg->frame_headroom;
0146     cfg->flags = usr_cfg->flags;
0147 }
0148 
0149 static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
0150                      const struct xsk_socket_config *usr_cfg)
0151 {
0152     if (!usr_cfg) {
0153         cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
0154         cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
0155         cfg->libbpf_flags = 0;
0156         cfg->xdp_flags = 0;
0157         cfg->bind_flags = 0;
0158         return 0;
0159     }
0160 
0161     if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)
0162         return -EINVAL;
0163 
0164     cfg->rx_size = usr_cfg->rx_size;
0165     cfg->tx_size = usr_cfg->tx_size;
0166     cfg->libbpf_flags = usr_cfg->libbpf_flags;
0167     cfg->xdp_flags = usr_cfg->xdp_flags;
0168     cfg->bind_flags = usr_cfg->bind_flags;
0169 
0170     return 0;
0171 }
0172 
0173 static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off)
0174 {
0175     struct xdp_mmap_offsets_v1 off_v1;
0176 
0177     /* getsockopt on a kernel <= 5.3 has no flags fields.
0178      * Copy over the offsets to the correct places in the >=5.4 format
0179      * and put the flags where they would have been on that kernel.
0180      */
0181     memcpy(&off_v1, off, sizeof(off_v1));
0182 
0183     off->rx.producer = off_v1.rx.producer;
0184     off->rx.consumer = off_v1.rx.consumer;
0185     off->rx.desc = off_v1.rx.desc;
0186     off->rx.flags = off_v1.rx.consumer + sizeof(__u32);
0187 
0188     off->tx.producer = off_v1.tx.producer;
0189     off->tx.consumer = off_v1.tx.consumer;
0190     off->tx.desc = off_v1.tx.desc;
0191     off->tx.flags = off_v1.tx.consumer + sizeof(__u32);
0192 
0193     off->fr.producer = off_v1.fr.producer;
0194     off->fr.consumer = off_v1.fr.consumer;
0195     off->fr.desc = off_v1.fr.desc;
0196     off->fr.flags = off_v1.fr.consumer + sizeof(__u32);
0197 
0198     off->cr.producer = off_v1.cr.producer;
0199     off->cr.consumer = off_v1.cr.consumer;
0200     off->cr.desc = off_v1.cr.desc;
0201     off->cr.flags = off_v1.cr.consumer + sizeof(__u32);
0202 }
0203 
0204 static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
0205 {
0206     socklen_t optlen;
0207     int err;
0208 
0209     optlen = sizeof(*off);
0210     err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen);
0211     if (err)
0212         return err;
0213 
0214     if (optlen == sizeof(*off))
0215         return 0;
0216 
0217     if (optlen == sizeof(struct xdp_mmap_offsets_v1)) {
0218         xsk_mmap_offsets_v1(off);
0219         return 0;
0220     }
0221 
0222     return -EINVAL;
0223 }
0224 
0225 static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
0226                  struct xsk_ring_prod *fill,
0227                  struct xsk_ring_cons *comp)
0228 {
0229     struct xdp_mmap_offsets off;
0230     void *map;
0231     int err;
0232 
0233     err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
0234              &umem->config.fill_size,
0235              sizeof(umem->config.fill_size));
0236     if (err)
0237         return -errno;
0238 
0239     err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
0240              &umem->config.comp_size,
0241              sizeof(umem->config.comp_size));
0242     if (err)
0243         return -errno;
0244 
0245     err = xsk_get_mmap_offsets(fd, &off);
0246     if (err)
0247         return -errno;
0248 
0249     map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
0250            PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
0251            XDP_UMEM_PGOFF_FILL_RING);
0252     if (map == MAP_FAILED)
0253         return -errno;
0254 
0255     fill->mask = umem->config.fill_size - 1;
0256     fill->size = umem->config.fill_size;
0257     fill->producer = map + off.fr.producer;
0258     fill->consumer = map + off.fr.consumer;
0259     fill->flags = map + off.fr.flags;
0260     fill->ring = map + off.fr.desc;
0261     fill->cached_cons = umem->config.fill_size;
0262 
0263     map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
0264            PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
0265            XDP_UMEM_PGOFF_COMPLETION_RING);
0266     if (map == MAP_FAILED) {
0267         err = -errno;
0268         goto out_mmap;
0269     }
0270 
0271     comp->mask = umem->config.comp_size - 1;
0272     comp->size = umem->config.comp_size;
0273     comp->producer = map + off.cr.producer;
0274     comp->consumer = map + off.cr.consumer;
0275     comp->flags = map + off.cr.flags;
0276     comp->ring = map + off.cr.desc;
0277 
0278     return 0;
0279 
0280 out_mmap:
0281     munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
0282     return err;
0283 }
0284 
0285 int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area,
0286              __u64 size, struct xsk_ring_prod *fill,
0287              struct xsk_ring_cons *comp,
0288              const struct xsk_umem_config *usr_config)
0289 {
0290     struct xdp_umem_reg mr;
0291     struct xsk_umem *umem;
0292     int err;
0293 
0294     if (!umem_area || !umem_ptr || !fill || !comp)
0295         return -EFAULT;
0296     if (!size && !xsk_page_aligned(umem_area))
0297         return -EINVAL;
0298 
0299     umem = calloc(1, sizeof(*umem));
0300     if (!umem)
0301         return -ENOMEM;
0302 
0303     umem->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
0304     if (umem->fd < 0) {
0305         err = -errno;
0306         goto out_umem_alloc;
0307     }
0308 
0309     umem->umem_area = umem_area;
0310     INIT_LIST_HEAD(&umem->ctx_list);
0311     xsk_set_umem_config(&umem->config, usr_config);
0312 
0313     memset(&mr, 0, sizeof(mr));
0314     mr.addr = (uintptr_t)umem_area;
0315     mr.len = size;
0316     mr.chunk_size = umem->config.frame_size;
0317     mr.headroom = umem->config.frame_headroom;
0318     mr.flags = umem->config.flags;
0319 
0320     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
0321     if (err) {
0322         err = -errno;
0323         goto out_socket;
0324     }
0325 
0326     err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
0327     if (err)
0328         goto out_socket;
0329 
0330     umem->fill_save = fill;
0331     umem->comp_save = comp;
0332     *umem_ptr = umem;
0333     return 0;
0334 
0335 out_socket:
0336     close(umem->fd);
0337 out_umem_alloc:
0338     free(umem);
0339     return err;
0340 }
0341 
0342 struct xsk_umem_config_v1 {
0343     __u32 fill_size;
0344     __u32 comp_size;
0345     __u32 frame_size;
0346     __u32 frame_headroom;
0347 };
0348 
0349 static enum xsk_prog get_xsk_prog(void)
0350 {
0351     enum xsk_prog detected = XSK_PROG_FALLBACK;
0352     char data_in = 0, data_out;
0353     struct bpf_insn insns[] = {
0354         BPF_LD_MAP_FD(BPF_REG_1, 0),
0355         BPF_MOV64_IMM(BPF_REG_2, 0),
0356         BPF_MOV64_IMM(BPF_REG_3, XDP_PASS),
0357         BPF_EMIT_CALL(BPF_FUNC_redirect_map),
0358         BPF_EXIT_INSN(),
0359     };
0360     LIBBPF_OPTS(bpf_test_run_opts, opts,
0361         .data_in = &data_in,
0362         .data_size_in = 1,
0363         .data_out = &data_out,
0364     );
0365 
0366     int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns);
0367 
0368     map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL);
0369     if (map_fd < 0)
0370         return detected;
0371 
0372     insns[0].imm = map_fd;
0373 
0374     prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
0375     if (prog_fd < 0) {
0376         close(map_fd);
0377         return detected;
0378     }
0379 
0380     ret = bpf_prog_test_run_opts(prog_fd, &opts);
0381     if (!ret && opts.retval == XDP_PASS)
0382         detected = XSK_PROG_REDIRECT_FLAGS;
0383     close(prog_fd);
0384     close(map_fd);
0385     return detected;
0386 }
0387 
0388 static int xsk_load_xdp_prog(struct xsk_socket *xsk)
0389 {
0390     static const int log_buf_size = 16 * 1024;
0391     struct xsk_ctx *ctx = xsk->ctx;
0392     char log_buf[log_buf_size];
0393     int prog_fd;
0394 
0395     /* This is the fallback C-program:
0396      * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
0397      * {
0398      *     int ret, index = ctx->rx_queue_index;
0399      *
0400      *     // A set entry here means that the correspnding queue_id
0401      *     // has an active AF_XDP socket bound to it.
0402      *     ret = bpf_redirect_map(&xsks_map, index, XDP_PASS);
0403      *     if (ret > 0)
0404      *         return ret;
0405      *
0406      *     // Fallback for pre-5.3 kernels, not supporting default
0407      *     // action in the flags parameter.
0408      *     if (bpf_map_lookup_elem(&xsks_map, &index))
0409      *         return bpf_redirect_map(&xsks_map, index, 0);
0410      *     return XDP_PASS;
0411      * }
0412      */
0413     struct bpf_insn prog[] = {
0414         /* r2 = *(u32 *)(r1 + 16) */
0415         BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
0416         /* *(u32 *)(r10 - 4) = r2 */
0417         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
0418         /* r1 = xskmap[] */
0419         BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
0420         /* r3 = XDP_PASS */
0421         BPF_MOV64_IMM(BPF_REG_3, 2),
0422         /* call bpf_redirect_map */
0423         BPF_EMIT_CALL(BPF_FUNC_redirect_map),
0424         /* if w0 != 0 goto pc+13 */
0425         BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13),
0426         /* r2 = r10 */
0427         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
0428         /* r2 += -4 */
0429         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
0430         /* r1 = xskmap[] */
0431         BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
0432         /* call bpf_map_lookup_elem */
0433         BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
0434         /* r1 = r0 */
0435         BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
0436         /* r0 = XDP_PASS */
0437         BPF_MOV64_IMM(BPF_REG_0, 2),
0438         /* if r1 == 0 goto pc+5 */
0439         BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
0440         /* r2 = *(u32 *)(r10 - 4) */
0441         BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
0442         /* r1 = xskmap[] */
0443         BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
0444         /* r3 = 0 */
0445         BPF_MOV64_IMM(BPF_REG_3, 0),
0446         /* call bpf_redirect_map */
0447         BPF_EMIT_CALL(BPF_FUNC_redirect_map),
0448         /* The jumps are to this instruction */
0449         BPF_EXIT_INSN(),
0450     };
0451 
0452     /* This is the post-5.3 kernel C-program:
0453      * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
0454      * {
0455      *     return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS);
0456      * }
0457      */
0458     struct bpf_insn prog_redirect_flags[] = {
0459         /* r2 = *(u32 *)(r1 + 16) */
0460         BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
0461         /* r1 = xskmap[] */
0462         BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
0463         /* r3 = XDP_PASS */
0464         BPF_MOV64_IMM(BPF_REG_3, 2),
0465         /* call bpf_redirect_map */
0466         BPF_EMIT_CALL(BPF_FUNC_redirect_map),
0467         BPF_EXIT_INSN(),
0468     };
0469     size_t insns_cnt[] = {ARRAY_SIZE(prog),
0470                   ARRAY_SIZE(prog_redirect_flags),
0471     };
0472     struct bpf_insn *progs[] = {prog, prog_redirect_flags};
0473     enum xsk_prog option = get_xsk_prog();
0474     LIBBPF_OPTS(bpf_prog_load_opts, opts,
0475         .log_buf = log_buf,
0476         .log_size = log_buf_size,
0477     );
0478 
0479     prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause",
0480                 progs[option], insns_cnt[option], &opts);
0481     if (prog_fd < 0) {
0482         pr_warn("BPF log buffer:\n%s", log_buf);
0483         return prog_fd;
0484     }
0485 
0486     ctx->prog_fd = prog_fd;
0487     return 0;
0488 }
0489 
0490 static int xsk_create_bpf_link(struct xsk_socket *xsk)
0491 {
0492     DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
0493     struct xsk_ctx *ctx = xsk->ctx;
0494     __u32 prog_id = 0;
0495     int link_fd;
0496     int err;
0497 
0498     err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
0499     if (err) {
0500         pr_warn("getting XDP prog id failed\n");
0501         return err;
0502     }
0503 
0504     /* if there's a netlink-based XDP prog loaded on interface, bail out
0505      * and ask user to do the removal by himself
0506      */
0507     if (prog_id) {
0508         pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n");
0509         return -EINVAL;
0510     }
0511 
0512     opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE);
0513 
0514     link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts);
0515     if (link_fd < 0) {
0516         pr_warn("bpf_link_create failed: %s\n", strerror(errno));
0517         return link_fd;
0518     }
0519 
0520     ctx->link_fd = link_fd;
0521     return 0;
0522 }
0523 
0524 /* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst
0525  * is zero-terminated string no matter what (unless sz == 0, in which case
0526  * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs
0527  * in what is returned. Given this is internal helper, it's trivial to extend
0528  * this, when necessary. Use this instead of strncpy inside libbpf source code.
0529  */
0530 static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz)
0531 {
0532         size_t i;
0533 
0534         if (sz == 0)
0535                 return;
0536 
0537         sz--;
0538         for (i = 0; i < sz && src[i]; i++)
0539                 dst[i] = src[i];
0540         dst[i] = '\0';
0541 }
0542 
0543 static int xsk_get_max_queues(struct xsk_socket *xsk)
0544 {
0545     struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
0546     struct xsk_ctx *ctx = xsk->ctx;
0547     struct ifreq ifr = {};
0548     int fd, err, ret;
0549 
0550     fd = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
0551     if (fd < 0)
0552         return -errno;
0553 
0554     ifr.ifr_data = (void *)&channels;
0555     libbpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ);
0556     err = ioctl(fd, SIOCETHTOOL, &ifr);
0557     if (err && errno != EOPNOTSUPP) {
0558         ret = -errno;
0559         goto out;
0560     }
0561 
0562     if (err) {
0563         /* If the device says it has no channels, then all traffic
0564          * is sent to a single stream, so max queues = 1.
0565          */
0566         ret = 1;
0567     } else {
0568         /* Take the max of rx, tx, combined. Drivers return
0569          * the number of channels in different ways.
0570          */
0571         ret = max(channels.max_rx, channels.max_tx);
0572         ret = max(ret, (int)channels.max_combined);
0573     }
0574 
0575 out:
0576     close(fd);
0577     return ret;
0578 }
0579 
0580 static int xsk_create_bpf_maps(struct xsk_socket *xsk)
0581 {
0582     struct xsk_ctx *ctx = xsk->ctx;
0583     int max_queues;
0584     int fd;
0585 
0586     max_queues = xsk_get_max_queues(xsk);
0587     if (max_queues < 0)
0588         return max_queues;
0589 
0590     fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map",
0591                 sizeof(int), sizeof(int), max_queues, NULL);
0592     if (fd < 0)
0593         return fd;
0594 
0595     ctx->xsks_map_fd = fd;
0596 
0597     return 0;
0598 }
0599 
0600 static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
0601 {
0602     struct xsk_ctx *ctx = xsk->ctx;
0603 
0604     bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
0605     close(ctx->xsks_map_fd);
0606 }
0607 
0608 static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
0609 {
0610     __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info);
0611     __u32 map_len = sizeof(struct bpf_map_info);
0612     struct bpf_prog_info prog_info = {};
0613     struct xsk_ctx *ctx = xsk->ctx;
0614     struct bpf_map_info map_info;
0615     int fd, err;
0616 
0617     err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
0618     if (err)
0619         return err;
0620 
0621     num_maps = prog_info.nr_map_ids;
0622 
0623     map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
0624     if (!map_ids)
0625         return -ENOMEM;
0626 
0627     memset(&prog_info, 0, prog_len);
0628     prog_info.nr_map_ids = num_maps;
0629     prog_info.map_ids = (__u64)(unsigned long)map_ids;
0630 
0631     err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
0632     if (err)
0633         goto out_map_ids;
0634 
0635     ctx->xsks_map_fd = -1;
0636 
0637     for (i = 0; i < prog_info.nr_map_ids; i++) {
0638         fd = bpf_map_get_fd_by_id(map_ids[i]);
0639         if (fd < 0)
0640             continue;
0641 
0642         memset(&map_info, 0, map_len);
0643         err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
0644         if (err) {
0645             close(fd);
0646             continue;
0647         }
0648 
0649         if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) {
0650             ctx->xsks_map_fd = fd;
0651             break;
0652         }
0653 
0654         close(fd);
0655     }
0656 
0657     if (ctx->xsks_map_fd == -1)
0658         err = -ENOENT;
0659 
0660 out_map_ids:
0661     free(map_ids);
0662     return err;
0663 }
0664 
0665 static int xsk_set_bpf_maps(struct xsk_socket *xsk)
0666 {
0667     struct xsk_ctx *ctx = xsk->ctx;
0668 
0669     return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id,
0670                    &xsk->fd, 0);
0671 }
0672 
0673 static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd)
0674 {
0675     struct bpf_link_info link_info;
0676     __u32 link_len;
0677     __u32 id = 0;
0678     int err;
0679     int fd;
0680 
0681     while (true) {
0682         err = bpf_link_get_next_id(id, &id);
0683         if (err) {
0684             if (errno == ENOENT) {
0685                 err = 0;
0686                 break;
0687             }
0688             pr_warn("can't get next link: %s\n", strerror(errno));
0689             break;
0690         }
0691 
0692         fd = bpf_link_get_fd_by_id(id);
0693         if (fd < 0) {
0694             if (errno == ENOENT)
0695                 continue;
0696             pr_warn("can't get link by id (%u): %s\n", id, strerror(errno));
0697             err = -errno;
0698             break;
0699         }
0700 
0701         link_len = sizeof(struct bpf_link_info);
0702         memset(&link_info, 0, link_len);
0703         err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len);
0704         if (err) {
0705             pr_warn("can't get link info: %s\n", strerror(errno));
0706             close(fd);
0707             break;
0708         }
0709         if (link_info.type == BPF_LINK_TYPE_XDP) {
0710             if (link_info.xdp.ifindex == ifindex) {
0711                 *link_fd = fd;
0712                 if (prog_id)
0713                     *prog_id = link_info.prog_id;
0714                 break;
0715             }
0716         }
0717         close(fd);
0718     }
0719 
0720     return err;
0721 }
0722 
0723 static bool xsk_probe_bpf_link(void)
0724 {
0725     LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE);
0726     struct bpf_insn insns[2] = {
0727         BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
0728         BPF_EXIT_INSN()
0729     };
0730     int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns);
0731     int ifindex_lo = 1;
0732     bool ret = false;
0733     int err;
0734 
0735     err = xsk_link_lookup(ifindex_lo, NULL, &link_fd);
0736     if (err)
0737         return ret;
0738 
0739     if (link_fd >= 0)
0740         return true;
0741 
0742     prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
0743     if (prog_fd < 0)
0744         return ret;
0745 
0746     link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts);
0747     close(prog_fd);
0748 
0749     if (link_fd >= 0) {
0750         ret = true;
0751         close(link_fd);
0752     }
0753 
0754     return ret;
0755 }
0756 
0757 static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
0758 {
0759     char ifname[IFNAMSIZ];
0760     struct xsk_ctx *ctx;
0761     char *interface;
0762 
0763     ctx = calloc(1, sizeof(*ctx));
0764     if (!ctx)
0765         return -ENOMEM;
0766 
0767     interface = if_indextoname(ifindex, &ifname[0]);
0768     if (!interface) {
0769         free(ctx);
0770         return -errno;
0771     }
0772 
0773     ctx->ifindex = ifindex;
0774     libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
0775 
0776     xsk->ctx = ctx;
0777     xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
0778 
0779     return 0;
0780 }
0781 
0782 static int xsk_init_xdp_res(struct xsk_socket *xsk,
0783                 int *xsks_map_fd)
0784 {
0785     struct xsk_ctx *ctx = xsk->ctx;
0786     int err;
0787 
0788     err = xsk_create_bpf_maps(xsk);
0789     if (err)
0790         return err;
0791 
0792     err = xsk_load_xdp_prog(xsk);
0793     if (err)
0794         goto err_load_xdp_prog;
0795 
0796     if (ctx->has_bpf_link)
0797         err = xsk_create_bpf_link(xsk);
0798     else
0799         err = bpf_xdp_attach(xsk->ctx->ifindex, ctx->prog_fd,
0800                      xsk->config.xdp_flags, NULL);
0801 
0802     if (err)
0803         goto err_attach_xdp_prog;
0804 
0805     if (!xsk->rx)
0806         return err;
0807 
0808     err = xsk_set_bpf_maps(xsk);
0809     if (err)
0810         goto err_set_bpf_maps;
0811 
0812     return err;
0813 
0814 err_set_bpf_maps:
0815     if (ctx->has_bpf_link)
0816         close(ctx->link_fd);
0817     else
0818         bpf_xdp_detach(ctx->ifindex, 0, NULL);
0819 err_attach_xdp_prog:
0820     close(ctx->prog_fd);
0821 err_load_xdp_prog:
0822     xsk_delete_bpf_maps(xsk);
0823     return err;
0824 }
0825 
0826 static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id)
0827 {
0828     struct xsk_ctx *ctx = xsk->ctx;
0829     int err;
0830 
0831     ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
0832     if (ctx->prog_fd < 0) {
0833         err = -errno;
0834         goto err_prog_fd;
0835     }
0836     err = xsk_lookup_bpf_maps(xsk);
0837     if (err)
0838         goto err_lookup_maps;
0839 
0840     if (!xsk->rx)
0841         return err;
0842 
0843     err = xsk_set_bpf_maps(xsk);
0844     if (err)
0845         goto err_set_maps;
0846 
0847     return err;
0848 
0849 err_set_maps:
0850     close(ctx->xsks_map_fd);
0851 err_lookup_maps:
0852     close(ctx->prog_fd);
0853 err_prog_fd:
0854     if (ctx->has_bpf_link)
0855         close(ctx->link_fd);
0856     return err;
0857 }
0858 
0859 static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd)
0860 {
0861     struct xsk_socket *xsk = _xdp;
0862     struct xsk_ctx *ctx = xsk->ctx;
0863     __u32 prog_id = 0;
0864     int err;
0865 
0866     if (ctx->has_bpf_link)
0867         err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd);
0868     else
0869         err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
0870 
0871     if (err)
0872         return err;
0873 
0874     err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) :
0875              xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id);
0876 
0877     if (!err && xsks_map_fd)
0878         *xsks_map_fd = ctx->xsks_map_fd;
0879 
0880     return err;
0881 }
0882 
0883 int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd)
0884 {
0885     return __xsk_setup_xdp_prog(xsk, xsks_map_fd);
0886 }
0887 
0888 static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
0889                    __u32 queue_id)
0890 {
0891     struct xsk_ctx *ctx;
0892 
0893     if (list_empty(&umem->ctx_list))
0894         return NULL;
0895 
0896     list_for_each_entry(ctx, &umem->ctx_list, list) {
0897         if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) {
0898             ctx->refcount++;
0899             return ctx;
0900         }
0901     }
0902 
0903     return NULL;
0904 }
0905 
0906 static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap)
0907 {
0908     struct xsk_umem *umem = ctx->umem;
0909     struct xdp_mmap_offsets off;
0910     int err;
0911 
0912     if (--ctx->refcount)
0913         return;
0914 
0915     if (!unmap)
0916         goto out_free;
0917 
0918     err = xsk_get_mmap_offsets(umem->fd, &off);
0919     if (err)
0920         goto out_free;
0921 
0922     munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size *
0923            sizeof(__u64));
0924     munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size *
0925            sizeof(__u64));
0926 
0927 out_free:
0928     list_del(&ctx->list);
0929     free(ctx);
0930 }
0931 
0932 static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
0933                       struct xsk_umem *umem, int ifindex,
0934                       const char *ifname, __u32 queue_id,
0935                       struct xsk_ring_prod *fill,
0936                       struct xsk_ring_cons *comp)
0937 {
0938     struct xsk_ctx *ctx;
0939     int err;
0940 
0941     ctx = calloc(1, sizeof(*ctx));
0942     if (!ctx)
0943         return NULL;
0944 
0945     if (!umem->fill_save) {
0946         err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
0947         if (err) {
0948             free(ctx);
0949             return NULL;
0950         }
0951     } else if (umem->fill_save != fill || umem->comp_save != comp) {
0952         /* Copy over rings to new structs. */
0953         memcpy(fill, umem->fill_save, sizeof(*fill));
0954         memcpy(comp, umem->comp_save, sizeof(*comp));
0955     }
0956 
0957     ctx->ifindex = ifindex;
0958     ctx->refcount = 1;
0959     ctx->umem = umem;
0960     ctx->queue_id = queue_id;
0961     libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
0962 
0963     ctx->fill = fill;
0964     ctx->comp = comp;
0965     list_add(&ctx->list, &umem->ctx_list);
0966     ctx->has_bpf_link = xsk_probe_bpf_link();
0967     return ctx;
0968 }
0969 
0970 static void xsk_destroy_xsk_struct(struct xsk_socket *xsk)
0971 {
0972     free(xsk->ctx);
0973     free(xsk);
0974 }
0975 
0976 int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd)
0977 {
0978     xsk->ctx->xsks_map_fd = fd;
0979     return xsk_set_bpf_maps(xsk);
0980 }
0981 
0982 int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd)
0983 {
0984     struct xsk_socket *xsk;
0985     int res;
0986 
0987     xsk = calloc(1, sizeof(*xsk));
0988     if (!xsk)
0989         return -ENOMEM;
0990 
0991     res = xsk_create_xsk_struct(ifindex, xsk);
0992     if (res) {
0993         free(xsk);
0994         return -EINVAL;
0995     }
0996 
0997     res = __xsk_setup_xdp_prog(xsk, xsks_map_fd);
0998 
0999     xsk_destroy_xsk_struct(xsk);
1000 
1001     return res;
1002 }
1003 
1004 int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
1005                   const char *ifname,
1006                   __u32 queue_id, struct xsk_umem *umem,
1007                   struct xsk_ring_cons *rx,
1008                   struct xsk_ring_prod *tx,
1009                   struct xsk_ring_prod *fill,
1010                   struct xsk_ring_cons *comp,
1011                   const struct xsk_socket_config *usr_config)
1012 {
1013     bool unmap, rx_setup_done = false, tx_setup_done = false;
1014     void *rx_map = NULL, *tx_map = NULL;
1015     struct sockaddr_xdp sxdp = {};
1016     struct xdp_mmap_offsets off;
1017     struct xsk_socket *xsk;
1018     struct xsk_ctx *ctx;
1019     int err, ifindex;
1020 
1021     if (!umem || !xsk_ptr || !(rx || tx))
1022         return -EFAULT;
1023 
1024     unmap = umem->fill_save != fill;
1025 
1026     xsk = calloc(1, sizeof(*xsk));
1027     if (!xsk)
1028         return -ENOMEM;
1029 
1030     err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
1031     if (err)
1032         goto out_xsk_alloc;
1033 
1034     xsk->outstanding_tx = 0;
1035     ifindex = if_nametoindex(ifname);
1036     if (!ifindex) {
1037         err = -errno;
1038         goto out_xsk_alloc;
1039     }
1040 
1041     if (umem->refcount++ > 0) {
1042         xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
1043         if (xsk->fd < 0) {
1044             err = -errno;
1045             goto out_xsk_alloc;
1046         }
1047     } else {
1048         xsk->fd = umem->fd;
1049         rx_setup_done = umem->rx_ring_setup_done;
1050         tx_setup_done = umem->tx_ring_setup_done;
1051     }
1052 
1053     ctx = xsk_get_ctx(umem, ifindex, queue_id);
1054     if (!ctx) {
1055         if (!fill || !comp) {
1056             err = -EFAULT;
1057             goto out_socket;
1058         }
1059 
1060         ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
1061                      fill, comp);
1062         if (!ctx) {
1063             err = -ENOMEM;
1064             goto out_socket;
1065         }
1066     }
1067     xsk->ctx = ctx;
1068 
1069     if (rx && !rx_setup_done) {
1070         err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
1071                  &xsk->config.rx_size,
1072                  sizeof(xsk->config.rx_size));
1073         if (err) {
1074             err = -errno;
1075             goto out_put_ctx;
1076         }
1077         if (xsk->fd == umem->fd)
1078             umem->rx_ring_setup_done = true;
1079     }
1080     if (tx && !tx_setup_done) {
1081         err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
1082                  &xsk->config.tx_size,
1083                  sizeof(xsk->config.tx_size));
1084         if (err) {
1085             err = -errno;
1086             goto out_put_ctx;
1087         }
1088         if (xsk->fd == umem->fd)
1089             umem->tx_ring_setup_done = true;
1090     }
1091 
1092     err = xsk_get_mmap_offsets(xsk->fd, &off);
1093     if (err) {
1094         err = -errno;
1095         goto out_put_ctx;
1096     }
1097 
1098     if (rx) {
1099         rx_map = mmap(NULL, off.rx.desc +
1100                   xsk->config.rx_size * sizeof(struct xdp_desc),
1101                   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
1102                   xsk->fd, XDP_PGOFF_RX_RING);
1103         if (rx_map == MAP_FAILED) {
1104             err = -errno;
1105             goto out_put_ctx;
1106         }
1107 
1108         rx->mask = xsk->config.rx_size - 1;
1109         rx->size = xsk->config.rx_size;
1110         rx->producer = rx_map + off.rx.producer;
1111         rx->consumer = rx_map + off.rx.consumer;
1112         rx->flags = rx_map + off.rx.flags;
1113         rx->ring = rx_map + off.rx.desc;
1114         rx->cached_prod = *rx->producer;
1115         rx->cached_cons = *rx->consumer;
1116     }
1117     xsk->rx = rx;
1118 
1119     if (tx) {
1120         tx_map = mmap(NULL, off.tx.desc +
1121                   xsk->config.tx_size * sizeof(struct xdp_desc),
1122                   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
1123                   xsk->fd, XDP_PGOFF_TX_RING);
1124         if (tx_map == MAP_FAILED) {
1125             err = -errno;
1126             goto out_mmap_rx;
1127         }
1128 
1129         tx->mask = xsk->config.tx_size - 1;
1130         tx->size = xsk->config.tx_size;
1131         tx->producer = tx_map + off.tx.producer;
1132         tx->consumer = tx_map + off.tx.consumer;
1133         tx->flags = tx_map + off.tx.flags;
1134         tx->ring = tx_map + off.tx.desc;
1135         tx->cached_prod = *tx->producer;
1136         /* cached_cons is r->size bigger than the real consumer pointer
1137          * See xsk_prod_nb_free
1138          */
1139         tx->cached_cons = *tx->consumer + xsk->config.tx_size;
1140     }
1141     xsk->tx = tx;
1142 
1143     sxdp.sxdp_family = PF_XDP;
1144     sxdp.sxdp_ifindex = ctx->ifindex;
1145     sxdp.sxdp_queue_id = ctx->queue_id;
1146     if (umem->refcount > 1) {
1147         sxdp.sxdp_flags |= XDP_SHARED_UMEM;
1148         sxdp.sxdp_shared_umem_fd = umem->fd;
1149     } else {
1150         sxdp.sxdp_flags = xsk->config.bind_flags;
1151     }
1152 
1153     err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
1154     if (err) {
1155         err = -errno;
1156         goto out_mmap_tx;
1157     }
1158 
1159     if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
1160         err = __xsk_setup_xdp_prog(xsk, NULL);
1161         if (err)
1162             goto out_mmap_tx;
1163     }
1164 
1165     *xsk_ptr = xsk;
1166     umem->fill_save = NULL;
1167     umem->comp_save = NULL;
1168     return 0;
1169 
1170 out_mmap_tx:
1171     if (tx)
1172         munmap(tx_map, off.tx.desc +
1173                xsk->config.tx_size * sizeof(struct xdp_desc));
1174 out_mmap_rx:
1175     if (rx)
1176         munmap(rx_map, off.rx.desc +
1177                xsk->config.rx_size * sizeof(struct xdp_desc));
1178 out_put_ctx:
1179     xsk_put_ctx(ctx, unmap);
1180 out_socket:
1181     if (--umem->refcount)
1182         close(xsk->fd);
1183 out_xsk_alloc:
1184     free(xsk);
1185     return err;
1186 }
1187 
1188 int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
1189                __u32 queue_id, struct xsk_umem *umem,
1190                struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
1191                const struct xsk_socket_config *usr_config)
1192 {
1193     if (!umem)
1194         return -EFAULT;
1195 
1196     return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
1197                      rx, tx, umem->fill_save,
1198                      umem->comp_save, usr_config);
1199 }
1200 
1201 int xsk_umem__delete(struct xsk_umem *umem)
1202 {
1203     struct xdp_mmap_offsets off;
1204     int err;
1205 
1206     if (!umem)
1207         return 0;
1208 
1209     if (umem->refcount)
1210         return -EBUSY;
1211 
1212     err = xsk_get_mmap_offsets(umem->fd, &off);
1213     if (!err && umem->fill_save && umem->comp_save) {
1214         munmap(umem->fill_save->ring - off.fr.desc,
1215                off.fr.desc + umem->config.fill_size * sizeof(__u64));
1216         munmap(umem->comp_save->ring - off.cr.desc,
1217                off.cr.desc + umem->config.comp_size * sizeof(__u64));
1218     }
1219 
1220     close(umem->fd);
1221     free(umem);
1222 
1223     return 0;
1224 }
1225 
1226 void xsk_socket__delete(struct xsk_socket *xsk)
1227 {
1228     size_t desc_sz = sizeof(struct xdp_desc);
1229     struct xdp_mmap_offsets off;
1230     struct xsk_umem *umem;
1231     struct xsk_ctx *ctx;
1232     int err;
1233 
1234     if (!xsk)
1235         return;
1236 
1237     ctx = xsk->ctx;
1238     umem = ctx->umem;
1239 
1240     xsk_put_ctx(ctx, true);
1241 
1242     if (!ctx->refcount) {
1243         xsk_delete_bpf_maps(xsk);
1244         close(ctx->prog_fd);
1245         if (ctx->has_bpf_link)
1246             close(ctx->link_fd);
1247     }
1248 
1249     err = xsk_get_mmap_offsets(xsk->fd, &off);
1250     if (!err) {
1251         if (xsk->rx) {
1252             munmap(xsk->rx->ring - off.rx.desc,
1253                    off.rx.desc + xsk->config.rx_size * desc_sz);
1254         }
1255         if (xsk->tx) {
1256             munmap(xsk->tx->ring - off.tx.desc,
1257                    off.tx.desc + xsk->config.tx_size * desc_sz);
1258         }
1259     }
1260 
1261     umem->refcount--;
1262     /* Do not close an fd that also has an associated umem connected
1263      * to it.
1264      */
1265     if (xsk->fd != umem->fd)
1266         close(xsk->fd);
1267     free(xsk);
1268 }