kernel/bpf/devmap.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
0003  */
0004
0005 /* Devmaps primary use is as a backend map for XDP BPF helper call
0006  * bpf_redirect_map(). Because XDP is mostly concerned with performance we
0007  * spent some effort to ensure the datapath with redirect maps does not use
0008  * any locking. This is a quick note on the details.
0009  *
0010  * We have three possible paths to get into the devmap control plane bpf
0011  * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
0012  * will invoke an update, delete, or lookup operation. To ensure updates and
0013  * deletes appear atomic from the datapath side xchg() is used to modify the
0014  * netdev_map array. Then because the datapath does a lookup into the netdev_map
0015  * array (read-only) from an RCU critical section we use call_rcu() to wait for
0016  * an rcu grace period before free'ing the old data structures. This ensures the
0017  * datapath always has a valid copy. However, the datapath does a "flush"
0018  * operation that pushes any pending packets in the driver outside the RCU
0019  * critical section. Each bpf_dtab_netdev tracks these pending operations using
0020  * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed  until
0021  * this list is empty, indicating outstanding flush operations have completed.
0022  *
0023  * BPF syscalls may race with BPF program calls on any of the update, delete
0024  * or lookup operations. As noted above the xchg() operation also keep the
0025  * netdev_map consistent in this case. From the devmap side BPF programs
0026  * calling into these operations are the same as multiple user space threads
0027  * making system calls.
0028  *
0029  * Finally, any of the above may race with a netdev_unregister notifier. The
0030  * unregister notifier must search for net devices in the map structure that
0031  * contain a reference to the net device and remove them. This is a two step
0032  * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
0033  * check to see if the ifindex is the same as the net_device being removed.
0034  * When removing the dev a cmpxchg() is used to ensure the correct dev is
0035  * removed, in the case of a concurrent update or delete operation it is
0036  * possible that the initially referenced dev is no longer in the map. As the
0037  * notifier hook walks the map we know that new dev references can not be
0038  * added by the user because core infrastructure ensures dev_get_by_index()
0039  * calls will fail at this point.
0040  *
0041  * The devmap_hash type is a map type which interprets keys as ifindexes and
0042  * indexes these using a hashmap. This allows maps that use ifindex as key to be
0043  * densely packed instead of having holes in the lookup array for unused
0044  * ifindexes. The setup and packet enqueue/send code is shared between the two
0045  * types of devmap; only the lookup and insertion is different.
0046  */
0047 #include <linux/bpf.h>
0048 #include <net/xdp.h>
0049 #include <linux/filter.h>
0050 #include <trace/events/xdp.h>
0051 #include <linux/btf_ids.h>
0052
0053 #define DEV_CREATE_FLAG_MASK \
0054     (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
0055
0056 struct xdp_dev_bulk_queue {
0057     struct xdp_frame *q[DEV_MAP_BULK_SIZE];
0058     struct list_head flush_node;
0059     struct net_device *dev;
0060     struct net_device *dev_rx;
0061     struct bpf_prog *xdp_prog;
0062     unsigned int count;
0063 };
0064
0065 struct bpf_dtab_netdev {
0066     struct net_device *dev; /* must be first member, due to tracepoint */
0067     struct hlist_node index_hlist;
0068     struct bpf_dtab *dtab;
0069     struct bpf_prog *xdp_prog;
0070     struct rcu_head rcu;
0071     unsigned int idx;
0072     struct bpf_devmap_val val;
0073 };
0074
0075 struct bpf_dtab {
0076     struct bpf_map map;
0077     struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
0078     struct list_head list;
0079
0080     /* these are only used for DEVMAP_HASH type maps */
0081     struct hlist_head *dev_index_head;
0082     spinlock_t index_lock;
0083     unsigned int items;
0084     u32 n_buckets;
0085 };
0086
0087 static DEFINE_PER_CPU(struct list_head, dev_flush_list);
0088 static DEFINE_SPINLOCK(dev_map_lock);
0089 static LIST_HEAD(dev_map_list);
0090
0091 static struct hlist_head *dev_map_create_hash(unsigned int entries,
0092                           int numa_node)
0093 {
0094     int i;
0095     struct hlist_head *hash;
0096
0097     hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
0098     if (hash != NULL)
0099         for (i = 0; i < entries; i++)
0100             INIT_HLIST_HEAD(&hash[i]);
0101
0102     return hash;
0103 }
0104
0105 static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
0106                             int idx)
0107 {
0108     return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
0109 }
0110
0111 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
0112 {
0113     u32 valsize = attr->value_size;
0114
0115     /* check sanity of attributes. 2 value sizes supported:
0116      * 4 bytes: ifindex
0117      * 8 bytes: ifindex + prog fd
0118      */
0119     if (attr->max_entries == 0 || attr->key_size != 4 ||
0120         (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
0121          valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
0122         attr->map_flags & ~DEV_CREATE_FLAG_MASK)
0123         return -EINVAL;
0124
0125     /* Lookup returns a pointer straight to dev->ifindex, so make sure the
0126      * verifier prevents writes from the BPF side
0127      */
0128     attr->map_flags |= BPF_F_RDONLY_PROG;
0129
0130
0131     bpf_map_init_from_attr(&dtab->map, attr);
0132
0133     if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
0134         dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
0135
0136         if (!dtab->n_buckets) /* Overflow check */
0137             return -EINVAL;
0138     }
0139
0140     if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
0141         dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
0142                                dtab->map.numa_node);
0143         if (!dtab->dev_index_head)
0144             return -ENOMEM;
0145
0146         spin_lock_init(&dtab->index_lock);
0147     } else {
0148         dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
0149                               sizeof(struct bpf_dtab_netdev *),
0150                               dtab->map.numa_node);
0151         if (!dtab->netdev_map)
0152             return -ENOMEM;
0153     }
0154
0155     return 0;
0156 }
0157
0158 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
0159 {
0160     struct bpf_dtab *dtab;
0161     int err;
0162
0163     if (!capable(CAP_NET_ADMIN))
0164         return ERR_PTR(-EPERM);
0165
0166     dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT);
0167     if (!dtab)
0168         return ERR_PTR(-ENOMEM);
0169
0170     err = dev_map_init_map(dtab, attr);
0171     if (err) {
0172         kfree(dtab);
0173         return ERR_PTR(err);
0174     }
0175
0176     spin_lock(&dev_map_lock);
0177     list_add_tail_rcu(&dtab->list, &dev_map_list);
0178     spin_unlock(&dev_map_lock);
0179
0180     return &dtab->map;
0181 }
0182
0183 static void dev_map_free(struct bpf_map *map)
0184 {
0185     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0186     int i;
0187
0188     /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
0189      * so the programs (can be more than one that used this map) were
0190      * disconnected from events. The following synchronize_rcu() guarantees
0191      * both rcu read critical sections complete and waits for
0192      * preempt-disable regions (NAPI being the relevant context here) so we
0193      * are certain there will be no further reads against the netdev_map and
0194      * all flush operations are complete. Flush operations can only be done
0195      * from NAPI context for this reason.
0196      */
0197
0198     spin_lock(&dev_map_lock);
0199     list_del_rcu(&dtab->list);
0200     spin_unlock(&dev_map_lock);
0201
0202     bpf_clear_redirect_map(map);
0203     synchronize_rcu();
0204
0205     /* Make sure prior __dev_map_entry_free() have completed. */
0206     rcu_barrier();
0207
0208     if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
0209         for (i = 0; i < dtab->n_buckets; i++) {
0210             struct bpf_dtab_netdev *dev;
0211             struct hlist_head *head;
0212             struct hlist_node *next;
0213
0214             head = dev_map_index_hash(dtab, i);
0215
0216             hlist_for_each_entry_safe(dev, next, head, index_hlist) {
0217                 hlist_del_rcu(&dev->index_hlist);
0218                 if (dev->xdp_prog)
0219                     bpf_prog_put(dev->xdp_prog);
0220                 dev_put(dev->dev);
0221                 kfree(dev);
0222             }
0223         }
0224
0225         bpf_map_area_free(dtab->dev_index_head);
0226     } else {
0227         for (i = 0; i < dtab->map.max_entries; i++) {
0228             struct bpf_dtab_netdev *dev;
0229
0230             dev = rcu_dereference_raw(dtab->netdev_map[i]);
0231             if (!dev)
0232                 continue;
0233
0234             if (dev->xdp_prog)
0235                 bpf_prog_put(dev->xdp_prog);
0236             dev_put(dev->dev);
0237             kfree(dev);
0238         }
0239
0240         bpf_map_area_free(dtab->netdev_map);
0241     }
0242
0243     kfree(dtab);
0244 }
0245
0246 static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
0247 {
0248     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0249     u32 index = key ? *(u32 *)key : U32_MAX;
0250     u32 *next = next_key;
0251
0252     if (index >= dtab->map.max_entries) {
0253         *next = 0;
0254         return 0;
0255     }
0256
0257     if (index == dtab->map.max_entries - 1)
0258         return -ENOENT;
0259     *next = index + 1;
0260     return 0;
0261 }
0262
0263 /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
0264  * by local_bh_disable() (from XDP calls inside NAPI). The
0265  * rcu_read_lock_bh_held() below makes lockdep accept both.
0266  */
0267 static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
0268 {
0269     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0270     struct hlist_head *head = dev_map_index_hash(dtab, key);
0271     struct bpf_dtab_netdev *dev;
0272
0273     hlist_for_each_entry_rcu(dev, head, index_hlist,
0274                  lockdep_is_held(&dtab->index_lock))
0275         if (dev->idx == key)
0276             return dev;
0277
0278     return NULL;
0279 }
0280
0281 static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
0282                     void *next_key)
0283 {
0284     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0285     u32 idx, *next = next_key;
0286     struct bpf_dtab_netdev *dev, *next_dev;
0287     struct hlist_head *head;
0288     int i = 0;
0289
0290     if (!key)
0291         goto find_first;
0292
0293     idx = *(u32 *)key;
0294
0295     dev = __dev_map_hash_lookup_elem(map, idx);
0296     if (!dev)
0297         goto find_first;
0298
0299     next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
0300                     struct bpf_dtab_netdev, index_hlist);
0301
0302     if (next_dev) {
0303         *next = next_dev->idx;
0304         return 0;
0305     }
0306
0307     i = idx & (dtab->n_buckets - 1);
0308     i++;
0309
0310  find_first:
0311     for (; i < dtab->n_buckets; i++) {
0312         head = dev_map_index_hash(dtab, i);
0313
0314         next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
0315                         struct bpf_dtab_netdev,
0316                         index_hlist);
0317         if (next_dev) {
0318             *next = next_dev->idx;
0319             return 0;
0320         }
0321     }
0322
0323     return -ENOENT;
0324 }
0325
0326 static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
0327                 struct xdp_frame **frames, int n,
0328                 struct net_device *dev)
0329 {
0330     struct xdp_txq_info txq = { .dev = dev };
0331     struct xdp_buff xdp;
0332     int i, nframes = 0;
0333
0334     for (i = 0; i < n; i++) {
0335         struct xdp_frame *xdpf = frames[i];
0336         u32 act;
0337         int err;
0338
0339         xdp_convert_frame_to_buff(xdpf, &xdp);
0340         xdp.txq = &txq;
0341
0342         act = bpf_prog_run_xdp(xdp_prog, &xdp);
0343         switch (act) {
0344         case XDP_PASS:
0345             err = xdp_update_frame_from_buff(&xdp, xdpf);
0346             if (unlikely(err < 0))
0347                 xdp_return_frame_rx_napi(xdpf);
0348             else
0349                 frames[nframes++] = xdpf;
0350             break;
0351         default:
0352             bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
0353             fallthrough;
0354         case XDP_ABORTED:
0355             trace_xdp_exception(dev, xdp_prog, act);
0356             fallthrough;
0357         case XDP_DROP:
0358             xdp_return_frame_rx_napi(xdpf);
0359             break;
0360         }
0361     }
0362     return nframes; /* sent frames count */
0363 }
0364
0365 static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
0366 {
0367     struct net_device *dev = bq->dev;
0368     unsigned int cnt = bq->count;
0369     int sent = 0, err = 0;
0370     int to_send = cnt;
0371     int i;
0372
0373     if (unlikely(!cnt))
0374         return;
0375
0376     for (i = 0; i < cnt; i++) {
0377         struct xdp_frame *xdpf = bq->q[i];
0378
0379         prefetch(xdpf);
0380     }
0381
0382     if (bq->xdp_prog) {
0383         to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
0384         if (!to_send)
0385             goto out;
0386     }
0387
0388     sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
0389     if (sent < 0) {
0390         /* If ndo_xdp_xmit fails with an errno, no frames have
0391          * been xmit'ed.
0392          */
0393         err = sent;
0394         sent = 0;
0395     }
0396
0397     /* If not all frames have been transmitted, it is our
0398      * responsibility to free them
0399      */
0400     for (i = sent; unlikely(i < to_send); i++)
0401         xdp_return_frame_rx_napi(bq->q[i]);
0402
0403 out:
0404     bq->count = 0;
0405     trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
0406 }
0407
0408 /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
0409  * driver before returning from its napi->poll() routine. See the comment above
0410  * xdp_do_flush() in filter.c.
0411  */
0412 void __dev_flush(void)
0413 {
0414     struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
0415     struct xdp_dev_bulk_queue *bq, *tmp;
0416
0417     list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
0418         bq_xmit_all(bq, XDP_XMIT_FLUSH);
0419         bq->dev_rx = NULL;
0420         bq->xdp_prog = NULL;
0421         __list_del_clearprev(&bq->flush_node);
0422     }
0423 }
0424
0425 /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
0426  * by local_bh_disable() (from XDP calls inside NAPI). The
0427  * rcu_read_lock_bh_held() below makes lockdep accept both.
0428  */
0429 static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
0430 {
0431     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0432     struct bpf_dtab_netdev *obj;
0433
0434     if (key >= map->max_entries)
0435         return NULL;
0436
0437     obj = rcu_dereference_check(dtab->netdev_map[key],
0438                     rcu_read_lock_bh_held());
0439     return obj;
0440 }
0441
0442 /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
0443  * variable access, and map elements stick around. See comment above
0444  * xdp_do_flush() in filter.c.
0445  */
0446 static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
0447                struct net_device *dev_rx, struct bpf_prog *xdp_prog)
0448 {
0449     struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
0450     struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
0451
0452     if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
0453         bq_xmit_all(bq, 0);
0454
0455     /* Ingress dev_rx will be the same for all xdp_frame's in
0456      * bulk_queue, because bq stored per-CPU and must be flushed
0457      * from net_device drivers NAPI func end.
0458      *
0459      * Do the same with xdp_prog and flush_list since these fields
0460      * are only ever modified together.
0461      */
0462     if (!bq->dev_rx) {
0463         bq->dev_rx = dev_rx;
0464         bq->xdp_prog = xdp_prog;
0465         list_add(&bq->flush_node, flush_list);
0466     }
0467
0468     bq->q[bq->count++] = xdpf;
0469 }
0470
0471 static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
0472                 struct net_device *dev_rx,
0473                 struct bpf_prog *xdp_prog)
0474 {
0475     int err;
0476
0477     if (!dev->netdev_ops->ndo_xdp_xmit)
0478         return -EOPNOTSUPP;
0479
0480     err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
0481     if (unlikely(err))
0482         return err;
0483
0484     bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
0485     return 0;
0486 }
0487
0488 static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
0489 {
0490     struct xdp_txq_info txq = { .dev = dst->dev };
0491     struct xdp_buff xdp;
0492     u32 act;
0493
0494     if (!dst->xdp_prog)
0495         return XDP_PASS;
0496
0497     __skb_pull(skb, skb->mac_len);
0498     xdp.txq = &txq;
0499
0500     act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
0501     switch (act) {
0502     case XDP_PASS:
0503         __skb_push(skb, skb->mac_len);
0504         break;
0505     default:
0506         bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
0507         fallthrough;
0508     case XDP_ABORTED:
0509         trace_xdp_exception(dst->dev, dst->xdp_prog, act);
0510         fallthrough;
0511     case XDP_DROP:
0512         kfree_skb(skb);
0513         break;
0514     }
0515
0516     return act;
0517 }
0518
0519 int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
0520             struct net_device *dev_rx)
0521 {
0522     return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
0523 }
0524
0525 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
0526             struct net_device *dev_rx)
0527 {
0528     struct net_device *dev = dst->dev;
0529
0530     return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
0531 }
0532
0533 static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
0534 {
0535     if (!obj ||
0536         !obj->dev->netdev_ops->ndo_xdp_xmit)
0537         return false;
0538
0539     if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
0540         return false;
0541
0542     return true;
0543 }
0544
0545 static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
0546                  struct net_device *dev_rx,
0547                  struct xdp_frame *xdpf)
0548 {
0549     struct xdp_frame *nxdpf;
0550
0551     nxdpf = xdpf_clone(xdpf);
0552     if (!nxdpf)
0553         return -ENOMEM;
0554
0555     bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
0556
0557     return 0;
0558 }
0559
0560 static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
0561 {
0562     while (num_excluded--) {
0563         if (ifindex == excluded[num_excluded])
0564             return true;
0565     }
0566     return false;
0567 }
0568
0569 /* Get ifindex of each upper device. 'indexes' must be able to hold at
0570  * least MAX_NEST_DEV elements.
0571  * Returns the number of ifindexes added.
0572  */
0573 static int get_upper_ifindexes(struct net_device *dev, int *indexes)
0574 {
0575     struct net_device *upper;
0576     struct list_head *iter;
0577     int n = 0;
0578
0579     netdev_for_each_upper_dev_rcu(dev, upper, iter) {
0580         indexes[n++] = upper->ifindex;
0581     }
0582     return n;
0583 }
0584
0585 int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
0586               struct bpf_map *map, bool exclude_ingress)
0587 {
0588     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0589     struct bpf_dtab_netdev *dst, *last_dst = NULL;
0590     int excluded_devices[1+MAX_NEST_DEV];
0591     struct hlist_head *head;
0592     int num_excluded = 0;
0593     unsigned int i;
0594     int err;
0595
0596     if (exclude_ingress) {
0597         num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
0598         excluded_devices[num_excluded++] = dev_rx->ifindex;
0599     }
0600
0601     if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
0602         for (i = 0; i < map->max_entries; i++) {
0603             dst = rcu_dereference_check(dtab->netdev_map[i],
0604                             rcu_read_lock_bh_held());
0605             if (!is_valid_dst(dst, xdpf))
0606                 continue;
0607
0608             if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
0609                 continue;
0610
0611             /* we only need n-1 clones; last_dst enqueued below */
0612             if (!last_dst) {
0613                 last_dst = dst;
0614                 continue;
0615             }
0616
0617             err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
0618             if (err)
0619                 return err;
0620
0621             last_dst = dst;
0622         }
0623     } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
0624         for (i = 0; i < dtab->n_buckets; i++) {
0625             head = dev_map_index_hash(dtab, i);
0626             hlist_for_each_entry_rcu(dst, head, index_hlist,
0627                          lockdep_is_held(&dtab->index_lock)) {
0628                 if (!is_valid_dst(dst, xdpf))
0629                     continue;
0630
0631                 if (is_ifindex_excluded(excluded_devices, num_excluded,
0632                             dst->dev->ifindex))
0633                     continue;
0634
0635                 /* we only need n-1 clones; last_dst enqueued below */
0636                 if (!last_dst) {
0637                     last_dst = dst;
0638                     continue;
0639                 }
0640
0641                 err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
0642                 if (err)
0643                     return err;
0644
0645                 last_dst = dst;
0646             }
0647         }
0648     }
0649
0650     /* consume the last copy of the frame */
0651     if (last_dst)
0652         bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
0653     else
0654         xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
0655
0656     return 0;
0657 }
0658
0659 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
0660                  struct bpf_prog *xdp_prog)
0661 {
0662     int err;
0663
0664     err = xdp_ok_fwd_dev(dst->dev, skb->len);
0665     if (unlikely(err))
0666         return err;
0667
0668     /* Redirect has already succeeded semantically at this point, so we just
0669      * return 0 even if packet is dropped. Helper below takes care of
0670      * freeing skb.
0671      */
0672     if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
0673         return 0;
0674
0675     skb->dev = dst->dev;
0676     generic_xdp_tx(skb, xdp_prog);
0677
0678     return 0;
0679 }
0680
0681 static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
0682                   struct sk_buff *skb,
0683                   struct bpf_prog *xdp_prog)
0684 {
0685     struct sk_buff *nskb;
0686     int err;
0687
0688     nskb = skb_clone(skb, GFP_ATOMIC);
0689     if (!nskb)
0690         return -ENOMEM;
0691
0692     err = dev_map_generic_redirect(dst, nskb, xdp_prog);
0693     if (unlikely(err)) {
0694         consume_skb(nskb);
0695         return err;
0696     }
0697
0698     return 0;
0699 }
0700
0701 int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
0702                struct bpf_prog *xdp_prog, struct bpf_map *map,
0703                bool exclude_ingress)
0704 {
0705     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0706     struct bpf_dtab_netdev *dst, *last_dst = NULL;
0707     int excluded_devices[1+MAX_NEST_DEV];
0708     struct hlist_head *head;
0709     struct hlist_node *next;
0710     int num_excluded = 0;
0711     unsigned int i;
0712     int err;
0713
0714     if (exclude_ingress) {
0715         num_excluded = get_upper_ifindexes(dev, excluded_devices);
0716         excluded_devices[num_excluded++] = dev->ifindex;
0717     }
0718
0719     if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
0720         for (i = 0; i < map->max_entries; i++) {
0721             dst = rcu_dereference_check(dtab->netdev_map[i],
0722                             rcu_read_lock_bh_held());
0723             if (!dst)
0724                 continue;
0725
0726             if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
0727                 continue;
0728
0729             /* we only need n-1 clones; last_dst enqueued below */
0730             if (!last_dst) {
0731                 last_dst = dst;
0732                 continue;
0733             }
0734
0735             err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
0736             if (err)
0737                 return err;
0738
0739             last_dst = dst;
0740
0741         }
0742     } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
0743         for (i = 0; i < dtab->n_buckets; i++) {
0744             head = dev_map_index_hash(dtab, i);
0745             hlist_for_each_entry_safe(dst, next, head, index_hlist) {
0746                 if (!dst)
0747                     continue;
0748
0749                 if (is_ifindex_excluded(excluded_devices, num_excluded,
0750                             dst->dev->ifindex))
0751                     continue;
0752
0753                 /* we only need n-1 clones; last_dst enqueued below */
0754                 if (!last_dst) {
0755                     last_dst = dst;
0756                     continue;
0757                 }
0758
0759                 err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
0760                 if (err)
0761                     return err;
0762
0763                 last_dst = dst;
0764             }
0765         }
0766     }
0767
0768     /* consume the first skb and return */
0769     if (last_dst)
0770         return dev_map_generic_redirect(last_dst, skb, xdp_prog);
0771
0772     /* dtab is empty */
0773     consume_skb(skb);
0774     return 0;
0775 }
0776
0777 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
0778 {
0779     struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
0780
0781     return obj ? &obj->val : NULL;
0782 }
0783
0784 static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
0785 {
0786     struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
0787                                 *(u32 *)key);
0788     return obj ? &obj->val : NULL;
0789 }
0790
0791 static void __dev_map_entry_free(struct rcu_head *rcu)
0792 {
0793     struct bpf_dtab_netdev *dev;
0794
0795     dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
0796     if (dev->xdp_prog)
0797         bpf_prog_put(dev->xdp_prog);
0798     dev_put(dev->dev);
0799     kfree(dev);
0800 }
0801
0802 static int dev_map_delete_elem(struct bpf_map *map, void *key)
0803 {
0804     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0805     struct bpf_dtab_netdev *old_dev;
0806     int k = *(u32 *)key;
0807
0808     if (k >= map->max_entries)
0809         return -EINVAL;
0810
0811     old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
0812     if (old_dev)
0813         call_rcu(&old_dev->rcu, __dev_map_entry_free);
0814     return 0;
0815 }
0816
0817 static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
0818 {
0819     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0820     struct bpf_dtab_netdev *old_dev;
0821     int k = *(u32 *)key;
0822     unsigned long flags;
0823     int ret = -ENOENT;
0824
0825     spin_lock_irqsave(&dtab->index_lock, flags);
0826
0827     old_dev = __dev_map_hash_lookup_elem(map, k);
0828     if (old_dev) {
0829         dtab->items--;
0830         hlist_del_init_rcu(&old_dev->index_hlist);
0831         call_rcu(&old_dev->rcu, __dev_map_entry_free);
0832         ret = 0;
0833     }
0834     spin_unlock_irqrestore(&dtab->index_lock, flags);
0835
0836     return ret;
0837 }
0838
0839 static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
0840                             struct bpf_dtab *dtab,
0841                             struct bpf_devmap_val *val,
0842                             unsigned int idx)
0843 {
0844     struct bpf_prog *prog = NULL;
0845     struct bpf_dtab_netdev *dev;
0846
0847     dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
0848                    GFP_NOWAIT | __GFP_NOWARN,
0849                    dtab->map.numa_node);
0850     if (!dev)
0851         return ERR_PTR(-ENOMEM);
0852
0853     dev->dev = dev_get_by_index(net, val->ifindex);
0854     if (!dev->dev)
0855         goto err_out;
0856
0857     if (val->bpf_prog.fd > 0) {
0858         prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
0859                          BPF_PROG_TYPE_XDP, false);
0860         if (IS_ERR(prog))
0861             goto err_put_dev;
0862         if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
0863             !bpf_prog_map_compatible(&dtab->map, prog))
0864             goto err_put_prog;
0865     }
0866
0867     dev->idx = idx;
0868     dev->dtab = dtab;
0869     if (prog) {
0870         dev->xdp_prog = prog;
0871         dev->val.bpf_prog.id = prog->aux->id;
0872     } else {
0873         dev->xdp_prog = NULL;
0874         dev->val.bpf_prog.id = 0;
0875     }
0876     dev->val.ifindex = val->ifindex;
0877
0878     return dev;
0879 err_put_prog:
0880     bpf_prog_put(prog);
0881 err_put_dev:
0882     dev_put(dev->dev);
0883 err_out:
0884     kfree(dev);
0885     return ERR_PTR(-EINVAL);
0886 }
0887
0888 static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
0889                  void *key, void *value, u64 map_flags)
0890 {
0891     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0892     struct bpf_dtab_netdev *dev, *old_dev;
0893     struct bpf_devmap_val val = {};
0894     u32 i = *(u32 *)key;
0895
0896     if (unlikely(map_flags > BPF_EXIST))
0897         return -EINVAL;
0898     if (unlikely(i >= dtab->map.max_entries))
0899         return -E2BIG;
0900     if (unlikely(map_flags == BPF_NOEXIST))
0901         return -EEXIST;
0902
0903     /* already verified value_size <= sizeof val */
0904     memcpy(&val, value, map->value_size);
0905
0906     if (!val.ifindex) {
0907         dev = NULL;
0908         /* can not specify fd if ifindex is 0 */
0909         if (val.bpf_prog.fd > 0)
0910             return -EINVAL;
0911     } else {
0912         dev = __dev_map_alloc_node(net, dtab, &val, i);
0913         if (IS_ERR(dev))
0914             return PTR_ERR(dev);
0915     }
0916
0917     /* Use call_rcu() here to ensure rcu critical sections have completed
0918      * Remembering the driver side flush operation will happen before the
0919      * net device is removed.
0920      */
0921     old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
0922     if (old_dev)
0923         call_rcu(&old_dev->rcu, __dev_map_entry_free);
0924
0925     return 0;
0926 }
0927
0928 static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
0929                    u64 map_flags)
0930 {
0931     return __dev_map_update_elem(current->nsproxy->net_ns,
0932                      map, key, value, map_flags);
0933 }
0934
0935 static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
0936                      void *key, void *value, u64 map_flags)
0937 {
0938     struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
0939     struct bpf_dtab_netdev *dev, *old_dev;
0940     struct bpf_devmap_val val = {};
0941     u32 idx = *(u32 *)key;
0942     unsigned long flags;
0943     int err = -EEXIST;
0944
0945     /* already verified value_size <= sizeof val */
0946     memcpy(&val, value, map->value_size);
0947
0948     if (unlikely(map_flags > BPF_EXIST || !val.ifindex))
0949         return -EINVAL;
0950
0951     spin_lock_irqsave(&dtab->index_lock, flags);
0952
0953     old_dev = __dev_map_hash_lookup_elem(map, idx);
0954     if (old_dev && (map_flags & BPF_NOEXIST))
0955         goto out_err;
0956
0957     dev = __dev_map_alloc_node(net, dtab, &val, idx);
0958     if (IS_ERR(dev)) {
0959         err = PTR_ERR(dev);
0960         goto out_err;
0961     }
0962
0963     if (old_dev) {
0964         hlist_del_rcu(&old_dev->index_hlist);
0965     } else {
0966         if (dtab->items >= dtab->map.max_entries) {
0967             spin_unlock_irqrestore(&dtab->index_lock, flags);
0968             call_rcu(&dev->rcu, __dev_map_entry_free);
0969             return -E2BIG;
0970         }
0971         dtab->items++;
0972     }
0973
0974     hlist_add_head_rcu(&dev->index_hlist,
0975                dev_map_index_hash(dtab, idx));
0976     spin_unlock_irqrestore(&dtab->index_lock, flags);
0977
0978     if (old_dev)
0979         call_rcu(&old_dev->rcu, __dev_map_entry_free);
0980
0981     return 0;
0982
0983 out_err:
0984     spin_unlock_irqrestore(&dtab->index_lock, flags);
0985     return err;
0986 }
0987
0988 static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
0989                    u64 map_flags)
0990 {
0991     return __dev_map_hash_update_elem(current->nsproxy->net_ns,
0992                      map, key, value, map_flags);
0993 }
0994
0995 static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
0996 {
0997     return __bpf_xdp_redirect_map(map, ifindex, flags,
0998                       BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
0999                       __dev_map_lookup_elem);
1000 }
1001
1002 static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
1003 {
1004     return __bpf_xdp_redirect_map(map, ifindex, flags,
1005                       BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
1006                       __dev_map_hash_lookup_elem);
1007 }
1008
1009 BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
1010 const struct bpf_map_ops dev_map_ops = {
1011     .map_meta_equal = bpf_map_meta_equal,
1012     .map_alloc = dev_map_alloc,
1013     .map_free = dev_map_free,
1014     .map_get_next_key = dev_map_get_next_key,
1015     .map_lookup_elem = dev_map_lookup_elem,
1016     .map_update_elem = dev_map_update_elem,
1017     .map_delete_elem = dev_map_delete_elem,
1018     .map_check_btf = map_check_no_btf,
1019     .map_btf_id = &dev_map_btf_ids[0],
1020     .map_redirect = dev_map_redirect,
1021 };
1022
1023 const struct bpf_map_ops dev_map_hash_ops = {
1024     .map_meta_equal = bpf_map_meta_equal,
1025     .map_alloc = dev_map_alloc,
1026     .map_free = dev_map_free,
1027     .map_get_next_key = dev_map_hash_get_next_key,
1028     .map_lookup_elem = dev_map_hash_lookup_elem,
1029     .map_update_elem = dev_map_hash_update_elem,
1030     .map_delete_elem = dev_map_hash_delete_elem,
1031     .map_check_btf = map_check_no_btf,
1032     .map_btf_id = &dev_map_btf_ids[0],
1033     .map_redirect = dev_hash_map_redirect,
1034 };
1035
1036 static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
1037                        struct net_device *netdev)
1038 {
1039     unsigned long flags;
1040     u32 i;
1041
1042     spin_lock_irqsave(&dtab->index_lock, flags);
1043     for (i = 0; i < dtab->n_buckets; i++) {
1044         struct bpf_dtab_netdev *dev;
1045         struct hlist_head *head;
1046         struct hlist_node *next;
1047
1048         head = dev_map_index_hash(dtab, i);
1049
1050         hlist_for_each_entry_safe(dev, next, head, index_hlist) {
1051             if (netdev != dev->dev)
1052                 continue;
1053
1054             dtab->items--;
1055             hlist_del_rcu(&dev->index_hlist);
1056             call_rcu(&dev->rcu, __dev_map_entry_free);
1057         }
1058     }
1059     spin_unlock_irqrestore(&dtab->index_lock, flags);
1060 }
1061
1062 static int dev_map_notification(struct notifier_block *notifier,
1063                 ulong event, void *ptr)
1064 {
1065     struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
1066     struct bpf_dtab *dtab;
1067     int i, cpu;
1068
1069     switch (event) {
1070     case NETDEV_REGISTER:
1071         if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
1072             break;
1073
1074         /* will be freed in free_netdev() */
1075         netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue);
1076         if (!netdev->xdp_bulkq)
1077             return NOTIFY_BAD;
1078
1079         for_each_possible_cpu(cpu)
1080             per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
1081         break;
1082     case NETDEV_UNREGISTER:
1083         /* This rcu_read_lock/unlock pair is needed because
1084          * dev_map_list is an RCU list AND to ensure a delete
1085          * operation does not free a netdev_map entry while we
1086          * are comparing it against the netdev being unregistered.
1087          */
1088         rcu_read_lock();
1089         list_for_each_entry_rcu(dtab, &dev_map_list, list) {
1090             if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
1091                 dev_map_hash_remove_netdev(dtab, netdev);
1092                 continue;
1093             }
1094
1095             for (i = 0; i < dtab->map.max_entries; i++) {
1096                 struct bpf_dtab_netdev *dev, *odev;
1097
1098                 dev = rcu_dereference(dtab->netdev_map[i]);
1099                 if (!dev || netdev != dev->dev)
1100                     continue;
1101                 odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
1102                 if (dev == odev)
1103                     call_rcu(&dev->rcu,
1104                          __dev_map_entry_free);
1105             }
1106         }
1107         rcu_read_unlock();
1108         break;
1109     default:
1110         break;
1111     }
1112     return NOTIFY_OK;
1113 }
1114
1115 static struct notifier_block dev_map_notifier = {
1116     .notifier_call = dev_map_notification,
1117 };
1118
1119 static int __init dev_map_init(void)
1120 {
1121     int cpu;
1122
1123     /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
1124     BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
1125              offsetof(struct _bpf_dtab_netdev, dev));
1126     register_netdevice_notifier(&dev_map_notifier);
1127
1128     for_each_possible_cpu(cpu)
1129         INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
1130     return 0;
1131 }
1132
1133 subsys_initcall(dev_map_init);