Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /* XDP sockets
0003  *
0004  * AF_XDP sockets allows a channel between XDP programs and userspace
0005  * applications.
0006  * Copyright(c) 2018 Intel Corporation.
0007  *
0008  * Author(s): Björn Töpel <bjorn.topel@intel.com>
0009  *        Magnus Karlsson <magnus.karlsson@intel.com>
0010  */
0011 
0012 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
0013 
0014 #include <linux/if_xdp.h>
0015 #include <linux/init.h>
0016 #include <linux/sched/mm.h>
0017 #include <linux/sched/signal.h>
0018 #include <linux/sched/task.h>
0019 #include <linux/socket.h>
0020 #include <linux/file.h>
0021 #include <linux/uaccess.h>
0022 #include <linux/net.h>
0023 #include <linux/netdevice.h>
0024 #include <linux/rculist.h>
0025 #include <net/xdp_sock_drv.h>
0026 #include <net/busy_poll.h>
0027 #include <net/xdp.h>
0028 
0029 #include "xsk_queue.h"
0030 #include "xdp_umem.h"
0031 #include "xsk.h"
0032 
0033 #define TX_BATCH_SIZE 32
0034 
0035 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
0036 
0037 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
0038 {
0039     if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
0040         return;
0041 
0042     pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
0043     pool->cached_need_wakeup |= XDP_WAKEUP_RX;
0044 }
0045 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
0046 
0047 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
0048 {
0049     struct xdp_sock *xs;
0050 
0051     if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
0052         return;
0053 
0054     rcu_read_lock();
0055     list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
0056         xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
0057     }
0058     rcu_read_unlock();
0059 
0060     pool->cached_need_wakeup |= XDP_WAKEUP_TX;
0061 }
0062 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
0063 
0064 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
0065 {
0066     if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
0067         return;
0068 
0069     pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
0070     pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
0071 }
0072 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
0073 
0074 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
0075 {
0076     struct xdp_sock *xs;
0077 
0078     if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
0079         return;
0080 
0081     rcu_read_lock();
0082     list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
0083         xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
0084     }
0085     rcu_read_unlock();
0086 
0087     pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
0088 }
0089 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
0090 
0091 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
0092 {
0093     return pool->uses_need_wakeup;
0094 }
0095 EXPORT_SYMBOL(xsk_uses_need_wakeup);
0096 
0097 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
0098                         u16 queue_id)
0099 {
0100     if (queue_id < dev->real_num_rx_queues)
0101         return dev->_rx[queue_id].pool;
0102     if (queue_id < dev->real_num_tx_queues)
0103         return dev->_tx[queue_id].pool;
0104 
0105     return NULL;
0106 }
0107 EXPORT_SYMBOL(xsk_get_pool_from_qid);
0108 
0109 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
0110 {
0111     if (queue_id < dev->num_rx_queues)
0112         dev->_rx[queue_id].pool = NULL;
0113     if (queue_id < dev->num_tx_queues)
0114         dev->_tx[queue_id].pool = NULL;
0115 }
0116 
0117 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
0118  * not know if the device has more tx queues than rx, or the opposite.
0119  * This might also change during run time.
0120  */
0121 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
0122             u16 queue_id)
0123 {
0124     if (queue_id >= max_t(unsigned int,
0125                   dev->real_num_rx_queues,
0126                   dev->real_num_tx_queues))
0127         return -EINVAL;
0128 
0129     if (queue_id < dev->real_num_rx_queues)
0130         dev->_rx[queue_id].pool = pool;
0131     if (queue_id < dev->real_num_tx_queues)
0132         dev->_tx[queue_id].pool = pool;
0133 
0134     return 0;
0135 }
0136 
0137 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
0138 {
0139     struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
0140     u64 addr;
0141     int err;
0142 
0143     addr = xp_get_handle(xskb);
0144     err = xskq_prod_reserve_desc(xs->rx, addr, len);
0145     if (err) {
0146         xs->rx_queue_full++;
0147         return err;
0148     }
0149 
0150     xp_release(xskb);
0151     return 0;
0152 }
0153 
0154 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
0155 {
0156     void *from_buf, *to_buf;
0157     u32 metalen;
0158 
0159     if (unlikely(xdp_data_meta_unsupported(from))) {
0160         from_buf = from->data;
0161         to_buf = to->data;
0162         metalen = 0;
0163     } else {
0164         from_buf = from->data_meta;
0165         metalen = from->data - from->data_meta;
0166         to_buf = to->data - metalen;
0167     }
0168 
0169     memcpy(to_buf, from_buf, len + metalen);
0170 }
0171 
0172 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
0173 {
0174     struct xdp_buff *xsk_xdp;
0175     int err;
0176     u32 len;
0177 
0178     len = xdp->data_end - xdp->data;
0179     if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
0180         xs->rx_dropped++;
0181         return -ENOSPC;
0182     }
0183 
0184     xsk_xdp = xsk_buff_alloc(xs->pool);
0185     if (!xsk_xdp) {
0186         xs->rx_dropped++;
0187         return -ENOMEM;
0188     }
0189 
0190     xsk_copy_xdp(xsk_xdp, xdp, len);
0191     err = __xsk_rcv_zc(xs, xsk_xdp, len);
0192     if (err) {
0193         xsk_buff_free(xsk_xdp);
0194         return err;
0195     }
0196     return 0;
0197 }
0198 
0199 static bool xsk_tx_writeable(struct xdp_sock *xs)
0200 {
0201     if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
0202         return false;
0203 
0204     return true;
0205 }
0206 
0207 static bool xsk_is_bound(struct xdp_sock *xs)
0208 {
0209     if (READ_ONCE(xs->state) == XSK_BOUND) {
0210         /* Matches smp_wmb() in bind(). */
0211         smp_rmb();
0212         return true;
0213     }
0214     return false;
0215 }
0216 
0217 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
0218 {
0219     if (!xsk_is_bound(xs))
0220         return -ENXIO;
0221 
0222     if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
0223         return -EINVAL;
0224 
0225     sk_mark_napi_id_once_xdp(&xs->sk, xdp);
0226     return 0;
0227 }
0228 
0229 static void xsk_flush(struct xdp_sock *xs)
0230 {
0231     xskq_prod_submit(xs->rx);
0232     __xskq_cons_release(xs->pool->fq);
0233     sock_def_readable(&xs->sk);
0234 }
0235 
0236 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
0237 {
0238     int err;
0239 
0240     spin_lock_bh(&xs->rx_lock);
0241     err = xsk_rcv_check(xs, xdp);
0242     if (!err) {
0243         err = __xsk_rcv(xs, xdp);
0244         xsk_flush(xs);
0245     }
0246     spin_unlock_bh(&xs->rx_lock);
0247     return err;
0248 }
0249 
0250 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
0251 {
0252     int err;
0253     u32 len;
0254 
0255     err = xsk_rcv_check(xs, xdp);
0256     if (err)
0257         return err;
0258 
0259     if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
0260         len = xdp->data_end - xdp->data;
0261         return __xsk_rcv_zc(xs, xdp, len);
0262     }
0263 
0264     err = __xsk_rcv(xs, xdp);
0265     if (!err)
0266         xdp_return_buff(xdp);
0267     return err;
0268 }
0269 
0270 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
0271 {
0272     struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
0273     int err;
0274 
0275     err = xsk_rcv(xs, xdp);
0276     if (err)
0277         return err;
0278 
0279     if (!xs->flush_node.prev)
0280         list_add(&xs->flush_node, flush_list);
0281 
0282     return 0;
0283 }
0284 
0285 void __xsk_map_flush(void)
0286 {
0287     struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
0288     struct xdp_sock *xs, *tmp;
0289 
0290     list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
0291         xsk_flush(xs);
0292         __list_del_clearprev(&xs->flush_node);
0293     }
0294 }
0295 
0296 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
0297 {
0298     xskq_prod_submit_n(pool->cq, nb_entries);
0299 }
0300 EXPORT_SYMBOL(xsk_tx_completed);
0301 
0302 void xsk_tx_release(struct xsk_buff_pool *pool)
0303 {
0304     struct xdp_sock *xs;
0305 
0306     rcu_read_lock();
0307     list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
0308         __xskq_cons_release(xs->tx);
0309         if (xsk_tx_writeable(xs))
0310             xs->sk.sk_write_space(&xs->sk);
0311     }
0312     rcu_read_unlock();
0313 }
0314 EXPORT_SYMBOL(xsk_tx_release);
0315 
0316 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
0317 {
0318     struct xdp_sock *xs;
0319 
0320     rcu_read_lock();
0321     list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
0322         if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
0323             xs->tx->queue_empty_descs++;
0324             continue;
0325         }
0326 
0327         /* This is the backpressure mechanism for the Tx path.
0328          * Reserve space in the completion queue and only proceed
0329          * if there is space in it. This avoids having to implement
0330          * any buffering in the Tx path.
0331          */
0332         if (xskq_prod_reserve_addr(pool->cq, desc->addr))
0333             goto out;
0334 
0335         xskq_cons_release(xs->tx);
0336         rcu_read_unlock();
0337         return true;
0338     }
0339 
0340 out:
0341     rcu_read_unlock();
0342     return false;
0343 }
0344 EXPORT_SYMBOL(xsk_tx_peek_desc);
0345 
0346 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
0347 {
0348     struct xdp_desc *descs = pool->tx_descs;
0349     u32 nb_pkts = 0;
0350 
0351     while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
0352         nb_pkts++;
0353 
0354     xsk_tx_release(pool);
0355     return nb_pkts;
0356 }
0357 
0358 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries)
0359 {
0360     struct xdp_sock *xs;
0361     u32 nb_pkts;
0362 
0363     rcu_read_lock();
0364     if (!list_is_singular(&pool->xsk_tx_list)) {
0365         /* Fallback to the non-batched version */
0366         rcu_read_unlock();
0367         return xsk_tx_peek_release_fallback(pool, max_entries);
0368     }
0369 
0370     xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
0371     if (!xs) {
0372         nb_pkts = 0;
0373         goto out;
0374     }
0375 
0376     max_entries = xskq_cons_nb_entries(xs->tx, max_entries);
0377     nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, max_entries);
0378     if (!nb_pkts) {
0379         xs->tx->queue_empty_descs++;
0380         goto out;
0381     }
0382 
0383     /* This is the backpressure mechanism for the Tx path. Try to
0384      * reserve space in the completion queue for all packets, but
0385      * if there are fewer slots available, just process that many
0386      * packets. This avoids having to implement any buffering in
0387      * the Tx path.
0388      */
0389     nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
0390     if (!nb_pkts)
0391         goto out;
0392 
0393     xskq_cons_release_n(xs->tx, max_entries);
0394     __xskq_cons_release(xs->tx);
0395     xs->sk.sk_write_space(&xs->sk);
0396 
0397 out:
0398     rcu_read_unlock();
0399     return nb_pkts;
0400 }
0401 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
0402 
0403 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
0404 {
0405     struct net_device *dev = xs->dev;
0406 
0407     return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
0408 }
0409 
0410 static void xsk_destruct_skb(struct sk_buff *skb)
0411 {
0412     u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
0413     struct xdp_sock *xs = xdp_sk(skb->sk);
0414     unsigned long flags;
0415 
0416     spin_lock_irqsave(&xs->pool->cq_lock, flags);
0417     xskq_prod_submit_addr(xs->pool->cq, addr);
0418     spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
0419 
0420     sock_wfree(skb);
0421 }
0422 
0423 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
0424                           struct xdp_desc *desc)
0425 {
0426     struct xsk_buff_pool *pool = xs->pool;
0427     u32 hr, len, ts, offset, copy, copied;
0428     struct sk_buff *skb;
0429     struct page *page;
0430     void *buffer;
0431     int err, i;
0432     u64 addr;
0433 
0434     hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
0435 
0436     skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
0437     if (unlikely(!skb))
0438         return ERR_PTR(err);
0439 
0440     skb_reserve(skb, hr);
0441 
0442     addr = desc->addr;
0443     len = desc->len;
0444     ts = pool->unaligned ? len : pool->chunk_size;
0445 
0446     buffer = xsk_buff_raw_get_data(pool, addr);
0447     offset = offset_in_page(buffer);
0448     addr = buffer - pool->addrs;
0449 
0450     for (copied = 0, i = 0; copied < len; i++) {
0451         page = pool->umem->pgs[addr >> PAGE_SHIFT];
0452         get_page(page);
0453 
0454         copy = min_t(u32, PAGE_SIZE - offset, len - copied);
0455         skb_fill_page_desc(skb, i, page, offset, copy);
0456 
0457         copied += copy;
0458         addr += copy;
0459         offset = 0;
0460     }
0461 
0462     skb->len += len;
0463     skb->data_len += len;
0464     skb->truesize += ts;
0465 
0466     refcount_add(ts, &xs->sk.sk_wmem_alloc);
0467 
0468     return skb;
0469 }
0470 
0471 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
0472                      struct xdp_desc *desc)
0473 {
0474     struct net_device *dev = xs->dev;
0475     struct sk_buff *skb;
0476 
0477     if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
0478         skb = xsk_build_skb_zerocopy(xs, desc);
0479         if (IS_ERR(skb))
0480             return skb;
0481     } else {
0482         u32 hr, tr, len;
0483         void *buffer;
0484         int err;
0485 
0486         hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
0487         tr = dev->needed_tailroom;
0488         len = desc->len;
0489 
0490         skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
0491         if (unlikely(!skb))
0492             return ERR_PTR(err);
0493 
0494         skb_reserve(skb, hr);
0495         skb_put(skb, len);
0496 
0497         buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
0498         err = skb_store_bits(skb, 0, buffer, len);
0499         if (unlikely(err)) {
0500             kfree_skb(skb);
0501             return ERR_PTR(err);
0502         }
0503     }
0504 
0505     skb->dev = dev;
0506     skb->priority = xs->sk.sk_priority;
0507     skb->mark = xs->sk.sk_mark;
0508     skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
0509     skb->destructor = xsk_destruct_skb;
0510 
0511     return skb;
0512 }
0513 
0514 static int xsk_generic_xmit(struct sock *sk)
0515 {
0516     struct xdp_sock *xs = xdp_sk(sk);
0517     u32 max_batch = TX_BATCH_SIZE;
0518     bool sent_frame = false;
0519     struct xdp_desc desc;
0520     struct sk_buff *skb;
0521     unsigned long flags;
0522     int err = 0;
0523 
0524     mutex_lock(&xs->mutex);
0525 
0526     /* Since we dropped the RCU read lock, the socket state might have changed. */
0527     if (unlikely(!xsk_is_bound(xs))) {
0528         err = -ENXIO;
0529         goto out;
0530     }
0531 
0532     if (xs->queue_id >= xs->dev->real_num_tx_queues)
0533         goto out;
0534 
0535     while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
0536         if (max_batch-- == 0) {
0537             err = -EAGAIN;
0538             goto out;
0539         }
0540 
0541         /* This is the backpressure mechanism for the Tx path.
0542          * Reserve space in the completion queue and only proceed
0543          * if there is space in it. This avoids having to implement
0544          * any buffering in the Tx path.
0545          */
0546         spin_lock_irqsave(&xs->pool->cq_lock, flags);
0547         if (xskq_prod_reserve(xs->pool->cq)) {
0548             spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
0549             goto out;
0550         }
0551         spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
0552 
0553         skb = xsk_build_skb(xs, &desc);
0554         if (IS_ERR(skb)) {
0555             err = PTR_ERR(skb);
0556             spin_lock_irqsave(&xs->pool->cq_lock, flags);
0557             xskq_prod_cancel(xs->pool->cq);
0558             spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
0559             goto out;
0560         }
0561 
0562         err = __dev_direct_xmit(skb, xs->queue_id);
0563         if  (err == NETDEV_TX_BUSY) {
0564             /* Tell user-space to retry the send */
0565             skb->destructor = sock_wfree;
0566             spin_lock_irqsave(&xs->pool->cq_lock, flags);
0567             xskq_prod_cancel(xs->pool->cq);
0568             spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
0569             /* Free skb without triggering the perf drop trace */
0570             consume_skb(skb);
0571             err = -EAGAIN;
0572             goto out;
0573         }
0574 
0575         xskq_cons_release(xs->tx);
0576         /* Ignore NET_XMIT_CN as packet might have been sent */
0577         if (err == NET_XMIT_DROP) {
0578             /* SKB completed but not sent */
0579             err = -EBUSY;
0580             goto out;
0581         }
0582 
0583         sent_frame = true;
0584     }
0585 
0586     xs->tx->queue_empty_descs++;
0587 
0588 out:
0589     if (sent_frame)
0590         if (xsk_tx_writeable(xs))
0591             sk->sk_write_space(sk);
0592 
0593     mutex_unlock(&xs->mutex);
0594     return err;
0595 }
0596 
0597 static int xsk_xmit(struct sock *sk)
0598 {
0599     struct xdp_sock *xs = xdp_sk(sk);
0600     int ret;
0601 
0602     if (unlikely(!(xs->dev->flags & IFF_UP)))
0603         return -ENETDOWN;
0604     if (unlikely(!xs->tx))
0605         return -ENOBUFS;
0606 
0607     if (xs->zc)
0608         return xsk_wakeup(xs, XDP_WAKEUP_TX);
0609 
0610     /* Drop the RCU lock since the SKB path might sleep. */
0611     rcu_read_unlock();
0612     ret = xsk_generic_xmit(sk);
0613     /* Reaquire RCU lock before going into common code. */
0614     rcu_read_lock();
0615 
0616     return ret;
0617 }
0618 
0619 static bool xsk_no_wakeup(struct sock *sk)
0620 {
0621 #ifdef CONFIG_NET_RX_BUSY_POLL
0622     /* Prefer busy-polling, skip the wakeup. */
0623     return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
0624         READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
0625 #else
0626     return false;
0627 #endif
0628 }
0629 
0630 static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
0631 {
0632     bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
0633     struct sock *sk = sock->sk;
0634     struct xdp_sock *xs = xdp_sk(sk);
0635     struct xsk_buff_pool *pool;
0636 
0637     if (unlikely(!xsk_is_bound(xs)))
0638         return -ENXIO;
0639     if (unlikely(need_wait))
0640         return -EOPNOTSUPP;
0641 
0642     if (sk_can_busy_loop(sk)) {
0643         if (xs->zc)
0644             __sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
0645         sk_busy_loop(sk, 1); /* only support non-blocking sockets */
0646     }
0647 
0648     if (xs->zc && xsk_no_wakeup(sk))
0649         return 0;
0650 
0651     pool = xs->pool;
0652     if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
0653         return xsk_xmit(sk);
0654     return 0;
0655 }
0656 
0657 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
0658 {
0659     int ret;
0660 
0661     rcu_read_lock();
0662     ret = __xsk_sendmsg(sock, m, total_len);
0663     rcu_read_unlock();
0664 
0665     return ret;
0666 }
0667 
0668 static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
0669 {
0670     bool need_wait = !(flags & MSG_DONTWAIT);
0671     struct sock *sk = sock->sk;
0672     struct xdp_sock *xs = xdp_sk(sk);
0673 
0674     if (unlikely(!xsk_is_bound(xs)))
0675         return -ENXIO;
0676     if (unlikely(!(xs->dev->flags & IFF_UP)))
0677         return -ENETDOWN;
0678     if (unlikely(!xs->rx))
0679         return -ENOBUFS;
0680     if (unlikely(need_wait))
0681         return -EOPNOTSUPP;
0682 
0683     if (sk_can_busy_loop(sk))
0684         sk_busy_loop(sk, 1); /* only support non-blocking sockets */
0685 
0686     if (xsk_no_wakeup(sk))
0687         return 0;
0688 
0689     if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
0690         return xsk_wakeup(xs, XDP_WAKEUP_RX);
0691     return 0;
0692 }
0693 
0694 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
0695 {
0696     int ret;
0697 
0698     rcu_read_lock();
0699     ret = __xsk_recvmsg(sock, m, len, flags);
0700     rcu_read_unlock();
0701 
0702     return ret;
0703 }
0704 
0705 static __poll_t xsk_poll(struct file *file, struct socket *sock,
0706                  struct poll_table_struct *wait)
0707 {
0708     __poll_t mask = 0;
0709     struct sock *sk = sock->sk;
0710     struct xdp_sock *xs = xdp_sk(sk);
0711     struct xsk_buff_pool *pool;
0712 
0713     sock_poll_wait(file, sock, wait);
0714 
0715     rcu_read_lock();
0716     if (unlikely(!xsk_is_bound(xs))) {
0717         rcu_read_unlock();
0718         return mask;
0719     }
0720 
0721     pool = xs->pool;
0722 
0723     if (pool->cached_need_wakeup) {
0724         if (xs->zc)
0725             xsk_wakeup(xs, pool->cached_need_wakeup);
0726         else
0727             /* Poll needs to drive Tx also in copy mode */
0728             xsk_xmit(sk);
0729     }
0730 
0731     if (xs->rx && !xskq_prod_is_empty(xs->rx))
0732         mask |= EPOLLIN | EPOLLRDNORM;
0733     if (xs->tx && xsk_tx_writeable(xs))
0734         mask |= EPOLLOUT | EPOLLWRNORM;
0735 
0736     rcu_read_unlock();
0737     return mask;
0738 }
0739 
0740 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
0741               bool umem_queue)
0742 {
0743     struct xsk_queue *q;
0744 
0745     if (entries == 0 || *queue || !is_power_of_2(entries))
0746         return -EINVAL;
0747 
0748     q = xskq_create(entries, umem_queue);
0749     if (!q)
0750         return -ENOMEM;
0751 
0752     /* Make sure queue is ready before it can be seen by others */
0753     smp_wmb();
0754     WRITE_ONCE(*queue, q);
0755     return 0;
0756 }
0757 
0758 static void xsk_unbind_dev(struct xdp_sock *xs)
0759 {
0760     struct net_device *dev = xs->dev;
0761 
0762     if (xs->state != XSK_BOUND)
0763         return;
0764     WRITE_ONCE(xs->state, XSK_UNBOUND);
0765 
0766     /* Wait for driver to stop using the xdp socket. */
0767     xp_del_xsk(xs->pool, xs);
0768     synchronize_net();
0769     dev_put(dev);
0770 }
0771 
0772 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
0773                           struct xdp_sock __rcu ***map_entry)
0774 {
0775     struct xsk_map *map = NULL;
0776     struct xsk_map_node *node;
0777 
0778     *map_entry = NULL;
0779 
0780     spin_lock_bh(&xs->map_list_lock);
0781     node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
0782                     node);
0783     if (node) {
0784         bpf_map_inc(&node->map->map);
0785         map = node->map;
0786         *map_entry = node->map_entry;
0787     }
0788     spin_unlock_bh(&xs->map_list_lock);
0789     return map;
0790 }
0791 
0792 static void xsk_delete_from_maps(struct xdp_sock *xs)
0793 {
0794     /* This function removes the current XDP socket from all the
0795      * maps it resides in. We need to take extra care here, due to
0796      * the two locks involved. Each map has a lock synchronizing
0797      * updates to the entries, and each socket has a lock that
0798      * synchronizes access to the list of maps (map_list). For
0799      * deadlock avoidance the locks need to be taken in the order
0800      * "map lock"->"socket map list lock". We start off by
0801      * accessing the socket map list, and take a reference to the
0802      * map to guarantee existence between the
0803      * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
0804      * calls. Then we ask the map to remove the socket, which
0805      * tries to remove the socket from the map. Note that there
0806      * might be updates to the map between
0807      * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
0808      */
0809     struct xdp_sock __rcu **map_entry = NULL;
0810     struct xsk_map *map;
0811 
0812     while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
0813         xsk_map_try_sock_delete(map, xs, map_entry);
0814         bpf_map_put(&map->map);
0815     }
0816 }
0817 
0818 static int xsk_release(struct socket *sock)
0819 {
0820     struct sock *sk = sock->sk;
0821     struct xdp_sock *xs = xdp_sk(sk);
0822     struct net *net;
0823 
0824     if (!sk)
0825         return 0;
0826 
0827     net = sock_net(sk);
0828 
0829     mutex_lock(&net->xdp.lock);
0830     sk_del_node_init_rcu(sk);
0831     mutex_unlock(&net->xdp.lock);
0832 
0833     sock_prot_inuse_add(net, sk->sk_prot, -1);
0834 
0835     xsk_delete_from_maps(xs);
0836     mutex_lock(&xs->mutex);
0837     xsk_unbind_dev(xs);
0838     mutex_unlock(&xs->mutex);
0839 
0840     xskq_destroy(xs->rx);
0841     xskq_destroy(xs->tx);
0842     xskq_destroy(xs->fq_tmp);
0843     xskq_destroy(xs->cq_tmp);
0844 
0845     sock_orphan(sk);
0846     sock->sk = NULL;
0847 
0848     sk_refcnt_debug_release(sk);
0849     sock_put(sk);
0850 
0851     return 0;
0852 }
0853 
0854 static struct socket *xsk_lookup_xsk_from_fd(int fd)
0855 {
0856     struct socket *sock;
0857     int err;
0858 
0859     sock = sockfd_lookup(fd, &err);
0860     if (!sock)
0861         return ERR_PTR(-ENOTSOCK);
0862 
0863     if (sock->sk->sk_family != PF_XDP) {
0864         sockfd_put(sock);
0865         return ERR_PTR(-ENOPROTOOPT);
0866     }
0867 
0868     return sock;
0869 }
0870 
0871 static bool xsk_validate_queues(struct xdp_sock *xs)
0872 {
0873     return xs->fq_tmp && xs->cq_tmp;
0874 }
0875 
0876 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
0877 {
0878     struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
0879     struct sock *sk = sock->sk;
0880     struct xdp_sock *xs = xdp_sk(sk);
0881     struct net_device *dev;
0882     u32 flags, qid;
0883     int err = 0;
0884 
0885     if (addr_len < sizeof(struct sockaddr_xdp))
0886         return -EINVAL;
0887     if (sxdp->sxdp_family != AF_XDP)
0888         return -EINVAL;
0889 
0890     flags = sxdp->sxdp_flags;
0891     if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
0892               XDP_USE_NEED_WAKEUP))
0893         return -EINVAL;
0894 
0895     rtnl_lock();
0896     mutex_lock(&xs->mutex);
0897     if (xs->state != XSK_READY) {
0898         err = -EBUSY;
0899         goto out_release;
0900     }
0901 
0902     dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
0903     if (!dev) {
0904         err = -ENODEV;
0905         goto out_release;
0906     }
0907 
0908     if (!xs->rx && !xs->tx) {
0909         err = -EINVAL;
0910         goto out_unlock;
0911     }
0912 
0913     qid = sxdp->sxdp_queue_id;
0914 
0915     if (flags & XDP_SHARED_UMEM) {
0916         struct xdp_sock *umem_xs;
0917         struct socket *sock;
0918 
0919         if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
0920             (flags & XDP_USE_NEED_WAKEUP)) {
0921             /* Cannot specify flags for shared sockets. */
0922             err = -EINVAL;
0923             goto out_unlock;
0924         }
0925 
0926         if (xs->umem) {
0927             /* We have already our own. */
0928             err = -EINVAL;
0929             goto out_unlock;
0930         }
0931 
0932         sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
0933         if (IS_ERR(sock)) {
0934             err = PTR_ERR(sock);
0935             goto out_unlock;
0936         }
0937 
0938         umem_xs = xdp_sk(sock->sk);
0939         if (!xsk_is_bound(umem_xs)) {
0940             err = -EBADF;
0941             sockfd_put(sock);
0942             goto out_unlock;
0943         }
0944 
0945         if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
0946             /* Share the umem with another socket on another qid
0947              * and/or device.
0948              */
0949             xs->pool = xp_create_and_assign_umem(xs,
0950                                  umem_xs->umem);
0951             if (!xs->pool) {
0952                 err = -ENOMEM;
0953                 sockfd_put(sock);
0954                 goto out_unlock;
0955             }
0956 
0957             err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
0958                            qid);
0959             if (err) {
0960                 xp_destroy(xs->pool);
0961                 xs->pool = NULL;
0962                 sockfd_put(sock);
0963                 goto out_unlock;
0964             }
0965         } else {
0966             /* Share the buffer pool with the other socket. */
0967             if (xs->fq_tmp || xs->cq_tmp) {
0968                 /* Do not allow setting your own fq or cq. */
0969                 err = -EINVAL;
0970                 sockfd_put(sock);
0971                 goto out_unlock;
0972             }
0973 
0974             xp_get_pool(umem_xs->pool);
0975             xs->pool = umem_xs->pool;
0976 
0977             /* If underlying shared umem was created without Tx
0978              * ring, allocate Tx descs array that Tx batching API
0979              * utilizes
0980              */
0981             if (xs->tx && !xs->pool->tx_descs) {
0982                 err = xp_alloc_tx_descs(xs->pool, xs);
0983                 if (err) {
0984                     xp_put_pool(xs->pool);
0985                     sockfd_put(sock);
0986                     goto out_unlock;
0987                 }
0988             }
0989         }
0990 
0991         xdp_get_umem(umem_xs->umem);
0992         WRITE_ONCE(xs->umem, umem_xs->umem);
0993         sockfd_put(sock);
0994     } else if (!xs->umem || !xsk_validate_queues(xs)) {
0995         err = -EINVAL;
0996         goto out_unlock;
0997     } else {
0998         /* This xsk has its own umem. */
0999         xs->pool = xp_create_and_assign_umem(xs, xs->umem);
1000         if (!xs->pool) {
1001             err = -ENOMEM;
1002             goto out_unlock;
1003         }
1004 
1005         err = xp_assign_dev(xs->pool, dev, qid, flags);
1006         if (err) {
1007             xp_destroy(xs->pool);
1008             xs->pool = NULL;
1009             goto out_unlock;
1010         }
1011     }
1012 
1013     /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
1014     xs->fq_tmp = NULL;
1015     xs->cq_tmp = NULL;
1016 
1017     xs->dev = dev;
1018     xs->zc = xs->umem->zc;
1019     xs->queue_id = qid;
1020     xp_add_xsk(xs->pool, xs);
1021 
1022 out_unlock:
1023     if (err) {
1024         dev_put(dev);
1025     } else {
1026         /* Matches smp_rmb() in bind() for shared umem
1027          * sockets, and xsk_is_bound().
1028          */
1029         smp_wmb();
1030         WRITE_ONCE(xs->state, XSK_BOUND);
1031     }
1032 out_release:
1033     mutex_unlock(&xs->mutex);
1034     rtnl_unlock();
1035     return err;
1036 }
1037 
1038 struct xdp_umem_reg_v1 {
1039     __u64 addr; /* Start of packet data area */
1040     __u64 len; /* Length of packet data area */
1041     __u32 chunk_size;
1042     __u32 headroom;
1043 };
1044 
1045 static int xsk_setsockopt(struct socket *sock, int level, int optname,
1046               sockptr_t optval, unsigned int optlen)
1047 {
1048     struct sock *sk = sock->sk;
1049     struct xdp_sock *xs = xdp_sk(sk);
1050     int err;
1051 
1052     if (level != SOL_XDP)
1053         return -ENOPROTOOPT;
1054 
1055     switch (optname) {
1056     case XDP_RX_RING:
1057     case XDP_TX_RING:
1058     {
1059         struct xsk_queue **q;
1060         int entries;
1061 
1062         if (optlen < sizeof(entries))
1063             return -EINVAL;
1064         if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1065             return -EFAULT;
1066 
1067         mutex_lock(&xs->mutex);
1068         if (xs->state != XSK_READY) {
1069             mutex_unlock(&xs->mutex);
1070             return -EBUSY;
1071         }
1072         q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1073         err = xsk_init_queue(entries, q, false);
1074         if (!err && optname == XDP_TX_RING)
1075             /* Tx needs to be explicitly woken up the first time */
1076             xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1077         mutex_unlock(&xs->mutex);
1078         return err;
1079     }
1080     case XDP_UMEM_REG:
1081     {
1082         size_t mr_size = sizeof(struct xdp_umem_reg);
1083         struct xdp_umem_reg mr = {};
1084         struct xdp_umem *umem;
1085 
1086         if (optlen < sizeof(struct xdp_umem_reg_v1))
1087             return -EINVAL;
1088         else if (optlen < sizeof(mr))
1089             mr_size = sizeof(struct xdp_umem_reg_v1);
1090 
1091         if (copy_from_sockptr(&mr, optval, mr_size))
1092             return -EFAULT;
1093 
1094         mutex_lock(&xs->mutex);
1095         if (xs->state != XSK_READY || xs->umem) {
1096             mutex_unlock(&xs->mutex);
1097             return -EBUSY;
1098         }
1099 
1100         umem = xdp_umem_create(&mr);
1101         if (IS_ERR(umem)) {
1102             mutex_unlock(&xs->mutex);
1103             return PTR_ERR(umem);
1104         }
1105 
1106         /* Make sure umem is ready before it can be seen by others */
1107         smp_wmb();
1108         WRITE_ONCE(xs->umem, umem);
1109         mutex_unlock(&xs->mutex);
1110         return 0;
1111     }
1112     case XDP_UMEM_FILL_RING:
1113     case XDP_UMEM_COMPLETION_RING:
1114     {
1115         struct xsk_queue **q;
1116         int entries;
1117 
1118         if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1119             return -EFAULT;
1120 
1121         mutex_lock(&xs->mutex);
1122         if (xs->state != XSK_READY) {
1123             mutex_unlock(&xs->mutex);
1124             return -EBUSY;
1125         }
1126 
1127         q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1128             &xs->cq_tmp;
1129         err = xsk_init_queue(entries, q, true);
1130         mutex_unlock(&xs->mutex);
1131         return err;
1132     }
1133     default:
1134         break;
1135     }
1136 
1137     return -ENOPROTOOPT;
1138 }
1139 
1140 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1141 {
1142     ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1143     ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1144     ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1145 }
1146 
1147 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1148 {
1149     ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1150     ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1151     ring->desc = offsetof(struct xdp_umem_ring, desc);
1152 }
1153 
1154 struct xdp_statistics_v1 {
1155     __u64 rx_dropped;
1156     __u64 rx_invalid_descs;
1157     __u64 tx_invalid_descs;
1158 };
1159 
1160 static int xsk_getsockopt(struct socket *sock, int level, int optname,
1161               char __user *optval, int __user *optlen)
1162 {
1163     struct sock *sk = sock->sk;
1164     struct xdp_sock *xs = xdp_sk(sk);
1165     int len;
1166 
1167     if (level != SOL_XDP)
1168         return -ENOPROTOOPT;
1169 
1170     if (get_user(len, optlen))
1171         return -EFAULT;
1172     if (len < 0)
1173         return -EINVAL;
1174 
1175     switch (optname) {
1176     case XDP_STATISTICS:
1177     {
1178         struct xdp_statistics stats = {};
1179         bool extra_stats = true;
1180         size_t stats_size;
1181 
1182         if (len < sizeof(struct xdp_statistics_v1)) {
1183             return -EINVAL;
1184         } else if (len < sizeof(stats)) {
1185             extra_stats = false;
1186             stats_size = sizeof(struct xdp_statistics_v1);
1187         } else {
1188             stats_size = sizeof(stats);
1189         }
1190 
1191         mutex_lock(&xs->mutex);
1192         stats.rx_dropped = xs->rx_dropped;
1193         if (extra_stats) {
1194             stats.rx_ring_full = xs->rx_queue_full;
1195             stats.rx_fill_ring_empty_descs =
1196                 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1197             stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1198         } else {
1199             stats.rx_dropped += xs->rx_queue_full;
1200         }
1201         stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1202         stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1203         mutex_unlock(&xs->mutex);
1204 
1205         if (copy_to_user(optval, &stats, stats_size))
1206             return -EFAULT;
1207         if (put_user(stats_size, optlen))
1208             return -EFAULT;
1209 
1210         return 0;
1211     }
1212     case XDP_MMAP_OFFSETS:
1213     {
1214         struct xdp_mmap_offsets off;
1215         struct xdp_mmap_offsets_v1 off_v1;
1216         bool flags_supported = true;
1217         void *to_copy;
1218 
1219         if (len < sizeof(off_v1))
1220             return -EINVAL;
1221         else if (len < sizeof(off))
1222             flags_supported = false;
1223 
1224         if (flags_supported) {
1225             /* xdp_ring_offset is identical to xdp_ring_offset_v1
1226              * except for the flags field added to the end.
1227              */
1228             xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1229                            &off.rx);
1230             xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1231                            &off.tx);
1232             xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1233                            &off.fr);
1234             xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1235                            &off.cr);
1236             off.rx.flags = offsetof(struct xdp_rxtx_ring,
1237                         ptrs.flags);
1238             off.tx.flags = offsetof(struct xdp_rxtx_ring,
1239                         ptrs.flags);
1240             off.fr.flags = offsetof(struct xdp_umem_ring,
1241                         ptrs.flags);
1242             off.cr.flags = offsetof(struct xdp_umem_ring,
1243                         ptrs.flags);
1244 
1245             len = sizeof(off);
1246             to_copy = &off;
1247         } else {
1248             xsk_enter_rxtx_offsets(&off_v1.rx);
1249             xsk_enter_rxtx_offsets(&off_v1.tx);
1250             xsk_enter_umem_offsets(&off_v1.fr);
1251             xsk_enter_umem_offsets(&off_v1.cr);
1252 
1253             len = sizeof(off_v1);
1254             to_copy = &off_v1;
1255         }
1256 
1257         if (copy_to_user(optval, to_copy, len))
1258             return -EFAULT;
1259         if (put_user(len, optlen))
1260             return -EFAULT;
1261 
1262         return 0;
1263     }
1264     case XDP_OPTIONS:
1265     {
1266         struct xdp_options opts = {};
1267 
1268         if (len < sizeof(opts))
1269             return -EINVAL;
1270 
1271         mutex_lock(&xs->mutex);
1272         if (xs->zc)
1273             opts.flags |= XDP_OPTIONS_ZEROCOPY;
1274         mutex_unlock(&xs->mutex);
1275 
1276         len = sizeof(opts);
1277         if (copy_to_user(optval, &opts, len))
1278             return -EFAULT;
1279         if (put_user(len, optlen))
1280             return -EFAULT;
1281 
1282         return 0;
1283     }
1284     default:
1285         break;
1286     }
1287 
1288     return -EOPNOTSUPP;
1289 }
1290 
1291 static int xsk_mmap(struct file *file, struct socket *sock,
1292             struct vm_area_struct *vma)
1293 {
1294     loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1295     unsigned long size = vma->vm_end - vma->vm_start;
1296     struct xdp_sock *xs = xdp_sk(sock->sk);
1297     struct xsk_queue *q = NULL;
1298     unsigned long pfn;
1299     struct page *qpg;
1300 
1301     if (READ_ONCE(xs->state) != XSK_READY)
1302         return -EBUSY;
1303 
1304     if (offset == XDP_PGOFF_RX_RING) {
1305         q = READ_ONCE(xs->rx);
1306     } else if (offset == XDP_PGOFF_TX_RING) {
1307         q = READ_ONCE(xs->tx);
1308     } else {
1309         /* Matches the smp_wmb() in XDP_UMEM_REG */
1310         smp_rmb();
1311         if (offset == XDP_UMEM_PGOFF_FILL_RING)
1312             q = READ_ONCE(xs->fq_tmp);
1313         else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1314             q = READ_ONCE(xs->cq_tmp);
1315     }
1316 
1317     if (!q)
1318         return -EINVAL;
1319 
1320     /* Matches the smp_wmb() in xsk_init_queue */
1321     smp_rmb();
1322     qpg = virt_to_head_page(q->ring);
1323     if (size > page_size(qpg))
1324         return -EINVAL;
1325 
1326     pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1327     return remap_pfn_range(vma, vma->vm_start, pfn,
1328                    size, vma->vm_page_prot);
1329 }
1330 
1331 static int xsk_notifier(struct notifier_block *this,
1332             unsigned long msg, void *ptr)
1333 {
1334     struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1335     struct net *net = dev_net(dev);
1336     struct sock *sk;
1337 
1338     switch (msg) {
1339     case NETDEV_UNREGISTER:
1340         mutex_lock(&net->xdp.lock);
1341         sk_for_each(sk, &net->xdp.list) {
1342             struct xdp_sock *xs = xdp_sk(sk);
1343 
1344             mutex_lock(&xs->mutex);
1345             if (xs->dev == dev) {
1346                 sk->sk_err = ENETDOWN;
1347                 if (!sock_flag(sk, SOCK_DEAD))
1348                     sk_error_report(sk);
1349 
1350                 xsk_unbind_dev(xs);
1351 
1352                 /* Clear device references. */
1353                 xp_clear_dev(xs->pool);
1354             }
1355             mutex_unlock(&xs->mutex);
1356         }
1357         mutex_unlock(&net->xdp.lock);
1358         break;
1359     }
1360     return NOTIFY_DONE;
1361 }
1362 
1363 static struct proto xsk_proto = {
1364     .name =     "XDP",
1365     .owner =    THIS_MODULE,
1366     .obj_size = sizeof(struct xdp_sock),
1367 };
1368 
1369 static const struct proto_ops xsk_proto_ops = {
1370     .family     = PF_XDP,
1371     .owner      = THIS_MODULE,
1372     .release    = xsk_release,
1373     .bind       = xsk_bind,
1374     .connect    = sock_no_connect,
1375     .socketpair = sock_no_socketpair,
1376     .accept     = sock_no_accept,
1377     .getname    = sock_no_getname,
1378     .poll       = xsk_poll,
1379     .ioctl      = sock_no_ioctl,
1380     .listen     = sock_no_listen,
1381     .shutdown   = sock_no_shutdown,
1382     .setsockopt = xsk_setsockopt,
1383     .getsockopt = xsk_getsockopt,
1384     .sendmsg    = xsk_sendmsg,
1385     .recvmsg    = xsk_recvmsg,
1386     .mmap       = xsk_mmap,
1387     .sendpage   = sock_no_sendpage,
1388 };
1389 
1390 static void xsk_destruct(struct sock *sk)
1391 {
1392     struct xdp_sock *xs = xdp_sk(sk);
1393 
1394     if (!sock_flag(sk, SOCK_DEAD))
1395         return;
1396 
1397     if (!xp_put_pool(xs->pool))
1398         xdp_put_umem(xs->umem, !xs->pool);
1399 
1400     sk_refcnt_debug_dec(sk);
1401 }
1402 
1403 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1404               int kern)
1405 {
1406     struct xdp_sock *xs;
1407     struct sock *sk;
1408 
1409     if (!ns_capable(net->user_ns, CAP_NET_RAW))
1410         return -EPERM;
1411     if (sock->type != SOCK_RAW)
1412         return -ESOCKTNOSUPPORT;
1413 
1414     if (protocol)
1415         return -EPROTONOSUPPORT;
1416 
1417     sock->state = SS_UNCONNECTED;
1418 
1419     sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1420     if (!sk)
1421         return -ENOBUFS;
1422 
1423     sock->ops = &xsk_proto_ops;
1424 
1425     sock_init_data(sock, sk);
1426 
1427     sk->sk_family = PF_XDP;
1428 
1429     sk->sk_destruct = xsk_destruct;
1430     sk_refcnt_debug_inc(sk);
1431 
1432     sock_set_flag(sk, SOCK_RCU_FREE);
1433 
1434     xs = xdp_sk(sk);
1435     xs->state = XSK_READY;
1436     mutex_init(&xs->mutex);
1437     spin_lock_init(&xs->rx_lock);
1438 
1439     INIT_LIST_HEAD(&xs->map_list);
1440     spin_lock_init(&xs->map_list_lock);
1441 
1442     mutex_lock(&net->xdp.lock);
1443     sk_add_node_rcu(sk, &net->xdp.list);
1444     mutex_unlock(&net->xdp.lock);
1445 
1446     sock_prot_inuse_add(net, &xsk_proto, 1);
1447 
1448     return 0;
1449 }
1450 
1451 static const struct net_proto_family xsk_family_ops = {
1452     .family = PF_XDP,
1453     .create = xsk_create,
1454     .owner  = THIS_MODULE,
1455 };
1456 
1457 static struct notifier_block xsk_netdev_notifier = {
1458     .notifier_call  = xsk_notifier,
1459 };
1460 
1461 static int __net_init xsk_net_init(struct net *net)
1462 {
1463     mutex_init(&net->xdp.lock);
1464     INIT_HLIST_HEAD(&net->xdp.list);
1465     return 0;
1466 }
1467 
1468 static void __net_exit xsk_net_exit(struct net *net)
1469 {
1470     WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1471 }
1472 
1473 static struct pernet_operations xsk_net_ops = {
1474     .init = xsk_net_init,
1475     .exit = xsk_net_exit,
1476 };
1477 
1478 static int __init xsk_init(void)
1479 {
1480     int err, cpu;
1481 
1482     err = proto_register(&xsk_proto, 0 /* no slab */);
1483     if (err)
1484         goto out;
1485 
1486     err = sock_register(&xsk_family_ops);
1487     if (err)
1488         goto out_proto;
1489 
1490     err = register_pernet_subsys(&xsk_net_ops);
1491     if (err)
1492         goto out_sk;
1493 
1494     err = register_netdevice_notifier(&xsk_netdev_notifier);
1495     if (err)
1496         goto out_pernet;
1497 
1498     for_each_possible_cpu(cpu)
1499         INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1500     return 0;
1501 
1502 out_pernet:
1503     unregister_pernet_subsys(&xsk_net_ops);
1504 out_sk:
1505     sock_unregister(PF_XDP);
1506 out_proto:
1507     proto_unregister(&xsk_proto);
1508 out:
1509     return err;
1510 }
1511 
1512 fs_initcall(xsk_init);