Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Kernel Connection Multiplexor
0004  *
0005  * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
0006  */
0007 
0008 #include <linux/bpf.h>
0009 #include <linux/errno.h>
0010 #include <linux/errqueue.h>
0011 #include <linux/file.h>
0012 #include <linux/filter.h>
0013 #include <linux/in.h>
0014 #include <linux/kernel.h>
0015 #include <linux/module.h>
0016 #include <linux/net.h>
0017 #include <linux/netdevice.h>
0018 #include <linux/poll.h>
0019 #include <linux/rculist.h>
0020 #include <linux/skbuff.h>
0021 #include <linux/socket.h>
0022 #include <linux/uaccess.h>
0023 #include <linux/workqueue.h>
0024 #include <linux/syscalls.h>
0025 #include <linux/sched/signal.h>
0026 
0027 #include <net/kcm.h>
0028 #include <net/netns/generic.h>
0029 #include <net/sock.h>
0030 #include <uapi/linux/kcm.h>
0031 
0032 unsigned int kcm_net_id;
0033 
0034 static struct kmem_cache *kcm_psockp __read_mostly;
0035 static struct kmem_cache *kcm_muxp __read_mostly;
0036 static struct workqueue_struct *kcm_wq;
0037 
0038 static inline struct kcm_sock *kcm_sk(const struct sock *sk)
0039 {
0040     return (struct kcm_sock *)sk;
0041 }
0042 
0043 static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
0044 {
0045     return (struct kcm_tx_msg *)skb->cb;
0046 }
0047 
0048 static void report_csk_error(struct sock *csk, int err)
0049 {
0050     csk->sk_err = EPIPE;
0051     sk_error_report(csk);
0052 }
0053 
0054 static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
0055                    bool wakeup_kcm)
0056 {
0057     struct sock *csk = psock->sk;
0058     struct kcm_mux *mux = psock->mux;
0059 
0060     /* Unrecoverable error in transmit */
0061 
0062     spin_lock_bh(&mux->lock);
0063 
0064     if (psock->tx_stopped) {
0065         spin_unlock_bh(&mux->lock);
0066         return;
0067     }
0068 
0069     psock->tx_stopped = 1;
0070     KCM_STATS_INCR(psock->stats.tx_aborts);
0071 
0072     if (!psock->tx_kcm) {
0073         /* Take off psocks_avail list */
0074         list_del(&psock->psock_avail_list);
0075     } else if (wakeup_kcm) {
0076         /* In this case psock is being aborted while outside of
0077          * write_msgs and psock is reserved. Schedule tx_work
0078          * to handle the failure there. Need to commit tx_stopped
0079          * before queuing work.
0080          */
0081         smp_mb();
0082 
0083         queue_work(kcm_wq, &psock->tx_kcm->tx_work);
0084     }
0085 
0086     spin_unlock_bh(&mux->lock);
0087 
0088     /* Report error on lower socket */
0089     report_csk_error(csk, err);
0090 }
0091 
0092 /* RX mux lock held. */
0093 static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
0094                     struct kcm_psock *psock)
0095 {
0096     STRP_STATS_ADD(mux->stats.rx_bytes,
0097                psock->strp.stats.bytes -
0098                psock->saved_rx_bytes);
0099     mux->stats.rx_msgs +=
0100         psock->strp.stats.msgs - psock->saved_rx_msgs;
0101     psock->saved_rx_msgs = psock->strp.stats.msgs;
0102     psock->saved_rx_bytes = psock->strp.stats.bytes;
0103 }
0104 
0105 static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
0106                     struct kcm_psock *psock)
0107 {
0108     KCM_STATS_ADD(mux->stats.tx_bytes,
0109               psock->stats.tx_bytes - psock->saved_tx_bytes);
0110     mux->stats.tx_msgs +=
0111         psock->stats.tx_msgs - psock->saved_tx_msgs;
0112     psock->saved_tx_msgs = psock->stats.tx_msgs;
0113     psock->saved_tx_bytes = psock->stats.tx_bytes;
0114 }
0115 
0116 static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
0117 
0118 /* KCM is ready to receive messages on its queue-- either the KCM is new or
0119  * has become unblocked after being blocked on full socket buffer. Queue any
0120  * pending ready messages on a psock. RX mux lock held.
0121  */
0122 static void kcm_rcv_ready(struct kcm_sock *kcm)
0123 {
0124     struct kcm_mux *mux = kcm->mux;
0125     struct kcm_psock *psock;
0126     struct sk_buff *skb;
0127 
0128     if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
0129         return;
0130 
0131     while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
0132         if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
0133             /* Assuming buffer limit has been reached */
0134             skb_queue_head(&mux->rx_hold_queue, skb);
0135             WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
0136             return;
0137         }
0138     }
0139 
0140     while (!list_empty(&mux->psocks_ready)) {
0141         psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
0142                      psock_ready_list);
0143 
0144         if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
0145             /* Assuming buffer limit has been reached */
0146             WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
0147             return;
0148         }
0149 
0150         /* Consumed the ready message on the psock. Schedule rx_work to
0151          * get more messages.
0152          */
0153         list_del(&psock->psock_ready_list);
0154         psock->ready_rx_msg = NULL;
0155         /* Commit clearing of ready_rx_msg for queuing work */
0156         smp_mb();
0157 
0158         strp_unpause(&psock->strp);
0159         strp_check_rcv(&psock->strp);
0160     }
0161 
0162     /* Buffer limit is okay now, add to ready list */
0163     list_add_tail(&kcm->wait_rx_list,
0164               &kcm->mux->kcm_rx_waiters);
0165     kcm->rx_wait = true;
0166 }
0167 
0168 static void kcm_rfree(struct sk_buff *skb)
0169 {
0170     struct sock *sk = skb->sk;
0171     struct kcm_sock *kcm = kcm_sk(sk);
0172     struct kcm_mux *mux = kcm->mux;
0173     unsigned int len = skb->truesize;
0174 
0175     sk_mem_uncharge(sk, len);
0176     atomic_sub(len, &sk->sk_rmem_alloc);
0177 
0178     /* For reading rx_wait and rx_psock without holding lock */
0179     smp_mb__after_atomic();
0180 
0181     if (!kcm->rx_wait && !kcm->rx_psock &&
0182         sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
0183         spin_lock_bh(&mux->rx_lock);
0184         kcm_rcv_ready(kcm);
0185         spin_unlock_bh(&mux->rx_lock);
0186     }
0187 }
0188 
0189 static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
0190 {
0191     struct sk_buff_head *list = &sk->sk_receive_queue;
0192 
0193     if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
0194         return -ENOMEM;
0195 
0196     if (!sk_rmem_schedule(sk, skb, skb->truesize))
0197         return -ENOBUFS;
0198 
0199     skb->dev = NULL;
0200 
0201     skb_orphan(skb);
0202     skb->sk = sk;
0203     skb->destructor = kcm_rfree;
0204     atomic_add(skb->truesize, &sk->sk_rmem_alloc);
0205     sk_mem_charge(sk, skb->truesize);
0206 
0207     skb_queue_tail(list, skb);
0208 
0209     if (!sock_flag(sk, SOCK_DEAD))
0210         sk->sk_data_ready(sk);
0211 
0212     return 0;
0213 }
0214 
0215 /* Requeue received messages for a kcm socket to other kcm sockets. This is
0216  * called with a kcm socket is receive disabled.
0217  * RX mux lock held.
0218  */
0219 static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
0220 {
0221     struct sk_buff *skb;
0222     struct kcm_sock *kcm;
0223 
0224     while ((skb = __skb_dequeue(head))) {
0225         /* Reset destructor to avoid calling kcm_rcv_ready */
0226         skb->destructor = sock_rfree;
0227         skb_orphan(skb);
0228 try_again:
0229         if (list_empty(&mux->kcm_rx_waiters)) {
0230             skb_queue_tail(&mux->rx_hold_queue, skb);
0231             continue;
0232         }
0233 
0234         kcm = list_first_entry(&mux->kcm_rx_waiters,
0235                        struct kcm_sock, wait_rx_list);
0236 
0237         if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
0238             /* Should mean socket buffer full */
0239             list_del(&kcm->wait_rx_list);
0240             kcm->rx_wait = false;
0241 
0242             /* Commit rx_wait to read in kcm_free */
0243             smp_wmb();
0244 
0245             goto try_again;
0246         }
0247     }
0248 }
0249 
0250 /* Lower sock lock held */
0251 static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
0252                        struct sk_buff *head)
0253 {
0254     struct kcm_mux *mux = psock->mux;
0255     struct kcm_sock *kcm;
0256 
0257     WARN_ON(psock->ready_rx_msg);
0258 
0259     if (psock->rx_kcm)
0260         return psock->rx_kcm;
0261 
0262     spin_lock_bh(&mux->rx_lock);
0263 
0264     if (psock->rx_kcm) {
0265         spin_unlock_bh(&mux->rx_lock);
0266         return psock->rx_kcm;
0267     }
0268 
0269     kcm_update_rx_mux_stats(mux, psock);
0270 
0271     if (list_empty(&mux->kcm_rx_waiters)) {
0272         psock->ready_rx_msg = head;
0273         strp_pause(&psock->strp);
0274         list_add_tail(&psock->psock_ready_list,
0275                   &mux->psocks_ready);
0276         spin_unlock_bh(&mux->rx_lock);
0277         return NULL;
0278     }
0279 
0280     kcm = list_first_entry(&mux->kcm_rx_waiters,
0281                    struct kcm_sock, wait_rx_list);
0282     list_del(&kcm->wait_rx_list);
0283     kcm->rx_wait = false;
0284 
0285     psock->rx_kcm = kcm;
0286     kcm->rx_psock = psock;
0287 
0288     spin_unlock_bh(&mux->rx_lock);
0289 
0290     return kcm;
0291 }
0292 
0293 static void kcm_done(struct kcm_sock *kcm);
0294 
0295 static void kcm_done_work(struct work_struct *w)
0296 {
0297     kcm_done(container_of(w, struct kcm_sock, done_work));
0298 }
0299 
0300 /* Lower sock held */
0301 static void unreserve_rx_kcm(struct kcm_psock *psock,
0302                  bool rcv_ready)
0303 {
0304     struct kcm_sock *kcm = psock->rx_kcm;
0305     struct kcm_mux *mux = psock->mux;
0306 
0307     if (!kcm)
0308         return;
0309 
0310     spin_lock_bh(&mux->rx_lock);
0311 
0312     psock->rx_kcm = NULL;
0313     kcm->rx_psock = NULL;
0314 
0315     /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
0316      * kcm_rfree
0317      */
0318     smp_mb();
0319 
0320     if (unlikely(kcm->done)) {
0321         spin_unlock_bh(&mux->rx_lock);
0322 
0323         /* Need to run kcm_done in a task since we need to qcquire
0324          * callback locks which may already be held here.
0325          */
0326         INIT_WORK(&kcm->done_work, kcm_done_work);
0327         schedule_work(&kcm->done_work);
0328         return;
0329     }
0330 
0331     if (unlikely(kcm->rx_disabled)) {
0332         requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
0333     } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
0334         /* Check for degenerative race with rx_wait that all
0335          * data was dequeued (accounted for in kcm_rfree).
0336          */
0337         kcm_rcv_ready(kcm);
0338     }
0339     spin_unlock_bh(&mux->rx_lock);
0340 }
0341 
0342 /* Lower sock lock held */
0343 static void psock_data_ready(struct sock *sk)
0344 {
0345     struct kcm_psock *psock;
0346 
0347     read_lock_bh(&sk->sk_callback_lock);
0348 
0349     psock = (struct kcm_psock *)sk->sk_user_data;
0350     if (likely(psock))
0351         strp_data_ready(&psock->strp);
0352 
0353     read_unlock_bh(&sk->sk_callback_lock);
0354 }
0355 
0356 /* Called with lower sock held */
0357 static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb)
0358 {
0359     struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
0360     struct kcm_sock *kcm;
0361 
0362 try_queue:
0363     kcm = reserve_rx_kcm(psock, skb);
0364     if (!kcm) {
0365          /* Unable to reserve a KCM, message is held in psock and strp
0366           * is paused.
0367           */
0368         return;
0369     }
0370 
0371     if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
0372         /* Should mean socket buffer full */
0373         unreserve_rx_kcm(psock, false);
0374         goto try_queue;
0375     }
0376 }
0377 
0378 static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
0379 {
0380     struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
0381     struct bpf_prog *prog = psock->bpf_prog;
0382     int res;
0383 
0384     res = bpf_prog_run_pin_on_cpu(prog, skb);
0385     return res;
0386 }
0387 
0388 static int kcm_read_sock_done(struct strparser *strp, int err)
0389 {
0390     struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
0391 
0392     unreserve_rx_kcm(psock, true);
0393 
0394     return err;
0395 }
0396 
0397 static void psock_state_change(struct sock *sk)
0398 {
0399     /* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here
0400      * since application will normally not poll with EPOLLIN
0401      * on the TCP sockets.
0402      */
0403 
0404     report_csk_error(sk, EPIPE);
0405 }
0406 
0407 static void psock_write_space(struct sock *sk)
0408 {
0409     struct kcm_psock *psock;
0410     struct kcm_mux *mux;
0411     struct kcm_sock *kcm;
0412 
0413     read_lock_bh(&sk->sk_callback_lock);
0414 
0415     psock = (struct kcm_psock *)sk->sk_user_data;
0416     if (unlikely(!psock))
0417         goto out;
0418     mux = psock->mux;
0419 
0420     spin_lock_bh(&mux->lock);
0421 
0422     /* Check if the socket is reserved so someone is waiting for sending. */
0423     kcm = psock->tx_kcm;
0424     if (kcm && !unlikely(kcm->tx_stopped))
0425         queue_work(kcm_wq, &kcm->tx_work);
0426 
0427     spin_unlock_bh(&mux->lock);
0428 out:
0429     read_unlock_bh(&sk->sk_callback_lock);
0430 }
0431 
0432 static void unreserve_psock(struct kcm_sock *kcm);
0433 
0434 /* kcm sock is locked. */
0435 static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
0436 {
0437     struct kcm_mux *mux = kcm->mux;
0438     struct kcm_psock *psock;
0439 
0440     psock = kcm->tx_psock;
0441 
0442     smp_rmb(); /* Must read tx_psock before tx_wait */
0443 
0444     if (psock) {
0445         WARN_ON(kcm->tx_wait);
0446         if (unlikely(psock->tx_stopped))
0447             unreserve_psock(kcm);
0448         else
0449             return kcm->tx_psock;
0450     }
0451 
0452     spin_lock_bh(&mux->lock);
0453 
0454     /* Check again under lock to see if psock was reserved for this
0455      * psock via psock_unreserve.
0456      */
0457     psock = kcm->tx_psock;
0458     if (unlikely(psock)) {
0459         WARN_ON(kcm->tx_wait);
0460         spin_unlock_bh(&mux->lock);
0461         return kcm->tx_psock;
0462     }
0463 
0464     if (!list_empty(&mux->psocks_avail)) {
0465         psock = list_first_entry(&mux->psocks_avail,
0466                      struct kcm_psock,
0467                      psock_avail_list);
0468         list_del(&psock->psock_avail_list);
0469         if (kcm->tx_wait) {
0470             list_del(&kcm->wait_psock_list);
0471             kcm->tx_wait = false;
0472         }
0473         kcm->tx_psock = psock;
0474         psock->tx_kcm = kcm;
0475         KCM_STATS_INCR(psock->stats.reserved);
0476     } else if (!kcm->tx_wait) {
0477         list_add_tail(&kcm->wait_psock_list,
0478                   &mux->kcm_tx_waiters);
0479         kcm->tx_wait = true;
0480     }
0481 
0482     spin_unlock_bh(&mux->lock);
0483 
0484     return psock;
0485 }
0486 
0487 /* mux lock held */
0488 static void psock_now_avail(struct kcm_psock *psock)
0489 {
0490     struct kcm_mux *mux = psock->mux;
0491     struct kcm_sock *kcm;
0492 
0493     if (list_empty(&mux->kcm_tx_waiters)) {
0494         list_add_tail(&psock->psock_avail_list,
0495                   &mux->psocks_avail);
0496     } else {
0497         kcm = list_first_entry(&mux->kcm_tx_waiters,
0498                        struct kcm_sock,
0499                        wait_psock_list);
0500         list_del(&kcm->wait_psock_list);
0501         kcm->tx_wait = false;
0502         psock->tx_kcm = kcm;
0503 
0504         /* Commit before changing tx_psock since that is read in
0505          * reserve_psock before queuing work.
0506          */
0507         smp_mb();
0508 
0509         kcm->tx_psock = psock;
0510         KCM_STATS_INCR(psock->stats.reserved);
0511         queue_work(kcm_wq, &kcm->tx_work);
0512     }
0513 }
0514 
0515 /* kcm sock is locked. */
0516 static void unreserve_psock(struct kcm_sock *kcm)
0517 {
0518     struct kcm_psock *psock;
0519     struct kcm_mux *mux = kcm->mux;
0520 
0521     spin_lock_bh(&mux->lock);
0522 
0523     psock = kcm->tx_psock;
0524 
0525     if (WARN_ON(!psock)) {
0526         spin_unlock_bh(&mux->lock);
0527         return;
0528     }
0529 
0530     smp_rmb(); /* Read tx_psock before tx_wait */
0531 
0532     kcm_update_tx_mux_stats(mux, psock);
0533 
0534     WARN_ON(kcm->tx_wait);
0535 
0536     kcm->tx_psock = NULL;
0537     psock->tx_kcm = NULL;
0538     KCM_STATS_INCR(psock->stats.unreserved);
0539 
0540     if (unlikely(psock->tx_stopped)) {
0541         if (psock->done) {
0542             /* Deferred free */
0543             list_del(&psock->psock_list);
0544             mux->psocks_cnt--;
0545             sock_put(psock->sk);
0546             fput(psock->sk->sk_socket->file);
0547             kmem_cache_free(kcm_psockp, psock);
0548         }
0549 
0550         /* Don't put back on available list */
0551 
0552         spin_unlock_bh(&mux->lock);
0553 
0554         return;
0555     }
0556 
0557     psock_now_avail(psock);
0558 
0559     spin_unlock_bh(&mux->lock);
0560 }
0561 
0562 static void kcm_report_tx_retry(struct kcm_sock *kcm)
0563 {
0564     struct kcm_mux *mux = kcm->mux;
0565 
0566     spin_lock_bh(&mux->lock);
0567     KCM_STATS_INCR(mux->stats.tx_retries);
0568     spin_unlock_bh(&mux->lock);
0569 }
0570 
0571 /* Write any messages ready on the kcm socket.  Called with kcm sock lock
0572  * held.  Return bytes actually sent or error.
0573  */
0574 static int kcm_write_msgs(struct kcm_sock *kcm)
0575 {
0576     struct sock *sk = &kcm->sk;
0577     struct kcm_psock *psock;
0578     struct sk_buff *skb, *head;
0579     struct kcm_tx_msg *txm;
0580     unsigned short fragidx, frag_offset;
0581     unsigned int sent, total_sent = 0;
0582     int ret = 0;
0583 
0584     kcm->tx_wait_more = false;
0585     psock = kcm->tx_psock;
0586     if (unlikely(psock && psock->tx_stopped)) {
0587         /* A reserved psock was aborted asynchronously. Unreserve
0588          * it and we'll retry the message.
0589          */
0590         unreserve_psock(kcm);
0591         kcm_report_tx_retry(kcm);
0592         if (skb_queue_empty(&sk->sk_write_queue))
0593             return 0;
0594 
0595         kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
0596 
0597     } else if (skb_queue_empty(&sk->sk_write_queue)) {
0598         return 0;
0599     }
0600 
0601     head = skb_peek(&sk->sk_write_queue);
0602     txm = kcm_tx_msg(head);
0603 
0604     if (txm->sent) {
0605         /* Send of first skbuff in queue already in progress */
0606         if (WARN_ON(!psock)) {
0607             ret = -EINVAL;
0608             goto out;
0609         }
0610         sent = txm->sent;
0611         frag_offset = txm->frag_offset;
0612         fragidx = txm->fragidx;
0613         skb = txm->frag_skb;
0614 
0615         goto do_frag;
0616     }
0617 
0618 try_again:
0619     psock = reserve_psock(kcm);
0620     if (!psock)
0621         goto out;
0622 
0623     do {
0624         skb = head;
0625         txm = kcm_tx_msg(head);
0626         sent = 0;
0627 
0628 do_frag_list:
0629         if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
0630             ret = -EINVAL;
0631             goto out;
0632         }
0633 
0634         for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
0635              fragidx++) {
0636             skb_frag_t *frag;
0637 
0638             frag_offset = 0;
0639 do_frag:
0640             frag = &skb_shinfo(skb)->frags[fragidx];
0641             if (WARN_ON(!skb_frag_size(frag))) {
0642                 ret = -EINVAL;
0643                 goto out;
0644             }
0645 
0646             ret = kernel_sendpage(psock->sk->sk_socket,
0647                           skb_frag_page(frag),
0648                           skb_frag_off(frag) + frag_offset,
0649                           skb_frag_size(frag) - frag_offset,
0650                           MSG_DONTWAIT);
0651             if (ret <= 0) {
0652                 if (ret == -EAGAIN) {
0653                     /* Save state to try again when there's
0654                      * write space on the socket
0655                      */
0656                     txm->sent = sent;
0657                     txm->frag_offset = frag_offset;
0658                     txm->fragidx = fragidx;
0659                     txm->frag_skb = skb;
0660 
0661                     ret = 0;
0662                     goto out;
0663                 }
0664 
0665                 /* Hard failure in sending message, abort this
0666                  * psock since it has lost framing
0667                  * synchronization and retry sending the
0668                  * message from the beginning.
0669                  */
0670                 kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
0671                            true);
0672                 unreserve_psock(kcm);
0673 
0674                 txm->sent = 0;
0675                 kcm_report_tx_retry(kcm);
0676                 ret = 0;
0677 
0678                 goto try_again;
0679             }
0680 
0681             sent += ret;
0682             frag_offset += ret;
0683             KCM_STATS_ADD(psock->stats.tx_bytes, ret);
0684             if (frag_offset < skb_frag_size(frag)) {
0685                 /* Not finished with this frag */
0686                 goto do_frag;
0687             }
0688         }
0689 
0690         if (skb == head) {
0691             if (skb_has_frag_list(skb)) {
0692                 skb = skb_shinfo(skb)->frag_list;
0693                 goto do_frag_list;
0694             }
0695         } else if (skb->next) {
0696             skb = skb->next;
0697             goto do_frag_list;
0698         }
0699 
0700         /* Successfully sent the whole packet, account for it. */
0701         skb_dequeue(&sk->sk_write_queue);
0702         kfree_skb(head);
0703         sk->sk_wmem_queued -= sent;
0704         total_sent += sent;
0705         KCM_STATS_INCR(psock->stats.tx_msgs);
0706     } while ((head = skb_peek(&sk->sk_write_queue)));
0707 out:
0708     if (!head) {
0709         /* Done with all queued messages. */
0710         WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
0711         unreserve_psock(kcm);
0712     }
0713 
0714     /* Check if write space is available */
0715     sk->sk_write_space(sk);
0716 
0717     return total_sent ? : ret;
0718 }
0719 
0720 static void kcm_tx_work(struct work_struct *w)
0721 {
0722     struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
0723     struct sock *sk = &kcm->sk;
0724     int err;
0725 
0726     lock_sock(sk);
0727 
0728     /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
0729      * aborts
0730      */
0731     err = kcm_write_msgs(kcm);
0732     if (err < 0) {
0733         /* Hard failure in write, report error on KCM socket */
0734         pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
0735         report_csk_error(&kcm->sk, -err);
0736         goto out;
0737     }
0738 
0739     /* Primarily for SOCK_SEQPACKET sockets */
0740     if (likely(sk->sk_socket) &&
0741         test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
0742         clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
0743         sk->sk_write_space(sk);
0744     }
0745 
0746 out:
0747     release_sock(sk);
0748 }
0749 
0750 static void kcm_push(struct kcm_sock *kcm)
0751 {
0752     if (kcm->tx_wait_more)
0753         kcm_write_msgs(kcm);
0754 }
0755 
0756 static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
0757                 int offset, size_t size, int flags)
0758 
0759 {
0760     struct sock *sk = sock->sk;
0761     struct kcm_sock *kcm = kcm_sk(sk);
0762     struct sk_buff *skb = NULL, *head = NULL;
0763     long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
0764     bool eor;
0765     int err = 0;
0766     int i;
0767 
0768     if (flags & MSG_SENDPAGE_NOTLAST)
0769         flags |= MSG_MORE;
0770 
0771     /* No MSG_EOR from splice, only look at MSG_MORE */
0772     eor = !(flags & MSG_MORE);
0773 
0774     lock_sock(sk);
0775 
0776     sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
0777 
0778     err = -EPIPE;
0779     if (sk->sk_err)
0780         goto out_error;
0781 
0782     if (kcm->seq_skb) {
0783         /* Previously opened message */
0784         head = kcm->seq_skb;
0785         skb = kcm_tx_msg(head)->last_skb;
0786         i = skb_shinfo(skb)->nr_frags;
0787 
0788         if (skb_can_coalesce(skb, i, page, offset)) {
0789             skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
0790             skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
0791             goto coalesced;
0792         }
0793 
0794         if (i >= MAX_SKB_FRAGS) {
0795             struct sk_buff *tskb;
0796 
0797             tskb = alloc_skb(0, sk->sk_allocation);
0798             while (!tskb) {
0799                 kcm_push(kcm);
0800                 err = sk_stream_wait_memory(sk, &timeo);
0801                 if (err)
0802                     goto out_error;
0803             }
0804 
0805             if (head == skb)
0806                 skb_shinfo(head)->frag_list = tskb;
0807             else
0808                 skb->next = tskb;
0809 
0810             skb = tskb;
0811             skb->ip_summed = CHECKSUM_UNNECESSARY;
0812             i = 0;
0813         }
0814     } else {
0815         /* Call the sk_stream functions to manage the sndbuf mem. */
0816         if (!sk_stream_memory_free(sk)) {
0817             kcm_push(kcm);
0818             set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
0819             err = sk_stream_wait_memory(sk, &timeo);
0820             if (err)
0821                 goto out_error;
0822         }
0823 
0824         head = alloc_skb(0, sk->sk_allocation);
0825         while (!head) {
0826             kcm_push(kcm);
0827             err = sk_stream_wait_memory(sk, &timeo);
0828             if (err)
0829                 goto out_error;
0830         }
0831 
0832         skb = head;
0833         i = 0;
0834     }
0835 
0836     get_page(page);
0837     skb_fill_page_desc(skb, i, page, offset, size);
0838     skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
0839 
0840 coalesced:
0841     skb->len += size;
0842     skb->data_len += size;
0843     skb->truesize += size;
0844     sk->sk_wmem_queued += size;
0845     sk_mem_charge(sk, size);
0846 
0847     if (head != skb) {
0848         head->len += size;
0849         head->data_len += size;
0850         head->truesize += size;
0851     }
0852 
0853     if (eor) {
0854         bool not_busy = skb_queue_empty(&sk->sk_write_queue);
0855 
0856         /* Message complete, queue it on send buffer */
0857         __skb_queue_tail(&sk->sk_write_queue, head);
0858         kcm->seq_skb = NULL;
0859         KCM_STATS_INCR(kcm->stats.tx_msgs);
0860 
0861         if (flags & MSG_BATCH) {
0862             kcm->tx_wait_more = true;
0863         } else if (kcm->tx_wait_more || not_busy) {
0864             err = kcm_write_msgs(kcm);
0865             if (err < 0) {
0866                 /* We got a hard error in write_msgs but have
0867                  * already queued this message. Report an error
0868                  * in the socket, but don't affect return value
0869                  * from sendmsg
0870                  */
0871                 pr_warn("KCM: Hard failure on kcm_write_msgs\n");
0872                 report_csk_error(&kcm->sk, -err);
0873             }
0874         }
0875     } else {
0876         /* Message not complete, save state */
0877         kcm->seq_skb = head;
0878         kcm_tx_msg(head)->last_skb = skb;
0879     }
0880 
0881     KCM_STATS_ADD(kcm->stats.tx_bytes, size);
0882 
0883     release_sock(sk);
0884     return size;
0885 
0886 out_error:
0887     kcm_push(kcm);
0888 
0889     err = sk_stream_error(sk, flags, err);
0890 
0891     /* make sure we wake any epoll edge trigger waiter */
0892     if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
0893         sk->sk_write_space(sk);
0894 
0895     release_sock(sk);
0896     return err;
0897 }
0898 
0899 static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
0900 {
0901     struct sock *sk = sock->sk;
0902     struct kcm_sock *kcm = kcm_sk(sk);
0903     struct sk_buff *skb = NULL, *head = NULL;
0904     size_t copy, copied = 0;
0905     long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
0906     int eor = (sock->type == SOCK_DGRAM) ?
0907           !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
0908     int err = -EPIPE;
0909 
0910     lock_sock(sk);
0911 
0912     /* Per tcp_sendmsg this should be in poll */
0913     sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
0914 
0915     if (sk->sk_err)
0916         goto out_error;
0917 
0918     if (kcm->seq_skb) {
0919         /* Previously opened message */
0920         head = kcm->seq_skb;
0921         skb = kcm_tx_msg(head)->last_skb;
0922         goto start;
0923     }
0924 
0925     /* Call the sk_stream functions to manage the sndbuf mem. */
0926     if (!sk_stream_memory_free(sk)) {
0927         kcm_push(kcm);
0928         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
0929         err = sk_stream_wait_memory(sk, &timeo);
0930         if (err)
0931             goto out_error;
0932     }
0933 
0934     if (msg_data_left(msg)) {
0935         /* New message, alloc head skb */
0936         head = alloc_skb(0, sk->sk_allocation);
0937         while (!head) {
0938             kcm_push(kcm);
0939             err = sk_stream_wait_memory(sk, &timeo);
0940             if (err)
0941                 goto out_error;
0942 
0943             head = alloc_skb(0, sk->sk_allocation);
0944         }
0945 
0946         skb = head;
0947 
0948         /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
0949          * csum_and_copy_from_iter from skb_do_copy_data_nocache.
0950          */
0951         skb->ip_summed = CHECKSUM_UNNECESSARY;
0952     }
0953 
0954 start:
0955     while (msg_data_left(msg)) {
0956         bool merge = true;
0957         int i = skb_shinfo(skb)->nr_frags;
0958         struct page_frag *pfrag = sk_page_frag(sk);
0959 
0960         if (!sk_page_frag_refill(sk, pfrag))
0961             goto wait_for_memory;
0962 
0963         if (!skb_can_coalesce(skb, i, pfrag->page,
0964                       pfrag->offset)) {
0965             if (i == MAX_SKB_FRAGS) {
0966                 struct sk_buff *tskb;
0967 
0968                 tskb = alloc_skb(0, sk->sk_allocation);
0969                 if (!tskb)
0970                     goto wait_for_memory;
0971 
0972                 if (head == skb)
0973                     skb_shinfo(head)->frag_list = tskb;
0974                 else
0975                     skb->next = tskb;
0976 
0977                 skb = tskb;
0978                 skb->ip_summed = CHECKSUM_UNNECESSARY;
0979                 continue;
0980             }
0981             merge = false;
0982         }
0983 
0984         copy = min_t(int, msg_data_left(msg),
0985                  pfrag->size - pfrag->offset);
0986 
0987         if (!sk_wmem_schedule(sk, copy))
0988             goto wait_for_memory;
0989 
0990         err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
0991                            pfrag->page,
0992                            pfrag->offset,
0993                            copy);
0994         if (err)
0995             goto out_error;
0996 
0997         /* Update the skb. */
0998         if (merge) {
0999             skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1000         } else {
1001             skb_fill_page_desc(skb, i, pfrag->page,
1002                        pfrag->offset, copy);
1003             get_page(pfrag->page);
1004         }
1005 
1006         pfrag->offset += copy;
1007         copied += copy;
1008         if (head != skb) {
1009             head->len += copy;
1010             head->data_len += copy;
1011         }
1012 
1013         continue;
1014 
1015 wait_for_memory:
1016         kcm_push(kcm);
1017         err = sk_stream_wait_memory(sk, &timeo);
1018         if (err)
1019             goto out_error;
1020     }
1021 
1022     if (eor) {
1023         bool not_busy = skb_queue_empty(&sk->sk_write_queue);
1024 
1025         if (head) {
1026             /* Message complete, queue it on send buffer */
1027             __skb_queue_tail(&sk->sk_write_queue, head);
1028             kcm->seq_skb = NULL;
1029             KCM_STATS_INCR(kcm->stats.tx_msgs);
1030         }
1031 
1032         if (msg->msg_flags & MSG_BATCH) {
1033             kcm->tx_wait_more = true;
1034         } else if (kcm->tx_wait_more || not_busy) {
1035             err = kcm_write_msgs(kcm);
1036             if (err < 0) {
1037                 /* We got a hard error in write_msgs but have
1038                  * already queued this message. Report an error
1039                  * in the socket, but don't affect return value
1040                  * from sendmsg
1041                  */
1042                 pr_warn("KCM: Hard failure on kcm_write_msgs\n");
1043                 report_csk_error(&kcm->sk, -err);
1044             }
1045         }
1046     } else {
1047         /* Message not complete, save state */
1048 partial_message:
1049         if (head) {
1050             kcm->seq_skb = head;
1051             kcm_tx_msg(head)->last_skb = skb;
1052         }
1053     }
1054 
1055     KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
1056 
1057     release_sock(sk);
1058     return copied;
1059 
1060 out_error:
1061     kcm_push(kcm);
1062 
1063     if (copied && sock->type == SOCK_SEQPACKET) {
1064         /* Wrote some bytes before encountering an
1065          * error, return partial success.
1066          */
1067         goto partial_message;
1068     }
1069 
1070     if (head != kcm->seq_skb)
1071         kfree_skb(head);
1072 
1073     err = sk_stream_error(sk, msg->msg_flags, err);
1074 
1075     /* make sure we wake any epoll edge trigger waiter */
1076     if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
1077         sk->sk_write_space(sk);
1078 
1079     release_sock(sk);
1080     return err;
1081 }
1082 
1083 static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
1084                      long timeo, int *err)
1085 {
1086     struct sk_buff *skb;
1087 
1088     while (!(skb = skb_peek(&sk->sk_receive_queue))) {
1089         if (sk->sk_err) {
1090             *err = sock_error(sk);
1091             return NULL;
1092         }
1093 
1094         if (sock_flag(sk, SOCK_DONE))
1095             return NULL;
1096 
1097         if ((flags & MSG_DONTWAIT) || !timeo) {
1098             *err = -EAGAIN;
1099             return NULL;
1100         }
1101 
1102         sk_wait_data(sk, &timeo, NULL);
1103 
1104         /* Handle signals */
1105         if (signal_pending(current)) {
1106             *err = sock_intr_errno(timeo);
1107             return NULL;
1108         }
1109     }
1110 
1111     return skb;
1112 }
1113 
1114 static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
1115                size_t len, int flags)
1116 {
1117     struct sock *sk = sock->sk;
1118     struct kcm_sock *kcm = kcm_sk(sk);
1119     int err = 0;
1120     long timeo;
1121     struct strp_msg *stm;
1122     int copied = 0;
1123     struct sk_buff *skb;
1124 
1125     timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
1126 
1127     lock_sock(sk);
1128 
1129     skb = kcm_wait_data(sk, flags, timeo, &err);
1130     if (!skb)
1131         goto out;
1132 
1133     /* Okay, have a message on the receive queue */
1134 
1135     stm = strp_msg(skb);
1136 
1137     if (len > stm->full_len)
1138         len = stm->full_len;
1139 
1140     err = skb_copy_datagram_msg(skb, stm->offset, msg, len);
1141     if (err < 0)
1142         goto out;
1143 
1144     copied = len;
1145     if (likely(!(flags & MSG_PEEK))) {
1146         KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
1147         if (copied < stm->full_len) {
1148             if (sock->type == SOCK_DGRAM) {
1149                 /* Truncated message */
1150                 msg->msg_flags |= MSG_TRUNC;
1151                 goto msg_finished;
1152             }
1153             stm->offset += copied;
1154             stm->full_len -= copied;
1155         } else {
1156 msg_finished:
1157             /* Finished with message */
1158             msg->msg_flags |= MSG_EOR;
1159             KCM_STATS_INCR(kcm->stats.rx_msgs);
1160             skb_unlink(skb, &sk->sk_receive_queue);
1161             kfree_skb(skb);
1162         }
1163     }
1164 
1165 out:
1166     release_sock(sk);
1167 
1168     return copied ? : err;
1169 }
1170 
1171 static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
1172                    struct pipe_inode_info *pipe, size_t len,
1173                    unsigned int flags)
1174 {
1175     struct sock *sk = sock->sk;
1176     struct kcm_sock *kcm = kcm_sk(sk);
1177     long timeo;
1178     struct strp_msg *stm;
1179     int err = 0;
1180     ssize_t copied;
1181     struct sk_buff *skb;
1182 
1183     /* Only support splice for SOCKSEQPACKET */
1184 
1185     timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
1186 
1187     lock_sock(sk);
1188 
1189     skb = kcm_wait_data(sk, flags, timeo, &err);
1190     if (!skb)
1191         goto err_out;
1192 
1193     /* Okay, have a message on the receive queue */
1194 
1195     stm = strp_msg(skb);
1196 
1197     if (len > stm->full_len)
1198         len = stm->full_len;
1199 
1200     copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags);
1201     if (copied < 0) {
1202         err = copied;
1203         goto err_out;
1204     }
1205 
1206     KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
1207 
1208     stm->offset += copied;
1209     stm->full_len -= copied;
1210 
1211     /* We have no way to return MSG_EOR. If all the bytes have been
1212      * read we still leave the message in the receive socket buffer.
1213      * A subsequent recvmsg needs to be done to return MSG_EOR and
1214      * finish reading the message.
1215      */
1216 
1217     release_sock(sk);
1218 
1219     return copied;
1220 
1221 err_out:
1222     release_sock(sk);
1223 
1224     return err;
1225 }
1226 
1227 /* kcm sock lock held */
1228 static void kcm_recv_disable(struct kcm_sock *kcm)
1229 {
1230     struct kcm_mux *mux = kcm->mux;
1231 
1232     if (kcm->rx_disabled)
1233         return;
1234 
1235     spin_lock_bh(&mux->rx_lock);
1236 
1237     kcm->rx_disabled = 1;
1238 
1239     /* If a psock is reserved we'll do cleanup in unreserve */
1240     if (!kcm->rx_psock) {
1241         if (kcm->rx_wait) {
1242             list_del(&kcm->wait_rx_list);
1243             kcm->rx_wait = false;
1244         }
1245 
1246         requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
1247     }
1248 
1249     spin_unlock_bh(&mux->rx_lock);
1250 }
1251 
1252 /* kcm sock lock held */
1253 static void kcm_recv_enable(struct kcm_sock *kcm)
1254 {
1255     struct kcm_mux *mux = kcm->mux;
1256 
1257     if (!kcm->rx_disabled)
1258         return;
1259 
1260     spin_lock_bh(&mux->rx_lock);
1261 
1262     kcm->rx_disabled = 0;
1263     kcm_rcv_ready(kcm);
1264 
1265     spin_unlock_bh(&mux->rx_lock);
1266 }
1267 
1268 static int kcm_setsockopt(struct socket *sock, int level, int optname,
1269               sockptr_t optval, unsigned int optlen)
1270 {
1271     struct kcm_sock *kcm = kcm_sk(sock->sk);
1272     int val, valbool;
1273     int err = 0;
1274 
1275     if (level != SOL_KCM)
1276         return -ENOPROTOOPT;
1277 
1278     if (optlen < sizeof(int))
1279         return -EINVAL;
1280 
1281     if (copy_from_sockptr(&val, optval, sizeof(int)))
1282         return -EFAULT;
1283 
1284     valbool = val ? 1 : 0;
1285 
1286     switch (optname) {
1287     case KCM_RECV_DISABLE:
1288         lock_sock(&kcm->sk);
1289         if (valbool)
1290             kcm_recv_disable(kcm);
1291         else
1292             kcm_recv_enable(kcm);
1293         release_sock(&kcm->sk);
1294         break;
1295     default:
1296         err = -ENOPROTOOPT;
1297     }
1298 
1299     return err;
1300 }
1301 
1302 static int kcm_getsockopt(struct socket *sock, int level, int optname,
1303               char __user *optval, int __user *optlen)
1304 {
1305     struct kcm_sock *kcm = kcm_sk(sock->sk);
1306     int val, len;
1307 
1308     if (level != SOL_KCM)
1309         return -ENOPROTOOPT;
1310 
1311     if (get_user(len, optlen))
1312         return -EFAULT;
1313 
1314     len = min_t(unsigned int, len, sizeof(int));
1315     if (len < 0)
1316         return -EINVAL;
1317 
1318     switch (optname) {
1319     case KCM_RECV_DISABLE:
1320         val = kcm->rx_disabled;
1321         break;
1322     default:
1323         return -ENOPROTOOPT;
1324     }
1325 
1326     if (put_user(len, optlen))
1327         return -EFAULT;
1328     if (copy_to_user(optval, &val, len))
1329         return -EFAULT;
1330     return 0;
1331 }
1332 
1333 static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
1334 {
1335     struct kcm_sock *tkcm;
1336     struct list_head *head;
1337     int index = 0;
1338 
1339     /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
1340      * we set sk_state, otherwise epoll_wait always returns right away with
1341      * EPOLLHUP
1342      */
1343     kcm->sk.sk_state = TCP_ESTABLISHED;
1344 
1345     /* Add to mux's kcm sockets list */
1346     kcm->mux = mux;
1347     spin_lock_bh(&mux->lock);
1348 
1349     head = &mux->kcm_socks;
1350     list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
1351         if (tkcm->index != index)
1352             break;
1353         head = &tkcm->kcm_sock_list;
1354         index++;
1355     }
1356 
1357     list_add(&kcm->kcm_sock_list, head);
1358     kcm->index = index;
1359 
1360     mux->kcm_socks_cnt++;
1361     spin_unlock_bh(&mux->lock);
1362 
1363     INIT_WORK(&kcm->tx_work, kcm_tx_work);
1364 
1365     spin_lock_bh(&mux->rx_lock);
1366     kcm_rcv_ready(kcm);
1367     spin_unlock_bh(&mux->rx_lock);
1368 }
1369 
1370 static int kcm_attach(struct socket *sock, struct socket *csock,
1371               struct bpf_prog *prog)
1372 {
1373     struct kcm_sock *kcm = kcm_sk(sock->sk);
1374     struct kcm_mux *mux = kcm->mux;
1375     struct sock *csk;
1376     struct kcm_psock *psock = NULL, *tpsock;
1377     struct list_head *head;
1378     int index = 0;
1379     static const struct strp_callbacks cb = {
1380         .rcv_msg = kcm_rcv_strparser,
1381         .parse_msg = kcm_parse_func_strparser,
1382         .read_sock_done = kcm_read_sock_done,
1383     };
1384     int err = 0;
1385 
1386     csk = csock->sk;
1387     if (!csk)
1388         return -EINVAL;
1389 
1390     lock_sock(csk);
1391 
1392     /* Only allow TCP sockets to be attached for now */
1393     if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) ||
1394         csk->sk_protocol != IPPROTO_TCP) {
1395         err = -EOPNOTSUPP;
1396         goto out;
1397     }
1398 
1399     /* Don't allow listeners or closed sockets */
1400     if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) {
1401         err = -EOPNOTSUPP;
1402         goto out;
1403     }
1404 
1405     psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
1406     if (!psock) {
1407         err = -ENOMEM;
1408         goto out;
1409     }
1410 
1411     psock->mux = mux;
1412     psock->sk = csk;
1413     psock->bpf_prog = prog;
1414 
1415     write_lock_bh(&csk->sk_callback_lock);
1416 
1417     /* Check if sk_user_data is already by KCM or someone else.
1418      * Must be done under lock to prevent race conditions.
1419      */
1420     if (csk->sk_user_data) {
1421         write_unlock_bh(&csk->sk_callback_lock);
1422         kmem_cache_free(kcm_psockp, psock);
1423         err = -EALREADY;
1424         goto out;
1425     }
1426 
1427     err = strp_init(&psock->strp, csk, &cb);
1428     if (err) {
1429         write_unlock_bh(&csk->sk_callback_lock);
1430         kmem_cache_free(kcm_psockp, psock);
1431         goto out;
1432     }
1433 
1434     psock->save_data_ready = csk->sk_data_ready;
1435     psock->save_write_space = csk->sk_write_space;
1436     psock->save_state_change = csk->sk_state_change;
1437     csk->sk_user_data = psock;
1438     csk->sk_data_ready = psock_data_ready;
1439     csk->sk_write_space = psock_write_space;
1440     csk->sk_state_change = psock_state_change;
1441 
1442     write_unlock_bh(&csk->sk_callback_lock);
1443 
1444     sock_hold(csk);
1445 
1446     /* Finished initialization, now add the psock to the MUX. */
1447     spin_lock_bh(&mux->lock);
1448     head = &mux->psocks;
1449     list_for_each_entry(tpsock, &mux->psocks, psock_list) {
1450         if (tpsock->index != index)
1451             break;
1452         head = &tpsock->psock_list;
1453         index++;
1454     }
1455 
1456     list_add(&psock->psock_list, head);
1457     psock->index = index;
1458 
1459     KCM_STATS_INCR(mux->stats.psock_attach);
1460     mux->psocks_cnt++;
1461     psock_now_avail(psock);
1462     spin_unlock_bh(&mux->lock);
1463 
1464     /* Schedule RX work in case there are already bytes queued */
1465     strp_check_rcv(&psock->strp);
1466 
1467 out:
1468     release_sock(csk);
1469 
1470     return err;
1471 }
1472 
1473 static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
1474 {
1475     struct socket *csock;
1476     struct bpf_prog *prog;
1477     int err;
1478 
1479     csock = sockfd_lookup(info->fd, &err);
1480     if (!csock)
1481         return -ENOENT;
1482 
1483     prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER);
1484     if (IS_ERR(prog)) {
1485         err = PTR_ERR(prog);
1486         goto out;
1487     }
1488 
1489     err = kcm_attach(sock, csock, prog);
1490     if (err) {
1491         bpf_prog_put(prog);
1492         goto out;
1493     }
1494 
1495     /* Keep reference on file also */
1496 
1497     return 0;
1498 out:
1499     sockfd_put(csock);
1500     return err;
1501 }
1502 
1503 static void kcm_unattach(struct kcm_psock *psock)
1504 {
1505     struct sock *csk = psock->sk;
1506     struct kcm_mux *mux = psock->mux;
1507 
1508     lock_sock(csk);
1509 
1510     /* Stop getting callbacks from TCP socket. After this there should
1511      * be no way to reserve a kcm for this psock.
1512      */
1513     write_lock_bh(&csk->sk_callback_lock);
1514     csk->sk_user_data = NULL;
1515     csk->sk_data_ready = psock->save_data_ready;
1516     csk->sk_write_space = psock->save_write_space;
1517     csk->sk_state_change = psock->save_state_change;
1518     strp_stop(&psock->strp);
1519 
1520     if (WARN_ON(psock->rx_kcm)) {
1521         write_unlock_bh(&csk->sk_callback_lock);
1522         release_sock(csk);
1523         return;
1524     }
1525 
1526     spin_lock_bh(&mux->rx_lock);
1527 
1528     /* Stop receiver activities. After this point psock should not be
1529      * able to get onto ready list either through callbacks or work.
1530      */
1531     if (psock->ready_rx_msg) {
1532         list_del(&psock->psock_ready_list);
1533         kfree_skb(psock->ready_rx_msg);
1534         psock->ready_rx_msg = NULL;
1535         KCM_STATS_INCR(mux->stats.rx_ready_drops);
1536     }
1537 
1538     spin_unlock_bh(&mux->rx_lock);
1539 
1540     write_unlock_bh(&csk->sk_callback_lock);
1541 
1542     /* Call strp_done without sock lock */
1543     release_sock(csk);
1544     strp_done(&psock->strp);
1545     lock_sock(csk);
1546 
1547     bpf_prog_put(psock->bpf_prog);
1548 
1549     spin_lock_bh(&mux->lock);
1550 
1551     aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats);
1552     save_strp_stats(&psock->strp, &mux->aggregate_strp_stats);
1553 
1554     KCM_STATS_INCR(mux->stats.psock_unattach);
1555 
1556     if (psock->tx_kcm) {
1557         /* psock was reserved.  Just mark it finished and we will clean
1558          * up in the kcm paths, we need kcm lock which can not be
1559          * acquired here.
1560          */
1561         KCM_STATS_INCR(mux->stats.psock_unattach_rsvd);
1562         spin_unlock_bh(&mux->lock);
1563 
1564         /* We are unattaching a socket that is reserved. Abort the
1565          * socket since we may be out of sync in sending on it. We need
1566          * to do this without the mux lock.
1567          */
1568         kcm_abort_tx_psock(psock, EPIPE, false);
1569 
1570         spin_lock_bh(&mux->lock);
1571         if (!psock->tx_kcm) {
1572             /* psock now unreserved in window mux was unlocked */
1573             goto no_reserved;
1574         }
1575         psock->done = 1;
1576 
1577         /* Commit done before queuing work to process it */
1578         smp_mb();
1579 
1580         /* Queue tx work to make sure psock->done is handled */
1581         queue_work(kcm_wq, &psock->tx_kcm->tx_work);
1582         spin_unlock_bh(&mux->lock);
1583     } else {
1584 no_reserved:
1585         if (!psock->tx_stopped)
1586             list_del(&psock->psock_avail_list);
1587         list_del(&psock->psock_list);
1588         mux->psocks_cnt--;
1589         spin_unlock_bh(&mux->lock);
1590 
1591         sock_put(csk);
1592         fput(csk->sk_socket->file);
1593         kmem_cache_free(kcm_psockp, psock);
1594     }
1595 
1596     release_sock(csk);
1597 }
1598 
1599 static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
1600 {
1601     struct kcm_sock *kcm = kcm_sk(sock->sk);
1602     struct kcm_mux *mux = kcm->mux;
1603     struct kcm_psock *psock;
1604     struct socket *csock;
1605     struct sock *csk;
1606     int err;
1607 
1608     csock = sockfd_lookup(info->fd, &err);
1609     if (!csock)
1610         return -ENOENT;
1611 
1612     csk = csock->sk;
1613     if (!csk) {
1614         err = -EINVAL;
1615         goto out;
1616     }
1617 
1618     err = -ENOENT;
1619 
1620     spin_lock_bh(&mux->lock);
1621 
1622     list_for_each_entry(psock, &mux->psocks, psock_list) {
1623         if (psock->sk != csk)
1624             continue;
1625 
1626         /* Found the matching psock */
1627 
1628         if (psock->unattaching || WARN_ON(psock->done)) {
1629             err = -EALREADY;
1630             break;
1631         }
1632 
1633         psock->unattaching = 1;
1634 
1635         spin_unlock_bh(&mux->lock);
1636 
1637         /* Lower socket lock should already be held */
1638         kcm_unattach(psock);
1639 
1640         err = 0;
1641         goto out;
1642     }
1643 
1644     spin_unlock_bh(&mux->lock);
1645 
1646 out:
1647     sockfd_put(csock);
1648     return err;
1649 }
1650 
1651 static struct proto kcm_proto = {
1652     .name   = "KCM",
1653     .owner  = THIS_MODULE,
1654     .obj_size = sizeof(struct kcm_sock),
1655 };
1656 
1657 /* Clone a kcm socket. */
1658 static struct file *kcm_clone(struct socket *osock)
1659 {
1660     struct socket *newsock;
1661     struct sock *newsk;
1662 
1663     newsock = sock_alloc();
1664     if (!newsock)
1665         return ERR_PTR(-ENFILE);
1666 
1667     newsock->type = osock->type;
1668     newsock->ops = osock->ops;
1669 
1670     __module_get(newsock->ops->owner);
1671 
1672     newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
1673              &kcm_proto, false);
1674     if (!newsk) {
1675         sock_release(newsock);
1676         return ERR_PTR(-ENOMEM);
1677     }
1678     sock_init_data(newsock, newsk);
1679     init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
1680 
1681     return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
1682 }
1683 
1684 static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1685 {
1686     int err;
1687 
1688     switch (cmd) {
1689     case SIOCKCMATTACH: {
1690         struct kcm_attach info;
1691 
1692         if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1693             return -EFAULT;
1694 
1695         err = kcm_attach_ioctl(sock, &info);
1696 
1697         break;
1698     }
1699     case SIOCKCMUNATTACH: {
1700         struct kcm_unattach info;
1701 
1702         if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
1703             return -EFAULT;
1704 
1705         err = kcm_unattach_ioctl(sock, &info);
1706 
1707         break;
1708     }
1709     case SIOCKCMCLONE: {
1710         struct kcm_clone info;
1711         struct file *file;
1712 
1713         info.fd = get_unused_fd_flags(0);
1714         if (unlikely(info.fd < 0))
1715             return info.fd;
1716 
1717         file = kcm_clone(sock);
1718         if (IS_ERR(file)) {
1719             put_unused_fd(info.fd);
1720             return PTR_ERR(file);
1721         }
1722         if (copy_to_user((void __user *)arg, &info,
1723                  sizeof(info))) {
1724             put_unused_fd(info.fd);
1725             fput(file);
1726             return -EFAULT;
1727         }
1728         fd_install(info.fd, file);
1729         err = 0;
1730         break;
1731     }
1732     default:
1733         err = -ENOIOCTLCMD;
1734         break;
1735     }
1736 
1737     return err;
1738 }
1739 
1740 static void free_mux(struct rcu_head *rcu)
1741 {
1742     struct kcm_mux *mux = container_of(rcu,
1743         struct kcm_mux, rcu);
1744 
1745     kmem_cache_free(kcm_muxp, mux);
1746 }
1747 
1748 static void release_mux(struct kcm_mux *mux)
1749 {
1750     struct kcm_net *knet = mux->knet;
1751     struct kcm_psock *psock, *tmp_psock;
1752 
1753     /* Release psocks */
1754     list_for_each_entry_safe(psock, tmp_psock,
1755                  &mux->psocks, psock_list) {
1756         if (!WARN_ON(psock->unattaching))
1757             kcm_unattach(psock);
1758     }
1759 
1760     if (WARN_ON(mux->psocks_cnt))
1761         return;
1762 
1763     __skb_queue_purge(&mux->rx_hold_queue);
1764 
1765     mutex_lock(&knet->mutex);
1766     aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats);
1767     aggregate_psock_stats(&mux->aggregate_psock_stats,
1768                   &knet->aggregate_psock_stats);
1769     aggregate_strp_stats(&mux->aggregate_strp_stats,
1770                  &knet->aggregate_strp_stats);
1771     list_del_rcu(&mux->kcm_mux_list);
1772     knet->count--;
1773     mutex_unlock(&knet->mutex);
1774 
1775     call_rcu(&mux->rcu, free_mux);
1776 }
1777 
1778 static void kcm_done(struct kcm_sock *kcm)
1779 {
1780     struct kcm_mux *mux = kcm->mux;
1781     struct sock *sk = &kcm->sk;
1782     int socks_cnt;
1783 
1784     spin_lock_bh(&mux->rx_lock);
1785     if (kcm->rx_psock) {
1786         /* Cleanup in unreserve_rx_kcm */
1787         WARN_ON(kcm->done);
1788         kcm->rx_disabled = 1;
1789         kcm->done = 1;
1790         spin_unlock_bh(&mux->rx_lock);
1791         return;
1792     }
1793 
1794     if (kcm->rx_wait) {
1795         list_del(&kcm->wait_rx_list);
1796         kcm->rx_wait = false;
1797     }
1798     /* Move any pending receive messages to other kcm sockets */
1799     requeue_rx_msgs(mux, &sk->sk_receive_queue);
1800 
1801     spin_unlock_bh(&mux->rx_lock);
1802 
1803     if (WARN_ON(sk_rmem_alloc_get(sk)))
1804         return;
1805 
1806     /* Detach from MUX */
1807     spin_lock_bh(&mux->lock);
1808 
1809     list_del(&kcm->kcm_sock_list);
1810     mux->kcm_socks_cnt--;
1811     socks_cnt = mux->kcm_socks_cnt;
1812 
1813     spin_unlock_bh(&mux->lock);
1814 
1815     if (!socks_cnt) {
1816         /* We are done with the mux now. */
1817         release_mux(mux);
1818     }
1819 
1820     WARN_ON(kcm->rx_wait);
1821 
1822     sock_put(&kcm->sk);
1823 }
1824 
1825 /* Called by kcm_release to close a KCM socket.
1826  * If this is the last KCM socket on the MUX, destroy the MUX.
1827  */
1828 static int kcm_release(struct socket *sock)
1829 {
1830     struct sock *sk = sock->sk;
1831     struct kcm_sock *kcm;
1832     struct kcm_mux *mux;
1833     struct kcm_psock *psock;
1834 
1835     if (!sk)
1836         return 0;
1837 
1838     kcm = kcm_sk(sk);
1839     mux = kcm->mux;
1840 
1841     sock_orphan(sk);
1842     kfree_skb(kcm->seq_skb);
1843 
1844     lock_sock(sk);
1845     /* Purge queue under lock to avoid race condition with tx_work trying
1846      * to act when queue is nonempty. If tx_work runs after this point
1847      * it will just return.
1848      */
1849     __skb_queue_purge(&sk->sk_write_queue);
1850 
1851     /* Set tx_stopped. This is checked when psock is bound to a kcm and we
1852      * get a writespace callback. This prevents further work being queued
1853      * from the callback (unbinding the psock occurs after canceling work.
1854      */
1855     kcm->tx_stopped = 1;
1856 
1857     release_sock(sk);
1858 
1859     spin_lock_bh(&mux->lock);
1860     if (kcm->tx_wait) {
1861         /* Take of tx_wait list, after this point there should be no way
1862          * that a psock will be assigned to this kcm.
1863          */
1864         list_del(&kcm->wait_psock_list);
1865         kcm->tx_wait = false;
1866     }
1867     spin_unlock_bh(&mux->lock);
1868 
1869     /* Cancel work. After this point there should be no outside references
1870      * to the kcm socket.
1871      */
1872     cancel_work_sync(&kcm->tx_work);
1873 
1874     lock_sock(sk);
1875     psock = kcm->tx_psock;
1876     if (psock) {
1877         /* A psock was reserved, so we need to kill it since it
1878          * may already have some bytes queued from a message. We
1879          * need to do this after removing kcm from tx_wait list.
1880          */
1881         kcm_abort_tx_psock(psock, EPIPE, false);
1882         unreserve_psock(kcm);
1883     }
1884     release_sock(sk);
1885 
1886     WARN_ON(kcm->tx_wait);
1887     WARN_ON(kcm->tx_psock);
1888 
1889     sock->sk = NULL;
1890 
1891     kcm_done(kcm);
1892 
1893     return 0;
1894 }
1895 
1896 static const struct proto_ops kcm_dgram_ops = {
1897     .family =   PF_KCM,
1898     .owner =    THIS_MODULE,
1899     .release =  kcm_release,
1900     .bind =     sock_no_bind,
1901     .connect =  sock_no_connect,
1902     .socketpair =   sock_no_socketpair,
1903     .accept =   sock_no_accept,
1904     .getname =  sock_no_getname,
1905     .poll =     datagram_poll,
1906     .ioctl =    kcm_ioctl,
1907     .listen =   sock_no_listen,
1908     .shutdown = sock_no_shutdown,
1909     .setsockopt =   kcm_setsockopt,
1910     .getsockopt =   kcm_getsockopt,
1911     .sendmsg =  kcm_sendmsg,
1912     .recvmsg =  kcm_recvmsg,
1913     .mmap =     sock_no_mmap,
1914     .sendpage = kcm_sendpage,
1915 };
1916 
1917 static const struct proto_ops kcm_seqpacket_ops = {
1918     .family =   PF_KCM,
1919     .owner =    THIS_MODULE,
1920     .release =  kcm_release,
1921     .bind =     sock_no_bind,
1922     .connect =  sock_no_connect,
1923     .socketpair =   sock_no_socketpair,
1924     .accept =   sock_no_accept,
1925     .getname =  sock_no_getname,
1926     .poll =     datagram_poll,
1927     .ioctl =    kcm_ioctl,
1928     .listen =   sock_no_listen,
1929     .shutdown = sock_no_shutdown,
1930     .setsockopt =   kcm_setsockopt,
1931     .getsockopt =   kcm_getsockopt,
1932     .sendmsg =  kcm_sendmsg,
1933     .recvmsg =  kcm_recvmsg,
1934     .mmap =     sock_no_mmap,
1935     .sendpage = kcm_sendpage,
1936     .splice_read =  kcm_splice_read,
1937 };
1938 
1939 /* Create proto operation for kcm sockets */
1940 static int kcm_create(struct net *net, struct socket *sock,
1941               int protocol, int kern)
1942 {
1943     struct kcm_net *knet = net_generic(net, kcm_net_id);
1944     struct sock *sk;
1945     struct kcm_mux *mux;
1946 
1947     switch (sock->type) {
1948     case SOCK_DGRAM:
1949         sock->ops = &kcm_dgram_ops;
1950         break;
1951     case SOCK_SEQPACKET:
1952         sock->ops = &kcm_seqpacket_ops;
1953         break;
1954     default:
1955         return -ESOCKTNOSUPPORT;
1956     }
1957 
1958     if (protocol != KCMPROTO_CONNECTED)
1959         return -EPROTONOSUPPORT;
1960 
1961     sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
1962     if (!sk)
1963         return -ENOMEM;
1964 
1965     /* Allocate a kcm mux, shared between KCM sockets */
1966     mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
1967     if (!mux) {
1968         sk_free(sk);
1969         return -ENOMEM;
1970     }
1971 
1972     spin_lock_init(&mux->lock);
1973     spin_lock_init(&mux->rx_lock);
1974     INIT_LIST_HEAD(&mux->kcm_socks);
1975     INIT_LIST_HEAD(&mux->kcm_rx_waiters);
1976     INIT_LIST_HEAD(&mux->kcm_tx_waiters);
1977 
1978     INIT_LIST_HEAD(&mux->psocks);
1979     INIT_LIST_HEAD(&mux->psocks_ready);
1980     INIT_LIST_HEAD(&mux->psocks_avail);
1981 
1982     mux->knet = knet;
1983 
1984     /* Add new MUX to list */
1985     mutex_lock(&knet->mutex);
1986     list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
1987     knet->count++;
1988     mutex_unlock(&knet->mutex);
1989 
1990     skb_queue_head_init(&mux->rx_hold_queue);
1991 
1992     /* Init KCM socket */
1993     sock_init_data(sock, sk);
1994     init_kcm_sock(kcm_sk(sk), mux);
1995 
1996     return 0;
1997 }
1998 
1999 static const struct net_proto_family kcm_family_ops = {
2000     .family = PF_KCM,
2001     .create = kcm_create,
2002     .owner  = THIS_MODULE,
2003 };
2004 
2005 static __net_init int kcm_init_net(struct net *net)
2006 {
2007     struct kcm_net *knet = net_generic(net, kcm_net_id);
2008 
2009     INIT_LIST_HEAD_RCU(&knet->mux_list);
2010     mutex_init(&knet->mutex);
2011 
2012     return 0;
2013 }
2014 
2015 static __net_exit void kcm_exit_net(struct net *net)
2016 {
2017     struct kcm_net *knet = net_generic(net, kcm_net_id);
2018 
2019     /* All KCM sockets should be closed at this point, which should mean
2020      * that all multiplexors and psocks have been destroyed.
2021      */
2022     WARN_ON(!list_empty(&knet->mux_list));
2023 }
2024 
2025 static struct pernet_operations kcm_net_ops = {
2026     .init = kcm_init_net,
2027     .exit = kcm_exit_net,
2028     .id   = &kcm_net_id,
2029     .size = sizeof(struct kcm_net),
2030 };
2031 
2032 static int __init kcm_init(void)
2033 {
2034     int err = -ENOMEM;
2035 
2036     kcm_muxp = kmem_cache_create("kcm_mux_cache",
2037                      sizeof(struct kcm_mux), 0,
2038                      SLAB_HWCACHE_ALIGN, NULL);
2039     if (!kcm_muxp)
2040         goto fail;
2041 
2042     kcm_psockp = kmem_cache_create("kcm_psock_cache",
2043                        sizeof(struct kcm_psock), 0,
2044                     SLAB_HWCACHE_ALIGN, NULL);
2045     if (!kcm_psockp)
2046         goto fail;
2047 
2048     kcm_wq = create_singlethread_workqueue("kkcmd");
2049     if (!kcm_wq)
2050         goto fail;
2051 
2052     err = proto_register(&kcm_proto, 1);
2053     if (err)
2054         goto fail;
2055 
2056     err = register_pernet_device(&kcm_net_ops);
2057     if (err)
2058         goto net_ops_fail;
2059 
2060     err = sock_register(&kcm_family_ops);
2061     if (err)
2062         goto sock_register_fail;
2063 
2064     err = kcm_proc_init();
2065     if (err)
2066         goto proc_init_fail;
2067 
2068     return 0;
2069 
2070 proc_init_fail:
2071     sock_unregister(PF_KCM);
2072 
2073 sock_register_fail:
2074     unregister_pernet_device(&kcm_net_ops);
2075 
2076 net_ops_fail:
2077     proto_unregister(&kcm_proto);
2078 
2079 fail:
2080     kmem_cache_destroy(kcm_muxp);
2081     kmem_cache_destroy(kcm_psockp);
2082 
2083     if (kcm_wq)
2084         destroy_workqueue(kcm_wq);
2085 
2086     return err;
2087 }
2088 
2089 static void __exit kcm_exit(void)
2090 {
2091     kcm_proc_exit();
2092     sock_unregister(PF_KCM);
2093     unregister_pernet_device(&kcm_net_ops);
2094     proto_unregister(&kcm_proto);
2095     destroy_workqueue(kcm_wq);
2096 
2097     kmem_cache_destroy(kcm_muxp);
2098     kmem_cache_destroy(kcm_psockp);
2099 }
2100 
2101 module_init(kcm_init);
2102 module_exit(kcm_exit);
2103 
2104 MODULE_LICENSE("GPL");
2105 MODULE_ALIAS_NETPROTO(PF_KCM);