net/unix/af_unix.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * NET4:    Implementation of BSD Unix domain sockets.
0004  *
0005  * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
0006  *
0007  * Fixes:
0008  *      Linus Torvalds  :   Assorted bug cures.
0009  *      Niibe Yutaka    :   async I/O support.
0010  *      Carsten Paeth   :   PF_UNIX check, address fixes.
0011  *      Alan Cox    :   Limit size of allocated blocks.
0012  *      Alan Cox    :   Fixed the stupid socketpair bug.
0013  *      Alan Cox    :   BSD compatibility fine tuning.
0014  *      Alan Cox    :   Fixed a bug in connect when interrupted.
0015  *      Alan Cox    :   Sorted out a proper draft version of
0016  *                  file descriptor passing hacked up from
0017  *                  Mike Shaver's work.
0018  *      Marty Leisner   :   Fixes to fd passing
0019  *      Nick Nevin  :   recvmsg bugfix.
0020  *      Alan Cox    :   Started proper garbage collector
0021  *      Heiko EiBfeldt  :   Missing verify_area check
0022  *      Alan Cox    :   Started POSIXisms
0023  *      Andreas Schwab  :   Replace inode by dentry for proper
0024  *                  reference counting
0025  *      Kirk Petersen   :   Made this a module
0026  *      Christoph Rohland   :   Elegant non-blocking accept/connect algorithm.
0027  *                  Lots of bug fixes.
0028  *       Alexey Kuznetosv   :   Repaired (I hope) bugs introduces
0029  *                  by above two patches.
0030  *       Andrea Arcangeli   :   If possible we block in connect(2)
0031  *                  if the max backlog of the listen socket
0032  *                  is been reached. This won't break
0033  *                  old apps and it will avoid huge amount
0034  *                  of socks hashed (this for unix_gc()
0035  *                  performances reasons).
0036  *                  Security fix that limits the max
0037  *                  number of socks to 2*max_files and
0038  *                  the number of skb queueable in the
0039  *                  dgram receiver.
0040  *      Artur Skawina   :   Hash function optimizations
0041  *       Alexey Kuznetsov   :   Full scale SMP. Lot of bugs are introduced 8)
0042  *        Malcolm Beattie   :   Set peercred for socketpair
0043  *       Michal Ostrowski   :       Module initialization cleanup.
0044  *       Arnaldo C. Melo    :   Remove MOD_{INC,DEC}_USE_COUNT,
0045  *                      the core infrastructure is doing that
0046  *                      for all net proto families now (2.5.69+)
0047  *
0048  * Known differences from reference BSD that was tested:
0049  *
0050  *  [TO FIX]
0051  *  ECONNREFUSED is not returned from one end of a connected() socket to the
0052  *      other the moment one end closes.
0053  *  fstat() doesn't return st_dev=0, and give the blksize as high water mark
0054  *      and a fake inode identifier (nor the BSD first socket fstat twice bug).
0055  *  [NOT TO FIX]
0056  *  accept() returns a path name even if the connecting socket has closed
0057  *      in the meantime (BSD loses the path and gives up).
0058  *  accept() returns 0 length path for an unbound connector. BSD returns 16
0059  *      and a null first byte in the path (but not for gethost/peername - BSD bug ??)
0060  *  socketpair(...SOCK_RAW..) doesn't panic the kernel.
0061  *  BSD af_unix apparently has connect forgetting to block properly.
0062  *      (need to check this with the POSIX spec in detail)
0063  *
0064  * Differences from 2.0.0-11-... (ANK)
0065  *  Bug fixes and improvements.
0066  *      - client shutdown killed server socket.
0067  *      - removed all useless cli/sti pairs.
0068  *
0069  *  Semantic changes/extensions.
0070  *      - generic control message passing.
0071  *      - SCM_CREDENTIALS control message.
0072  *      - "Abstract" (not FS based) socket bindings.
0073  *        Abstract names are sequences of bytes (not zero terminated)
0074  *        started by 0, so that this name space does not intersect
0075  *        with BSD names.
0076  */
0077
0078 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0079
0080 #include <linux/module.h>
0081 #include <linux/kernel.h>
0082 #include <linux/signal.h>
0083 #include <linux/sched/signal.h>
0084 #include <linux/errno.h>
0085 #include <linux/string.h>
0086 #include <linux/stat.h>
0087 #include <linux/dcache.h>
0088 #include <linux/namei.h>
0089 #include <linux/socket.h>
0090 #include <linux/un.h>
0091 #include <linux/fcntl.h>
0092 #include <linux/filter.h>
0093 #include <linux/termios.h>
0094 #include <linux/sockios.h>
0095 #include <linux/net.h>
0096 #include <linux/in.h>
0097 #include <linux/fs.h>
0098 #include <linux/slab.h>
0099 #include <linux/uaccess.h>
0100 #include <linux/skbuff.h>
0101 #include <linux/netdevice.h>
0102 #include <net/net_namespace.h>
0103 #include <net/sock.h>
0104 #include <net/tcp_states.h>
0105 #include <net/af_unix.h>
0106 #include <linux/proc_fs.h>
0107 #include <linux/seq_file.h>
0108 #include <net/scm.h>
0109 #include <linux/init.h>
0110 #include <linux/poll.h>
0111 #include <linux/rtnetlink.h>
0112 #include <linux/mount.h>
0113 #include <net/checksum.h>
0114 #include <linux/security.h>
0115 #include <linux/freezer.h>
0116 #include <linux/file.h>
0117 #include <linux/btf_ids.h>
0118
0119 #include "scm.h"
0120
0121 static atomic_long_t unix_nr_socks;
0122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
0123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
0124
0125 /* SMP locking strategy:
0126  *    hash table is protected with spinlock.
0127  *    each socket state is protected by separate spinlock.
0128  */
0129
0130 static unsigned int unix_unbound_hash(struct sock *sk)
0131 {
0132     unsigned long hash = (unsigned long)sk;
0133
0134     hash ^= hash >> 16;
0135     hash ^= hash >> 8;
0136     hash ^= sk->sk_type;
0137
0138     return hash & UNIX_HASH_MOD;
0139 }
0140
0141 static unsigned int unix_bsd_hash(struct inode *i)
0142 {
0143     return i->i_ino & UNIX_HASH_MOD;
0144 }
0145
0146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
0147                        int addr_len, int type)
0148 {
0149     __wsum csum = csum_partial(sunaddr, addr_len, 0);
0150     unsigned int hash;
0151
0152     hash = (__force unsigned int)csum_fold(csum);
0153     hash ^= hash >> 8;
0154     hash ^= type;
0155
0156     return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
0157 }
0158
0159 static void unix_table_double_lock(struct net *net,
0160                    unsigned int hash1, unsigned int hash2)
0161 {
0162     if (hash1 == hash2) {
0163         spin_lock(&net->unx.table.locks[hash1]);
0164         return;
0165     }
0166
0167     if (hash1 > hash2)
0168         swap(hash1, hash2);
0169
0170     spin_lock(&net->unx.table.locks[hash1]);
0171     spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
0172 }
0173
0174 static void unix_table_double_unlock(struct net *net,
0175                      unsigned int hash1, unsigned int hash2)
0176 {
0177     if (hash1 == hash2) {
0178         spin_unlock(&net->unx.table.locks[hash1]);
0179         return;
0180     }
0181
0182     spin_unlock(&net->unx.table.locks[hash1]);
0183     spin_unlock(&net->unx.table.locks[hash2]);
0184 }
0185
0186 #ifdef CONFIG_SECURITY_NETWORK
0187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
0188 {
0189     UNIXCB(skb).secid = scm->secid;
0190 }
0191
0192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
0193 {
0194     scm->secid = UNIXCB(skb).secid;
0195 }
0196
0197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
0198 {
0199     return (scm->secid == UNIXCB(skb).secid);
0200 }
0201 #else
0202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
0203 { }
0204
0205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
0206 { }
0207
0208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
0209 {
0210     return true;
0211 }
0212 #endif /* CONFIG_SECURITY_NETWORK */
0213
0214 #define unix_peer(sk) (unix_sk(sk)->peer)
0215
0216 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
0217 {
0218     return unix_peer(osk) == sk;
0219 }
0220
0221 static inline int unix_may_send(struct sock *sk, struct sock *osk)
0222 {
0223     return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
0224 }
0225
0226 static inline int unix_recvq_full(const struct sock *sk)
0227 {
0228     return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
0229 }
0230
0231 static inline int unix_recvq_full_lockless(const struct sock *sk)
0232 {
0233     return skb_queue_len_lockless(&sk->sk_receive_queue) >
0234         READ_ONCE(sk->sk_max_ack_backlog);
0235 }
0236
0237 struct sock *unix_peer_get(struct sock *s)
0238 {
0239     struct sock *peer;
0240
0241     unix_state_lock(s);
0242     peer = unix_peer(s);
0243     if (peer)
0244         sock_hold(peer);
0245     unix_state_unlock(s);
0246     return peer;
0247 }
0248 EXPORT_SYMBOL_GPL(unix_peer_get);
0249
0250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
0251                          int addr_len)
0252 {
0253     struct unix_address *addr;
0254
0255     addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
0256     if (!addr)
0257         return NULL;
0258
0259     refcount_set(&addr->refcnt, 1);
0260     addr->len = addr_len;
0261     memcpy(addr->name, sunaddr, addr_len);
0262
0263     return addr;
0264 }
0265
0266 static inline void unix_release_addr(struct unix_address *addr)
0267 {
0268     if (refcount_dec_and_test(&addr->refcnt))
0269         kfree(addr);
0270 }
0271
0272 /*
0273  *  Check unix socket name:
0274  *      - should be not zero length.
0275  *          - if started by not zero, should be NULL terminated (FS object)
0276  *      - if started by zero, it is abstract name.
0277  */
0278
0279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
0280 {
0281     if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
0282         addr_len > sizeof(*sunaddr))
0283         return -EINVAL;
0284
0285     if (sunaddr->sun_family != AF_UNIX)
0286         return -EINVAL;
0287
0288     return 0;
0289 }
0290
0291 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
0292 {
0293     /* This may look like an off by one error but it is a bit more
0294      * subtle.  108 is the longest valid AF_UNIX path for a binding.
0295      * sun_path[108] doesn't as such exist.  However in kernel space
0296      * we are guaranteed that it is a valid memory location in our
0297      * kernel address buffer because syscall functions always pass
0298      * a pointer of struct sockaddr_storage which has a bigger buffer
0299      * than 108.
0300      */
0301     ((char *)sunaddr)[addr_len] = 0;
0302 }
0303
0304 static void __unix_remove_socket(struct sock *sk)
0305 {
0306     sk_del_node_init(sk);
0307 }
0308
0309 static void __unix_insert_socket(struct net *net, struct sock *sk)
0310 {
0311     DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
0312     sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
0313 }
0314
0315 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
0316                  struct unix_address *addr, unsigned int hash)
0317 {
0318     __unix_remove_socket(sk);
0319     smp_store_release(&unix_sk(sk)->addr, addr);
0320
0321     sk->sk_hash = hash;
0322     __unix_insert_socket(net, sk);
0323 }
0324
0325 static void unix_remove_socket(struct net *net, struct sock *sk)
0326 {
0327     spin_lock(&net->unx.table.locks[sk->sk_hash]);
0328     __unix_remove_socket(sk);
0329     spin_unlock(&net->unx.table.locks[sk->sk_hash]);
0330 }
0331
0332 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
0333 {
0334     spin_lock(&net->unx.table.locks[sk->sk_hash]);
0335     __unix_insert_socket(net, sk);
0336     spin_unlock(&net->unx.table.locks[sk->sk_hash]);
0337 }
0338
0339 static void unix_insert_bsd_socket(struct sock *sk)
0340 {
0341     spin_lock(&bsd_socket_locks[sk->sk_hash]);
0342     sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
0343     spin_unlock(&bsd_socket_locks[sk->sk_hash]);
0344 }
0345
0346 static void unix_remove_bsd_socket(struct sock *sk)
0347 {
0348     if (!hlist_unhashed(&sk->sk_bind_node)) {
0349         spin_lock(&bsd_socket_locks[sk->sk_hash]);
0350         __sk_del_bind_node(sk);
0351         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
0352
0353         sk_node_init(&sk->sk_bind_node);
0354     }
0355 }
0356
0357 static struct sock *__unix_find_socket_byname(struct net *net,
0358                           struct sockaddr_un *sunname,
0359                           int len, unsigned int hash)
0360 {
0361     struct sock *s;
0362
0363     sk_for_each(s, &net->unx.table.buckets[hash]) {
0364         struct unix_sock *u = unix_sk(s);
0365
0366         if (u->addr->len == len &&
0367             !memcmp(u->addr->name, sunname, len))
0368             return s;
0369     }
0370     return NULL;
0371 }
0372
0373 static inline struct sock *unix_find_socket_byname(struct net *net,
0374                            struct sockaddr_un *sunname,
0375                            int len, unsigned int hash)
0376 {
0377     struct sock *s;
0378
0379     spin_lock(&net->unx.table.locks[hash]);
0380     s = __unix_find_socket_byname(net, sunname, len, hash);
0381     if (s)
0382         sock_hold(s);
0383     spin_unlock(&net->unx.table.locks[hash]);
0384     return s;
0385 }
0386
0387 static struct sock *unix_find_socket_byinode(struct inode *i)
0388 {
0389     unsigned int hash = unix_bsd_hash(i);
0390     struct sock *s;
0391
0392     spin_lock(&bsd_socket_locks[hash]);
0393     sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
0394         struct dentry *dentry = unix_sk(s)->path.dentry;
0395
0396         if (dentry && d_backing_inode(dentry) == i) {
0397             sock_hold(s);
0398             spin_unlock(&bsd_socket_locks[hash]);
0399             return s;
0400         }
0401     }
0402     spin_unlock(&bsd_socket_locks[hash]);
0403     return NULL;
0404 }
0405
0406 /* Support code for asymmetrically connected dgram sockets
0407  *
0408  * If a datagram socket is connected to a socket not itself connected
0409  * to the first socket (eg, /dev/log), clients may only enqueue more
0410  * messages if the present receive queue of the server socket is not
0411  * "too large". This means there's a second writeability condition
0412  * poll and sendmsg need to test. The dgram recv code will do a wake
0413  * up on the peer_wait wait queue of a socket upon reception of a
0414  * datagram which needs to be propagated to sleeping would-be writers
0415  * since these might not have sent anything so far. This can't be
0416  * accomplished via poll_wait because the lifetime of the server
0417  * socket might be less than that of its clients if these break their
0418  * association with it or if the server socket is closed while clients
0419  * are still connected to it and there's no way to inform "a polling
0420  * implementation" that it should let go of a certain wait queue
0421  *
0422  * In order to propagate a wake up, a wait_queue_entry_t of the client
0423  * socket is enqueued on the peer_wait queue of the server socket
0424  * whose wake function does a wake_up on the ordinary client socket
0425  * wait queue. This connection is established whenever a write (or
0426  * poll for write) hit the flow control condition and broken when the
0427  * association to the server socket is dissolved or after a wake up
0428  * was relayed.
0429  */
0430
0431 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
0432                       void *key)
0433 {
0434     struct unix_sock *u;
0435     wait_queue_head_t *u_sleep;
0436
0437     u = container_of(q, struct unix_sock, peer_wake);
0438
0439     __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
0440                 q);
0441     u->peer_wake.private = NULL;
0442
0443     /* relaying can only happen while the wq still exists */
0444     u_sleep = sk_sleep(&u->sk);
0445     if (u_sleep)
0446         wake_up_interruptible_poll(u_sleep, key_to_poll(key));
0447
0448     return 0;
0449 }
0450
0451 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
0452 {
0453     struct unix_sock *u, *u_other;
0454     int rc;
0455
0456     u = unix_sk(sk);
0457     u_other = unix_sk(other);
0458     rc = 0;
0459     spin_lock(&u_other->peer_wait.lock);
0460
0461     if (!u->peer_wake.private) {
0462         u->peer_wake.private = other;
0463         __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
0464
0465         rc = 1;
0466     }
0467
0468     spin_unlock(&u_other->peer_wait.lock);
0469     return rc;
0470 }
0471
0472 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
0473                         struct sock *other)
0474 {
0475     struct unix_sock *u, *u_other;
0476
0477     u = unix_sk(sk);
0478     u_other = unix_sk(other);
0479     spin_lock(&u_other->peer_wait.lock);
0480
0481     if (u->peer_wake.private == other) {
0482         __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
0483         u->peer_wake.private = NULL;
0484     }
0485
0486     spin_unlock(&u_other->peer_wait.lock);
0487 }
0488
0489 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
0490                            struct sock *other)
0491 {
0492     unix_dgram_peer_wake_disconnect(sk, other);
0493     wake_up_interruptible_poll(sk_sleep(sk),
0494                    EPOLLOUT |
0495                    EPOLLWRNORM |
0496                    EPOLLWRBAND);
0497 }
0498
0499 /* preconditions:
0500  *  - unix_peer(sk) == other
0501  *  - association is stable
0502  */
0503 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
0504 {
0505     int connected;
0506
0507     connected = unix_dgram_peer_wake_connect(sk, other);
0508
0509     /* If other is SOCK_DEAD, we want to make sure we signal
0510      * POLLOUT, such that a subsequent write() can get a
0511      * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
0512      * to other and its full, we will hang waiting for POLLOUT.
0513      */
0514     if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
0515         return 1;
0516
0517     if (connected)
0518         unix_dgram_peer_wake_disconnect(sk, other);
0519
0520     return 0;
0521 }
0522
0523 static int unix_writable(const struct sock *sk)
0524 {
0525     return sk->sk_state != TCP_LISTEN &&
0526            (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
0527 }
0528
0529 static void unix_write_space(struct sock *sk)
0530 {
0531     struct socket_wq *wq;
0532
0533     rcu_read_lock();
0534     if (unix_writable(sk)) {
0535         wq = rcu_dereference(sk->sk_wq);
0536         if (skwq_has_sleeper(wq))
0537             wake_up_interruptible_sync_poll(&wq->wait,
0538                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
0539         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
0540     }
0541     rcu_read_unlock();
0542 }
0543
0544 /* When dgram socket disconnects (or changes its peer), we clear its receive
0545  * queue of packets arrived from previous peer. First, it allows to do
0546  * flow control based only on wmem_alloc; second, sk connected to peer
0547  * may receive messages only from that peer. */
0548 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
0549 {
0550     if (!skb_queue_empty(&sk->sk_receive_queue)) {
0551         skb_queue_purge(&sk->sk_receive_queue);
0552         wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
0553
0554         /* If one link of bidirectional dgram pipe is disconnected,
0555          * we signal error. Messages are lost. Do not make this,
0556          * when peer was not connected to us.
0557          */
0558         if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
0559             other->sk_err = ECONNRESET;
0560             sk_error_report(other);
0561         }
0562     }
0563     other->sk_state = TCP_CLOSE;
0564 }
0565
0566 static void unix_sock_destructor(struct sock *sk)
0567 {
0568     struct unix_sock *u = unix_sk(sk);
0569
0570     skb_queue_purge(&sk->sk_receive_queue);
0571
0572 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
0573     if (u->oob_skb) {
0574         kfree_skb(u->oob_skb);
0575         u->oob_skb = NULL;
0576     }
0577 #endif
0578     DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
0579     DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
0580     DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
0581     if (!sock_flag(sk, SOCK_DEAD)) {
0582         pr_info("Attempt to release alive unix socket: %p\n", sk);
0583         return;
0584     }
0585
0586     if (u->addr)
0587         unix_release_addr(u->addr);
0588
0589     atomic_long_dec(&unix_nr_socks);
0590     sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
0591 #ifdef UNIX_REFCNT_DEBUG
0592     pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
0593         atomic_long_read(&unix_nr_socks));
0594 #endif
0595 }
0596
0597 static void unix_release_sock(struct sock *sk, int embrion)
0598 {
0599     struct unix_sock *u = unix_sk(sk);
0600     struct sock *skpair;
0601     struct sk_buff *skb;
0602     struct path path;
0603     int state;
0604
0605     unix_remove_socket(sock_net(sk), sk);
0606     unix_remove_bsd_socket(sk);
0607
0608     /* Clear state */
0609     unix_state_lock(sk);
0610     sock_orphan(sk);
0611     sk->sk_shutdown = SHUTDOWN_MASK;
0612     path         = u->path;
0613     u->path.dentry = NULL;
0614     u->path.mnt = NULL;
0615     state = sk->sk_state;
0616     sk->sk_state = TCP_CLOSE;
0617
0618     skpair = unix_peer(sk);
0619     unix_peer(sk) = NULL;
0620
0621     unix_state_unlock(sk);
0622
0623     wake_up_interruptible_all(&u->peer_wait);
0624
0625     if (skpair != NULL) {
0626         if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
0627             unix_state_lock(skpair);
0628             /* No more writes */
0629             skpair->sk_shutdown = SHUTDOWN_MASK;
0630             if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
0631                 skpair->sk_err = ECONNRESET;
0632             unix_state_unlock(skpair);
0633             skpair->sk_state_change(skpair);
0634             sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
0635         }
0636
0637         unix_dgram_peer_wake_disconnect(sk, skpair);
0638         sock_put(skpair); /* It may now die */
0639     }
0640
0641     /* Try to flush out this socket. Throw out buffers at least */
0642
0643     while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
0644         if (state == TCP_LISTEN)
0645             unix_release_sock(skb->sk, 1);
0646         /* passed fds are erased in the kfree_skb hook        */
0647         UNIXCB(skb).consumed = skb->len;
0648         kfree_skb(skb);
0649     }
0650
0651     if (path.dentry)
0652         path_put(&path);
0653
0654     sock_put(sk);
0655
0656     /* ---- Socket is dead now and most probably destroyed ---- */
0657
0658     /*
0659      * Fixme: BSD difference: In BSD all sockets connected to us get
0660      *    ECONNRESET and we die on the spot. In Linux we behave
0661      *    like files and pipes do and wait for the last
0662      *    dereference.
0663      *
0664      * Can't we simply set sock->err?
0665      *
0666      *    What the above comment does talk about? --ANK(980817)
0667      */
0668
0669     if (unix_tot_inflight)
0670         unix_gc();      /* Garbage collect fds */
0671 }
0672
0673 static void init_peercred(struct sock *sk)
0674 {
0675     const struct cred *old_cred;
0676     struct pid *old_pid;
0677
0678     spin_lock(&sk->sk_peer_lock);
0679     old_pid = sk->sk_peer_pid;
0680     old_cred = sk->sk_peer_cred;
0681     sk->sk_peer_pid  = get_pid(task_tgid(current));
0682     sk->sk_peer_cred = get_current_cred();
0683     spin_unlock(&sk->sk_peer_lock);
0684
0685     put_pid(old_pid);
0686     put_cred(old_cred);
0687 }
0688
0689 static void copy_peercred(struct sock *sk, struct sock *peersk)
0690 {
0691     const struct cred *old_cred;
0692     struct pid *old_pid;
0693
0694     if (sk < peersk) {
0695         spin_lock(&sk->sk_peer_lock);
0696         spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
0697     } else {
0698         spin_lock(&peersk->sk_peer_lock);
0699         spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
0700     }
0701     old_pid = sk->sk_peer_pid;
0702     old_cred = sk->sk_peer_cred;
0703     sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
0704     sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
0705
0706     spin_unlock(&sk->sk_peer_lock);
0707     spin_unlock(&peersk->sk_peer_lock);
0708
0709     put_pid(old_pid);
0710     put_cred(old_cred);
0711 }
0712
0713 static int unix_listen(struct socket *sock, int backlog)
0714 {
0715     int err;
0716     struct sock *sk = sock->sk;
0717     struct unix_sock *u = unix_sk(sk);
0718
0719     err = -EOPNOTSUPP;
0720     if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
0721         goto out;   /* Only stream/seqpacket sockets accept */
0722     err = -EINVAL;
0723     if (!u->addr)
0724         goto out;   /* No listens on an unbound socket */
0725     unix_state_lock(sk);
0726     if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
0727         goto out_unlock;
0728     if (backlog > sk->sk_max_ack_backlog)
0729         wake_up_interruptible_all(&u->peer_wait);
0730     sk->sk_max_ack_backlog  = backlog;
0731     sk->sk_state        = TCP_LISTEN;
0732     /* set credentials so connect can copy them */
0733     init_peercred(sk);
0734     err = 0;
0735
0736 out_unlock:
0737     unix_state_unlock(sk);
0738 out:
0739     return err;
0740 }
0741
0742 static int unix_release(struct socket *);
0743 static int unix_bind(struct socket *, struct sockaddr *, int);
0744 static int unix_stream_connect(struct socket *, struct sockaddr *,
0745                    int addr_len, int flags);
0746 static int unix_socketpair(struct socket *, struct socket *);
0747 static int unix_accept(struct socket *, struct socket *, int, bool);
0748 static int unix_getname(struct socket *, struct sockaddr *, int);
0749 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
0750 static __poll_t unix_dgram_poll(struct file *, struct socket *,
0751                     poll_table *);
0752 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
0753 #ifdef CONFIG_COMPAT
0754 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
0755 #endif
0756 static int unix_shutdown(struct socket *, int);
0757 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
0758 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
0759 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
0760                     size_t size, int flags);
0761 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
0762                        struct pipe_inode_info *, size_t size,
0763                        unsigned int flags);
0764 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
0765 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
0766 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
0767 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
0768 static int unix_dgram_connect(struct socket *, struct sockaddr *,
0769                   int, int);
0770 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
0771 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
0772                   int);
0773
0774 static int unix_set_peek_off(struct sock *sk, int val)
0775 {
0776     struct unix_sock *u = unix_sk(sk);
0777
0778     if (mutex_lock_interruptible(&u->iolock))
0779         return -EINTR;
0780
0781     sk->sk_peek_off = val;
0782     mutex_unlock(&u->iolock);
0783
0784     return 0;
0785 }
0786
0787 #ifdef CONFIG_PROC_FS
0788 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
0789 {
0790     struct sock *sk = sock->sk;
0791     struct unix_sock *u;
0792
0793     if (sk) {
0794         u = unix_sk(sock->sk);
0795         seq_printf(m, "scm_fds: %u\n",
0796                atomic_read(&u->scm_stat.nr_fds));
0797     }
0798 }
0799 #else
0800 #define unix_show_fdinfo NULL
0801 #endif
0802
0803 static const struct proto_ops unix_stream_ops = {
0804     .family =   PF_UNIX,
0805     .owner =    THIS_MODULE,
0806     .release =  unix_release,
0807     .bind =     unix_bind,
0808     .connect =  unix_stream_connect,
0809     .socketpair =   unix_socketpair,
0810     .accept =   unix_accept,
0811     .getname =  unix_getname,
0812     .poll =     unix_poll,
0813     .ioctl =    unix_ioctl,
0814 #ifdef CONFIG_COMPAT
0815     .compat_ioctl = unix_compat_ioctl,
0816 #endif
0817     .listen =   unix_listen,
0818     .shutdown = unix_shutdown,
0819     .sendmsg =  unix_stream_sendmsg,
0820     .recvmsg =  unix_stream_recvmsg,
0821     .read_skb = unix_stream_read_skb,
0822     .mmap =     sock_no_mmap,
0823     .sendpage = unix_stream_sendpage,
0824     .splice_read =  unix_stream_splice_read,
0825     .set_peek_off = unix_set_peek_off,
0826     .show_fdinfo =  unix_show_fdinfo,
0827 };
0828
0829 static const struct proto_ops unix_dgram_ops = {
0830     .family =   PF_UNIX,
0831     .owner =    THIS_MODULE,
0832     .release =  unix_release,
0833     .bind =     unix_bind,
0834     .connect =  unix_dgram_connect,
0835     .socketpair =   unix_socketpair,
0836     .accept =   sock_no_accept,
0837     .getname =  unix_getname,
0838     .poll =     unix_dgram_poll,
0839     .ioctl =    unix_ioctl,
0840 #ifdef CONFIG_COMPAT
0841     .compat_ioctl = unix_compat_ioctl,
0842 #endif
0843     .listen =   sock_no_listen,
0844     .shutdown = unix_shutdown,
0845     .sendmsg =  unix_dgram_sendmsg,
0846     .read_skb = unix_read_skb,
0847     .recvmsg =  unix_dgram_recvmsg,
0848     .mmap =     sock_no_mmap,
0849     .sendpage = sock_no_sendpage,
0850     .set_peek_off = unix_set_peek_off,
0851     .show_fdinfo =  unix_show_fdinfo,
0852 };
0853
0854 static const struct proto_ops unix_seqpacket_ops = {
0855     .family =   PF_UNIX,
0856     .owner =    THIS_MODULE,
0857     .release =  unix_release,
0858     .bind =     unix_bind,
0859     .connect =  unix_stream_connect,
0860     .socketpair =   unix_socketpair,
0861     .accept =   unix_accept,
0862     .getname =  unix_getname,
0863     .poll =     unix_dgram_poll,
0864     .ioctl =    unix_ioctl,
0865 #ifdef CONFIG_COMPAT
0866     .compat_ioctl = unix_compat_ioctl,
0867 #endif
0868     .listen =   unix_listen,
0869     .shutdown = unix_shutdown,
0870     .sendmsg =  unix_seqpacket_sendmsg,
0871     .recvmsg =  unix_seqpacket_recvmsg,
0872     .mmap =     sock_no_mmap,
0873     .sendpage = sock_no_sendpage,
0874     .set_peek_off = unix_set_peek_off,
0875     .show_fdinfo =  unix_show_fdinfo,
0876 };
0877
0878 static void unix_close(struct sock *sk, long timeout)
0879 {
0880     /* Nothing to do here, unix socket does not need a ->close().
0881      * This is merely for sockmap.
0882      */
0883 }
0884
0885 static void unix_unhash(struct sock *sk)
0886 {
0887     /* Nothing to do here, unix socket does not need a ->unhash().
0888      * This is merely for sockmap.
0889      */
0890 }
0891
0892 struct proto unix_dgram_proto = {
0893     .name           = "UNIX",
0894     .owner          = THIS_MODULE,
0895     .obj_size       = sizeof(struct unix_sock),
0896     .close          = unix_close,
0897 #ifdef CONFIG_BPF_SYSCALL
0898     .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
0899 #endif
0900 };
0901
0902 struct proto unix_stream_proto = {
0903     .name           = "UNIX-STREAM",
0904     .owner          = THIS_MODULE,
0905     .obj_size       = sizeof(struct unix_sock),
0906     .close          = unix_close,
0907     .unhash         = unix_unhash,
0908 #ifdef CONFIG_BPF_SYSCALL
0909     .psock_update_sk_prot   = unix_stream_bpf_update_proto,
0910 #endif
0911 };
0912
0913 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
0914 {
0915     struct unix_sock *u;
0916     struct sock *sk;
0917     int err;
0918
0919     atomic_long_inc(&unix_nr_socks);
0920     if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
0921         err = -ENFILE;
0922         goto err;
0923     }
0924
0925     if (type == SOCK_STREAM)
0926         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
0927     else /*dgram and  seqpacket */
0928         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
0929
0930     if (!sk) {
0931         err = -ENOMEM;
0932         goto err;
0933     }
0934
0935     sock_init_data(sock, sk);
0936
0937     sk->sk_hash     = unix_unbound_hash(sk);
0938     sk->sk_allocation   = GFP_KERNEL_ACCOUNT;
0939     sk->sk_write_space  = unix_write_space;
0940     sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
0941     sk->sk_destruct     = unix_sock_destructor;
0942     u     = unix_sk(sk);
0943     u->path.dentry = NULL;
0944     u->path.mnt = NULL;
0945     spin_lock_init(&u->lock);
0946     atomic_long_set(&u->inflight, 0);
0947     INIT_LIST_HEAD(&u->link);
0948     mutex_init(&u->iolock); /* single task reading lock */
0949     mutex_init(&u->bindlock); /* single task binding lock */
0950     init_waitqueue_head(&u->peer_wait);
0951     init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
0952     memset(&u->scm_stat, 0, sizeof(struct scm_stat));
0953     unix_insert_unbound_socket(net, sk);
0954
0955     sock_prot_inuse_add(net, sk->sk_prot, 1);
0956
0957     return sk;
0958
0959 err:
0960     atomic_long_dec(&unix_nr_socks);
0961     return ERR_PTR(err);
0962 }
0963
0964 static int unix_create(struct net *net, struct socket *sock, int protocol,
0965                int kern)
0966 {
0967     struct sock *sk;
0968
0969     if (protocol && protocol != PF_UNIX)
0970         return -EPROTONOSUPPORT;
0971
0972     sock->state = SS_UNCONNECTED;
0973
0974     switch (sock->type) {
0975     case SOCK_STREAM:
0976         sock->ops = &unix_stream_ops;
0977         break;
0978         /*
0979          *  Believe it or not BSD has AF_UNIX, SOCK_RAW though
0980          *  nothing uses it.
0981          */
0982     case SOCK_RAW:
0983         sock->type = SOCK_DGRAM;
0984         fallthrough;
0985     case SOCK_DGRAM:
0986         sock->ops = &unix_dgram_ops;
0987         break;
0988     case SOCK_SEQPACKET:
0989         sock->ops = &unix_seqpacket_ops;
0990         break;
0991     default:
0992         return -ESOCKTNOSUPPORT;
0993     }
0994
0995     sk = unix_create1(net, sock, kern, sock->type);
0996     if (IS_ERR(sk))
0997         return PTR_ERR(sk);
0998
0999     return 0;
1000 }
1001
1002 static int unix_release(struct socket *sock)
1003 {
1004     struct sock *sk = sock->sk;
1005
1006     if (!sk)
1007         return 0;
1008
1009     sk->sk_prot->close(sk, 0);
1010     unix_release_sock(sk, 0);
1011     sock->sk = NULL;
1012
1013     return 0;
1014 }
1015
1016 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1017                   int type)
1018 {
1019     struct inode *inode;
1020     struct path path;
1021     struct sock *sk;
1022     int err;
1023
1024     unix_mkname_bsd(sunaddr, addr_len);
1025     err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1026     if (err)
1027         goto fail;
1028
1029     err = path_permission(&path, MAY_WRITE);
1030     if (err)
1031         goto path_put;
1032
1033     err = -ECONNREFUSED;
1034     inode = d_backing_inode(path.dentry);
1035     if (!S_ISSOCK(inode->i_mode))
1036         goto path_put;
1037
1038     sk = unix_find_socket_byinode(inode);
1039     if (!sk)
1040         goto path_put;
1041
1042     err = -EPROTOTYPE;
1043     if (sk->sk_type == type)
1044         touch_atime(&path);
1045     else
1046         goto sock_put;
1047
1048     path_put(&path);
1049
1050     return sk;
1051
1052 sock_put:
1053     sock_put(sk);
1054 path_put:
1055     path_put(&path);
1056 fail:
1057     return ERR_PTR(err);
1058 }
1059
1060 static struct sock *unix_find_abstract(struct net *net,
1061                        struct sockaddr_un *sunaddr,
1062                        int addr_len, int type)
1063 {
1064     unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1065     struct dentry *dentry;
1066     struct sock *sk;
1067
1068     sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1069     if (!sk)
1070         return ERR_PTR(-ECONNREFUSED);
1071
1072     dentry = unix_sk(sk)->path.dentry;
1073     if (dentry)
1074         touch_atime(&unix_sk(sk)->path);
1075
1076     return sk;
1077 }
1078
1079 static struct sock *unix_find_other(struct net *net,
1080                     struct sockaddr_un *sunaddr,
1081                     int addr_len, int type)
1082 {
1083     struct sock *sk;
1084
1085     if (sunaddr->sun_path[0])
1086         sk = unix_find_bsd(sunaddr, addr_len, type);
1087     else
1088         sk = unix_find_abstract(net, sunaddr, addr_len, type);
1089
1090     return sk;
1091 }
1092
1093 static int unix_autobind(struct sock *sk)
1094 {
1095     unsigned int new_hash, old_hash = sk->sk_hash;
1096     struct unix_sock *u = unix_sk(sk);
1097     struct net *net = sock_net(sk);
1098     struct unix_address *addr;
1099     u32 lastnum, ordernum;
1100     int err;
1101
1102     err = mutex_lock_interruptible(&u->bindlock);
1103     if (err)
1104         return err;
1105
1106     if (u->addr)
1107         goto out;
1108
1109     err = -ENOMEM;
1110     addr = kzalloc(sizeof(*addr) +
1111                offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1112     if (!addr)
1113         goto out;
1114
1115     addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1116     addr->name->sun_family = AF_UNIX;
1117     refcount_set(&addr->refcnt, 1);
1118
1119     ordernum = prandom_u32();
1120     lastnum = ordernum & 0xFFFFF;
1121 retry:
1122     ordernum = (ordernum + 1) & 0xFFFFF;
1123     sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1124
1125     new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1126     unix_table_double_lock(net, old_hash, new_hash);
1127
1128     if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1129         unix_table_double_unlock(net, old_hash, new_hash);
1130
1131         /* __unix_find_socket_byname() may take long time if many names
1132          * are already in use.
1133          */
1134         cond_resched();
1135
1136         if (ordernum == lastnum) {
1137             /* Give up if all names seems to be in use. */
1138             err = -ENOSPC;
1139             unix_release_addr(addr);
1140             goto out;
1141         }
1142
1143         goto retry;
1144     }
1145
1146     __unix_set_addr_hash(net, sk, addr, new_hash);
1147     unix_table_double_unlock(net, old_hash, new_hash);
1148     err = 0;
1149
1150 out:    mutex_unlock(&u->bindlock);
1151     return err;
1152 }
1153
1154 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1155              int addr_len)
1156 {
1157     umode_t mode = S_IFSOCK |
1158            (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1159     unsigned int new_hash, old_hash = sk->sk_hash;
1160     struct unix_sock *u = unix_sk(sk);
1161     struct net *net = sock_net(sk);
1162     struct user_namespace *ns; // barf...
1163     struct unix_address *addr;
1164     struct dentry *dentry;
1165     struct path parent;
1166     int err;
1167
1168     unix_mkname_bsd(sunaddr, addr_len);
1169     addr_len = strlen(sunaddr->sun_path) +
1170         offsetof(struct sockaddr_un, sun_path) + 1;
1171
1172     addr = unix_create_addr(sunaddr, addr_len);
1173     if (!addr)
1174         return -ENOMEM;
1175
1176     /*
1177      * Get the parent directory, calculate the hash for last
1178      * component.
1179      */
1180     dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1181     if (IS_ERR(dentry)) {
1182         err = PTR_ERR(dentry);
1183         goto out;
1184     }
1185
1186     /*
1187      * All right, let's create it.
1188      */
1189     ns = mnt_user_ns(parent.mnt);
1190     err = security_path_mknod(&parent, dentry, mode, 0);
1191     if (!err)
1192         err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1193     if (err)
1194         goto out_path;
1195     err = mutex_lock_interruptible(&u->bindlock);
1196     if (err)
1197         goto out_unlink;
1198     if (u->addr)
1199         goto out_unlock;
1200
1201     new_hash = unix_bsd_hash(d_backing_inode(dentry));
1202     unix_table_double_lock(net, old_hash, new_hash);
1203     u->path.mnt = mntget(parent.mnt);
1204     u->path.dentry = dget(dentry);
1205     __unix_set_addr_hash(net, sk, addr, new_hash);
1206     unix_table_double_unlock(net, old_hash, new_hash);
1207     unix_insert_bsd_socket(sk);
1208     mutex_unlock(&u->bindlock);
1209     done_path_create(&parent, dentry);
1210     return 0;
1211
1212 out_unlock:
1213     mutex_unlock(&u->bindlock);
1214     err = -EINVAL;
1215 out_unlink:
1216     /* failed after successful mknod?  unlink what we'd created... */
1217     vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1218 out_path:
1219     done_path_create(&parent, dentry);
1220 out:
1221     unix_release_addr(addr);
1222     return err == -EEXIST ? -EADDRINUSE : err;
1223 }
1224
1225 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1226                   int addr_len)
1227 {
1228     unsigned int new_hash, old_hash = sk->sk_hash;
1229     struct unix_sock *u = unix_sk(sk);
1230     struct net *net = sock_net(sk);
1231     struct unix_address *addr;
1232     int err;
1233
1234     addr = unix_create_addr(sunaddr, addr_len);
1235     if (!addr)
1236         return -ENOMEM;
1237
1238     err = mutex_lock_interruptible(&u->bindlock);
1239     if (err)
1240         goto out;
1241
1242     if (u->addr) {
1243         err = -EINVAL;
1244         goto out_mutex;
1245     }
1246
1247     new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1248     unix_table_double_lock(net, old_hash, new_hash);
1249
1250     if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1251         goto out_spin;
1252
1253     __unix_set_addr_hash(net, sk, addr, new_hash);
1254     unix_table_double_unlock(net, old_hash, new_hash);
1255     mutex_unlock(&u->bindlock);
1256     return 0;
1257
1258 out_spin:
1259     unix_table_double_unlock(net, old_hash, new_hash);
1260     err = -EADDRINUSE;
1261 out_mutex:
1262     mutex_unlock(&u->bindlock);
1263 out:
1264     unix_release_addr(addr);
1265     return err;
1266 }
1267
1268 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1269 {
1270     struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1271     struct sock *sk = sock->sk;
1272     int err;
1273
1274     if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1275         sunaddr->sun_family == AF_UNIX)
1276         return unix_autobind(sk);
1277
1278     err = unix_validate_addr(sunaddr, addr_len);
1279     if (err)
1280         return err;
1281
1282     if (sunaddr->sun_path[0])
1283         err = unix_bind_bsd(sk, sunaddr, addr_len);
1284     else
1285         err = unix_bind_abstract(sk, sunaddr, addr_len);
1286
1287     return err;
1288 }
1289
1290 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1291 {
1292     if (unlikely(sk1 == sk2) || !sk2) {
1293         unix_state_lock(sk1);
1294         return;
1295     }
1296     if (sk1 < sk2) {
1297         unix_state_lock(sk1);
1298         unix_state_lock_nested(sk2);
1299     } else {
1300         unix_state_lock(sk2);
1301         unix_state_lock_nested(sk1);
1302     }
1303 }
1304
1305 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1306 {
1307     if (unlikely(sk1 == sk2) || !sk2) {
1308         unix_state_unlock(sk1);
1309         return;
1310     }
1311     unix_state_unlock(sk1);
1312     unix_state_unlock(sk2);
1313 }
1314
1315 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1316                   int alen, int flags)
1317 {
1318     struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1319     struct sock *sk = sock->sk;
1320     struct sock *other;
1321     int err;
1322
1323     err = -EINVAL;
1324     if (alen < offsetofend(struct sockaddr, sa_family))
1325         goto out;
1326
1327     if (addr->sa_family != AF_UNSPEC) {
1328         err = unix_validate_addr(sunaddr, alen);
1329         if (err)
1330             goto out;
1331
1332         if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1333             !unix_sk(sk)->addr) {
1334             err = unix_autobind(sk);
1335             if (err)
1336                 goto out;
1337         }
1338
1339 restart:
1340         other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1341         if (IS_ERR(other)) {
1342             err = PTR_ERR(other);
1343             goto out;
1344         }
1345
1346         unix_state_double_lock(sk, other);
1347
1348         /* Apparently VFS overslept socket death. Retry. */
1349         if (sock_flag(other, SOCK_DEAD)) {
1350             unix_state_double_unlock(sk, other);
1351             sock_put(other);
1352             goto restart;
1353         }
1354
1355         err = -EPERM;
1356         if (!unix_may_send(sk, other))
1357             goto out_unlock;
1358
1359         err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1360         if (err)
1361             goto out_unlock;
1362
1363         sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1364     } else {
1365         /*
1366          *  1003.1g breaking connected state with AF_UNSPEC
1367          */
1368         other = NULL;
1369         unix_state_double_lock(sk, other);
1370     }
1371
1372     /*
1373      * If it was connected, reconnect.
1374      */
1375     if (unix_peer(sk)) {
1376         struct sock *old_peer = unix_peer(sk);
1377
1378         unix_peer(sk) = other;
1379         if (!other)
1380             sk->sk_state = TCP_CLOSE;
1381         unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1382
1383         unix_state_double_unlock(sk, other);
1384
1385         if (other != old_peer)
1386             unix_dgram_disconnected(sk, old_peer);
1387         sock_put(old_peer);
1388     } else {
1389         unix_peer(sk) = other;
1390         unix_state_double_unlock(sk, other);
1391     }
1392
1393     return 0;
1394
1395 out_unlock:
1396     unix_state_double_unlock(sk, other);
1397     sock_put(other);
1398 out:
1399     return err;
1400 }
1401
1402 static long unix_wait_for_peer(struct sock *other, long timeo)
1403     __releases(&unix_sk(other)->lock)
1404 {
1405     struct unix_sock *u = unix_sk(other);
1406     int sched;
1407     DEFINE_WAIT(wait);
1408
1409     prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1410
1411     sched = !sock_flag(other, SOCK_DEAD) &&
1412         !(other->sk_shutdown & RCV_SHUTDOWN) &&
1413         unix_recvq_full(other);
1414
1415     unix_state_unlock(other);
1416
1417     if (sched)
1418         timeo = schedule_timeout(timeo);
1419
1420     finish_wait(&u->peer_wait, &wait);
1421     return timeo;
1422 }
1423
1424 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1425                    int addr_len, int flags)
1426 {
1427     struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1428     struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1429     struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1430     struct net *net = sock_net(sk);
1431     struct sk_buff *skb = NULL;
1432     long timeo;
1433     int err;
1434     int st;
1435
1436     err = unix_validate_addr(sunaddr, addr_len);
1437     if (err)
1438         goto out;
1439
1440     if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1441         err = unix_autobind(sk);
1442         if (err)
1443             goto out;
1444     }
1445
1446     timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1447
1448     /* First of all allocate resources.
1449        If we will make it after state is locked,
1450        we will have to recheck all again in any case.
1451      */
1452
1453     /* create new sock for complete connection */
1454     newsk = unix_create1(net, NULL, 0, sock->type);
1455     if (IS_ERR(newsk)) {
1456         err = PTR_ERR(newsk);
1457         newsk = NULL;
1458         goto out;
1459     }
1460
1461     err = -ENOMEM;
1462
1463     /* Allocate skb for sending to listening sock */
1464     skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1465     if (skb == NULL)
1466         goto out;
1467
1468 restart:
1469     /*  Find listening sock. */
1470     other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1471     if (IS_ERR(other)) {
1472         err = PTR_ERR(other);
1473         other = NULL;
1474         goto out;
1475     }
1476
1477     /* Latch state of peer */
1478     unix_state_lock(other);
1479
1480     /* Apparently VFS overslept socket death. Retry. */
1481     if (sock_flag(other, SOCK_DEAD)) {
1482         unix_state_unlock(other);
1483         sock_put(other);
1484         goto restart;
1485     }
1486
1487     err = -ECONNREFUSED;
1488     if (other->sk_state != TCP_LISTEN)
1489         goto out_unlock;
1490     if (other->sk_shutdown & RCV_SHUTDOWN)
1491         goto out_unlock;
1492
1493     if (unix_recvq_full(other)) {
1494         err = -EAGAIN;
1495         if (!timeo)
1496             goto out_unlock;
1497
1498         timeo = unix_wait_for_peer(other, timeo);
1499
1500         err = sock_intr_errno(timeo);
1501         if (signal_pending(current))
1502             goto out;
1503         sock_put(other);
1504         goto restart;
1505     }
1506
1507     /* Latch our state.
1508
1509        It is tricky place. We need to grab our state lock and cannot
1510        drop lock on peer. It is dangerous because deadlock is
1511        possible. Connect to self case and simultaneous
1512        attempt to connect are eliminated by checking socket
1513        state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1514        check this before attempt to grab lock.
1515
1516        Well, and we have to recheck the state after socket locked.
1517      */
1518     st = sk->sk_state;
1519
1520     switch (st) {
1521     case TCP_CLOSE:
1522         /* This is ok... continue with connect */
1523         break;
1524     case TCP_ESTABLISHED:
1525         /* Socket is already connected */
1526         err = -EISCONN;
1527         goto out_unlock;
1528     default:
1529         err = -EINVAL;
1530         goto out_unlock;
1531     }
1532
1533     unix_state_lock_nested(sk);
1534
1535     if (sk->sk_state != st) {
1536         unix_state_unlock(sk);
1537         unix_state_unlock(other);
1538         sock_put(other);
1539         goto restart;
1540     }
1541
1542     err = security_unix_stream_connect(sk, other, newsk);
1543     if (err) {
1544         unix_state_unlock(sk);
1545         goto out_unlock;
1546     }
1547
1548     /* The way is open! Fastly set all the necessary fields... */
1549
1550     sock_hold(sk);
1551     unix_peer(newsk)    = sk;
1552     newsk->sk_state     = TCP_ESTABLISHED;
1553     newsk->sk_type      = sk->sk_type;
1554     init_peercred(newsk);
1555     newu = unix_sk(newsk);
1556     RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1557     otheru = unix_sk(other);
1558
1559     /* copy address information from listening to new sock
1560      *
1561      * The contents of *(otheru->addr) and otheru->path
1562      * are seen fully set up here, since we have found
1563      * otheru in hash under its lock.  Insertion into the
1564      * hash chain we'd found it in had been done in an
1565      * earlier critical area protected by the chain's lock,
1566      * the same one where we'd set *(otheru->addr) contents,
1567      * as well as otheru->path and otheru->addr itself.
1568      *
1569      * Using smp_store_release() here to set newu->addr
1570      * is enough to make those stores, as well as stores
1571      * to newu->path visible to anyone who gets newu->addr
1572      * by smp_load_acquire().  IOW, the same warranties
1573      * as for unix_sock instances bound in unix_bind() or
1574      * in unix_autobind().
1575      */
1576     if (otheru->path.dentry) {
1577         path_get(&otheru->path);
1578         newu->path = otheru->path;
1579     }
1580     refcount_inc(&otheru->addr->refcnt);
1581     smp_store_release(&newu->addr, otheru->addr);
1582
1583     /* Set credentials */
1584     copy_peercred(sk, other);
1585
1586     sock->state = SS_CONNECTED;
1587     sk->sk_state    = TCP_ESTABLISHED;
1588     sock_hold(newsk);
1589
1590     smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1591     unix_peer(sk)   = newsk;
1592
1593     unix_state_unlock(sk);
1594
1595     /* take ten and send info to listening sock */
1596     spin_lock(&other->sk_receive_queue.lock);
1597     __skb_queue_tail(&other->sk_receive_queue, skb);
1598     spin_unlock(&other->sk_receive_queue.lock);
1599     unix_state_unlock(other);
1600     other->sk_data_ready(other);
1601     sock_put(other);
1602     return 0;
1603
1604 out_unlock:
1605     if (other)
1606         unix_state_unlock(other);
1607
1608 out:
1609     kfree_skb(skb);
1610     if (newsk)
1611         unix_release_sock(newsk, 0);
1612     if (other)
1613         sock_put(other);
1614     return err;
1615 }
1616
1617 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1618 {
1619     struct sock *ska = socka->sk, *skb = sockb->sk;
1620
1621     /* Join our sockets back to back */
1622     sock_hold(ska);
1623     sock_hold(skb);
1624     unix_peer(ska) = skb;
1625     unix_peer(skb) = ska;
1626     init_peercred(ska);
1627     init_peercred(skb);
1628
1629     ska->sk_state = TCP_ESTABLISHED;
1630     skb->sk_state = TCP_ESTABLISHED;
1631     socka->state  = SS_CONNECTED;
1632     sockb->state  = SS_CONNECTED;
1633     return 0;
1634 }
1635
1636 static void unix_sock_inherit_flags(const struct socket *old,
1637                     struct socket *new)
1638 {
1639     if (test_bit(SOCK_PASSCRED, &old->flags))
1640         set_bit(SOCK_PASSCRED, &new->flags);
1641     if (test_bit(SOCK_PASSSEC, &old->flags))
1642         set_bit(SOCK_PASSSEC, &new->flags);
1643 }
1644
1645 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1646                bool kern)
1647 {
1648     struct sock *sk = sock->sk;
1649     struct sock *tsk;
1650     struct sk_buff *skb;
1651     int err;
1652
1653     err = -EOPNOTSUPP;
1654     if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1655         goto out;
1656
1657     err = -EINVAL;
1658     if (sk->sk_state != TCP_LISTEN)
1659         goto out;
1660
1661     /* If socket state is TCP_LISTEN it cannot change (for now...),
1662      * so that no locks are necessary.
1663      */
1664
1665     skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1666                 &err);
1667     if (!skb) {
1668         /* This means receive shutdown. */
1669         if (err == 0)
1670             err = -EINVAL;
1671         goto out;
1672     }
1673
1674     tsk = skb->sk;
1675     skb_free_datagram(sk, skb);
1676     wake_up_interruptible(&unix_sk(sk)->peer_wait);
1677
1678     /* attach accepted sock to socket */
1679     unix_state_lock(tsk);
1680     newsock->state = SS_CONNECTED;
1681     unix_sock_inherit_flags(sock, newsock);
1682     sock_graft(tsk, newsock);
1683     unix_state_unlock(tsk);
1684     return 0;
1685
1686 out:
1687     return err;
1688 }
1689
1690
1691 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1692 {
1693     struct sock *sk = sock->sk;
1694     struct unix_address *addr;
1695     DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1696     int err = 0;
1697
1698     if (peer) {
1699         sk = unix_peer_get(sk);
1700
1701         err = -ENOTCONN;
1702         if (!sk)
1703             goto out;
1704         err = 0;
1705     } else {
1706         sock_hold(sk);
1707     }
1708
1709     addr = smp_load_acquire(&unix_sk(sk)->addr);
1710     if (!addr) {
1711         sunaddr->sun_family = AF_UNIX;
1712         sunaddr->sun_path[0] = 0;
1713         err = offsetof(struct sockaddr_un, sun_path);
1714     } else {
1715         err = addr->len;
1716         memcpy(sunaddr, addr->name, addr->len);
1717     }
1718     sock_put(sk);
1719 out:
1720     return err;
1721 }
1722
1723 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1724 {
1725     scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1726
1727     /*
1728      * Garbage collection of unix sockets starts by selecting a set of
1729      * candidate sockets which have reference only from being in flight
1730      * (total_refs == inflight_refs).  This condition is checked once during
1731      * the candidate collection phase, and candidates are marked as such, so
1732      * that non-candidates can later be ignored.  While inflight_refs is
1733      * protected by unix_gc_lock, total_refs (file count) is not, hence this
1734      * is an instantaneous decision.
1735      *
1736      * Once a candidate, however, the socket must not be reinstalled into a
1737      * file descriptor while the garbage collection is in progress.
1738      *
1739      * If the above conditions are met, then the directed graph of
1740      * candidates (*) does not change while unix_gc_lock is held.
1741      *
1742      * Any operations that changes the file count through file descriptors
1743      * (dup, close, sendmsg) does not change the graph since candidates are
1744      * not installed in fds.
1745      *
1746      * Dequeing a candidate via recvmsg would install it into an fd, but
1747      * that takes unix_gc_lock to decrement the inflight count, so it's
1748      * serialized with garbage collection.
1749      *
1750      * MSG_PEEK is special in that it does not change the inflight count,
1751      * yet does install the socket into an fd.  The following lock/unlock
1752      * pair is to ensure serialization with garbage collection.  It must be
1753      * done between incrementing the file count and installing the file into
1754      * an fd.
1755      *
1756      * If garbage collection starts after the barrier provided by the
1757      * lock/unlock, then it will see the elevated refcount and not mark this
1758      * as a candidate.  If a garbage collection is already in progress
1759      * before the file count was incremented, then the lock/unlock pair will
1760      * ensure that garbage collection is finished before progressing to
1761      * installing the fd.
1762      *
1763      * (*) A -> B where B is on the queue of A or B is on the queue of C
1764      * which is on the queue of listening socket A.
1765      */
1766     spin_lock(&unix_gc_lock);
1767     spin_unlock(&unix_gc_lock);
1768 }
1769
1770 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1771 {
1772     int err = 0;
1773
1774     UNIXCB(skb).pid  = get_pid(scm->pid);
1775     UNIXCB(skb).uid = scm->creds.uid;
1776     UNIXCB(skb).gid = scm->creds.gid;
1777     UNIXCB(skb).fp = NULL;
1778     unix_get_secdata(scm, skb);
1779     if (scm->fp && send_fds)
1780         err = unix_attach_fds(scm, skb);
1781
1782     skb->destructor = unix_destruct_scm;
1783     return err;
1784 }
1785
1786 static bool unix_passcred_enabled(const struct socket *sock,
1787                   const struct sock *other)
1788 {
1789     return test_bit(SOCK_PASSCRED, &sock->flags) ||
1790            !other->sk_socket ||
1791            test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1792 }
1793
1794 /*
1795  * Some apps rely on write() giving SCM_CREDENTIALS
1796  * We include credentials if source or destination socket
1797  * asserted SOCK_PASSCRED.
1798  */
1799 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1800                 const struct sock *other)
1801 {
1802     if (UNIXCB(skb).pid)
1803         return;
1804     if (unix_passcred_enabled(sock, other)) {
1805         UNIXCB(skb).pid  = get_pid(task_tgid(current));
1806         current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1807     }
1808 }
1809
1810 static int maybe_init_creds(struct scm_cookie *scm,
1811                 struct socket *socket,
1812                 const struct sock *other)
1813 {
1814     int err;
1815     struct msghdr msg = { .msg_controllen = 0 };
1816
1817     err = scm_send(socket, &msg, scm, false);
1818     if (err)
1819         return err;
1820
1821     if (unix_passcred_enabled(socket, other)) {
1822         scm->pid = get_pid(task_tgid(current));
1823         current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1824     }
1825     return err;
1826 }
1827
1828 static bool unix_skb_scm_eq(struct sk_buff *skb,
1829                 struct scm_cookie *scm)
1830 {
1831     return UNIXCB(skb).pid == scm->pid &&
1832            uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1833            gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1834            unix_secdata_eq(scm, skb);
1835 }
1836
1837 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1838 {
1839     struct scm_fp_list *fp = UNIXCB(skb).fp;
1840     struct unix_sock *u = unix_sk(sk);
1841
1842     if (unlikely(fp && fp->count))
1843         atomic_add(fp->count, &u->scm_stat.nr_fds);
1844 }
1845
1846 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1847 {
1848     struct scm_fp_list *fp = UNIXCB(skb).fp;
1849     struct unix_sock *u = unix_sk(sk);
1850
1851     if (unlikely(fp && fp->count))
1852         atomic_sub(fp->count, &u->scm_stat.nr_fds);
1853 }
1854
1855 /*
1856  *  Send AF_UNIX data.
1857  */
1858
1859 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1860                   size_t len)
1861 {
1862     DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1863     struct sock *sk = sock->sk, *other = NULL;
1864     struct unix_sock *u = unix_sk(sk);
1865     struct scm_cookie scm;
1866     struct sk_buff *skb;
1867     int data_len = 0;
1868     int sk_locked;
1869     long timeo;
1870     int err;
1871
1872     wait_for_unix_gc();
1873     err = scm_send(sock, msg, &scm, false);
1874     if (err < 0)
1875         return err;
1876
1877     err = -EOPNOTSUPP;
1878     if (msg->msg_flags&MSG_OOB)
1879         goto out;
1880
1881     if (msg->msg_namelen) {
1882         err = unix_validate_addr(sunaddr, msg->msg_namelen);
1883         if (err)
1884             goto out;
1885     } else {
1886         sunaddr = NULL;
1887         err = -ENOTCONN;
1888         other = unix_peer_get(sk);
1889         if (!other)
1890             goto out;
1891     }
1892
1893     if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1894         err = unix_autobind(sk);
1895         if (err)
1896             goto out;
1897     }
1898
1899     err = -EMSGSIZE;
1900     if (len > sk->sk_sndbuf - 32)
1901         goto out;
1902
1903     if (len > SKB_MAX_ALLOC) {
1904         data_len = min_t(size_t,
1905                  len - SKB_MAX_ALLOC,
1906                  MAX_SKB_FRAGS * PAGE_SIZE);
1907         data_len = PAGE_ALIGN(data_len);
1908
1909         BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1910     }
1911
1912     skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1913                    msg->msg_flags & MSG_DONTWAIT, &err,
1914                    PAGE_ALLOC_COSTLY_ORDER);
1915     if (skb == NULL)
1916         goto out;
1917
1918     err = unix_scm_to_skb(&scm, skb, true);
1919     if (err < 0)
1920         goto out_free;
1921
1922     skb_put(skb, len - data_len);
1923     skb->data_len = data_len;
1924     skb->len = len;
1925     err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1926     if (err)
1927         goto out_free;
1928
1929     timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1930
1931 restart:
1932     if (!other) {
1933         err = -ECONNRESET;
1934         if (sunaddr == NULL)
1935             goto out_free;
1936
1937         other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1938                     sk->sk_type);
1939         if (IS_ERR(other)) {
1940             err = PTR_ERR(other);
1941             other = NULL;
1942             goto out_free;
1943         }
1944     }
1945
1946     if (sk_filter(other, skb) < 0) {
1947         /* Toss the packet but do not return any error to the sender */
1948         err = len;
1949         goto out_free;
1950     }
1951
1952     sk_locked = 0;
1953     unix_state_lock(other);
1954 restart_locked:
1955     err = -EPERM;
1956     if (!unix_may_send(sk, other))
1957         goto out_unlock;
1958
1959     if (unlikely(sock_flag(other, SOCK_DEAD))) {
1960         /*
1961          *  Check with 1003.1g - what should
1962          *  datagram error
1963          */
1964         unix_state_unlock(other);
1965         sock_put(other);
1966
1967         if (!sk_locked)
1968             unix_state_lock(sk);
1969
1970         err = 0;
1971         if (unix_peer(sk) == other) {
1972             unix_peer(sk) = NULL;
1973             unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1974
1975             unix_state_unlock(sk);
1976
1977             sk->sk_state = TCP_CLOSE;
1978             unix_dgram_disconnected(sk, other);
1979             sock_put(other);
1980             err = -ECONNREFUSED;
1981         } else {
1982             unix_state_unlock(sk);
1983         }
1984
1985         other = NULL;
1986         if (err)
1987             goto out_free;
1988         goto restart;
1989     }
1990
1991     err = -EPIPE;
1992     if (other->sk_shutdown & RCV_SHUTDOWN)
1993         goto out_unlock;
1994
1995     if (sk->sk_type != SOCK_SEQPACKET) {
1996         err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1997         if (err)
1998             goto out_unlock;
1999     }
2000
2001     /* other == sk && unix_peer(other) != sk if
2002      * - unix_peer(sk) == NULL, destination address bound to sk
2003      * - unix_peer(sk) == sk by time of get but disconnected before lock
2004      */
2005     if (other != sk &&
2006         unlikely(unix_peer(other) != sk &&
2007         unix_recvq_full_lockless(other))) {
2008         if (timeo) {
2009             timeo = unix_wait_for_peer(other, timeo);
2010
2011             err = sock_intr_errno(timeo);
2012             if (signal_pending(current))
2013                 goto out_free;
2014
2015             goto restart;
2016         }
2017
2018         if (!sk_locked) {
2019             unix_state_unlock(other);
2020             unix_state_double_lock(sk, other);
2021         }
2022
2023         if (unix_peer(sk) != other ||
2024             unix_dgram_peer_wake_me(sk, other)) {
2025             err = -EAGAIN;
2026             sk_locked = 1;
2027             goto out_unlock;
2028         }
2029
2030         if (!sk_locked) {
2031             sk_locked = 1;
2032             goto restart_locked;
2033         }
2034     }
2035
2036     if (unlikely(sk_locked))
2037         unix_state_unlock(sk);
2038
2039     if (sock_flag(other, SOCK_RCVTSTAMP))
2040         __net_timestamp(skb);
2041     maybe_add_creds(skb, sock, other);
2042     scm_stat_add(other, skb);
2043     skb_queue_tail(&other->sk_receive_queue, skb);
2044     unix_state_unlock(other);
2045     other->sk_data_ready(other);
2046     sock_put(other);
2047     scm_destroy(&scm);
2048     return len;
2049
2050 out_unlock:
2051     if (sk_locked)
2052         unix_state_unlock(sk);
2053     unix_state_unlock(other);
2054 out_free:
2055     kfree_skb(skb);
2056 out:
2057     if (other)
2058         sock_put(other);
2059     scm_destroy(&scm);
2060     return err;
2061 }
2062
2063 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2064  * bytes, and a minimum of a full page.
2065  */
2066 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2067
2068 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2069 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
2070 {
2071     struct unix_sock *ousk = unix_sk(other);
2072     struct sk_buff *skb;
2073     int err = 0;
2074
2075     skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2076
2077     if (!skb)
2078         return err;
2079
2080     skb_put(skb, 1);
2081     err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2082
2083     if (err) {
2084         kfree_skb(skb);
2085         return err;
2086     }
2087
2088     unix_state_lock(other);
2089
2090     if (sock_flag(other, SOCK_DEAD) ||
2091         (other->sk_shutdown & RCV_SHUTDOWN)) {
2092         unix_state_unlock(other);
2093         kfree_skb(skb);
2094         return -EPIPE;
2095     }
2096
2097     maybe_add_creds(skb, sock, other);
2098     skb_get(skb);
2099
2100     if (ousk->oob_skb)
2101         consume_skb(ousk->oob_skb);
2102
2103     WRITE_ONCE(ousk->oob_skb, skb);
2104
2105     scm_stat_add(other, skb);
2106     skb_queue_tail(&other->sk_receive_queue, skb);
2107     sk_send_sigurg(other);
2108     unix_state_unlock(other);
2109     other->sk_data_ready(other);
2110
2111     return err;
2112 }
2113 #endif
2114
2115 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2116                    size_t len)
2117 {
2118     struct sock *sk = sock->sk;
2119     struct sock *other = NULL;
2120     int err, size;
2121     struct sk_buff *skb;
2122     int sent = 0;
2123     struct scm_cookie scm;
2124     bool fds_sent = false;
2125     int data_len;
2126
2127     wait_for_unix_gc();
2128     err = scm_send(sock, msg, &scm, false);
2129     if (err < 0)
2130         return err;
2131
2132     err = -EOPNOTSUPP;
2133     if (msg->msg_flags & MSG_OOB) {
2134 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2135         if (len)
2136             len--;
2137         else
2138 #endif
2139             goto out_err;
2140     }
2141
2142     if (msg->msg_namelen) {
2143         err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2144         goto out_err;
2145     } else {
2146         err = -ENOTCONN;
2147         other = unix_peer(sk);
2148         if (!other)
2149             goto out_err;
2150     }
2151
2152     if (sk->sk_shutdown & SEND_SHUTDOWN)
2153         goto pipe_err;
2154
2155     while (sent < len) {
2156         size = len - sent;
2157
2158         /* Keep two messages in the pipe so it schedules better */
2159         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2160
2161         /* allow fallback to order-0 allocations */
2162         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2163
2164         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2165
2166         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2167
2168         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2169                        msg->msg_flags & MSG_DONTWAIT, &err,
2170                        get_order(UNIX_SKB_FRAGS_SZ));
2171         if (!skb)
2172             goto out_err;
2173
2174         /* Only send the fds in the first buffer */
2175         err = unix_scm_to_skb(&scm, skb, !fds_sent);
2176         if (err < 0) {
2177             kfree_skb(skb);
2178             goto out_err;
2179         }
2180         fds_sent = true;
2181
2182         skb_put(skb, size - data_len);
2183         skb->data_len = data_len;
2184         skb->len = size;
2185         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2186         if (err) {
2187             kfree_skb(skb);
2188             goto out_err;
2189         }
2190
2191         unix_state_lock(other);
2192
2193         if (sock_flag(other, SOCK_DEAD) ||
2194             (other->sk_shutdown & RCV_SHUTDOWN))
2195             goto pipe_err_free;
2196
2197         maybe_add_creds(skb, sock, other);
2198         scm_stat_add(other, skb);
2199         skb_queue_tail(&other->sk_receive_queue, skb);
2200         unix_state_unlock(other);
2201         other->sk_data_ready(other);
2202         sent += size;
2203     }
2204
2205 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2206     if (msg->msg_flags & MSG_OOB) {
2207         err = queue_oob(sock, msg, other);
2208         if (err)
2209             goto out_err;
2210         sent++;
2211     }
2212 #endif
2213
2214     scm_destroy(&scm);
2215
2216     return sent;
2217
2218 pipe_err_free:
2219     unix_state_unlock(other);
2220     kfree_skb(skb);
2221 pipe_err:
2222     if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2223         send_sig(SIGPIPE, current, 0);
2224     err = -EPIPE;
2225 out_err:
2226     scm_destroy(&scm);
2227     return sent ? : err;
2228 }
2229
2230 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2231                     int offset, size_t size, int flags)
2232 {
2233     int err;
2234     bool send_sigpipe = false;
2235     bool init_scm = true;
2236     struct scm_cookie scm;
2237     struct sock *other, *sk = socket->sk;
2238     struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2239
2240     if (flags & MSG_OOB)
2241         return -EOPNOTSUPP;
2242
2243     other = unix_peer(sk);
2244     if (!other || sk->sk_state != TCP_ESTABLISHED)
2245         return -ENOTCONN;
2246
2247     if (false) {
2248 alloc_skb:
2249         unix_state_unlock(other);
2250         mutex_unlock(&unix_sk(other)->iolock);
2251         newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2252                           &err, 0);
2253         if (!newskb)
2254             goto err;
2255     }
2256
2257     /* we must acquire iolock as we modify already present
2258      * skbs in the sk_receive_queue and mess with skb->len
2259      */
2260     err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2261     if (err) {
2262         err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2263         goto err;
2264     }
2265
2266     if (sk->sk_shutdown & SEND_SHUTDOWN) {
2267         err = -EPIPE;
2268         send_sigpipe = true;
2269         goto err_unlock;
2270     }
2271
2272     unix_state_lock(other);
2273
2274     if (sock_flag(other, SOCK_DEAD) ||
2275         other->sk_shutdown & RCV_SHUTDOWN) {
2276         err = -EPIPE;
2277         send_sigpipe = true;
2278         goto err_state_unlock;
2279     }
2280
2281     if (init_scm) {
2282         err = maybe_init_creds(&scm, socket, other);
2283         if (err)
2284             goto err_state_unlock;
2285         init_scm = false;
2286     }
2287
2288     skb = skb_peek_tail(&other->sk_receive_queue);
2289     if (tail && tail == skb) {
2290         skb = newskb;
2291     } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2292         if (newskb) {
2293             skb = newskb;
2294         } else {
2295             tail = skb;
2296             goto alloc_skb;
2297         }
2298     } else if (newskb) {
2299         /* this is fast path, we don't necessarily need to
2300          * call to kfree_skb even though with newskb == NULL
2301          * this - does no harm
2302          */
2303         consume_skb(newskb);
2304         newskb = NULL;
2305     }
2306
2307     if (skb_append_pagefrags(skb, page, offset, size)) {
2308         tail = skb;
2309         goto alloc_skb;
2310     }
2311
2312     skb->len += size;
2313     skb->data_len += size;
2314     skb->truesize += size;
2315     refcount_add(size, &sk->sk_wmem_alloc);
2316
2317     if (newskb) {
2318         err = unix_scm_to_skb(&scm, skb, false);
2319         if (err)
2320             goto err_state_unlock;
2321         spin_lock(&other->sk_receive_queue.lock);
2322         __skb_queue_tail(&other->sk_receive_queue, newskb);
2323         spin_unlock(&other->sk_receive_queue.lock);
2324     }
2325
2326     unix_state_unlock(other);
2327     mutex_unlock(&unix_sk(other)->iolock);
2328
2329     other->sk_data_ready(other);
2330     scm_destroy(&scm);
2331     return size;
2332
2333 err_state_unlock:
2334     unix_state_unlock(other);
2335 err_unlock:
2336     mutex_unlock(&unix_sk(other)->iolock);
2337 err:
2338     kfree_skb(newskb);
2339     if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2340         send_sig(SIGPIPE, current, 0);
2341     if (!init_scm)
2342         scm_destroy(&scm);
2343     return err;
2344 }
2345
2346 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2347                   size_t len)
2348 {
2349     int err;
2350     struct sock *sk = sock->sk;
2351
2352     err = sock_error(sk);
2353     if (err)
2354         return err;
2355
2356     if (sk->sk_state != TCP_ESTABLISHED)
2357         return -ENOTCONN;
2358
2359     if (msg->msg_namelen)
2360         msg->msg_namelen = 0;
2361
2362     return unix_dgram_sendmsg(sock, msg, len);
2363 }
2364
2365 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2366                   size_t size, int flags)
2367 {
2368     struct sock *sk = sock->sk;
2369
2370     if (sk->sk_state != TCP_ESTABLISHED)
2371         return -ENOTCONN;
2372
2373     return unix_dgram_recvmsg(sock, msg, size, flags);
2374 }
2375
2376 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2377 {
2378     struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2379
2380     if (addr) {
2381         msg->msg_namelen = addr->len;
2382         memcpy(msg->msg_name, addr->name, addr->len);
2383     }
2384 }
2385
2386 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2387              int flags)
2388 {
2389     struct scm_cookie scm;
2390     struct socket *sock = sk->sk_socket;
2391     struct unix_sock *u = unix_sk(sk);
2392     struct sk_buff *skb, *last;
2393     long timeo;
2394     int skip;
2395     int err;
2396
2397     err = -EOPNOTSUPP;
2398     if (flags&MSG_OOB)
2399         goto out;
2400
2401     timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2402
2403     do {
2404         mutex_lock(&u->iolock);
2405
2406         skip = sk_peek_offset(sk, flags);
2407         skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2408                           &skip, &err, &last);
2409         if (skb) {
2410             if (!(flags & MSG_PEEK))
2411                 scm_stat_del(sk, skb);
2412             break;
2413         }
2414
2415         mutex_unlock(&u->iolock);
2416
2417         if (err != -EAGAIN)
2418             break;
2419     } while (timeo &&
2420          !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2421                           &err, &timeo, last));
2422
2423     if (!skb) { /* implies iolock unlocked */
2424         unix_state_lock(sk);
2425         /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2426         if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2427             (sk->sk_shutdown & RCV_SHUTDOWN))
2428             err = 0;
2429         unix_state_unlock(sk);
2430         goto out;
2431     }
2432
2433     if (wq_has_sleeper(&u->peer_wait))
2434         wake_up_interruptible_sync_poll(&u->peer_wait,
2435                         EPOLLOUT | EPOLLWRNORM |
2436                         EPOLLWRBAND);
2437
2438     if (msg->msg_name)
2439         unix_copy_addr(msg, skb->sk);
2440
2441     if (size > skb->len - skip)
2442         size = skb->len - skip;
2443     else if (size < skb->len - skip)
2444         msg->msg_flags |= MSG_TRUNC;
2445
2446     err = skb_copy_datagram_msg(skb, skip, msg, size);
2447     if (err)
2448         goto out_free;
2449
2450     if (sock_flag(sk, SOCK_RCVTSTAMP))
2451         __sock_recv_timestamp(msg, sk, skb);
2452
2453     memset(&scm, 0, sizeof(scm));
2454
2455     scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2456     unix_set_secdata(&scm, skb);
2457
2458     if (!(flags & MSG_PEEK)) {
2459         if (UNIXCB(skb).fp)
2460             unix_detach_fds(&scm, skb);
2461
2462         sk_peek_offset_bwd(sk, skb->len);
2463     } else {
2464         /* It is questionable: on PEEK we could:
2465            - do not return fds - good, but too simple 8)
2466            - return fds, and do not return them on read (old strategy,
2467              apparently wrong)
2468            - clone fds (I chose it for now, it is the most universal
2469              solution)
2470
2471            POSIX 1003.1g does not actually define this clearly
2472            at all. POSIX 1003.1g doesn't define a lot of things
2473            clearly however!
2474
2475         */
2476
2477         sk_peek_offset_fwd(sk, size);
2478
2479         if (UNIXCB(skb).fp)
2480             unix_peek_fds(&scm, skb);
2481     }
2482     err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2483
2484     scm_recv(sock, msg, &scm, flags);
2485
2486 out_free:
2487     skb_free_datagram(sk, skb);
2488     mutex_unlock(&u->iolock);
2489 out:
2490     return err;
2491 }
2492
2493 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2494                   int flags)
2495 {
2496     struct sock *sk = sock->sk;
2497
2498 #ifdef CONFIG_BPF_SYSCALL
2499     const struct proto *prot = READ_ONCE(sk->sk_prot);
2500
2501     if (prot != &unix_dgram_proto)
2502         return prot->recvmsg(sk, msg, size, flags, NULL);
2503 #endif
2504     return __unix_dgram_recvmsg(sk, msg, size, flags);
2505 }
2506
2507 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2508 {
2509     int copied = 0;
2510
2511     while (1) {
2512         struct unix_sock *u = unix_sk(sk);
2513         struct sk_buff *skb;
2514         int used, err;
2515
2516         mutex_lock(&u->iolock);
2517         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2518         mutex_unlock(&u->iolock);
2519         if (!skb)
2520             return err;
2521
2522         used = recv_actor(sk, skb);
2523         if (used <= 0) {
2524             if (!copied)
2525                 copied = used;
2526             kfree_skb(skb);
2527             break;
2528         } else if (used <= skb->len) {
2529             copied += used;
2530         }
2531
2532         kfree_skb(skb);
2533         break;
2534     }
2535
2536     return copied;
2537 }
2538
2539 /*
2540  *  Sleep until more data has arrived. But check for races..
2541  */
2542 static long unix_stream_data_wait(struct sock *sk, long timeo,
2543                   struct sk_buff *last, unsigned int last_len,
2544                   bool freezable)
2545 {
2546     struct sk_buff *tail;
2547     DEFINE_WAIT(wait);
2548
2549     unix_state_lock(sk);
2550
2551     for (;;) {
2552         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2553
2554         tail = skb_peek_tail(&sk->sk_receive_queue);
2555         if (tail != last ||
2556             (tail && tail->len != last_len) ||
2557             sk->sk_err ||
2558             (sk->sk_shutdown & RCV_SHUTDOWN) ||
2559             signal_pending(current) ||
2560             !timeo)
2561             break;
2562
2563         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2564         unix_state_unlock(sk);
2565         if (freezable)
2566             timeo = freezable_schedule_timeout(timeo);
2567         else
2568             timeo = schedule_timeout(timeo);
2569         unix_state_lock(sk);
2570
2571         if (sock_flag(sk, SOCK_DEAD))
2572             break;
2573
2574         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2575     }
2576
2577     finish_wait(sk_sleep(sk), &wait);
2578     unix_state_unlock(sk);
2579     return timeo;
2580 }
2581
2582 static unsigned int unix_skb_len(const struct sk_buff *skb)
2583 {
2584     return skb->len - UNIXCB(skb).consumed;
2585 }
2586
2587 struct unix_stream_read_state {
2588     int (*recv_actor)(struct sk_buff *, int, int,
2589               struct unix_stream_read_state *);
2590     struct socket *socket;
2591     struct msghdr *msg;
2592     struct pipe_inode_info *pipe;
2593     size_t size;
2594     int flags;
2595     unsigned int splice_flags;
2596 };
2597
2598 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2599 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2600 {
2601     struct socket *sock = state->socket;
2602     struct sock *sk = sock->sk;
2603     struct unix_sock *u = unix_sk(sk);
2604     int chunk = 1;
2605     struct sk_buff *oob_skb;
2606
2607     mutex_lock(&u->iolock);
2608     unix_state_lock(sk);
2609
2610     if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2611         unix_state_unlock(sk);
2612         mutex_unlock(&u->iolock);
2613         return -EINVAL;
2614     }
2615
2616     oob_skb = u->oob_skb;
2617
2618     if (!(state->flags & MSG_PEEK))
2619         WRITE_ONCE(u->oob_skb, NULL);
2620
2621     unix_state_unlock(sk);
2622
2623     chunk = state->recv_actor(oob_skb, 0, chunk, state);
2624
2625     if (!(state->flags & MSG_PEEK)) {
2626         UNIXCB(oob_skb).consumed += 1;
2627         kfree_skb(oob_skb);
2628     }
2629
2630     mutex_unlock(&u->iolock);
2631
2632     if (chunk < 0)
2633         return -EFAULT;
2634
2635     state->msg->msg_flags |= MSG_OOB;
2636     return 1;
2637 }
2638
2639 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2640                   int flags, int copied)
2641 {
2642     struct unix_sock *u = unix_sk(sk);
2643
2644     if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2645         skb_unlink(skb, &sk->sk_receive_queue);
2646         consume_skb(skb);
2647         skb = NULL;
2648     } else {
2649         if (skb == u->oob_skb) {
2650             if (copied) {
2651                 skb = NULL;
2652             } else if (sock_flag(sk, SOCK_URGINLINE)) {
2653                 if (!(flags & MSG_PEEK)) {
2654                     WRITE_ONCE(u->oob_skb, NULL);
2655                     consume_skb(skb);
2656                 }
2657             } else if (!(flags & MSG_PEEK)) {
2658                 skb_unlink(skb, &sk->sk_receive_queue);
2659                 consume_skb(skb);
2660                 skb = skb_peek(&sk->sk_receive_queue);
2661             }
2662         }
2663     }
2664     return skb;
2665 }
2666 #endif
2667
2668 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2669 {
2670     if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2671         return -ENOTCONN;
2672
2673     return unix_read_skb(sk, recv_actor);
2674 }
2675
2676 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2677                     bool freezable)
2678 {
2679     struct scm_cookie scm;
2680     struct socket *sock = state->socket;
2681     struct sock *sk = sock->sk;
2682     struct unix_sock *u = unix_sk(sk);
2683     int copied = 0;
2684     int flags = state->flags;
2685     int noblock = flags & MSG_DONTWAIT;
2686     bool check_creds = false;
2687     int target;
2688     int err = 0;
2689     long timeo;
2690     int skip;
2691     size_t size = state->size;
2692     unsigned int last_len;
2693
2694     if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2695         err = -EINVAL;
2696         goto out;
2697     }
2698
2699     if (unlikely(flags & MSG_OOB)) {
2700         err = -EOPNOTSUPP;
2701 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2702         err = unix_stream_recv_urg(state);
2703 #endif
2704         goto out;
2705     }
2706
2707     target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2708     timeo = sock_rcvtimeo(sk, noblock);
2709
2710     memset(&scm, 0, sizeof(scm));
2711
2712     /* Lock the socket to prevent queue disordering
2713      * while sleeps in memcpy_tomsg
2714      */
2715     mutex_lock(&u->iolock);
2716
2717     skip = max(sk_peek_offset(sk, flags), 0);
2718
2719     do {
2720         int chunk;
2721         bool drop_skb;
2722         struct sk_buff *skb, *last;
2723
2724 redo:
2725         unix_state_lock(sk);
2726         if (sock_flag(sk, SOCK_DEAD)) {
2727             err = -ECONNRESET;
2728             goto unlock;
2729         }
2730         last = skb = skb_peek(&sk->sk_receive_queue);
2731         last_len = last ? last->len : 0;
2732
2733 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2734         if (skb) {
2735             skb = manage_oob(skb, sk, flags, copied);
2736             if (!skb) {
2737                 unix_state_unlock(sk);
2738                 if (copied)
2739                     break;
2740                 goto redo;
2741             }
2742         }
2743 #endif
2744 again:
2745         if (skb == NULL) {
2746             if (copied >= target)
2747                 goto unlock;
2748
2749             /*
2750              *  POSIX 1003.1g mandates this order.
2751              */
2752
2753             err = sock_error(sk);
2754             if (err)
2755                 goto unlock;
2756             if (sk->sk_shutdown & RCV_SHUTDOWN)
2757                 goto unlock;
2758
2759             unix_state_unlock(sk);
2760             if (!timeo) {
2761                 err = -EAGAIN;
2762                 break;
2763             }
2764
2765             mutex_unlock(&u->iolock);
2766
2767             timeo = unix_stream_data_wait(sk, timeo, last,
2768                               last_len, freezable);
2769
2770             if (signal_pending(current)) {
2771                 err = sock_intr_errno(timeo);
2772                 scm_destroy(&scm);
2773                 goto out;
2774             }
2775
2776             mutex_lock(&u->iolock);
2777             goto redo;
2778 unlock:
2779             unix_state_unlock(sk);
2780             break;
2781         }
2782
2783         while (skip >= unix_skb_len(skb)) {
2784             skip -= unix_skb_len(skb);
2785             last = skb;
2786             last_len = skb->len;
2787             skb = skb_peek_next(skb, &sk->sk_receive_queue);
2788             if (!skb)
2789                 goto again;
2790         }
2791
2792         unix_state_unlock(sk);
2793
2794         if (check_creds) {
2795             /* Never glue messages from different writers */
2796             if (!unix_skb_scm_eq(skb, &scm))
2797                 break;
2798         } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2799             /* Copy credentials */
2800             scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2801             unix_set_secdata(&scm, skb);
2802             check_creds = true;
2803         }
2804
2805         /* Copy address just once */
2806         if (state->msg && state->msg->msg_name) {
2807             DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2808                      state->msg->msg_name);
2809             unix_copy_addr(state->msg, skb->sk);
2810             sunaddr = NULL;
2811         }
2812
2813         chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2814         skb_get(skb);
2815         chunk = state->recv_actor(skb, skip, chunk, state);
2816         drop_skb = !unix_skb_len(skb);
2817         /* skb is only safe to use if !drop_skb */
2818         consume_skb(skb);
2819         if (chunk < 0) {
2820             if (copied == 0)
2821                 copied = -EFAULT;
2822             break;
2823         }
2824         copied += chunk;
2825         size -= chunk;
2826
2827         if (drop_skb) {
2828             /* the skb was touched by a concurrent reader;
2829              * we should not expect anything from this skb
2830              * anymore and assume it invalid - we can be
2831              * sure it was dropped from the socket queue
2832              *
2833              * let's report a short read
2834              */
2835             err = 0;
2836             break;
2837         }
2838
2839         /* Mark read part of skb as used */
2840         if (!(flags & MSG_PEEK)) {
2841             UNIXCB(skb).consumed += chunk;
2842
2843             sk_peek_offset_bwd(sk, chunk);
2844
2845             if (UNIXCB(skb).fp) {
2846                 scm_stat_del(sk, skb);
2847                 unix_detach_fds(&scm, skb);
2848             }
2849
2850             if (unix_skb_len(skb))
2851                 break;
2852
2853             skb_unlink(skb, &sk->sk_receive_queue);
2854             consume_skb(skb);
2855
2856             if (scm.fp)
2857                 break;
2858         } else {
2859             /* It is questionable, see note in unix_dgram_recvmsg.
2860              */
2861             if (UNIXCB(skb).fp)
2862                 unix_peek_fds(&scm, skb);
2863
2864             sk_peek_offset_fwd(sk, chunk);
2865
2866             if (UNIXCB(skb).fp)
2867                 break;
2868
2869             skip = 0;
2870             last = skb;
2871             last_len = skb->len;
2872             unix_state_lock(sk);
2873             skb = skb_peek_next(skb, &sk->sk_receive_queue);
2874             if (skb)
2875                 goto again;
2876             unix_state_unlock(sk);
2877             break;
2878         }
2879     } while (size);
2880
2881     mutex_unlock(&u->iolock);
2882     if (state->msg)
2883         scm_recv(sock, state->msg, &scm, flags);
2884     else
2885         scm_destroy(&scm);
2886 out:
2887     return copied ? : err;
2888 }
2889
2890 static int unix_stream_read_actor(struct sk_buff *skb,
2891                   int skip, int chunk,
2892                   struct unix_stream_read_state *state)
2893 {
2894     int ret;
2895
2896     ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2897                     state->msg, chunk);
2898     return ret ?: chunk;
2899 }
2900
2901 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2902               size_t size, int flags)
2903 {
2904     struct unix_stream_read_state state = {
2905         .recv_actor = unix_stream_read_actor,
2906         .socket = sk->sk_socket,
2907         .msg = msg,
2908         .size = size,
2909         .flags = flags
2910     };
2911
2912     return unix_stream_read_generic(&state, true);
2913 }
2914
2915 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2916                    size_t size, int flags)
2917 {
2918     struct unix_stream_read_state state = {
2919         .recv_actor = unix_stream_read_actor,
2920         .socket = sock,
2921         .msg = msg,
2922         .size = size,
2923         .flags = flags
2924     };
2925
2926 #ifdef CONFIG_BPF_SYSCALL
2927     struct sock *sk = sock->sk;
2928     const struct proto *prot = READ_ONCE(sk->sk_prot);
2929
2930     if (prot != &unix_stream_proto)
2931         return prot->recvmsg(sk, msg, size, flags, NULL);
2932 #endif
2933     return unix_stream_read_generic(&state, true);
2934 }
2935
2936 static int unix_stream_splice_actor(struct sk_buff *skb,
2937                     int skip, int chunk,
2938                     struct unix_stream_read_state *state)
2939 {
2940     return skb_splice_bits(skb, state->socket->sk,
2941                    UNIXCB(skb).consumed + skip,
2942                    state->pipe, chunk, state->splice_flags);
2943 }
2944
2945 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2946                        struct pipe_inode_info *pipe,
2947                        size_t size, unsigned int flags)
2948 {
2949     struct unix_stream_read_state state = {
2950         .recv_actor = unix_stream_splice_actor,
2951         .socket = sock,
2952         .pipe = pipe,
2953         .size = size,
2954         .splice_flags = flags,
2955     };
2956
2957     if (unlikely(*ppos))
2958         return -ESPIPE;
2959
2960     if (sock->file->f_flags & O_NONBLOCK ||
2961         flags & SPLICE_F_NONBLOCK)
2962         state.flags = MSG_DONTWAIT;
2963
2964     return unix_stream_read_generic(&state, false);
2965 }
2966
2967 static int unix_shutdown(struct socket *sock, int mode)
2968 {
2969     struct sock *sk = sock->sk;
2970     struct sock *other;
2971
2972     if (mode < SHUT_RD || mode > SHUT_RDWR)
2973         return -EINVAL;
2974     /* This maps:
2975      * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2976      * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2977      * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2978      */
2979     ++mode;
2980
2981     unix_state_lock(sk);
2982     sk->sk_shutdown |= mode;
2983     other = unix_peer(sk);
2984     if (other)
2985         sock_hold(other);
2986     unix_state_unlock(sk);
2987     sk->sk_state_change(sk);
2988
2989     if (other &&
2990         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2991
2992         int peer_mode = 0;
2993         const struct proto *prot = READ_ONCE(other->sk_prot);
2994
2995         if (prot->unhash)
2996             prot->unhash(other);
2997         if (mode&RCV_SHUTDOWN)
2998             peer_mode |= SEND_SHUTDOWN;
2999         if (mode&SEND_SHUTDOWN)
3000             peer_mode |= RCV_SHUTDOWN;
3001         unix_state_lock(other);
3002         other->sk_shutdown |= peer_mode;
3003         unix_state_unlock(other);
3004         other->sk_state_change(other);
3005         if (peer_mode == SHUTDOWN_MASK)
3006             sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3007         else if (peer_mode & RCV_SHUTDOWN)
3008             sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3009     }
3010     if (other)
3011         sock_put(other);
3012
3013     return 0;
3014 }
3015
3016 long unix_inq_len(struct sock *sk)
3017 {
3018     struct sk_buff *skb;
3019     long amount = 0;
3020
3021     if (sk->sk_state == TCP_LISTEN)
3022         return -EINVAL;
3023
3024     spin_lock(&sk->sk_receive_queue.lock);
3025     if (sk->sk_type == SOCK_STREAM ||
3026         sk->sk_type == SOCK_SEQPACKET) {
3027         skb_queue_walk(&sk->sk_receive_queue, skb)
3028             amount += unix_skb_len(skb);
3029     } else {
3030         skb = skb_peek(&sk->sk_receive_queue);
3031         if (skb)
3032             amount = skb->len;
3033     }
3034     spin_unlock(&sk->sk_receive_queue.lock);
3035
3036     return amount;
3037 }
3038 EXPORT_SYMBOL_GPL(unix_inq_len);
3039
3040 long unix_outq_len(struct sock *sk)
3041 {
3042     return sk_wmem_alloc_get(sk);
3043 }
3044 EXPORT_SYMBOL_GPL(unix_outq_len);
3045
3046 static int unix_open_file(struct sock *sk)
3047 {
3048     struct path path;
3049     struct file *f;
3050     int fd;
3051
3052     if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3053         return -EPERM;
3054
3055     if (!smp_load_acquire(&unix_sk(sk)->addr))
3056         return -ENOENT;
3057
3058     path = unix_sk(sk)->path;
3059     if (!path.dentry)
3060         return -ENOENT;
3061
3062     path_get(&path);
3063
3064     fd = get_unused_fd_flags(O_CLOEXEC);
3065     if (fd < 0)
3066         goto out;
3067
3068     f = dentry_open(&path, O_PATH, current_cred());
3069     if (IS_ERR(f)) {
3070         put_unused_fd(fd);
3071         fd = PTR_ERR(f);
3072         goto out;
3073     }
3074
3075     fd_install(fd, f);
3076 out:
3077     path_put(&path);
3078
3079     return fd;
3080 }
3081
3082 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3083 {
3084     struct sock *sk = sock->sk;
3085     long amount = 0;
3086     int err;
3087
3088     switch (cmd) {
3089     case SIOCOUTQ:
3090         amount = unix_outq_len(sk);
3091         err = put_user(amount, (int __user *)arg);
3092         break;
3093     case SIOCINQ:
3094         amount = unix_inq_len(sk);
3095         if (amount < 0)
3096             err = amount;
3097         else
3098             err = put_user(amount, (int __user *)arg);
3099         break;
3100     case SIOCUNIXFILE:
3101         err = unix_open_file(sk);
3102         break;
3103 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3104     case SIOCATMARK:
3105         {
3106             struct sk_buff *skb;
3107             int answ = 0;
3108
3109             skb = skb_peek(&sk->sk_receive_queue);
3110             if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3111                 answ = 1;
3112             err = put_user(answ, (int __user *)arg);
3113         }
3114         break;
3115 #endif
3116     default:
3117         err = -ENOIOCTLCMD;
3118         break;
3119     }
3120     return err;
3121 }
3122
3123 #ifdef CONFIG_COMPAT
3124 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3125 {
3126     return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3127 }
3128 #endif
3129
3130 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3131 {
3132     struct sock *sk = sock->sk;
3133     __poll_t mask;
3134
3135     sock_poll_wait(file, sock, wait);
3136     mask = 0;
3137
3138     /* exceptional events? */
3139     if (sk->sk_err)
3140         mask |= EPOLLERR;
3141     if (sk->sk_shutdown == SHUTDOWN_MASK)
3142         mask |= EPOLLHUP;
3143     if (sk->sk_shutdown & RCV_SHUTDOWN)
3144         mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3145
3146     /* readable? */
3147     if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3148         mask |= EPOLLIN | EPOLLRDNORM;
3149     if (sk_is_readable(sk))
3150         mask |= EPOLLIN | EPOLLRDNORM;
3151 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3152     if (READ_ONCE(unix_sk(sk)->oob_skb))
3153         mask |= EPOLLPRI;
3154 #endif
3155
3156     /* Connection-based need to check for termination and startup */
3157     if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3158         sk->sk_state == TCP_CLOSE)
3159         mask |= EPOLLHUP;
3160
3161     /*
3162      * we set writable also when the other side has shut down the
3163      * connection. This prevents stuck sockets.
3164      */
3165     if (unix_writable(sk))
3166         mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3167
3168     return mask;
3169 }
3170
3171 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3172                     poll_table *wait)
3173 {
3174     struct sock *sk = sock->sk, *other;
3175     unsigned int writable;
3176     __poll_t mask;
3177
3178     sock_poll_wait(file, sock, wait);
3179     mask = 0;
3180
3181     /* exceptional events? */
3182     if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3183         mask |= EPOLLERR |
3184             (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3185
3186     if (sk->sk_shutdown & RCV_SHUTDOWN)
3187         mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3188     if (sk->sk_shutdown == SHUTDOWN_MASK)
3189         mask |= EPOLLHUP;
3190
3191     /* readable? */
3192     if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3193         mask |= EPOLLIN | EPOLLRDNORM;
3194     if (sk_is_readable(sk))
3195         mask |= EPOLLIN | EPOLLRDNORM;
3196
3197     /* Connection-based need to check for termination and startup */
3198     if (sk->sk_type == SOCK_SEQPACKET) {
3199         if (sk->sk_state == TCP_CLOSE)
3200             mask |= EPOLLHUP;
3201         /* connection hasn't started yet? */
3202         if (sk->sk_state == TCP_SYN_SENT)
3203             return mask;
3204     }
3205
3206     /* No write status requested, avoid expensive OUT tests. */
3207     if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3208         return mask;
3209
3210     writable = unix_writable(sk);
3211     if (writable) {
3212         unix_state_lock(sk);
3213
3214         other = unix_peer(sk);
3215         if (other && unix_peer(other) != sk &&
3216             unix_recvq_full_lockless(other) &&
3217             unix_dgram_peer_wake_me(sk, other))
3218             writable = 0;
3219
3220         unix_state_unlock(sk);
3221     }
3222
3223     if (writable)
3224         mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3225     else
3226         sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3227
3228     return mask;
3229 }
3230
3231 #ifdef CONFIG_PROC_FS
3232
3233 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3234
3235 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3236 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3237 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3238
3239 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3240 {
3241     unsigned long offset = get_offset(*pos);
3242     unsigned long bucket = get_bucket(*pos);
3243     unsigned long count = 0;
3244     struct sock *sk;
3245
3246     for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3247          sk; sk = sk_next(sk)) {
3248         if (++count == offset)
3249             break;
3250     }
3251
3252     return sk;
3253 }
3254
3255 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3256 {
3257     unsigned long bucket = get_bucket(*pos);
3258     struct net *net = seq_file_net(seq);
3259     struct sock *sk;
3260
3261     while (bucket < UNIX_HASH_SIZE) {
3262         spin_lock(&net->unx.table.locks[bucket]);
3263
3264         sk = unix_from_bucket(seq, pos);
3265         if (sk)
3266             return sk;
3267
3268         spin_unlock(&net->unx.table.locks[bucket]);
3269
3270         *pos = set_bucket_offset(++bucket, 1);
3271     }
3272
3273     return NULL;
3274 }
3275
3276 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3277                   loff_t *pos)
3278 {
3279     unsigned long bucket = get_bucket(*pos);
3280
3281     sk = sk_next(sk);
3282     if (sk)
3283         return sk;
3284
3285
3286     spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3287
3288     *pos = set_bucket_offset(++bucket, 1);
3289
3290     return unix_get_first(seq, pos);
3291 }
3292
3293 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3294 {
3295     if (!*pos)
3296         return SEQ_START_TOKEN;
3297
3298     return unix_get_first(seq, pos);
3299 }
3300
3301 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3302 {
3303     ++*pos;
3304
3305     if (v == SEQ_START_TOKEN)
3306         return unix_get_first(seq, pos);
3307
3308     return unix_get_next(seq, v, pos);
3309 }
3310
3311 static void unix_seq_stop(struct seq_file *seq, void *v)
3312 {
3313     struct sock *sk = v;
3314
3315     if (sk)
3316         spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3317 }
3318
3319 static int unix_seq_show(struct seq_file *seq, void *v)
3320 {
3321
3322     if (v == SEQ_START_TOKEN)
3323         seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3324              "Inode Path\n");
3325     else {
3326         struct sock *s = v;
3327         struct unix_sock *u = unix_sk(s);
3328         unix_state_lock(s);
3329
3330         seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3331             s,
3332             refcount_read(&s->sk_refcnt),
3333             0,
3334             s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3335             s->sk_type,
3336             s->sk_socket ?
3337             (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3338             (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3339             sock_i_ino(s));
3340
3341         if (u->addr) {  // under a hash table lock here
3342             int i, len;
3343             seq_putc(seq, ' ');
3344
3345             i = 0;
3346             len = u->addr->len -
3347                 offsetof(struct sockaddr_un, sun_path);
3348             if (u->addr->name->sun_path[0]) {
3349                 len--;
3350             } else {
3351                 seq_putc(seq, '@');
3352                 i++;
3353             }
3354             for ( ; i < len; i++)
3355                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3356                      '@');
3357         }
3358         unix_state_unlock(s);
3359         seq_putc(seq, '\n');
3360     }
3361
3362     return 0;
3363 }
3364
3365 static const struct seq_operations unix_seq_ops = {
3366     .start  = unix_seq_start,
3367     .next   = unix_seq_next,
3368     .stop   = unix_seq_stop,
3369     .show   = unix_seq_show,
3370 };
3371
3372 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3373 struct bpf_unix_iter_state {
3374     struct seq_net_private p;
3375     unsigned int cur_sk;
3376     unsigned int end_sk;
3377     unsigned int max_sk;
3378     struct sock **batch;
3379     bool st_bucket_done;
3380 };
3381
3382 struct bpf_iter__unix {
3383     __bpf_md_ptr(struct bpf_iter_meta *, meta);
3384     __bpf_md_ptr(struct unix_sock *, unix_sk);
3385     uid_t uid __aligned(8);
3386 };
3387
3388 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3389                   struct unix_sock *unix_sk, uid_t uid)
3390 {
3391     struct bpf_iter__unix ctx;
3392
3393     meta->seq_num--;  /* skip SEQ_START_TOKEN */
3394     ctx.meta = meta;
3395     ctx.unix_sk = unix_sk;
3396     ctx.uid = uid;
3397     return bpf_iter_run_prog(prog, &ctx);
3398 }
3399
3400 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3401
3402 {
3403     struct bpf_unix_iter_state *iter = seq->private;
3404     unsigned int expected = 1;
3405     struct sock *sk;
3406
3407     sock_hold(start_sk);
3408     iter->batch[iter->end_sk++] = start_sk;
3409
3410     for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3411         if (iter->end_sk < iter->max_sk) {
3412             sock_hold(sk);
3413             iter->batch[iter->end_sk++] = sk;
3414         }
3415
3416         expected++;
3417     }
3418
3419     spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3420
3421     return expected;
3422 }
3423
3424 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3425 {
3426     while (iter->cur_sk < iter->end_sk)
3427         sock_put(iter->batch[iter->cur_sk++]);
3428 }
3429
3430 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3431                        unsigned int new_batch_sz)
3432 {
3433     struct sock **new_batch;
3434
3435     new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3436                  GFP_USER | __GFP_NOWARN);
3437     if (!new_batch)
3438         return -ENOMEM;
3439
3440     bpf_iter_unix_put_batch(iter);
3441     kvfree(iter->batch);
3442     iter->batch = new_batch;
3443     iter->max_sk = new_batch_sz;
3444
3445     return 0;
3446 }
3447
3448 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3449                     loff_t *pos)
3450 {
3451     struct bpf_unix_iter_state *iter = seq->private;
3452     unsigned int expected;
3453     bool resized = false;
3454     struct sock *sk;
3455
3456     if (iter->st_bucket_done)
3457         *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3458
3459 again:
3460     /* Get a new batch */
3461     iter->cur_sk = 0;
3462     iter->end_sk = 0;
3463
3464     sk = unix_get_first(seq, pos);
3465     if (!sk)
3466         return NULL; /* Done */
3467
3468     expected = bpf_iter_unix_hold_batch(seq, sk);
3469
3470     if (iter->end_sk == expected) {
3471         iter->st_bucket_done = true;
3472         return sk;
3473     }
3474
3475     if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3476         resized = true;
3477         goto again;
3478     }
3479
3480     return sk;
3481 }
3482
3483 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3484 {
3485     if (!*pos)
3486         return SEQ_START_TOKEN;
3487
3488     /* bpf iter does not support lseek, so it always
3489      * continue from where it was stop()-ped.
3490      */
3491     return bpf_iter_unix_batch(seq, pos);
3492 }
3493
3494 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3495 {
3496     struct bpf_unix_iter_state *iter = seq->private;
3497     struct sock *sk;
3498
3499     /* Whenever seq_next() is called, the iter->cur_sk is
3500      * done with seq_show(), so advance to the next sk in
3501      * the batch.
3502      */
3503     if (iter->cur_sk < iter->end_sk)
3504         sock_put(iter->batch[iter->cur_sk++]);
3505
3506     ++*pos;
3507
3508     if (iter->cur_sk < iter->end_sk)
3509         sk = iter->batch[iter->cur_sk];
3510     else
3511         sk = bpf_iter_unix_batch(seq, pos);
3512
3513     return sk;
3514 }
3515
3516 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3517 {
3518     struct bpf_iter_meta meta;
3519     struct bpf_prog *prog;
3520     struct sock *sk = v;
3521     uid_t uid;
3522     bool slow;
3523     int ret;
3524
3525     if (v == SEQ_START_TOKEN)
3526         return 0;
3527
3528     slow = lock_sock_fast(sk);
3529
3530     if (unlikely(sk_unhashed(sk))) {
3531         ret = SEQ_SKIP;
3532         goto unlock;
3533     }
3534
3535     uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3536     meta.seq = seq;
3537     prog = bpf_iter_get_info(&meta, false);
3538     ret = unix_prog_seq_show(prog, &meta, v, uid);
3539 unlock:
3540     unlock_sock_fast(sk, slow);
3541     return ret;
3542 }
3543
3544 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3545 {
3546     struct bpf_unix_iter_state *iter = seq->private;
3547     struct bpf_iter_meta meta;
3548     struct bpf_prog *prog;
3549
3550     if (!v) {
3551         meta.seq = seq;
3552         prog = bpf_iter_get_info(&meta, true);
3553         if (prog)
3554             (void)unix_prog_seq_show(prog, &meta, v, 0);
3555     }
3556
3557     if (iter->cur_sk < iter->end_sk)
3558         bpf_iter_unix_put_batch(iter);
3559 }
3560
3561 static const struct seq_operations bpf_iter_unix_seq_ops = {
3562     .start  = bpf_iter_unix_seq_start,
3563     .next   = bpf_iter_unix_seq_next,
3564     .stop   = bpf_iter_unix_seq_stop,
3565     .show   = bpf_iter_unix_seq_show,
3566 };
3567 #endif
3568 #endif
3569
3570 static const struct net_proto_family unix_family_ops = {
3571     .family = PF_UNIX,
3572     .create = unix_create,
3573     .owner  = THIS_MODULE,
3574 };
3575
3576
3577 static int __net_init unix_net_init(struct net *net)
3578 {
3579     int i;
3580
3581     net->unx.sysctl_max_dgram_qlen = 10;
3582     if (unix_sysctl_register(net))
3583         goto out;
3584
3585 #ifdef CONFIG_PROC_FS
3586     if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3587                  sizeof(struct seq_net_private)))
3588         goto err_sysctl;
3589 #endif
3590
3591     net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3592                           sizeof(spinlock_t), GFP_KERNEL);
3593     if (!net->unx.table.locks)
3594         goto err_proc;
3595
3596     net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3597                         sizeof(struct hlist_head),
3598                         GFP_KERNEL);
3599     if (!net->unx.table.buckets)
3600         goto free_locks;
3601
3602     for (i = 0; i < UNIX_HASH_SIZE; i++) {
3603         spin_lock_init(&net->unx.table.locks[i]);
3604         INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3605     }
3606
3607     return 0;
3608
3609 free_locks:
3610     kvfree(net->unx.table.locks);
3611 err_proc:
3612 #ifdef CONFIG_PROC_FS
3613     remove_proc_entry("unix", net->proc_net);
3614 err_sysctl:
3615 #endif
3616     unix_sysctl_unregister(net);
3617 out:
3618     return -ENOMEM;
3619 }
3620
3621 static void __net_exit unix_net_exit(struct net *net)
3622 {
3623     kvfree(net->unx.table.buckets);
3624     kvfree(net->unx.table.locks);
3625     unix_sysctl_unregister(net);
3626     remove_proc_entry("unix", net->proc_net);
3627 }
3628
3629 static struct pernet_operations unix_net_ops = {
3630     .init = unix_net_init,
3631     .exit = unix_net_exit,
3632 };
3633
3634 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3635 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3636              struct unix_sock *unix_sk, uid_t uid)
3637
3638 #define INIT_BATCH_SZ 16
3639
3640 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3641 {
3642     struct bpf_unix_iter_state *iter = priv_data;
3643     int err;
3644
3645     err = bpf_iter_init_seq_net(priv_data, aux);
3646     if (err)
3647         return err;
3648
3649     err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3650     if (err) {
3651         bpf_iter_fini_seq_net(priv_data);
3652         return err;
3653     }
3654
3655     return 0;
3656 }
3657
3658 static void bpf_iter_fini_unix(void *priv_data)
3659 {
3660     struct bpf_unix_iter_state *iter = priv_data;
3661
3662     bpf_iter_fini_seq_net(priv_data);
3663     kvfree(iter->batch);
3664 }
3665
3666 static const struct bpf_iter_seq_info unix_seq_info = {
3667     .seq_ops        = &bpf_iter_unix_seq_ops,
3668     .init_seq_private   = bpf_iter_init_unix,
3669     .fini_seq_private   = bpf_iter_fini_unix,
3670     .seq_priv_size      = sizeof(struct bpf_unix_iter_state),
3671 };
3672
3673 static const struct bpf_func_proto *
3674 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3675                  const struct bpf_prog *prog)
3676 {
3677     switch (func_id) {
3678     case BPF_FUNC_setsockopt:
3679         return &bpf_sk_setsockopt_proto;
3680     case BPF_FUNC_getsockopt:
3681         return &bpf_sk_getsockopt_proto;
3682     default:
3683         return NULL;
3684     }
3685 }
3686
3687 static struct bpf_iter_reg unix_reg_info = {
3688     .target         = "unix",
3689     .ctx_arg_info_size  = 1,
3690     .ctx_arg_info       = {
3691         { offsetof(struct bpf_iter__unix, unix_sk),
3692           PTR_TO_BTF_ID_OR_NULL },
3693     },
3694     .get_func_proto         = bpf_iter_unix_get_func_proto,
3695     .seq_info       = &unix_seq_info,
3696 };
3697
3698 static void __init bpf_iter_register(void)
3699 {
3700     unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3701     if (bpf_iter_reg_target(&unix_reg_info))
3702         pr_warn("Warning: could not register bpf iterator unix\n");
3703 }
3704 #endif
3705
3706 static int __init af_unix_init(void)
3707 {
3708     int i, rc = -1;
3709
3710     BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3711
3712     for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3713         spin_lock_init(&bsd_socket_locks[i]);
3714         INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3715     }
3716
3717     rc = proto_register(&unix_dgram_proto, 1);
3718     if (rc != 0) {
3719         pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3720         goto out;
3721     }
3722
3723     rc = proto_register(&unix_stream_proto, 1);
3724     if (rc != 0) {
3725         pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3726         goto out;
3727     }
3728
3729     sock_register(&unix_family_ops);
3730     register_pernet_subsys(&unix_net_ops);
3731     unix_bpf_build_proto();
3732
3733 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3734     bpf_iter_register();
3735 #endif
3736
3737 out:
3738     return rc;
3739 }
3740
3741 static void __exit af_unix_exit(void)
3742 {
3743     sock_unregister(PF_UNIX);
3744     proto_unregister(&unix_dgram_proto);
3745     proto_unregister(&unix_stream_proto);
3746     unregister_pernet_subsys(&unix_net_ops);
3747 }
3748
3749 /* Earlier than device_initcall() so that other drivers invoking
3750    request_module() don't end up in a loop when modprobe tries
3751    to use a UNIX socket. But later than subsys_initcall() because
3752    we depend on stuff initialised there */
3753 fs_initcall(af_unix_init);
3754 module_exit(af_unix_exit);
3755
3756 MODULE_LICENSE("GPL");
3757 MODULE_ALIAS_NETPROTO(PF_UNIX);