Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * INET     An implementation of the TCP/IP protocol suite for the LINUX
0004  *      operating system.  INET is implemented using the  BSD Socket
0005  *      interface as the means of communication with the user level.
0006  *
0007  *      Generic socket support routines. Memory allocators, socket lock/release
0008  *      handler for protocols to use and generic option handler.
0009  *
0010  * Authors: Ross Biro
0011  *      Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
0012  *      Florian La Roche, <flla@stud.uni-sb.de>
0013  *      Alan Cox, <A.Cox@swansea.ac.uk>
0014  *
0015  * Fixes:
0016  *      Alan Cox    :   Numerous verify_area() problems
0017  *      Alan Cox    :   Connecting on a connecting socket
0018  *                  now returns an error for tcp.
0019  *      Alan Cox    :   sock->protocol is set correctly.
0020  *                  and is not sometimes left as 0.
0021  *      Alan Cox    :   connect handles icmp errors on a
0022  *                  connect properly. Unfortunately there
0023  *                  is a restart syscall nasty there. I
0024  *                  can't match BSD without hacking the C
0025  *                  library. Ideas urgently sought!
0026  *      Alan Cox    :   Disallow bind() to addresses that are
0027  *                  not ours - especially broadcast ones!!
0028  *      Alan Cox    :   Socket 1024 _IS_ ok for users. (fencepost)
0029  *      Alan Cox    :   sock_wfree/sock_rfree don't destroy sockets,
0030  *                  instead they leave that for the DESTROY timer.
0031  *      Alan Cox    :   Clean up error flag in accept
0032  *      Alan Cox    :   TCP ack handling is buggy, the DESTROY timer
0033  *                  was buggy. Put a remove_sock() in the handler
0034  *                  for memory when we hit 0. Also altered the timer
0035  *                  code. The ACK stuff can wait and needs major
0036  *                  TCP layer surgery.
0037  *      Alan Cox    :   Fixed TCP ack bug, removed remove sock
0038  *                  and fixed timer/inet_bh race.
0039  *      Alan Cox    :   Added zapped flag for TCP
0040  *      Alan Cox    :   Move kfree_skb into skbuff.c and tidied up surplus code
0041  *      Alan Cox    :   for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
0042  *      Alan Cox    :   kfree_s calls now are kfree_skbmem so we can track skb resources
0043  *      Alan Cox    :   Supports socket option broadcast now as does udp. Packet and raw need fixing.
0044  *      Alan Cox    :   Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
0045  *      Rick Sladkey    :   Relaxed UDP rules for matching packets.
0046  *      C.E.Hawkins :   IFF_PROMISC/SIOCGHWADDR support
0047  *  Pauline Middelink   :   identd support
0048  *      Alan Cox    :   Fixed connect() taking signals I think.
0049  *      Alan Cox    :   SO_LINGER supported
0050  *      Alan Cox    :   Error reporting fixes
0051  *      Anonymous   :   inet_create tidied up (sk->reuse setting)
0052  *      Alan Cox    :   inet sockets don't set sk->type!
0053  *      Alan Cox    :   Split socket option code
0054  *      Alan Cox    :   Callbacks
0055  *      Alan Cox    :   Nagle flag for Charles & Johannes stuff
0056  *      Alex        :   Removed restriction on inet fioctl
0057  *      Alan Cox    :   Splitting INET from NET core
0058  *      Alan Cox    :   Fixed bogus SO_TYPE handling in getsockopt()
0059  *      Adam Caldwell   :   Missing return in SO_DONTROUTE/SO_DEBUG code
0060  *      Alan Cox    :   Split IP from generic code
0061  *      Alan Cox    :   New kfree_skbmem()
0062  *      Alan Cox    :   Make SO_DEBUG superuser only.
0063  *      Alan Cox    :   Allow anyone to clear SO_DEBUG
0064  *                  (compatibility fix)
0065  *      Alan Cox    :   Added optimistic memory grabbing for AF_UNIX throughput.
0066  *      Alan Cox    :   Allocator for a socket is settable.
0067  *      Alan Cox    :   SO_ERROR includes soft errors.
0068  *      Alan Cox    :   Allow NULL arguments on some SO_ opts
0069  *      Alan Cox    :   Generic socket allocation to make hooks
0070  *                  easier (suggested by Craig Metz).
0071  *      Michael Pall    :   SO_ERROR returns positive errno again
0072  *              Steve Whitehouse:       Added default destructor to free
0073  *                                      protocol private data.
0074  *              Steve Whitehouse:       Added various other default routines
0075  *                                      common to several socket families.
0076  *              Chris Evans     :       Call suser() check last on F_SETOWN
0077  *      Jay Schulist    :   Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
0078  *      Andi Kleen  :   Add sock_kmalloc()/sock_kfree_s()
0079  *      Andi Kleen  :   Fix write_space callback
0080  *      Chris Evans :   Security fixes - signedness again
0081  *      Arnaldo C. Melo :       cleanups, use skb_queue_purge
0082  *
0083  * To Fix:
0084  */
0085 
0086 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0087 
0088 #include <asm/unaligned.h>
0089 #include <linux/capability.h>
0090 #include <linux/errno.h>
0091 #include <linux/errqueue.h>
0092 #include <linux/types.h>
0093 #include <linux/socket.h>
0094 #include <linux/in.h>
0095 #include <linux/kernel.h>
0096 #include <linux/module.h>
0097 #include <linux/proc_fs.h>
0098 #include <linux/seq_file.h>
0099 #include <linux/sched.h>
0100 #include <linux/sched/mm.h>
0101 #include <linux/timer.h>
0102 #include <linux/string.h>
0103 #include <linux/sockios.h>
0104 #include <linux/net.h>
0105 #include <linux/mm.h>
0106 #include <linux/slab.h>
0107 #include <linux/interrupt.h>
0108 #include <linux/poll.h>
0109 #include <linux/tcp.h>
0110 #include <linux/init.h>
0111 #include <linux/highmem.h>
0112 #include <linux/user_namespace.h>
0113 #include <linux/static_key.h>
0114 #include <linux/memcontrol.h>
0115 #include <linux/prefetch.h>
0116 #include <linux/compat.h>
0117 
0118 #include <linux/uaccess.h>
0119 
0120 #include <linux/netdevice.h>
0121 #include <net/protocol.h>
0122 #include <linux/skbuff.h>
0123 #include <net/net_namespace.h>
0124 #include <net/request_sock.h>
0125 #include <net/sock.h>
0126 #include <linux/net_tstamp.h>
0127 #include <net/xfrm.h>
0128 #include <linux/ipsec.h>
0129 #include <net/cls_cgroup.h>
0130 #include <net/netprio_cgroup.h>
0131 #include <linux/sock_diag.h>
0132 
0133 #include <linux/filter.h>
0134 #include <net/sock_reuseport.h>
0135 #include <net/bpf_sk_storage.h>
0136 
0137 #include <trace/events/sock.h>
0138 
0139 #include <net/tcp.h>
0140 #include <net/busy_poll.h>
0141 
0142 #include <linux/ethtool.h>
0143 
0144 #include "dev.h"
0145 
0146 static DEFINE_MUTEX(proto_list_mutex);
0147 static LIST_HEAD(proto_list);
0148 
0149 static void sock_def_write_space_wfree(struct sock *sk);
0150 static void sock_def_write_space(struct sock *sk);
0151 
0152 /**
0153  * sk_ns_capable - General socket capability test
0154  * @sk: Socket to use a capability on or through
0155  * @user_ns: The user namespace of the capability to use
0156  * @cap: The capability to use
0157  *
0158  * Test to see if the opener of the socket had when the socket was
0159  * created and the current process has the capability @cap in the user
0160  * namespace @user_ns.
0161  */
0162 bool sk_ns_capable(const struct sock *sk,
0163            struct user_namespace *user_ns, int cap)
0164 {
0165     return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
0166         ns_capable(user_ns, cap);
0167 }
0168 EXPORT_SYMBOL(sk_ns_capable);
0169 
0170 /**
0171  * sk_capable - Socket global capability test
0172  * @sk: Socket to use a capability on or through
0173  * @cap: The global capability to use
0174  *
0175  * Test to see if the opener of the socket had when the socket was
0176  * created and the current process has the capability @cap in all user
0177  * namespaces.
0178  */
0179 bool sk_capable(const struct sock *sk, int cap)
0180 {
0181     return sk_ns_capable(sk, &init_user_ns, cap);
0182 }
0183 EXPORT_SYMBOL(sk_capable);
0184 
0185 /**
0186  * sk_net_capable - Network namespace socket capability test
0187  * @sk: Socket to use a capability on or through
0188  * @cap: The capability to use
0189  *
0190  * Test to see if the opener of the socket had when the socket was created
0191  * and the current process has the capability @cap over the network namespace
0192  * the socket is a member of.
0193  */
0194 bool sk_net_capable(const struct sock *sk, int cap)
0195 {
0196     return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
0197 }
0198 EXPORT_SYMBOL(sk_net_capable);
0199 
0200 /*
0201  * Each address family might have different locking rules, so we have
0202  * one slock key per address family and separate keys for internal and
0203  * userspace sockets.
0204  */
0205 static struct lock_class_key af_family_keys[AF_MAX];
0206 static struct lock_class_key af_family_kern_keys[AF_MAX];
0207 static struct lock_class_key af_family_slock_keys[AF_MAX];
0208 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
0209 
0210 /*
0211  * Make lock validator output more readable. (we pre-construct these
0212  * strings build-time, so that runtime initialization of socket
0213  * locks is fast):
0214  */
0215 
0216 #define _sock_locks(x)                        \
0217   x "AF_UNSPEC",    x "AF_UNIX"     ,   x "AF_INET"     , \
0218   x "AF_AX25"  ,    x "AF_IPX"      ,   x "AF_APPLETALK", \
0219   x "AF_NETROM",    x "AF_BRIDGE"   ,   x "AF_ATMPVC"   , \
0220   x "AF_X25"   ,    x "AF_INET6"    ,   x "AF_ROSE"     , \
0221   x "AF_DECnet",    x "AF_NETBEUI"  ,   x "AF_SECURITY" , \
0222   x "AF_KEY"   ,    x "AF_NETLINK"  ,   x "AF_PACKET"   , \
0223   x "AF_ASH"   ,    x "AF_ECONET"   ,   x "AF_ATMSVC"   , \
0224   x "AF_RDS"   ,    x "AF_SNA"      ,   x "AF_IRDA"     , \
0225   x "AF_PPPOX" ,    x "AF_WANPIPE"  ,   x "AF_LLC"      , \
0226   x "27"       ,    x "28"          ,   x "AF_CAN"      , \
0227   x "AF_TIPC"  ,    x "AF_BLUETOOTH",   x "IUCV"        , \
0228   x "AF_RXRPC" ,    x "AF_ISDN"     ,   x "AF_PHONET"   , \
0229   x "AF_IEEE802154",    x "AF_CAIF" ,   x "AF_ALG"      , \
0230   x "AF_NFC"   ,    x "AF_VSOCK"    ,   x "AF_KCM"      , \
0231   x "AF_QIPCRTR",   x "AF_SMC"  ,   x "AF_XDP"  , \
0232   x "AF_MCTP"  , \
0233   x "AF_MAX"
0234 
0235 static const char *const af_family_key_strings[AF_MAX+1] = {
0236     _sock_locks("sk_lock-")
0237 };
0238 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
0239     _sock_locks("slock-")
0240 };
0241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
0242     _sock_locks("clock-")
0243 };
0244 
0245 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
0246     _sock_locks("k-sk_lock-")
0247 };
0248 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
0249     _sock_locks("k-slock-")
0250 };
0251 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
0252     _sock_locks("k-clock-")
0253 };
0254 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
0255     _sock_locks("rlock-")
0256 };
0257 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
0258     _sock_locks("wlock-")
0259 };
0260 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
0261     _sock_locks("elock-")
0262 };
0263 
0264 /*
0265  * sk_callback_lock and sk queues locking rules are per-address-family,
0266  * so split the lock classes by using a per-AF key:
0267  */
0268 static struct lock_class_key af_callback_keys[AF_MAX];
0269 static struct lock_class_key af_rlock_keys[AF_MAX];
0270 static struct lock_class_key af_wlock_keys[AF_MAX];
0271 static struct lock_class_key af_elock_keys[AF_MAX];
0272 static struct lock_class_key af_kern_callback_keys[AF_MAX];
0273 
0274 /* Run time adjustable parameters. */
0275 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
0276 EXPORT_SYMBOL(sysctl_wmem_max);
0277 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
0278 EXPORT_SYMBOL(sysctl_rmem_max);
0279 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
0280 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
0281 
0282 /* Maximal space eaten by iovec or ancillary data plus some space */
0283 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
0284 EXPORT_SYMBOL(sysctl_optmem_max);
0285 
0286 int sysctl_tstamp_allow_data __read_mostly = 1;
0287 
0288 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
0289 EXPORT_SYMBOL_GPL(memalloc_socks_key);
0290 
0291 /**
0292  * sk_set_memalloc - sets %SOCK_MEMALLOC
0293  * @sk: socket to set it on
0294  *
0295  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
0296  * It's the responsibility of the admin to adjust min_free_kbytes
0297  * to meet the requirements
0298  */
0299 void sk_set_memalloc(struct sock *sk)
0300 {
0301     sock_set_flag(sk, SOCK_MEMALLOC);
0302     sk->sk_allocation |= __GFP_MEMALLOC;
0303     static_branch_inc(&memalloc_socks_key);
0304 }
0305 EXPORT_SYMBOL_GPL(sk_set_memalloc);
0306 
0307 void sk_clear_memalloc(struct sock *sk)
0308 {
0309     sock_reset_flag(sk, SOCK_MEMALLOC);
0310     sk->sk_allocation &= ~__GFP_MEMALLOC;
0311     static_branch_dec(&memalloc_socks_key);
0312 
0313     /*
0314      * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
0315      * progress of swapping. SOCK_MEMALLOC may be cleared while
0316      * it has rmem allocations due to the last swapfile being deactivated
0317      * but there is a risk that the socket is unusable due to exceeding
0318      * the rmem limits. Reclaim the reserves and obey rmem limits again.
0319      */
0320     sk_mem_reclaim(sk);
0321 }
0322 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
0323 
0324 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
0325 {
0326     int ret;
0327     unsigned int noreclaim_flag;
0328 
0329     /* these should have been dropped before queueing */
0330     BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
0331 
0332     noreclaim_flag = memalloc_noreclaim_save();
0333     ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
0334                  tcp_v6_do_rcv,
0335                  tcp_v4_do_rcv,
0336                  sk, skb);
0337     memalloc_noreclaim_restore(noreclaim_flag);
0338 
0339     return ret;
0340 }
0341 EXPORT_SYMBOL(__sk_backlog_rcv);
0342 
0343 void sk_error_report(struct sock *sk)
0344 {
0345     sk->sk_error_report(sk);
0346 
0347     switch (sk->sk_family) {
0348     case AF_INET:
0349         fallthrough;
0350     case AF_INET6:
0351         trace_inet_sk_error_report(sk);
0352         break;
0353     default:
0354         break;
0355     }
0356 }
0357 EXPORT_SYMBOL(sk_error_report);
0358 
0359 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
0360 {
0361     struct __kernel_sock_timeval tv;
0362 
0363     if (timeo == MAX_SCHEDULE_TIMEOUT) {
0364         tv.tv_sec = 0;
0365         tv.tv_usec = 0;
0366     } else {
0367         tv.tv_sec = timeo / HZ;
0368         tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
0369     }
0370 
0371     if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
0372         struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
0373         *(struct old_timeval32 *)optval = tv32;
0374         return sizeof(tv32);
0375     }
0376 
0377     if (old_timeval) {
0378         struct __kernel_old_timeval old_tv;
0379         old_tv.tv_sec = tv.tv_sec;
0380         old_tv.tv_usec = tv.tv_usec;
0381         *(struct __kernel_old_timeval *)optval = old_tv;
0382         return sizeof(old_tv);
0383     }
0384 
0385     *(struct __kernel_sock_timeval *)optval = tv;
0386     return sizeof(tv);
0387 }
0388 EXPORT_SYMBOL(sock_get_timeout);
0389 
0390 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
0391                sockptr_t optval, int optlen, bool old_timeval)
0392 {
0393     if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
0394         struct old_timeval32 tv32;
0395 
0396         if (optlen < sizeof(tv32))
0397             return -EINVAL;
0398 
0399         if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
0400             return -EFAULT;
0401         tv->tv_sec = tv32.tv_sec;
0402         tv->tv_usec = tv32.tv_usec;
0403     } else if (old_timeval) {
0404         struct __kernel_old_timeval old_tv;
0405 
0406         if (optlen < sizeof(old_tv))
0407             return -EINVAL;
0408         if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
0409             return -EFAULT;
0410         tv->tv_sec = old_tv.tv_sec;
0411         tv->tv_usec = old_tv.tv_usec;
0412     } else {
0413         if (optlen < sizeof(*tv))
0414             return -EINVAL;
0415         if (copy_from_sockptr(tv, optval, sizeof(*tv)))
0416             return -EFAULT;
0417     }
0418 
0419     return 0;
0420 }
0421 EXPORT_SYMBOL(sock_copy_user_timeval);
0422 
0423 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
0424                 bool old_timeval)
0425 {
0426     struct __kernel_sock_timeval tv;
0427     int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
0428 
0429     if (err)
0430         return err;
0431 
0432     if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
0433         return -EDOM;
0434 
0435     if (tv.tv_sec < 0) {
0436         static int warned __read_mostly;
0437 
0438         *timeo_p = 0;
0439         if (warned < 10 && net_ratelimit()) {
0440             warned++;
0441             pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
0442                 __func__, current->comm, task_pid_nr(current));
0443         }
0444         return 0;
0445     }
0446     *timeo_p = MAX_SCHEDULE_TIMEOUT;
0447     if (tv.tv_sec == 0 && tv.tv_usec == 0)
0448         return 0;
0449     if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
0450         *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
0451     return 0;
0452 }
0453 
0454 static bool sock_needs_netstamp(const struct sock *sk)
0455 {
0456     switch (sk->sk_family) {
0457     case AF_UNSPEC:
0458     case AF_UNIX:
0459         return false;
0460     default:
0461         return true;
0462     }
0463 }
0464 
0465 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
0466 {
0467     if (sk->sk_flags & flags) {
0468         sk->sk_flags &= ~flags;
0469         if (sock_needs_netstamp(sk) &&
0470             !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
0471             net_disable_timestamp();
0472     }
0473 }
0474 
0475 
0476 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
0477 {
0478     unsigned long flags;
0479     struct sk_buff_head *list = &sk->sk_receive_queue;
0480 
0481     if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
0482         atomic_inc(&sk->sk_drops);
0483         trace_sock_rcvqueue_full(sk, skb);
0484         return -ENOMEM;
0485     }
0486 
0487     if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
0488         atomic_inc(&sk->sk_drops);
0489         return -ENOBUFS;
0490     }
0491 
0492     skb->dev = NULL;
0493     skb_set_owner_r(skb, sk);
0494 
0495     /* we escape from rcu protected region, make sure we dont leak
0496      * a norefcounted dst
0497      */
0498     skb_dst_force(skb);
0499 
0500     spin_lock_irqsave(&list->lock, flags);
0501     sock_skb_set_dropcount(sk, skb);
0502     __skb_queue_tail(list, skb);
0503     spin_unlock_irqrestore(&list->lock, flags);
0504 
0505     if (!sock_flag(sk, SOCK_DEAD))
0506         sk->sk_data_ready(sk);
0507     return 0;
0508 }
0509 EXPORT_SYMBOL(__sock_queue_rcv_skb);
0510 
0511 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
0512                   enum skb_drop_reason *reason)
0513 {
0514     enum skb_drop_reason drop_reason;
0515     int err;
0516 
0517     err = sk_filter(sk, skb);
0518     if (err) {
0519         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
0520         goto out;
0521     }
0522     err = __sock_queue_rcv_skb(sk, skb);
0523     switch (err) {
0524     case -ENOMEM:
0525         drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
0526         break;
0527     case -ENOBUFS:
0528         drop_reason = SKB_DROP_REASON_PROTO_MEM;
0529         break;
0530     default:
0531         drop_reason = SKB_NOT_DROPPED_YET;
0532         break;
0533     }
0534 out:
0535     if (reason)
0536         *reason = drop_reason;
0537     return err;
0538 }
0539 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
0540 
0541 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
0542              const int nested, unsigned int trim_cap, bool refcounted)
0543 {
0544     int rc = NET_RX_SUCCESS;
0545 
0546     if (sk_filter_trim_cap(sk, skb, trim_cap))
0547         goto discard_and_relse;
0548 
0549     skb->dev = NULL;
0550 
0551     if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
0552         atomic_inc(&sk->sk_drops);
0553         goto discard_and_relse;
0554     }
0555     if (nested)
0556         bh_lock_sock_nested(sk);
0557     else
0558         bh_lock_sock(sk);
0559     if (!sock_owned_by_user(sk)) {
0560         /*
0561          * trylock + unlock semantics:
0562          */
0563         mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
0564 
0565         rc = sk_backlog_rcv(sk, skb);
0566 
0567         mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
0568     } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
0569         bh_unlock_sock(sk);
0570         atomic_inc(&sk->sk_drops);
0571         goto discard_and_relse;
0572     }
0573 
0574     bh_unlock_sock(sk);
0575 out:
0576     if (refcounted)
0577         sock_put(sk);
0578     return rc;
0579 discard_and_relse:
0580     kfree_skb(skb);
0581     goto out;
0582 }
0583 EXPORT_SYMBOL(__sk_receive_skb);
0584 
0585 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
0586                               u32));
0587 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
0588                                u32));
0589 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
0590 {
0591     struct dst_entry *dst = __sk_dst_get(sk);
0592 
0593     if (dst && dst->obsolete &&
0594         INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
0595                    dst, cookie) == NULL) {
0596         sk_tx_queue_clear(sk);
0597         sk->sk_dst_pending_confirm = 0;
0598         RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
0599         dst_release(dst);
0600         return NULL;
0601     }
0602 
0603     return dst;
0604 }
0605 EXPORT_SYMBOL(__sk_dst_check);
0606 
0607 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
0608 {
0609     struct dst_entry *dst = sk_dst_get(sk);
0610 
0611     if (dst && dst->obsolete &&
0612         INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
0613                    dst, cookie) == NULL) {
0614         sk_dst_reset(sk);
0615         dst_release(dst);
0616         return NULL;
0617     }
0618 
0619     return dst;
0620 }
0621 EXPORT_SYMBOL(sk_dst_check);
0622 
0623 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
0624 {
0625     int ret = -ENOPROTOOPT;
0626 #ifdef CONFIG_NETDEVICES
0627     struct net *net = sock_net(sk);
0628 
0629     /* Sorry... */
0630     ret = -EPERM;
0631     if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
0632         goto out;
0633 
0634     ret = -EINVAL;
0635     if (ifindex < 0)
0636         goto out;
0637 
0638     /* Paired with all READ_ONCE() done locklessly. */
0639     WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
0640 
0641     if (sk->sk_prot->rehash)
0642         sk->sk_prot->rehash(sk);
0643     sk_dst_reset(sk);
0644 
0645     ret = 0;
0646 
0647 out:
0648 #endif
0649 
0650     return ret;
0651 }
0652 
0653 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
0654 {
0655     int ret;
0656 
0657     if (lock_sk)
0658         lock_sock(sk);
0659     ret = sock_bindtoindex_locked(sk, ifindex);
0660     if (lock_sk)
0661         release_sock(sk);
0662 
0663     return ret;
0664 }
0665 EXPORT_SYMBOL(sock_bindtoindex);
0666 
0667 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
0668 {
0669     int ret = -ENOPROTOOPT;
0670 #ifdef CONFIG_NETDEVICES
0671     struct net *net = sock_net(sk);
0672     char devname[IFNAMSIZ];
0673     int index;
0674 
0675     ret = -EINVAL;
0676     if (optlen < 0)
0677         goto out;
0678 
0679     /* Bind this socket to a particular device like "eth0",
0680      * as specified in the passed interface name. If the
0681      * name is "" or the option length is zero the socket
0682      * is not bound.
0683      */
0684     if (optlen > IFNAMSIZ - 1)
0685         optlen = IFNAMSIZ - 1;
0686     memset(devname, 0, sizeof(devname));
0687 
0688     ret = -EFAULT;
0689     if (copy_from_sockptr(devname, optval, optlen))
0690         goto out;
0691 
0692     index = 0;
0693     if (devname[0] != '\0') {
0694         struct net_device *dev;
0695 
0696         rcu_read_lock();
0697         dev = dev_get_by_name_rcu(net, devname);
0698         if (dev)
0699             index = dev->ifindex;
0700         rcu_read_unlock();
0701         ret = -ENODEV;
0702         if (!dev)
0703             goto out;
0704     }
0705 
0706     return sock_bindtoindex(sk, index, true);
0707 out:
0708 #endif
0709 
0710     return ret;
0711 }
0712 
0713 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
0714                 int __user *optlen, int len)
0715 {
0716     int ret = -ENOPROTOOPT;
0717 #ifdef CONFIG_NETDEVICES
0718     int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
0719     struct net *net = sock_net(sk);
0720     char devname[IFNAMSIZ];
0721 
0722     if (bound_dev_if == 0) {
0723         len = 0;
0724         goto zero;
0725     }
0726 
0727     ret = -EINVAL;
0728     if (len < IFNAMSIZ)
0729         goto out;
0730 
0731     ret = netdev_get_name(net, devname, bound_dev_if);
0732     if (ret)
0733         goto out;
0734 
0735     len = strlen(devname) + 1;
0736 
0737     ret = -EFAULT;
0738     if (copy_to_user(optval, devname, len))
0739         goto out;
0740 
0741 zero:
0742     ret = -EFAULT;
0743     if (put_user(len, optlen))
0744         goto out;
0745 
0746     ret = 0;
0747 
0748 out:
0749 #endif
0750 
0751     return ret;
0752 }
0753 
0754 bool sk_mc_loop(struct sock *sk)
0755 {
0756     if (dev_recursion_level())
0757         return false;
0758     if (!sk)
0759         return true;
0760     switch (sk->sk_family) {
0761     case AF_INET:
0762         return inet_sk(sk)->mc_loop;
0763 #if IS_ENABLED(CONFIG_IPV6)
0764     case AF_INET6:
0765         return inet6_sk(sk)->mc_loop;
0766 #endif
0767     }
0768     WARN_ON_ONCE(1);
0769     return true;
0770 }
0771 EXPORT_SYMBOL(sk_mc_loop);
0772 
0773 void sock_set_reuseaddr(struct sock *sk)
0774 {
0775     lock_sock(sk);
0776     sk->sk_reuse = SK_CAN_REUSE;
0777     release_sock(sk);
0778 }
0779 EXPORT_SYMBOL(sock_set_reuseaddr);
0780 
0781 void sock_set_reuseport(struct sock *sk)
0782 {
0783     lock_sock(sk);
0784     sk->sk_reuseport = true;
0785     release_sock(sk);
0786 }
0787 EXPORT_SYMBOL(sock_set_reuseport);
0788 
0789 void sock_no_linger(struct sock *sk)
0790 {
0791     lock_sock(sk);
0792     sk->sk_lingertime = 0;
0793     sock_set_flag(sk, SOCK_LINGER);
0794     release_sock(sk);
0795 }
0796 EXPORT_SYMBOL(sock_no_linger);
0797 
0798 void sock_set_priority(struct sock *sk, u32 priority)
0799 {
0800     lock_sock(sk);
0801     sk->sk_priority = priority;
0802     release_sock(sk);
0803 }
0804 EXPORT_SYMBOL(sock_set_priority);
0805 
0806 void sock_set_sndtimeo(struct sock *sk, s64 secs)
0807 {
0808     lock_sock(sk);
0809     if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
0810         sk->sk_sndtimeo = secs * HZ;
0811     else
0812         sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
0813     release_sock(sk);
0814 }
0815 EXPORT_SYMBOL(sock_set_sndtimeo);
0816 
0817 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
0818 {
0819     if (val)  {
0820         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
0821         sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
0822         sock_set_flag(sk, SOCK_RCVTSTAMP);
0823         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
0824     } else {
0825         sock_reset_flag(sk, SOCK_RCVTSTAMP);
0826         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
0827     }
0828 }
0829 
0830 void sock_enable_timestamps(struct sock *sk)
0831 {
0832     lock_sock(sk);
0833     __sock_set_timestamps(sk, true, false, true);
0834     release_sock(sk);
0835 }
0836 EXPORT_SYMBOL(sock_enable_timestamps);
0837 
0838 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
0839 {
0840     switch (optname) {
0841     case SO_TIMESTAMP_OLD:
0842         __sock_set_timestamps(sk, valbool, false, false);
0843         break;
0844     case SO_TIMESTAMP_NEW:
0845         __sock_set_timestamps(sk, valbool, true, false);
0846         break;
0847     case SO_TIMESTAMPNS_OLD:
0848         __sock_set_timestamps(sk, valbool, false, true);
0849         break;
0850     case SO_TIMESTAMPNS_NEW:
0851         __sock_set_timestamps(sk, valbool, true, true);
0852         break;
0853     }
0854 }
0855 
0856 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
0857 {
0858     struct net *net = sock_net(sk);
0859     struct net_device *dev = NULL;
0860     bool match = false;
0861     int *vclock_index;
0862     int i, num;
0863 
0864     if (sk->sk_bound_dev_if)
0865         dev = dev_get_by_index(net, sk->sk_bound_dev_if);
0866 
0867     if (!dev) {
0868         pr_err("%s: sock not bind to device\n", __func__);
0869         return -EOPNOTSUPP;
0870     }
0871 
0872     num = ethtool_get_phc_vclocks(dev, &vclock_index);
0873     dev_put(dev);
0874 
0875     for (i = 0; i < num; i++) {
0876         if (*(vclock_index + i) == phc_index) {
0877             match = true;
0878             break;
0879         }
0880     }
0881 
0882     if (num > 0)
0883         kfree(vclock_index);
0884 
0885     if (!match)
0886         return -EINVAL;
0887 
0888     sk->sk_bind_phc = phc_index;
0889 
0890     return 0;
0891 }
0892 
0893 int sock_set_timestamping(struct sock *sk, int optname,
0894               struct so_timestamping timestamping)
0895 {
0896     int val = timestamping.flags;
0897     int ret;
0898 
0899     if (val & ~SOF_TIMESTAMPING_MASK)
0900         return -EINVAL;
0901 
0902     if (val & SOF_TIMESTAMPING_OPT_ID &&
0903         !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
0904         if (sk_is_tcp(sk)) {
0905             if ((1 << sk->sk_state) &
0906                 (TCPF_CLOSE | TCPF_LISTEN))
0907                 return -EINVAL;
0908             atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
0909         } else {
0910             atomic_set(&sk->sk_tskey, 0);
0911         }
0912     }
0913 
0914     if (val & SOF_TIMESTAMPING_OPT_STATS &&
0915         !(val & SOF_TIMESTAMPING_OPT_TSONLY))
0916         return -EINVAL;
0917 
0918     if (val & SOF_TIMESTAMPING_BIND_PHC) {
0919         ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
0920         if (ret)
0921             return ret;
0922     }
0923 
0924     sk->sk_tsflags = val;
0925     sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
0926 
0927     if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
0928         sock_enable_timestamp(sk,
0929                       SOCK_TIMESTAMPING_RX_SOFTWARE);
0930     else
0931         sock_disable_timestamp(sk,
0932                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
0933     return 0;
0934 }
0935 
0936 void sock_set_keepalive(struct sock *sk)
0937 {
0938     lock_sock(sk);
0939     if (sk->sk_prot->keepalive)
0940         sk->sk_prot->keepalive(sk, true);
0941     sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
0942     release_sock(sk);
0943 }
0944 EXPORT_SYMBOL(sock_set_keepalive);
0945 
0946 static void __sock_set_rcvbuf(struct sock *sk, int val)
0947 {
0948     /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
0949      * as a negative value.
0950      */
0951     val = min_t(int, val, INT_MAX / 2);
0952     sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
0953 
0954     /* We double it on the way in to account for "struct sk_buff" etc.
0955      * overhead.   Applications assume that the SO_RCVBUF setting they make
0956      * will allow that much actual data to be received on that socket.
0957      *
0958      * Applications are unaware that "struct sk_buff" and other overheads
0959      * allocate from the receive buffer during socket buffer allocation.
0960      *
0961      * And after considering the possible alternatives, returning the value
0962      * we actually used in getsockopt is the most desirable behavior.
0963      */
0964     WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
0965 }
0966 
0967 void sock_set_rcvbuf(struct sock *sk, int val)
0968 {
0969     lock_sock(sk);
0970     __sock_set_rcvbuf(sk, val);
0971     release_sock(sk);
0972 }
0973 EXPORT_SYMBOL(sock_set_rcvbuf);
0974 
0975 static void __sock_set_mark(struct sock *sk, u32 val)
0976 {
0977     if (val != sk->sk_mark) {
0978         sk->sk_mark = val;
0979         sk_dst_reset(sk);
0980     }
0981 }
0982 
0983 void sock_set_mark(struct sock *sk, u32 val)
0984 {
0985     lock_sock(sk);
0986     __sock_set_mark(sk, val);
0987     release_sock(sk);
0988 }
0989 EXPORT_SYMBOL(sock_set_mark);
0990 
0991 static void sock_release_reserved_memory(struct sock *sk, int bytes)
0992 {
0993     /* Round down bytes to multiple of pages */
0994     bytes = round_down(bytes, PAGE_SIZE);
0995 
0996     WARN_ON(bytes > sk->sk_reserved_mem);
0997     sk->sk_reserved_mem -= bytes;
0998     sk_mem_reclaim(sk);
0999 }
1000 
1001 static int sock_reserve_memory(struct sock *sk, int bytes)
1002 {
1003     long allocated;
1004     bool charged;
1005     int pages;
1006 
1007     if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1008         return -EOPNOTSUPP;
1009 
1010     if (!bytes)
1011         return 0;
1012 
1013     pages = sk_mem_pages(bytes);
1014 
1015     /* pre-charge to memcg */
1016     charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1017                       GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1018     if (!charged)
1019         return -ENOMEM;
1020 
1021     /* pre-charge to forward_alloc */
1022     sk_memory_allocated_add(sk, pages);
1023     allocated = sk_memory_allocated(sk);
1024     /* If the system goes into memory pressure with this
1025      * precharge, give up and return error.
1026      */
1027     if (allocated > sk_prot_mem_limits(sk, 1)) {
1028         sk_memory_allocated_sub(sk, pages);
1029         mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1030         return -ENOMEM;
1031     }
1032     sk->sk_forward_alloc += pages << PAGE_SHIFT;
1033 
1034     sk->sk_reserved_mem += pages << PAGE_SHIFT;
1035 
1036     return 0;
1037 }
1038 
1039 /*
1040  *  This is meant for all protocols to use and covers goings on
1041  *  at the socket level. Everything here is generic.
1042  */
1043 
1044 int sock_setsockopt(struct socket *sock, int level, int optname,
1045             sockptr_t optval, unsigned int optlen)
1046 {
1047     struct so_timestamping timestamping;
1048     struct sock_txtime sk_txtime;
1049     struct sock *sk = sock->sk;
1050     int val;
1051     int valbool;
1052     struct linger ling;
1053     int ret = 0;
1054 
1055     /*
1056      *  Options without arguments
1057      */
1058 
1059     if (optname == SO_BINDTODEVICE)
1060         return sock_setbindtodevice(sk, optval, optlen);
1061 
1062     if (optlen < sizeof(int))
1063         return -EINVAL;
1064 
1065     if (copy_from_sockptr(&val, optval, sizeof(val)))
1066         return -EFAULT;
1067 
1068     valbool = val ? 1 : 0;
1069 
1070     lock_sock(sk);
1071 
1072     switch (optname) {
1073     case SO_DEBUG:
1074         if (val && !capable(CAP_NET_ADMIN))
1075             ret = -EACCES;
1076         else
1077             sock_valbool_flag(sk, SOCK_DBG, valbool);
1078         break;
1079     case SO_REUSEADDR:
1080         sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1081         break;
1082     case SO_REUSEPORT:
1083         sk->sk_reuseport = valbool;
1084         break;
1085     case SO_TYPE:
1086     case SO_PROTOCOL:
1087     case SO_DOMAIN:
1088     case SO_ERROR:
1089         ret = -ENOPROTOOPT;
1090         break;
1091     case SO_DONTROUTE:
1092         sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1093         sk_dst_reset(sk);
1094         break;
1095     case SO_BROADCAST:
1096         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1097         break;
1098     case SO_SNDBUF:
1099         /* Don't error on this BSD doesn't and if you think
1100          * about it this is right. Otherwise apps have to
1101          * play 'guess the biggest size' games. RCVBUF/SNDBUF
1102          * are treated in BSD as hints
1103          */
1104         val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1105 set_sndbuf:
1106         /* Ensure val * 2 fits into an int, to prevent max_t()
1107          * from treating it as a negative value.
1108          */
1109         val = min_t(int, val, INT_MAX / 2);
1110         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1111         WRITE_ONCE(sk->sk_sndbuf,
1112                max_t(int, val * 2, SOCK_MIN_SNDBUF));
1113         /* Wake up sending tasks if we upped the value. */
1114         sk->sk_write_space(sk);
1115         break;
1116 
1117     case SO_SNDBUFFORCE:
1118         if (!capable(CAP_NET_ADMIN)) {
1119             ret = -EPERM;
1120             break;
1121         }
1122 
1123         /* No negative values (to prevent underflow, as val will be
1124          * multiplied by 2).
1125          */
1126         if (val < 0)
1127             val = 0;
1128         goto set_sndbuf;
1129 
1130     case SO_RCVBUF:
1131         /* Don't error on this BSD doesn't and if you think
1132          * about it this is right. Otherwise apps have to
1133          * play 'guess the biggest size' games. RCVBUF/SNDBUF
1134          * are treated in BSD as hints
1135          */
1136         __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1137         break;
1138 
1139     case SO_RCVBUFFORCE:
1140         if (!capable(CAP_NET_ADMIN)) {
1141             ret = -EPERM;
1142             break;
1143         }
1144 
1145         /* No negative values (to prevent underflow, as val will be
1146          * multiplied by 2).
1147          */
1148         __sock_set_rcvbuf(sk, max(val, 0));
1149         break;
1150 
1151     case SO_KEEPALIVE:
1152         if (sk->sk_prot->keepalive)
1153             sk->sk_prot->keepalive(sk, valbool);
1154         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1155         break;
1156 
1157     case SO_OOBINLINE:
1158         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1159         break;
1160 
1161     case SO_NO_CHECK:
1162         sk->sk_no_check_tx = valbool;
1163         break;
1164 
1165     case SO_PRIORITY:
1166         if ((val >= 0 && val <= 6) ||
1167             ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1168             ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1169             sk->sk_priority = val;
1170         else
1171             ret = -EPERM;
1172         break;
1173 
1174     case SO_LINGER:
1175         if (optlen < sizeof(ling)) {
1176             ret = -EINVAL;  /* 1003.1g */
1177             break;
1178         }
1179         if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1180             ret = -EFAULT;
1181             break;
1182         }
1183         if (!ling.l_onoff)
1184             sock_reset_flag(sk, SOCK_LINGER);
1185         else {
1186 #if (BITS_PER_LONG == 32)
1187             if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1188                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1189             else
1190 #endif
1191                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1192             sock_set_flag(sk, SOCK_LINGER);
1193         }
1194         break;
1195 
1196     case SO_BSDCOMPAT:
1197         break;
1198 
1199     case SO_PASSCRED:
1200         if (valbool)
1201             set_bit(SOCK_PASSCRED, &sock->flags);
1202         else
1203             clear_bit(SOCK_PASSCRED, &sock->flags);
1204         break;
1205 
1206     case SO_TIMESTAMP_OLD:
1207     case SO_TIMESTAMP_NEW:
1208     case SO_TIMESTAMPNS_OLD:
1209     case SO_TIMESTAMPNS_NEW:
1210         sock_set_timestamp(sk, optname, valbool);
1211         break;
1212 
1213     case SO_TIMESTAMPING_NEW:
1214     case SO_TIMESTAMPING_OLD:
1215         if (optlen == sizeof(timestamping)) {
1216             if (copy_from_sockptr(&timestamping, optval,
1217                           sizeof(timestamping))) {
1218                 ret = -EFAULT;
1219                 break;
1220             }
1221         } else {
1222             memset(&timestamping, 0, sizeof(timestamping));
1223             timestamping.flags = val;
1224         }
1225         ret = sock_set_timestamping(sk, optname, timestamping);
1226         break;
1227 
1228     case SO_RCVLOWAT:
1229         if (val < 0)
1230             val = INT_MAX;
1231         if (sock->ops->set_rcvlowat)
1232             ret = sock->ops->set_rcvlowat(sk, val);
1233         else
1234             WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1235         break;
1236 
1237     case SO_RCVTIMEO_OLD:
1238     case SO_RCVTIMEO_NEW:
1239         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1240                        optlen, optname == SO_RCVTIMEO_OLD);
1241         break;
1242 
1243     case SO_SNDTIMEO_OLD:
1244     case SO_SNDTIMEO_NEW:
1245         ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1246                        optlen, optname == SO_SNDTIMEO_OLD);
1247         break;
1248 
1249     case SO_ATTACH_FILTER: {
1250         struct sock_fprog fprog;
1251 
1252         ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1253         if (!ret)
1254             ret = sk_attach_filter(&fprog, sk);
1255         break;
1256     }
1257     case SO_ATTACH_BPF:
1258         ret = -EINVAL;
1259         if (optlen == sizeof(u32)) {
1260             u32 ufd;
1261 
1262             ret = -EFAULT;
1263             if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1264                 break;
1265 
1266             ret = sk_attach_bpf(ufd, sk);
1267         }
1268         break;
1269 
1270     case SO_ATTACH_REUSEPORT_CBPF: {
1271         struct sock_fprog fprog;
1272 
1273         ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1274         if (!ret)
1275             ret = sk_reuseport_attach_filter(&fprog, sk);
1276         break;
1277     }
1278     case SO_ATTACH_REUSEPORT_EBPF:
1279         ret = -EINVAL;
1280         if (optlen == sizeof(u32)) {
1281             u32 ufd;
1282 
1283             ret = -EFAULT;
1284             if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1285                 break;
1286 
1287             ret = sk_reuseport_attach_bpf(ufd, sk);
1288         }
1289         break;
1290 
1291     case SO_DETACH_REUSEPORT_BPF:
1292         ret = reuseport_detach_prog(sk);
1293         break;
1294 
1295     case SO_DETACH_FILTER:
1296         ret = sk_detach_filter(sk);
1297         break;
1298 
1299     case SO_LOCK_FILTER:
1300         if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1301             ret = -EPERM;
1302         else
1303             sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1304         break;
1305 
1306     case SO_PASSSEC:
1307         if (valbool)
1308             set_bit(SOCK_PASSSEC, &sock->flags);
1309         else
1310             clear_bit(SOCK_PASSSEC, &sock->flags);
1311         break;
1312     case SO_MARK:
1313         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1314             !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1315             ret = -EPERM;
1316             break;
1317         }
1318 
1319         __sock_set_mark(sk, val);
1320         break;
1321     case SO_RCVMARK:
1322         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1323             !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1324             ret = -EPERM;
1325             break;
1326         }
1327 
1328         sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1329         break;
1330 
1331     case SO_RXQ_OVFL:
1332         sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1333         break;
1334 
1335     case SO_WIFI_STATUS:
1336         sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1337         break;
1338 
1339     case SO_PEEK_OFF:
1340         if (sock->ops->set_peek_off)
1341             ret = sock->ops->set_peek_off(sk, val);
1342         else
1343             ret = -EOPNOTSUPP;
1344         break;
1345 
1346     case SO_NOFCS:
1347         sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1348         break;
1349 
1350     case SO_SELECT_ERR_QUEUE:
1351         sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1352         break;
1353 
1354 #ifdef CONFIG_NET_RX_BUSY_POLL
1355     case SO_BUSY_POLL:
1356         /* allow unprivileged users to decrease the value */
1357         if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1358             ret = -EPERM;
1359         else {
1360             if (val < 0)
1361                 ret = -EINVAL;
1362             else
1363                 WRITE_ONCE(sk->sk_ll_usec, val);
1364         }
1365         break;
1366     case SO_PREFER_BUSY_POLL:
1367         if (valbool && !capable(CAP_NET_ADMIN))
1368             ret = -EPERM;
1369         else
1370             WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1371         break;
1372     case SO_BUSY_POLL_BUDGET:
1373         if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1374             ret = -EPERM;
1375         } else {
1376             if (val < 0 || val > U16_MAX)
1377                 ret = -EINVAL;
1378             else
1379                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1380         }
1381         break;
1382 #endif
1383 
1384     case SO_MAX_PACING_RATE:
1385         {
1386         unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1387 
1388         if (sizeof(ulval) != sizeof(val) &&
1389             optlen >= sizeof(ulval) &&
1390             copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1391             ret = -EFAULT;
1392             break;
1393         }
1394         if (ulval != ~0UL)
1395             cmpxchg(&sk->sk_pacing_status,
1396                 SK_PACING_NONE,
1397                 SK_PACING_NEEDED);
1398         sk->sk_max_pacing_rate = ulval;
1399         sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1400         break;
1401         }
1402     case SO_INCOMING_CPU:
1403         WRITE_ONCE(sk->sk_incoming_cpu, val);
1404         break;
1405 
1406     case SO_CNX_ADVICE:
1407         if (val == 1)
1408             dst_negative_advice(sk);
1409         break;
1410 
1411     case SO_ZEROCOPY:
1412         if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1413             if (!(sk_is_tcp(sk) ||
1414                   (sk->sk_type == SOCK_DGRAM &&
1415                    sk->sk_protocol == IPPROTO_UDP)))
1416                 ret = -EOPNOTSUPP;
1417         } else if (sk->sk_family != PF_RDS) {
1418             ret = -EOPNOTSUPP;
1419         }
1420         if (!ret) {
1421             if (val < 0 || val > 1)
1422                 ret = -EINVAL;
1423             else
1424                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1425         }
1426         break;
1427 
1428     case SO_TXTIME:
1429         if (optlen != sizeof(struct sock_txtime)) {
1430             ret = -EINVAL;
1431             break;
1432         } else if (copy_from_sockptr(&sk_txtime, optval,
1433                sizeof(struct sock_txtime))) {
1434             ret = -EFAULT;
1435             break;
1436         } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1437             ret = -EINVAL;
1438             break;
1439         }
1440         /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1441          * scheduler has enough safe guards.
1442          */
1443         if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1444             !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1445             ret = -EPERM;
1446             break;
1447         }
1448         sock_valbool_flag(sk, SOCK_TXTIME, true);
1449         sk->sk_clockid = sk_txtime.clockid;
1450         sk->sk_txtime_deadline_mode =
1451             !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1452         sk->sk_txtime_report_errors =
1453             !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1454         break;
1455 
1456     case SO_BINDTOIFINDEX:
1457         ret = sock_bindtoindex_locked(sk, val);
1458         break;
1459 
1460     case SO_BUF_LOCK:
1461         if (val & ~SOCK_BUF_LOCK_MASK) {
1462             ret = -EINVAL;
1463             break;
1464         }
1465         sk->sk_userlocks = val | (sk->sk_userlocks &
1466                       ~SOCK_BUF_LOCK_MASK);
1467         break;
1468 
1469     case SO_RESERVE_MEM:
1470     {
1471         int delta;
1472 
1473         if (val < 0) {
1474             ret = -EINVAL;
1475             break;
1476         }
1477 
1478         delta = val - sk->sk_reserved_mem;
1479         if (delta < 0)
1480             sock_release_reserved_memory(sk, -delta);
1481         else
1482             ret = sock_reserve_memory(sk, delta);
1483         break;
1484     }
1485 
1486     case SO_TXREHASH:
1487         if (val < -1 || val > 1) {
1488             ret = -EINVAL;
1489             break;
1490         }
1491         /* Paired with READ_ONCE() in tcp_rtx_synack() */
1492         WRITE_ONCE(sk->sk_txrehash, (u8)val);
1493         break;
1494 
1495     default:
1496         ret = -ENOPROTOOPT;
1497         break;
1498     }
1499     release_sock(sk);
1500     return ret;
1501 }
1502 EXPORT_SYMBOL(sock_setsockopt);
1503 
1504 static const struct cred *sk_get_peer_cred(struct sock *sk)
1505 {
1506     const struct cred *cred;
1507 
1508     spin_lock(&sk->sk_peer_lock);
1509     cred = get_cred(sk->sk_peer_cred);
1510     spin_unlock(&sk->sk_peer_lock);
1511 
1512     return cred;
1513 }
1514 
1515 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1516               struct ucred *ucred)
1517 {
1518     ucred->pid = pid_vnr(pid);
1519     ucred->uid = ucred->gid = -1;
1520     if (cred) {
1521         struct user_namespace *current_ns = current_user_ns();
1522 
1523         ucred->uid = from_kuid_munged(current_ns, cred->euid);
1524         ucred->gid = from_kgid_munged(current_ns, cred->egid);
1525     }
1526 }
1527 
1528 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1529 {
1530     struct user_namespace *user_ns = current_user_ns();
1531     int i;
1532 
1533     for (i = 0; i < src->ngroups; i++)
1534         if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1535             return -EFAULT;
1536 
1537     return 0;
1538 }
1539 
1540 int sock_getsockopt(struct socket *sock, int level, int optname,
1541             char __user *optval, int __user *optlen)
1542 {
1543     struct sock *sk = sock->sk;
1544 
1545     union {
1546         int val;
1547         u64 val64;
1548         unsigned long ulval;
1549         struct linger ling;
1550         struct old_timeval32 tm32;
1551         struct __kernel_old_timeval tm;
1552         struct  __kernel_sock_timeval stm;
1553         struct sock_txtime txtime;
1554         struct so_timestamping timestamping;
1555     } v;
1556 
1557     int lv = sizeof(int);
1558     int len;
1559 
1560     if (get_user(len, optlen))
1561         return -EFAULT;
1562     if (len < 0)
1563         return -EINVAL;
1564 
1565     memset(&v, 0, sizeof(v));
1566 
1567     switch (optname) {
1568     case SO_DEBUG:
1569         v.val = sock_flag(sk, SOCK_DBG);
1570         break;
1571 
1572     case SO_DONTROUTE:
1573         v.val = sock_flag(sk, SOCK_LOCALROUTE);
1574         break;
1575 
1576     case SO_BROADCAST:
1577         v.val = sock_flag(sk, SOCK_BROADCAST);
1578         break;
1579 
1580     case SO_SNDBUF:
1581         v.val = sk->sk_sndbuf;
1582         break;
1583 
1584     case SO_RCVBUF:
1585         v.val = sk->sk_rcvbuf;
1586         break;
1587 
1588     case SO_REUSEADDR:
1589         v.val = sk->sk_reuse;
1590         break;
1591 
1592     case SO_REUSEPORT:
1593         v.val = sk->sk_reuseport;
1594         break;
1595 
1596     case SO_KEEPALIVE:
1597         v.val = sock_flag(sk, SOCK_KEEPOPEN);
1598         break;
1599 
1600     case SO_TYPE:
1601         v.val = sk->sk_type;
1602         break;
1603 
1604     case SO_PROTOCOL:
1605         v.val = sk->sk_protocol;
1606         break;
1607 
1608     case SO_DOMAIN:
1609         v.val = sk->sk_family;
1610         break;
1611 
1612     case SO_ERROR:
1613         v.val = -sock_error(sk);
1614         if (v.val == 0)
1615             v.val = xchg(&sk->sk_err_soft, 0);
1616         break;
1617 
1618     case SO_OOBINLINE:
1619         v.val = sock_flag(sk, SOCK_URGINLINE);
1620         break;
1621 
1622     case SO_NO_CHECK:
1623         v.val = sk->sk_no_check_tx;
1624         break;
1625 
1626     case SO_PRIORITY:
1627         v.val = sk->sk_priority;
1628         break;
1629 
1630     case SO_LINGER:
1631         lv      = sizeof(v.ling);
1632         v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1633         v.ling.l_linger = sk->sk_lingertime / HZ;
1634         break;
1635 
1636     case SO_BSDCOMPAT:
1637         break;
1638 
1639     case SO_TIMESTAMP_OLD:
1640         v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1641                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1642                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1643         break;
1644 
1645     case SO_TIMESTAMPNS_OLD:
1646         v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1647         break;
1648 
1649     case SO_TIMESTAMP_NEW:
1650         v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1651         break;
1652 
1653     case SO_TIMESTAMPNS_NEW:
1654         v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1655         break;
1656 
1657     case SO_TIMESTAMPING_OLD:
1658         lv = sizeof(v.timestamping);
1659         v.timestamping.flags = sk->sk_tsflags;
1660         v.timestamping.bind_phc = sk->sk_bind_phc;
1661         break;
1662 
1663     case SO_RCVTIMEO_OLD:
1664     case SO_RCVTIMEO_NEW:
1665         lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1666         break;
1667 
1668     case SO_SNDTIMEO_OLD:
1669     case SO_SNDTIMEO_NEW:
1670         lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1671         break;
1672 
1673     case SO_RCVLOWAT:
1674         v.val = sk->sk_rcvlowat;
1675         break;
1676 
1677     case SO_SNDLOWAT:
1678         v.val = 1;
1679         break;
1680 
1681     case SO_PASSCRED:
1682         v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1683         break;
1684 
1685     case SO_PEERCRED:
1686     {
1687         struct ucred peercred;
1688         if (len > sizeof(peercred))
1689             len = sizeof(peercred);
1690 
1691         spin_lock(&sk->sk_peer_lock);
1692         cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1693         spin_unlock(&sk->sk_peer_lock);
1694 
1695         if (copy_to_user(optval, &peercred, len))
1696             return -EFAULT;
1697         goto lenout;
1698     }
1699 
1700     case SO_PEERGROUPS:
1701     {
1702         const struct cred *cred;
1703         int ret, n;
1704 
1705         cred = sk_get_peer_cred(sk);
1706         if (!cred)
1707             return -ENODATA;
1708 
1709         n = cred->group_info->ngroups;
1710         if (len < n * sizeof(gid_t)) {
1711             len = n * sizeof(gid_t);
1712             put_cred(cred);
1713             return put_user(len, optlen) ? -EFAULT : -ERANGE;
1714         }
1715         len = n * sizeof(gid_t);
1716 
1717         ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1718         put_cred(cred);
1719         if (ret)
1720             return ret;
1721         goto lenout;
1722     }
1723 
1724     case SO_PEERNAME:
1725     {
1726         char address[128];
1727 
1728         lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1729         if (lv < 0)
1730             return -ENOTCONN;
1731         if (lv < len)
1732             return -EINVAL;
1733         if (copy_to_user(optval, address, len))
1734             return -EFAULT;
1735         goto lenout;
1736     }
1737 
1738     /* Dubious BSD thing... Probably nobody even uses it, but
1739      * the UNIX standard wants it for whatever reason... -DaveM
1740      */
1741     case SO_ACCEPTCONN:
1742         v.val = sk->sk_state == TCP_LISTEN;
1743         break;
1744 
1745     case SO_PASSSEC:
1746         v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1747         break;
1748 
1749     case SO_PEERSEC:
1750         return security_socket_getpeersec_stream(sock, optval, optlen, len);
1751 
1752     case SO_MARK:
1753         v.val = sk->sk_mark;
1754         break;
1755 
1756     case SO_RCVMARK:
1757         v.val = sock_flag(sk, SOCK_RCVMARK);
1758         break;
1759 
1760     case SO_RXQ_OVFL:
1761         v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1762         break;
1763 
1764     case SO_WIFI_STATUS:
1765         v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1766         break;
1767 
1768     case SO_PEEK_OFF:
1769         if (!sock->ops->set_peek_off)
1770             return -EOPNOTSUPP;
1771 
1772         v.val = sk->sk_peek_off;
1773         break;
1774     case SO_NOFCS:
1775         v.val = sock_flag(sk, SOCK_NOFCS);
1776         break;
1777 
1778     case SO_BINDTODEVICE:
1779         return sock_getbindtodevice(sk, optval, optlen, len);
1780 
1781     case SO_GET_FILTER:
1782         len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1783         if (len < 0)
1784             return len;
1785 
1786         goto lenout;
1787 
1788     case SO_LOCK_FILTER:
1789         v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1790         break;
1791 
1792     case SO_BPF_EXTENSIONS:
1793         v.val = bpf_tell_extensions();
1794         break;
1795 
1796     case SO_SELECT_ERR_QUEUE:
1797         v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1798         break;
1799 
1800 #ifdef CONFIG_NET_RX_BUSY_POLL
1801     case SO_BUSY_POLL:
1802         v.val = sk->sk_ll_usec;
1803         break;
1804     case SO_PREFER_BUSY_POLL:
1805         v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1806         break;
1807 #endif
1808 
1809     case SO_MAX_PACING_RATE:
1810         if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1811             lv = sizeof(v.ulval);
1812             v.ulval = sk->sk_max_pacing_rate;
1813         } else {
1814             /* 32bit version */
1815             v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1816         }
1817         break;
1818 
1819     case SO_INCOMING_CPU:
1820         v.val = READ_ONCE(sk->sk_incoming_cpu);
1821         break;
1822 
1823     case SO_MEMINFO:
1824     {
1825         u32 meminfo[SK_MEMINFO_VARS];
1826 
1827         sk_get_meminfo(sk, meminfo);
1828 
1829         len = min_t(unsigned int, len, sizeof(meminfo));
1830         if (copy_to_user(optval, &meminfo, len))
1831             return -EFAULT;
1832 
1833         goto lenout;
1834     }
1835 
1836 #ifdef CONFIG_NET_RX_BUSY_POLL
1837     case SO_INCOMING_NAPI_ID:
1838         v.val = READ_ONCE(sk->sk_napi_id);
1839 
1840         /* aggregate non-NAPI IDs down to 0 */
1841         if (v.val < MIN_NAPI_ID)
1842             v.val = 0;
1843 
1844         break;
1845 #endif
1846 
1847     case SO_COOKIE:
1848         lv = sizeof(u64);
1849         if (len < lv)
1850             return -EINVAL;
1851         v.val64 = sock_gen_cookie(sk);
1852         break;
1853 
1854     case SO_ZEROCOPY:
1855         v.val = sock_flag(sk, SOCK_ZEROCOPY);
1856         break;
1857 
1858     case SO_TXTIME:
1859         lv = sizeof(v.txtime);
1860         v.txtime.clockid = sk->sk_clockid;
1861         v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1862                   SOF_TXTIME_DEADLINE_MODE : 0;
1863         v.txtime.flags |= sk->sk_txtime_report_errors ?
1864                   SOF_TXTIME_REPORT_ERRORS : 0;
1865         break;
1866 
1867     case SO_BINDTOIFINDEX:
1868         v.val = READ_ONCE(sk->sk_bound_dev_if);
1869         break;
1870 
1871     case SO_NETNS_COOKIE:
1872         lv = sizeof(u64);
1873         if (len != lv)
1874             return -EINVAL;
1875         v.val64 = sock_net(sk)->net_cookie;
1876         break;
1877 
1878     case SO_BUF_LOCK:
1879         v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1880         break;
1881 
1882     case SO_RESERVE_MEM:
1883         v.val = sk->sk_reserved_mem;
1884         break;
1885 
1886     case SO_TXREHASH:
1887         v.val = sk->sk_txrehash;
1888         break;
1889 
1890     default:
1891         /* We implement the SO_SNDLOWAT etc to not be settable
1892          * (1003.1g 7).
1893          */
1894         return -ENOPROTOOPT;
1895     }
1896 
1897     if (len > lv)
1898         len = lv;
1899     if (copy_to_user(optval, &v, len))
1900         return -EFAULT;
1901 lenout:
1902     if (put_user(len, optlen))
1903         return -EFAULT;
1904     return 0;
1905 }
1906 
1907 /*
1908  * Initialize an sk_lock.
1909  *
1910  * (We also register the sk_lock with the lock validator.)
1911  */
1912 static inline void sock_lock_init(struct sock *sk)
1913 {
1914     if (sk->sk_kern_sock)
1915         sock_lock_init_class_and_name(
1916             sk,
1917             af_family_kern_slock_key_strings[sk->sk_family],
1918             af_family_kern_slock_keys + sk->sk_family,
1919             af_family_kern_key_strings[sk->sk_family],
1920             af_family_kern_keys + sk->sk_family);
1921     else
1922         sock_lock_init_class_and_name(
1923             sk,
1924             af_family_slock_key_strings[sk->sk_family],
1925             af_family_slock_keys + sk->sk_family,
1926             af_family_key_strings[sk->sk_family],
1927             af_family_keys + sk->sk_family);
1928 }
1929 
1930 /*
1931  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1932  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1933  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1934  */
1935 static void sock_copy(struct sock *nsk, const struct sock *osk)
1936 {
1937     const struct proto *prot = READ_ONCE(osk->sk_prot);
1938 #ifdef CONFIG_SECURITY_NETWORK
1939     void *sptr = nsk->sk_security;
1940 #endif
1941 
1942     /* If we move sk_tx_queue_mapping out of the private section,
1943      * we must check if sk_tx_queue_clear() is called after
1944      * sock_copy() in sk_clone_lock().
1945      */
1946     BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1947              offsetof(struct sock, sk_dontcopy_begin) ||
1948              offsetof(struct sock, sk_tx_queue_mapping) >=
1949              offsetof(struct sock, sk_dontcopy_end));
1950 
1951     memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1952 
1953     memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1954            prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1955 
1956 #ifdef CONFIG_SECURITY_NETWORK
1957     nsk->sk_security = sptr;
1958     security_sk_clone(osk, nsk);
1959 #endif
1960 }
1961 
1962 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1963         int family)
1964 {
1965     struct sock *sk;
1966     struct kmem_cache *slab;
1967 
1968     slab = prot->slab;
1969     if (slab != NULL) {
1970         sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1971         if (!sk)
1972             return sk;
1973         if (want_init_on_alloc(priority))
1974             sk_prot_clear_nulls(sk, prot->obj_size);
1975     } else
1976         sk = kmalloc(prot->obj_size, priority);
1977 
1978     if (sk != NULL) {
1979         if (security_sk_alloc(sk, family, priority))
1980             goto out_free;
1981 
1982         if (!try_module_get(prot->owner))
1983             goto out_free_sec;
1984     }
1985 
1986     return sk;
1987 
1988 out_free_sec:
1989     security_sk_free(sk);
1990 out_free:
1991     if (slab != NULL)
1992         kmem_cache_free(slab, sk);
1993     else
1994         kfree(sk);
1995     return NULL;
1996 }
1997 
1998 static void sk_prot_free(struct proto *prot, struct sock *sk)
1999 {
2000     struct kmem_cache *slab;
2001     struct module *owner;
2002 
2003     owner = prot->owner;
2004     slab = prot->slab;
2005 
2006     cgroup_sk_free(&sk->sk_cgrp_data);
2007     mem_cgroup_sk_free(sk);
2008     security_sk_free(sk);
2009     if (slab != NULL)
2010         kmem_cache_free(slab, sk);
2011     else
2012         kfree(sk);
2013     module_put(owner);
2014 }
2015 
2016 /**
2017  *  sk_alloc - All socket objects are allocated here
2018  *  @net: the applicable net namespace
2019  *  @family: protocol family
2020  *  @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2021  *  @prot: struct proto associated with this new sock instance
2022  *  @kern: is this to be a kernel socket?
2023  */
2024 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2025               struct proto *prot, int kern)
2026 {
2027     struct sock *sk;
2028 
2029     sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2030     if (sk) {
2031         sk->sk_family = family;
2032         /*
2033          * See comment in struct sock definition to understand
2034          * why we need sk_prot_creator -acme
2035          */
2036         sk->sk_prot = sk->sk_prot_creator = prot;
2037         sk->sk_kern_sock = kern;
2038         sock_lock_init(sk);
2039         sk->sk_net_refcnt = kern ? 0 : 1;
2040         if (likely(sk->sk_net_refcnt)) {
2041             get_net_track(net, &sk->ns_tracker, priority);
2042             sock_inuse_add(net, 1);
2043         }
2044 
2045         sock_net_set(sk, net);
2046         refcount_set(&sk->sk_wmem_alloc, 1);
2047 
2048         mem_cgroup_sk_alloc(sk);
2049         cgroup_sk_alloc(&sk->sk_cgrp_data);
2050         sock_update_classid(&sk->sk_cgrp_data);
2051         sock_update_netprioidx(&sk->sk_cgrp_data);
2052         sk_tx_queue_clear(sk);
2053     }
2054 
2055     return sk;
2056 }
2057 EXPORT_SYMBOL(sk_alloc);
2058 
2059 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2060  * grace period. This is the case for UDP sockets and TCP listeners.
2061  */
2062 static void __sk_destruct(struct rcu_head *head)
2063 {
2064     struct sock *sk = container_of(head, struct sock, sk_rcu);
2065     struct sk_filter *filter;
2066 
2067     if (sk->sk_destruct)
2068         sk->sk_destruct(sk);
2069 
2070     filter = rcu_dereference_check(sk->sk_filter,
2071                        refcount_read(&sk->sk_wmem_alloc) == 0);
2072     if (filter) {
2073         sk_filter_uncharge(sk, filter);
2074         RCU_INIT_POINTER(sk->sk_filter, NULL);
2075     }
2076 
2077     sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2078 
2079 #ifdef CONFIG_BPF_SYSCALL
2080     bpf_sk_storage_free(sk);
2081 #endif
2082 
2083     if (atomic_read(&sk->sk_omem_alloc))
2084         pr_debug("%s: optmem leakage (%d bytes) detected\n",
2085              __func__, atomic_read(&sk->sk_omem_alloc));
2086 
2087     if (sk->sk_frag.page) {
2088         put_page(sk->sk_frag.page);
2089         sk->sk_frag.page = NULL;
2090     }
2091 
2092     /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2093     put_cred(sk->sk_peer_cred);
2094     put_pid(sk->sk_peer_pid);
2095 
2096     if (likely(sk->sk_net_refcnt))
2097         put_net_track(sock_net(sk), &sk->ns_tracker);
2098     sk_prot_free(sk->sk_prot_creator, sk);
2099 }
2100 
2101 void sk_destruct(struct sock *sk)
2102 {
2103     bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2104 
2105     if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2106         reuseport_detach_sock(sk);
2107         use_call_rcu = true;
2108     }
2109 
2110     if (use_call_rcu)
2111         call_rcu(&sk->sk_rcu, __sk_destruct);
2112     else
2113         __sk_destruct(&sk->sk_rcu);
2114 }
2115 
2116 static void __sk_free(struct sock *sk)
2117 {
2118     if (likely(sk->sk_net_refcnt))
2119         sock_inuse_add(sock_net(sk), -1);
2120 
2121     if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2122         sock_diag_broadcast_destroy(sk);
2123     else
2124         sk_destruct(sk);
2125 }
2126 
2127 void sk_free(struct sock *sk)
2128 {
2129     /*
2130      * We subtract one from sk_wmem_alloc and can know if
2131      * some packets are still in some tx queue.
2132      * If not null, sock_wfree() will call __sk_free(sk) later
2133      */
2134     if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2135         __sk_free(sk);
2136 }
2137 EXPORT_SYMBOL(sk_free);
2138 
2139 static void sk_init_common(struct sock *sk)
2140 {
2141     skb_queue_head_init(&sk->sk_receive_queue);
2142     skb_queue_head_init(&sk->sk_write_queue);
2143     skb_queue_head_init(&sk->sk_error_queue);
2144 
2145     rwlock_init(&sk->sk_callback_lock);
2146     lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2147             af_rlock_keys + sk->sk_family,
2148             af_family_rlock_key_strings[sk->sk_family]);
2149     lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2150             af_wlock_keys + sk->sk_family,
2151             af_family_wlock_key_strings[sk->sk_family]);
2152     lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2153             af_elock_keys + sk->sk_family,
2154             af_family_elock_key_strings[sk->sk_family]);
2155     lockdep_set_class_and_name(&sk->sk_callback_lock,
2156             af_callback_keys + sk->sk_family,
2157             af_family_clock_key_strings[sk->sk_family]);
2158 }
2159 
2160 /**
2161  *  sk_clone_lock - clone a socket, and lock its clone
2162  *  @sk: the socket to clone
2163  *  @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2164  *
2165  *  Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2166  */
2167 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2168 {
2169     struct proto *prot = READ_ONCE(sk->sk_prot);
2170     struct sk_filter *filter;
2171     bool is_charged = true;
2172     struct sock *newsk;
2173 
2174     newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2175     if (!newsk)
2176         goto out;
2177 
2178     sock_copy(newsk, sk);
2179 
2180     newsk->sk_prot_creator = prot;
2181 
2182     /* SANITY */
2183     if (likely(newsk->sk_net_refcnt)) {
2184         get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2185         sock_inuse_add(sock_net(newsk), 1);
2186     }
2187     sk_node_init(&newsk->sk_node);
2188     sock_lock_init(newsk);
2189     bh_lock_sock(newsk);
2190     newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2191     newsk->sk_backlog.len = 0;
2192 
2193     atomic_set(&newsk->sk_rmem_alloc, 0);
2194 
2195     /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2196     refcount_set(&newsk->sk_wmem_alloc, 1);
2197 
2198     atomic_set(&newsk->sk_omem_alloc, 0);
2199     sk_init_common(newsk);
2200 
2201     newsk->sk_dst_cache = NULL;
2202     newsk->sk_dst_pending_confirm = 0;
2203     newsk->sk_wmem_queued   = 0;
2204     newsk->sk_forward_alloc = 0;
2205     newsk->sk_reserved_mem  = 0;
2206     atomic_set(&newsk->sk_drops, 0);
2207     newsk->sk_send_head = NULL;
2208     newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2209     atomic_set(&newsk->sk_zckey, 0);
2210 
2211     sock_reset_flag(newsk, SOCK_DONE);
2212 
2213     /* sk->sk_memcg will be populated at accept() time */
2214     newsk->sk_memcg = NULL;
2215 
2216     cgroup_sk_clone(&newsk->sk_cgrp_data);
2217 
2218     rcu_read_lock();
2219     filter = rcu_dereference(sk->sk_filter);
2220     if (filter != NULL)
2221         /* though it's an empty new sock, the charging may fail
2222          * if sysctl_optmem_max was changed between creation of
2223          * original socket and cloning
2224          */
2225         is_charged = sk_filter_charge(newsk, filter);
2226     RCU_INIT_POINTER(newsk->sk_filter, filter);
2227     rcu_read_unlock();
2228 
2229     if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2230         /* We need to make sure that we don't uncharge the new
2231          * socket if we couldn't charge it in the first place
2232          * as otherwise we uncharge the parent's filter.
2233          */
2234         if (!is_charged)
2235             RCU_INIT_POINTER(newsk->sk_filter, NULL);
2236         sk_free_unlock_clone(newsk);
2237         newsk = NULL;
2238         goto out;
2239     }
2240     RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2241 
2242     if (bpf_sk_storage_clone(sk, newsk)) {
2243         sk_free_unlock_clone(newsk);
2244         newsk = NULL;
2245         goto out;
2246     }
2247 
2248     /* Clear sk_user_data if parent had the pointer tagged
2249      * as not suitable for copying when cloning.
2250      */
2251     if (sk_user_data_is_nocopy(newsk))
2252         newsk->sk_user_data = NULL;
2253 
2254     newsk->sk_err      = 0;
2255     newsk->sk_err_soft = 0;
2256     newsk->sk_priority = 0;
2257     newsk->sk_incoming_cpu = raw_smp_processor_id();
2258 
2259     /* Before updating sk_refcnt, we must commit prior changes to memory
2260      * (Documentation/RCU/rculist_nulls.rst for details)
2261      */
2262     smp_wmb();
2263     refcount_set(&newsk->sk_refcnt, 2);
2264 
2265     /* Increment the counter in the same struct proto as the master
2266      * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2267      * is the same as sk->sk_prot->socks, as this field was copied
2268      * with memcpy).
2269      *
2270      * This _changes_ the previous behaviour, where
2271      * tcp_create_openreq_child always was incrementing the
2272      * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2273      * to be taken into account in all callers. -acme
2274      */
2275     sk_refcnt_debug_inc(newsk);
2276     sk_set_socket(newsk, NULL);
2277     sk_tx_queue_clear(newsk);
2278     RCU_INIT_POINTER(newsk->sk_wq, NULL);
2279 
2280     if (newsk->sk_prot->sockets_allocated)
2281         sk_sockets_allocated_inc(newsk);
2282 
2283     if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2284         net_enable_timestamp();
2285 out:
2286     return newsk;
2287 }
2288 EXPORT_SYMBOL_GPL(sk_clone_lock);
2289 
2290 void sk_free_unlock_clone(struct sock *sk)
2291 {
2292     /* It is still raw copy of parent, so invalidate
2293      * destructor and make plain sk_free() */
2294     sk->sk_destruct = NULL;
2295     bh_unlock_sock(sk);
2296     sk_free(sk);
2297 }
2298 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2299 
2300 static void sk_trim_gso_size(struct sock *sk)
2301 {
2302     if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
2303         return;
2304 #if IS_ENABLED(CONFIG_IPV6)
2305     if (sk->sk_family == AF_INET6 &&
2306         sk_is_tcp(sk) &&
2307         !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
2308         return;
2309 #endif
2310     sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
2311 }
2312 
2313 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2314 {
2315     u32 max_segs = 1;
2316 
2317     sk_dst_set(sk, dst);
2318     sk->sk_route_caps = dst->dev->features;
2319     if (sk_is_tcp(sk))
2320         sk->sk_route_caps |= NETIF_F_GSO;
2321     if (sk->sk_route_caps & NETIF_F_GSO)
2322         sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2323     if (unlikely(sk->sk_gso_disabled))
2324         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2325     if (sk_can_gso(sk)) {
2326         if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2327             sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2328         } else {
2329             sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2330             /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2331             sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2332             sk_trim_gso_size(sk);
2333             sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2334             /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2335             max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2336         }
2337     }
2338     sk->sk_gso_max_segs = max_segs;
2339 }
2340 EXPORT_SYMBOL_GPL(sk_setup_caps);
2341 
2342 /*
2343  *  Simple resource managers for sockets.
2344  */
2345 
2346 
2347 /*
2348  * Write buffer destructor automatically called from kfree_skb.
2349  */
2350 void sock_wfree(struct sk_buff *skb)
2351 {
2352     struct sock *sk = skb->sk;
2353     unsigned int len = skb->truesize;
2354     bool free;
2355 
2356     if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2357         if (sock_flag(sk, SOCK_RCU_FREE) &&
2358             sk->sk_write_space == sock_def_write_space) {
2359             rcu_read_lock();
2360             free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2361             sock_def_write_space_wfree(sk);
2362             rcu_read_unlock();
2363             if (unlikely(free))
2364                 __sk_free(sk);
2365             return;
2366         }
2367 
2368         /*
2369          * Keep a reference on sk_wmem_alloc, this will be released
2370          * after sk_write_space() call
2371          */
2372         WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2373         sk->sk_write_space(sk);
2374         len = 1;
2375     }
2376     /*
2377      * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2378      * could not do because of in-flight packets
2379      */
2380     if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2381         __sk_free(sk);
2382 }
2383 EXPORT_SYMBOL(sock_wfree);
2384 
2385 /* This variant of sock_wfree() is used by TCP,
2386  * since it sets SOCK_USE_WRITE_QUEUE.
2387  */
2388 void __sock_wfree(struct sk_buff *skb)
2389 {
2390     struct sock *sk = skb->sk;
2391 
2392     if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2393         __sk_free(sk);
2394 }
2395 
2396 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2397 {
2398     skb_orphan(skb);
2399     skb->sk = sk;
2400 #ifdef CONFIG_INET
2401     if (unlikely(!sk_fullsock(sk))) {
2402         skb->destructor = sock_edemux;
2403         sock_hold(sk);
2404         return;
2405     }
2406 #endif
2407     skb->destructor = sock_wfree;
2408     skb_set_hash_from_sk(skb, sk);
2409     /*
2410      * We used to take a refcount on sk, but following operation
2411      * is enough to guarantee sk_free() wont free this sock until
2412      * all in-flight packets are completed
2413      */
2414     refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2415 }
2416 EXPORT_SYMBOL(skb_set_owner_w);
2417 
2418 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2419 {
2420 #ifdef CONFIG_TLS_DEVICE
2421     /* Drivers depend on in-order delivery for crypto offload,
2422      * partial orphan breaks out-of-order-OK logic.
2423      */
2424     if (skb->decrypted)
2425         return false;
2426 #endif
2427     return (skb->destructor == sock_wfree ||
2428         (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2429 }
2430 
2431 /* This helper is used by netem, as it can hold packets in its
2432  * delay queue. We want to allow the owner socket to send more
2433  * packets, as if they were already TX completed by a typical driver.
2434  * But we also want to keep skb->sk set because some packet schedulers
2435  * rely on it (sch_fq for example).
2436  */
2437 void skb_orphan_partial(struct sk_buff *skb)
2438 {
2439     if (skb_is_tcp_pure_ack(skb))
2440         return;
2441 
2442     if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2443         return;
2444 
2445     skb_orphan(skb);
2446 }
2447 EXPORT_SYMBOL(skb_orphan_partial);
2448 
2449 /*
2450  * Read buffer destructor automatically called from kfree_skb.
2451  */
2452 void sock_rfree(struct sk_buff *skb)
2453 {
2454     struct sock *sk = skb->sk;
2455     unsigned int len = skb->truesize;
2456 
2457     atomic_sub(len, &sk->sk_rmem_alloc);
2458     sk_mem_uncharge(sk, len);
2459 }
2460 EXPORT_SYMBOL(sock_rfree);
2461 
2462 /*
2463  * Buffer destructor for skbs that are not used directly in read or write
2464  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2465  */
2466 void sock_efree(struct sk_buff *skb)
2467 {
2468     sock_put(skb->sk);
2469 }
2470 EXPORT_SYMBOL(sock_efree);
2471 
2472 /* Buffer destructor for prefetch/receive path where reference count may
2473  * not be held, e.g. for listen sockets.
2474  */
2475 #ifdef CONFIG_INET
2476 void sock_pfree(struct sk_buff *skb)
2477 {
2478     if (sk_is_refcounted(skb->sk))
2479         sock_gen_put(skb->sk);
2480 }
2481 EXPORT_SYMBOL(sock_pfree);
2482 #endif /* CONFIG_INET */
2483 
2484 kuid_t sock_i_uid(struct sock *sk)
2485 {
2486     kuid_t uid;
2487 
2488     read_lock_bh(&sk->sk_callback_lock);
2489     uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2490     read_unlock_bh(&sk->sk_callback_lock);
2491     return uid;
2492 }
2493 EXPORT_SYMBOL(sock_i_uid);
2494 
2495 unsigned long sock_i_ino(struct sock *sk)
2496 {
2497     unsigned long ino;
2498 
2499     read_lock_bh(&sk->sk_callback_lock);
2500     ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2501     read_unlock_bh(&sk->sk_callback_lock);
2502     return ino;
2503 }
2504 EXPORT_SYMBOL(sock_i_ino);
2505 
2506 /*
2507  * Allocate a skb from the socket's send buffer.
2508  */
2509 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2510                  gfp_t priority)
2511 {
2512     if (force ||
2513         refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2514         struct sk_buff *skb = alloc_skb(size, priority);
2515 
2516         if (skb) {
2517             skb_set_owner_w(skb, sk);
2518             return skb;
2519         }
2520     }
2521     return NULL;
2522 }
2523 EXPORT_SYMBOL(sock_wmalloc);
2524 
2525 static void sock_ofree(struct sk_buff *skb)
2526 {
2527     struct sock *sk = skb->sk;
2528 
2529     atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2530 }
2531 
2532 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2533                  gfp_t priority)
2534 {
2535     struct sk_buff *skb;
2536 
2537     /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2538     if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2539         READ_ONCE(sysctl_optmem_max))
2540         return NULL;
2541 
2542     skb = alloc_skb(size, priority);
2543     if (!skb)
2544         return NULL;
2545 
2546     atomic_add(skb->truesize, &sk->sk_omem_alloc);
2547     skb->sk = sk;
2548     skb->destructor = sock_ofree;
2549     return skb;
2550 }
2551 
2552 /*
2553  * Allocate a memory block from the socket's option memory buffer.
2554  */
2555 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2556 {
2557     int optmem_max = READ_ONCE(sysctl_optmem_max);
2558 
2559     if ((unsigned int)size <= optmem_max &&
2560         atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2561         void *mem;
2562         /* First do the add, to avoid the race if kmalloc
2563          * might sleep.
2564          */
2565         atomic_add(size, &sk->sk_omem_alloc);
2566         mem = kmalloc(size, priority);
2567         if (mem)
2568             return mem;
2569         atomic_sub(size, &sk->sk_omem_alloc);
2570     }
2571     return NULL;
2572 }
2573 EXPORT_SYMBOL(sock_kmalloc);
2574 
2575 /* Free an option memory block. Note, we actually want the inline
2576  * here as this allows gcc to detect the nullify and fold away the
2577  * condition entirely.
2578  */
2579 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2580                   const bool nullify)
2581 {
2582     if (WARN_ON_ONCE(!mem))
2583         return;
2584     if (nullify)
2585         kfree_sensitive(mem);
2586     else
2587         kfree(mem);
2588     atomic_sub(size, &sk->sk_omem_alloc);
2589 }
2590 
2591 void sock_kfree_s(struct sock *sk, void *mem, int size)
2592 {
2593     __sock_kfree_s(sk, mem, size, false);
2594 }
2595 EXPORT_SYMBOL(sock_kfree_s);
2596 
2597 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2598 {
2599     __sock_kfree_s(sk, mem, size, true);
2600 }
2601 EXPORT_SYMBOL(sock_kzfree_s);
2602 
2603 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2604    I think, these locks should be removed for datagram sockets.
2605  */
2606 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2607 {
2608     DEFINE_WAIT(wait);
2609 
2610     sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2611     for (;;) {
2612         if (!timeo)
2613             break;
2614         if (signal_pending(current))
2615             break;
2616         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2617         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2618         if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2619             break;
2620         if (sk->sk_shutdown & SEND_SHUTDOWN)
2621             break;
2622         if (sk->sk_err)
2623             break;
2624         timeo = schedule_timeout(timeo);
2625     }
2626     finish_wait(sk_sleep(sk), &wait);
2627     return timeo;
2628 }
2629 
2630 
2631 /*
2632  *  Generic send/receive buffer handlers
2633  */
2634 
2635 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2636                      unsigned long data_len, int noblock,
2637                      int *errcode, int max_page_order)
2638 {
2639     struct sk_buff *skb;
2640     long timeo;
2641     int err;
2642 
2643     timeo = sock_sndtimeo(sk, noblock);
2644     for (;;) {
2645         err = sock_error(sk);
2646         if (err != 0)
2647             goto failure;
2648 
2649         err = -EPIPE;
2650         if (sk->sk_shutdown & SEND_SHUTDOWN)
2651             goto failure;
2652 
2653         if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2654             break;
2655 
2656         sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2657         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2658         err = -EAGAIN;
2659         if (!timeo)
2660             goto failure;
2661         if (signal_pending(current))
2662             goto interrupted;
2663         timeo = sock_wait_for_wmem(sk, timeo);
2664     }
2665     skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2666                    errcode, sk->sk_allocation);
2667     if (skb)
2668         skb_set_owner_w(skb, sk);
2669     return skb;
2670 
2671 interrupted:
2672     err = sock_intr_errno(timeo);
2673 failure:
2674     *errcode = err;
2675     return NULL;
2676 }
2677 EXPORT_SYMBOL(sock_alloc_send_pskb);
2678 
2679 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2680              struct sockcm_cookie *sockc)
2681 {
2682     u32 tsflags;
2683 
2684     switch (cmsg->cmsg_type) {
2685     case SO_MARK:
2686         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2687             !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2688             return -EPERM;
2689         if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2690             return -EINVAL;
2691         sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2692         break;
2693     case SO_TIMESTAMPING_OLD:
2694         if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2695             return -EINVAL;
2696 
2697         tsflags = *(u32 *)CMSG_DATA(cmsg);
2698         if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2699             return -EINVAL;
2700 
2701         sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2702         sockc->tsflags |= tsflags;
2703         break;
2704     case SCM_TXTIME:
2705         if (!sock_flag(sk, SOCK_TXTIME))
2706             return -EINVAL;
2707         if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2708             return -EINVAL;
2709         sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2710         break;
2711     /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2712     case SCM_RIGHTS:
2713     case SCM_CREDENTIALS:
2714         break;
2715     default:
2716         return -EINVAL;
2717     }
2718     return 0;
2719 }
2720 EXPORT_SYMBOL(__sock_cmsg_send);
2721 
2722 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2723            struct sockcm_cookie *sockc)
2724 {
2725     struct cmsghdr *cmsg;
2726     int ret;
2727 
2728     for_each_cmsghdr(cmsg, msg) {
2729         if (!CMSG_OK(msg, cmsg))
2730             return -EINVAL;
2731         if (cmsg->cmsg_level != SOL_SOCKET)
2732             continue;
2733         ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2734         if (ret)
2735             return ret;
2736     }
2737     return 0;
2738 }
2739 EXPORT_SYMBOL(sock_cmsg_send);
2740 
2741 static void sk_enter_memory_pressure(struct sock *sk)
2742 {
2743     if (!sk->sk_prot->enter_memory_pressure)
2744         return;
2745 
2746     sk->sk_prot->enter_memory_pressure(sk);
2747 }
2748 
2749 static void sk_leave_memory_pressure(struct sock *sk)
2750 {
2751     if (sk->sk_prot->leave_memory_pressure) {
2752         sk->sk_prot->leave_memory_pressure(sk);
2753     } else {
2754         unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2755 
2756         if (memory_pressure && READ_ONCE(*memory_pressure))
2757             WRITE_ONCE(*memory_pressure, 0);
2758     }
2759 }
2760 
2761 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2762 
2763 /**
2764  * skb_page_frag_refill - check that a page_frag contains enough room
2765  * @sz: minimum size of the fragment we want to get
2766  * @pfrag: pointer to page_frag
2767  * @gfp: priority for memory allocation
2768  *
2769  * Note: While this allocator tries to use high order pages, there is
2770  * no guarantee that allocations succeed. Therefore, @sz MUST be
2771  * less or equal than PAGE_SIZE.
2772  */
2773 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2774 {
2775     if (pfrag->page) {
2776         if (page_ref_count(pfrag->page) == 1) {
2777             pfrag->offset = 0;
2778             return true;
2779         }
2780         if (pfrag->offset + sz <= pfrag->size)
2781             return true;
2782         put_page(pfrag->page);
2783     }
2784 
2785     pfrag->offset = 0;
2786     if (SKB_FRAG_PAGE_ORDER &&
2787         !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2788         /* Avoid direct reclaim but allow kswapd to wake */
2789         pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2790                       __GFP_COMP | __GFP_NOWARN |
2791                       __GFP_NORETRY,
2792                       SKB_FRAG_PAGE_ORDER);
2793         if (likely(pfrag->page)) {
2794             pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2795             return true;
2796         }
2797     }
2798     pfrag->page = alloc_page(gfp);
2799     if (likely(pfrag->page)) {
2800         pfrag->size = PAGE_SIZE;
2801         return true;
2802     }
2803     return false;
2804 }
2805 EXPORT_SYMBOL(skb_page_frag_refill);
2806 
2807 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2808 {
2809     if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2810         return true;
2811 
2812     sk_enter_memory_pressure(sk);
2813     sk_stream_moderate_sndbuf(sk);
2814     return false;
2815 }
2816 EXPORT_SYMBOL(sk_page_frag_refill);
2817 
2818 void __lock_sock(struct sock *sk)
2819     __releases(&sk->sk_lock.slock)
2820     __acquires(&sk->sk_lock.slock)
2821 {
2822     DEFINE_WAIT(wait);
2823 
2824     for (;;) {
2825         prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2826                     TASK_UNINTERRUPTIBLE);
2827         spin_unlock_bh(&sk->sk_lock.slock);
2828         schedule();
2829         spin_lock_bh(&sk->sk_lock.slock);
2830         if (!sock_owned_by_user(sk))
2831             break;
2832     }
2833     finish_wait(&sk->sk_lock.wq, &wait);
2834 }
2835 
2836 void __release_sock(struct sock *sk)
2837     __releases(&sk->sk_lock.slock)
2838     __acquires(&sk->sk_lock.slock)
2839 {
2840     struct sk_buff *skb, *next;
2841 
2842     while ((skb = sk->sk_backlog.head) != NULL) {
2843         sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2844 
2845         spin_unlock_bh(&sk->sk_lock.slock);
2846 
2847         do {
2848             next = skb->next;
2849             prefetch(next);
2850             DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2851             skb_mark_not_on_list(skb);
2852             sk_backlog_rcv(sk, skb);
2853 
2854             cond_resched();
2855 
2856             skb = next;
2857         } while (skb != NULL);
2858 
2859         spin_lock_bh(&sk->sk_lock.slock);
2860     }
2861 
2862     /*
2863      * Doing the zeroing here guarantee we can not loop forever
2864      * while a wild producer attempts to flood us.
2865      */
2866     sk->sk_backlog.len = 0;
2867 }
2868 
2869 void __sk_flush_backlog(struct sock *sk)
2870 {
2871     spin_lock_bh(&sk->sk_lock.slock);
2872     __release_sock(sk);
2873     spin_unlock_bh(&sk->sk_lock.slock);
2874 }
2875 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2876 
2877 /**
2878  * sk_wait_data - wait for data to arrive at sk_receive_queue
2879  * @sk:    sock to wait on
2880  * @timeo: for how long
2881  * @skb:   last skb seen on sk_receive_queue
2882  *
2883  * Now socket state including sk->sk_err is changed only under lock,
2884  * hence we may omit checks after joining wait queue.
2885  * We check receive queue before schedule() only as optimization;
2886  * it is very likely that release_sock() added new data.
2887  */
2888 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2889 {
2890     DEFINE_WAIT_FUNC(wait, woken_wake_function);
2891     int rc;
2892 
2893     add_wait_queue(sk_sleep(sk), &wait);
2894     sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2895     rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2896     sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2897     remove_wait_queue(sk_sleep(sk), &wait);
2898     return rc;
2899 }
2900 EXPORT_SYMBOL(sk_wait_data);
2901 
2902 /**
2903  *  __sk_mem_raise_allocated - increase memory_allocated
2904  *  @sk: socket
2905  *  @size: memory size to allocate
2906  *  @amt: pages to allocate
2907  *  @kind: allocation type
2908  *
2909  *  Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2910  */
2911 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2912 {
2913     bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2914     struct proto *prot = sk->sk_prot;
2915     bool charged = true;
2916     long allocated;
2917 
2918     sk_memory_allocated_add(sk, amt);
2919     allocated = sk_memory_allocated(sk);
2920     if (memcg_charge &&
2921         !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2922                         gfp_memcg_charge())))
2923         goto suppress_allocation;
2924 
2925     /* Under limit. */
2926     if (allocated <= sk_prot_mem_limits(sk, 0)) {
2927         sk_leave_memory_pressure(sk);
2928         return 1;
2929     }
2930 
2931     /* Under pressure. */
2932     if (allocated > sk_prot_mem_limits(sk, 1))
2933         sk_enter_memory_pressure(sk);
2934 
2935     /* Over hard limit. */
2936     if (allocated > sk_prot_mem_limits(sk, 2))
2937         goto suppress_allocation;
2938 
2939     /* guarantee minimum buffer size under pressure */
2940     if (kind == SK_MEM_RECV) {
2941         if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2942             return 1;
2943 
2944     } else { /* SK_MEM_SEND */
2945         int wmem0 = sk_get_wmem0(sk, prot);
2946 
2947         if (sk->sk_type == SOCK_STREAM) {
2948             if (sk->sk_wmem_queued < wmem0)
2949                 return 1;
2950         } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2951                 return 1;
2952         }
2953     }
2954 
2955     if (sk_has_memory_pressure(sk)) {
2956         u64 alloc;
2957 
2958         if (!sk_under_memory_pressure(sk))
2959             return 1;
2960         alloc = sk_sockets_allocated_read_positive(sk);
2961         if (sk_prot_mem_limits(sk, 2) > alloc *
2962             sk_mem_pages(sk->sk_wmem_queued +
2963                  atomic_read(&sk->sk_rmem_alloc) +
2964                  sk->sk_forward_alloc))
2965             return 1;
2966     }
2967 
2968 suppress_allocation:
2969 
2970     if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2971         sk_stream_moderate_sndbuf(sk);
2972 
2973         /* Fail only if socket is _under_ its sndbuf.
2974          * In this case we cannot block, so that we have to fail.
2975          */
2976         if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2977             /* Force charge with __GFP_NOFAIL */
2978             if (memcg_charge && !charged) {
2979                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2980                     gfp_memcg_charge() | __GFP_NOFAIL);
2981             }
2982             return 1;
2983         }
2984     }
2985 
2986     if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2987         trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2988 
2989     sk_memory_allocated_sub(sk, amt);
2990 
2991     if (memcg_charge && charged)
2992         mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2993 
2994     return 0;
2995 }
2996 
2997 /**
2998  *  __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2999  *  @sk: socket
3000  *  @size: memory size to allocate
3001  *  @kind: allocation type
3002  *
3003  *  If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3004  *  rmem allocation. This function assumes that protocols which have
3005  *  memory_pressure use sk_wmem_queued as write buffer accounting.
3006  */
3007 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3008 {
3009     int ret, amt = sk_mem_pages(size);
3010 
3011     sk->sk_forward_alloc += amt << PAGE_SHIFT;
3012     ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3013     if (!ret)
3014         sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3015     return ret;
3016 }
3017 EXPORT_SYMBOL(__sk_mem_schedule);
3018 
3019 /**
3020  *  __sk_mem_reduce_allocated - reclaim memory_allocated
3021  *  @sk: socket
3022  *  @amount: number of quanta
3023  *
3024  *  Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3025  */
3026 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3027 {
3028     sk_memory_allocated_sub(sk, amount);
3029 
3030     if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3031         mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3032 
3033     if (sk_under_memory_pressure(sk) &&
3034         (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3035         sk_leave_memory_pressure(sk);
3036 }
3037 
3038 /**
3039  *  __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3040  *  @sk: socket
3041  *  @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3042  */
3043 void __sk_mem_reclaim(struct sock *sk, int amount)
3044 {
3045     amount >>= PAGE_SHIFT;
3046     sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3047     __sk_mem_reduce_allocated(sk, amount);
3048 }
3049 EXPORT_SYMBOL(__sk_mem_reclaim);
3050 
3051 int sk_set_peek_off(struct sock *sk, int val)
3052 {
3053     sk->sk_peek_off = val;
3054     return 0;
3055 }
3056 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3057 
3058 /*
3059  * Set of default routines for initialising struct proto_ops when
3060  * the protocol does not support a particular function. In certain
3061  * cases where it makes no sense for a protocol to have a "do nothing"
3062  * function, some default processing is provided.
3063  */
3064 
3065 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3066 {
3067     return -EOPNOTSUPP;
3068 }
3069 EXPORT_SYMBOL(sock_no_bind);
3070 
3071 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3072             int len, int flags)
3073 {
3074     return -EOPNOTSUPP;
3075 }
3076 EXPORT_SYMBOL(sock_no_connect);
3077 
3078 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3079 {
3080     return -EOPNOTSUPP;
3081 }
3082 EXPORT_SYMBOL(sock_no_socketpair);
3083 
3084 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3085            bool kern)
3086 {
3087     return -EOPNOTSUPP;
3088 }
3089 EXPORT_SYMBOL(sock_no_accept);
3090 
3091 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3092             int peer)
3093 {
3094     return -EOPNOTSUPP;
3095 }
3096 EXPORT_SYMBOL(sock_no_getname);
3097 
3098 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3099 {
3100     return -EOPNOTSUPP;
3101 }
3102 EXPORT_SYMBOL(sock_no_ioctl);
3103 
3104 int sock_no_listen(struct socket *sock, int backlog)
3105 {
3106     return -EOPNOTSUPP;
3107 }
3108 EXPORT_SYMBOL(sock_no_listen);
3109 
3110 int sock_no_shutdown(struct socket *sock, int how)
3111 {
3112     return -EOPNOTSUPP;
3113 }
3114 EXPORT_SYMBOL(sock_no_shutdown);
3115 
3116 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3117 {
3118     return -EOPNOTSUPP;
3119 }
3120 EXPORT_SYMBOL(sock_no_sendmsg);
3121 
3122 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3123 {
3124     return -EOPNOTSUPP;
3125 }
3126 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3127 
3128 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3129             int flags)
3130 {
3131     return -EOPNOTSUPP;
3132 }
3133 EXPORT_SYMBOL(sock_no_recvmsg);
3134 
3135 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3136 {
3137     /* Mirror missing mmap method error code */
3138     return -ENODEV;
3139 }
3140 EXPORT_SYMBOL(sock_no_mmap);
3141 
3142 /*
3143  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3144  * various sock-based usage counts.
3145  */
3146 void __receive_sock(struct file *file)
3147 {
3148     struct socket *sock;
3149 
3150     sock = sock_from_file(file);
3151     if (sock) {
3152         sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3153         sock_update_classid(&sock->sk->sk_cgrp_data);
3154     }
3155 }
3156 
3157 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3158 {
3159     ssize_t res;
3160     struct msghdr msg = {.msg_flags = flags};
3161     struct kvec iov;
3162     char *kaddr = kmap(page);
3163     iov.iov_base = kaddr + offset;
3164     iov.iov_len = size;
3165     res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3166     kunmap(page);
3167     return res;
3168 }
3169 EXPORT_SYMBOL(sock_no_sendpage);
3170 
3171 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3172                 int offset, size_t size, int flags)
3173 {
3174     ssize_t res;
3175     struct msghdr msg = {.msg_flags = flags};
3176     struct kvec iov;
3177     char *kaddr = kmap(page);
3178 
3179     iov.iov_base = kaddr + offset;
3180     iov.iov_len = size;
3181     res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3182     kunmap(page);
3183     return res;
3184 }
3185 EXPORT_SYMBOL(sock_no_sendpage_locked);
3186 
3187 /*
3188  *  Default Socket Callbacks
3189  */
3190 
3191 static void sock_def_wakeup(struct sock *sk)
3192 {
3193     struct socket_wq *wq;
3194 
3195     rcu_read_lock();
3196     wq = rcu_dereference(sk->sk_wq);
3197     if (skwq_has_sleeper(wq))
3198         wake_up_interruptible_all(&wq->wait);
3199     rcu_read_unlock();
3200 }
3201 
3202 static void sock_def_error_report(struct sock *sk)
3203 {
3204     struct socket_wq *wq;
3205 
3206     rcu_read_lock();
3207     wq = rcu_dereference(sk->sk_wq);
3208     if (skwq_has_sleeper(wq))
3209         wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3210     sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3211     rcu_read_unlock();
3212 }
3213 
3214 void sock_def_readable(struct sock *sk)
3215 {
3216     struct socket_wq *wq;
3217 
3218     rcu_read_lock();
3219     wq = rcu_dereference(sk->sk_wq);
3220     if (skwq_has_sleeper(wq))
3221         wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3222                         EPOLLRDNORM | EPOLLRDBAND);
3223     sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3224     rcu_read_unlock();
3225 }
3226 
3227 static void sock_def_write_space(struct sock *sk)
3228 {
3229     struct socket_wq *wq;
3230 
3231     rcu_read_lock();
3232 
3233     /* Do not wake up a writer until he can make "significant"
3234      * progress.  --DaveM
3235      */
3236     if (sock_writeable(sk)) {
3237         wq = rcu_dereference(sk->sk_wq);
3238         if (skwq_has_sleeper(wq))
3239             wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3240                         EPOLLWRNORM | EPOLLWRBAND);
3241 
3242         /* Should agree with poll, otherwise some programs break */
3243         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3244     }
3245 
3246     rcu_read_unlock();
3247 }
3248 
3249 /* An optimised version of sock_def_write_space(), should only be called
3250  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3251  * ->sk_wmem_alloc.
3252  */
3253 static void sock_def_write_space_wfree(struct sock *sk)
3254 {
3255     /* Do not wake up a writer until he can make "significant"
3256      * progress.  --DaveM
3257      */
3258     if (sock_writeable(sk)) {
3259         struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3260 
3261         /* rely on refcount_sub from sock_wfree() */
3262         smp_mb__after_atomic();
3263         if (wq && waitqueue_active(&wq->wait))
3264             wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3265                         EPOLLWRNORM | EPOLLWRBAND);
3266 
3267         /* Should agree with poll, otherwise some programs break */
3268         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3269     }
3270 }
3271 
3272 static void sock_def_destruct(struct sock *sk)
3273 {
3274 }
3275 
3276 void sk_send_sigurg(struct sock *sk)
3277 {
3278     if (sk->sk_socket && sk->sk_socket->file)
3279         if (send_sigurg(&sk->sk_socket->file->f_owner))
3280             sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3281 }
3282 EXPORT_SYMBOL(sk_send_sigurg);
3283 
3284 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3285             unsigned long expires)
3286 {
3287     if (!mod_timer(timer, expires))
3288         sock_hold(sk);
3289 }
3290 EXPORT_SYMBOL(sk_reset_timer);
3291 
3292 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3293 {
3294     if (del_timer(timer))
3295         __sock_put(sk);
3296 }
3297 EXPORT_SYMBOL(sk_stop_timer);
3298 
3299 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3300 {
3301     if (del_timer_sync(timer))
3302         __sock_put(sk);
3303 }
3304 EXPORT_SYMBOL(sk_stop_timer_sync);
3305 
3306 void sock_init_data(struct socket *sock, struct sock *sk)
3307 {
3308     sk_init_common(sk);
3309     sk->sk_send_head    =   NULL;
3310 
3311     timer_setup(&sk->sk_timer, NULL, 0);
3312 
3313     sk->sk_allocation   =   GFP_KERNEL;
3314     sk->sk_rcvbuf       =   READ_ONCE(sysctl_rmem_default);
3315     sk->sk_sndbuf       =   READ_ONCE(sysctl_wmem_default);
3316     sk->sk_state        =   TCP_CLOSE;
3317     sk_set_socket(sk, sock);
3318 
3319     sock_set_flag(sk, SOCK_ZAPPED);
3320 
3321     if (sock) {
3322         sk->sk_type =   sock->type;
3323         RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3324         sock->sk    =   sk;
3325         sk->sk_uid  =   SOCK_INODE(sock)->i_uid;
3326     } else {
3327         RCU_INIT_POINTER(sk->sk_wq, NULL);
3328         sk->sk_uid  =   make_kuid(sock_net(sk)->user_ns, 0);
3329     }
3330 
3331     rwlock_init(&sk->sk_callback_lock);
3332     if (sk->sk_kern_sock)
3333         lockdep_set_class_and_name(
3334             &sk->sk_callback_lock,
3335             af_kern_callback_keys + sk->sk_family,
3336             af_family_kern_clock_key_strings[sk->sk_family]);
3337     else
3338         lockdep_set_class_and_name(
3339             &sk->sk_callback_lock,
3340             af_callback_keys + sk->sk_family,
3341             af_family_clock_key_strings[sk->sk_family]);
3342 
3343     sk->sk_state_change =   sock_def_wakeup;
3344     sk->sk_data_ready   =   sock_def_readable;
3345     sk->sk_write_space  =   sock_def_write_space;
3346     sk->sk_error_report =   sock_def_error_report;
3347     sk->sk_destruct     =   sock_def_destruct;
3348 
3349     sk->sk_frag.page    =   NULL;
3350     sk->sk_frag.offset  =   0;
3351     sk->sk_peek_off     =   -1;
3352 
3353     sk->sk_peer_pid     =   NULL;
3354     sk->sk_peer_cred    =   NULL;
3355     spin_lock_init(&sk->sk_peer_lock);
3356 
3357     sk->sk_write_pending    =   0;
3358     sk->sk_rcvlowat     =   1;
3359     sk->sk_rcvtimeo     =   MAX_SCHEDULE_TIMEOUT;
3360     sk->sk_sndtimeo     =   MAX_SCHEDULE_TIMEOUT;
3361 
3362     sk->sk_stamp = SK_DEFAULT_STAMP;
3363 #if BITS_PER_LONG==32
3364     seqlock_init(&sk->sk_stamp_seq);
3365 #endif
3366     atomic_set(&sk->sk_zckey, 0);
3367 
3368 #ifdef CONFIG_NET_RX_BUSY_POLL
3369     sk->sk_napi_id      =   0;
3370     sk->sk_ll_usec      =   READ_ONCE(sysctl_net_busy_read);
3371 #endif
3372 
3373     sk->sk_max_pacing_rate = ~0UL;
3374     sk->sk_pacing_rate = ~0UL;
3375     WRITE_ONCE(sk->sk_pacing_shift, 10);
3376     sk->sk_incoming_cpu = -1;
3377     sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
3378 
3379     sk_rx_queue_clear(sk);
3380     /*
3381      * Before updating sk_refcnt, we must commit prior changes to memory
3382      * (Documentation/RCU/rculist_nulls.rst for details)
3383      */
3384     smp_wmb();
3385     refcount_set(&sk->sk_refcnt, 1);
3386     atomic_set(&sk->sk_drops, 0);
3387 }
3388 EXPORT_SYMBOL(sock_init_data);
3389 
3390 void lock_sock_nested(struct sock *sk, int subclass)
3391 {
3392     /* The sk_lock has mutex_lock() semantics here. */
3393     mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3394 
3395     might_sleep();
3396     spin_lock_bh(&sk->sk_lock.slock);
3397     if (sock_owned_by_user_nocheck(sk))
3398         __lock_sock(sk);
3399     sk->sk_lock.owned = 1;
3400     spin_unlock_bh(&sk->sk_lock.slock);
3401 }
3402 EXPORT_SYMBOL(lock_sock_nested);
3403 
3404 void release_sock(struct sock *sk)
3405 {
3406     spin_lock_bh(&sk->sk_lock.slock);
3407     if (sk->sk_backlog.tail)
3408         __release_sock(sk);
3409 
3410     /* Warning : release_cb() might need to release sk ownership,
3411      * ie call sock_release_ownership(sk) before us.
3412      */
3413     if (sk->sk_prot->release_cb)
3414         sk->sk_prot->release_cb(sk);
3415 
3416     sock_release_ownership(sk);
3417     if (waitqueue_active(&sk->sk_lock.wq))
3418         wake_up(&sk->sk_lock.wq);
3419     spin_unlock_bh(&sk->sk_lock.slock);
3420 }
3421 EXPORT_SYMBOL(release_sock);
3422 
3423 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3424 {
3425     might_sleep();
3426     spin_lock_bh(&sk->sk_lock.slock);
3427 
3428     if (!sock_owned_by_user_nocheck(sk)) {
3429         /*
3430          * Fast path return with bottom halves disabled and
3431          * sock::sk_lock.slock held.
3432          *
3433          * The 'mutex' is not contended and holding
3434          * sock::sk_lock.slock prevents all other lockers to
3435          * proceed so the corresponding unlock_sock_fast() can
3436          * avoid the slow path of release_sock() completely and
3437          * just release slock.
3438          *
3439          * From a semantical POV this is equivalent to 'acquiring'
3440          * the 'mutex', hence the corresponding lockdep
3441          * mutex_release() has to happen in the fast path of
3442          * unlock_sock_fast().
3443          */
3444         return false;
3445     }
3446 
3447     __lock_sock(sk);
3448     sk->sk_lock.owned = 1;
3449     __acquire(&sk->sk_lock.slock);
3450     spin_unlock_bh(&sk->sk_lock.slock);
3451     return true;
3452 }
3453 EXPORT_SYMBOL(__lock_sock_fast);
3454 
3455 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3456            bool timeval, bool time32)
3457 {
3458     struct sock *sk = sock->sk;
3459     struct timespec64 ts;
3460 
3461     sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3462     ts = ktime_to_timespec64(sock_read_timestamp(sk));
3463     if (ts.tv_sec == -1)
3464         return -ENOENT;
3465     if (ts.tv_sec == 0) {
3466         ktime_t kt = ktime_get_real();
3467         sock_write_timestamp(sk, kt);
3468         ts = ktime_to_timespec64(kt);
3469     }
3470 
3471     if (timeval)
3472         ts.tv_nsec /= 1000;
3473 
3474 #ifdef CONFIG_COMPAT_32BIT_TIME
3475     if (time32)
3476         return put_old_timespec32(&ts, userstamp);
3477 #endif
3478 #ifdef CONFIG_SPARC64
3479     /* beware of padding in sparc64 timeval */
3480     if (timeval && !in_compat_syscall()) {
3481         struct __kernel_old_timeval __user tv = {
3482             .tv_sec = ts.tv_sec,
3483             .tv_usec = ts.tv_nsec,
3484         };
3485         if (copy_to_user(userstamp, &tv, sizeof(tv)))
3486             return -EFAULT;
3487         return 0;
3488     }
3489 #endif
3490     return put_timespec64(&ts, userstamp);
3491 }
3492 EXPORT_SYMBOL(sock_gettstamp);
3493 
3494 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3495 {
3496     if (!sock_flag(sk, flag)) {
3497         unsigned long previous_flags = sk->sk_flags;
3498 
3499         sock_set_flag(sk, flag);
3500         /*
3501          * we just set one of the two flags which require net
3502          * time stamping, but time stamping might have been on
3503          * already because of the other one
3504          */
3505         if (sock_needs_netstamp(sk) &&
3506             !(previous_flags & SK_FLAGS_TIMESTAMP))
3507             net_enable_timestamp();
3508     }
3509 }
3510 
3511 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3512                int level, int type)
3513 {
3514     struct sock_exterr_skb *serr;
3515     struct sk_buff *skb;
3516     int copied, err;
3517 
3518     err = -EAGAIN;
3519     skb = sock_dequeue_err_skb(sk);
3520     if (skb == NULL)
3521         goto out;
3522 
3523     copied = skb->len;
3524     if (copied > len) {
3525         msg->msg_flags |= MSG_TRUNC;
3526         copied = len;
3527     }
3528     err = skb_copy_datagram_msg(skb, 0, msg, copied);
3529     if (err)
3530         goto out_free_skb;
3531 
3532     sock_recv_timestamp(msg, sk, skb);
3533 
3534     serr = SKB_EXT_ERR(skb);
3535     put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3536 
3537     msg->msg_flags |= MSG_ERRQUEUE;
3538     err = copied;
3539 
3540 out_free_skb:
3541     kfree_skb(skb);
3542 out:
3543     return err;
3544 }
3545 EXPORT_SYMBOL(sock_recv_errqueue);
3546 
3547 /*
3548  *  Get a socket option on an socket.
3549  *
3550  *  FIX: POSIX 1003.1g is very ambiguous here. It states that
3551  *  asynchronous errors should be reported by getsockopt. We assume
3552  *  this means if you specify SO_ERROR (otherwise whats the point of it).
3553  */
3554 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3555                char __user *optval, int __user *optlen)
3556 {
3557     struct sock *sk = sock->sk;
3558 
3559     return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3560 }
3561 EXPORT_SYMBOL(sock_common_getsockopt);
3562 
3563 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3564             int flags)
3565 {
3566     struct sock *sk = sock->sk;
3567     int addr_len = 0;
3568     int err;
3569 
3570     err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3571     if (err >= 0)
3572         msg->msg_namelen = addr_len;
3573     return err;
3574 }
3575 EXPORT_SYMBOL(sock_common_recvmsg);
3576 
3577 /*
3578  *  Set socket options on an inet socket.
3579  */
3580 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3581                sockptr_t optval, unsigned int optlen)
3582 {
3583     struct sock *sk = sock->sk;
3584 
3585     return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3586 }
3587 EXPORT_SYMBOL(sock_common_setsockopt);
3588 
3589 void sk_common_release(struct sock *sk)
3590 {
3591     if (sk->sk_prot->destroy)
3592         sk->sk_prot->destroy(sk);
3593 
3594     /*
3595      * Observation: when sk_common_release is called, processes have
3596      * no access to socket. But net still has.
3597      * Step one, detach it from networking:
3598      *
3599      * A. Remove from hash tables.
3600      */
3601 
3602     sk->sk_prot->unhash(sk);
3603 
3604     /*
3605      * In this point socket cannot receive new packets, but it is possible
3606      * that some packets are in flight because some CPU runs receiver and
3607      * did hash table lookup before we unhashed socket. They will achieve
3608      * receive queue and will be purged by socket destructor.
3609      *
3610      * Also we still have packets pending on receive queue and probably,
3611      * our own packets waiting in device queues. sock_destroy will drain
3612      * receive queue, but transmitted packets will delay socket destruction
3613      * until the last reference will be released.
3614      */
3615 
3616     sock_orphan(sk);
3617 
3618     xfrm_sk_free_policy(sk);
3619 
3620     sk_refcnt_debug_release(sk);
3621 
3622     sock_put(sk);
3623 }
3624 EXPORT_SYMBOL(sk_common_release);
3625 
3626 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3627 {
3628     memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3629 
3630     mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3631     mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3632     mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3633     mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3634     mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3635     mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3636     mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3637     mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3638     mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3639 }
3640 
3641 #ifdef CONFIG_PROC_FS
3642 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3643 
3644 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3645 {
3646     int cpu, idx = prot->inuse_idx;
3647     int res = 0;
3648 
3649     for_each_possible_cpu(cpu)
3650         res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3651 
3652     return res >= 0 ? res : 0;
3653 }
3654 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3655 
3656 int sock_inuse_get(struct net *net)
3657 {
3658     int cpu, res = 0;
3659 
3660     for_each_possible_cpu(cpu)
3661         res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3662 
3663     return res;
3664 }
3665 
3666 EXPORT_SYMBOL_GPL(sock_inuse_get);
3667 
3668 static int __net_init sock_inuse_init_net(struct net *net)
3669 {
3670     net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3671     if (net->core.prot_inuse == NULL)
3672         return -ENOMEM;
3673     return 0;
3674 }
3675 
3676 static void __net_exit sock_inuse_exit_net(struct net *net)
3677 {
3678     free_percpu(net->core.prot_inuse);
3679 }
3680 
3681 static struct pernet_operations net_inuse_ops = {
3682     .init = sock_inuse_init_net,
3683     .exit = sock_inuse_exit_net,
3684 };
3685 
3686 static __init int net_inuse_init(void)
3687 {
3688     if (register_pernet_subsys(&net_inuse_ops))
3689         panic("Cannot initialize net inuse counters");
3690 
3691     return 0;
3692 }
3693 
3694 core_initcall(net_inuse_init);
3695 
3696 static int assign_proto_idx(struct proto *prot)
3697 {
3698     prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3699 
3700     if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3701         pr_err("PROTO_INUSE_NR exhausted\n");
3702         return -ENOSPC;
3703     }
3704 
3705     set_bit(prot->inuse_idx, proto_inuse_idx);
3706     return 0;
3707 }
3708 
3709 static void release_proto_idx(struct proto *prot)
3710 {
3711     if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3712         clear_bit(prot->inuse_idx, proto_inuse_idx);
3713 }
3714 #else
3715 static inline int assign_proto_idx(struct proto *prot)
3716 {
3717     return 0;
3718 }
3719 
3720 static inline void release_proto_idx(struct proto *prot)
3721 {
3722 }
3723 
3724 #endif
3725 
3726 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3727 {
3728     if (!twsk_prot)
3729         return;
3730     kfree(twsk_prot->twsk_slab_name);
3731     twsk_prot->twsk_slab_name = NULL;
3732     kmem_cache_destroy(twsk_prot->twsk_slab);
3733     twsk_prot->twsk_slab = NULL;
3734 }
3735 
3736 static int tw_prot_init(const struct proto *prot)
3737 {
3738     struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3739 
3740     if (!twsk_prot)
3741         return 0;
3742 
3743     twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3744                           prot->name);
3745     if (!twsk_prot->twsk_slab_name)
3746         return -ENOMEM;
3747 
3748     twsk_prot->twsk_slab =
3749         kmem_cache_create(twsk_prot->twsk_slab_name,
3750                   twsk_prot->twsk_obj_size, 0,
3751                   SLAB_ACCOUNT | prot->slab_flags,
3752                   NULL);
3753     if (!twsk_prot->twsk_slab) {
3754         pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3755             prot->name);
3756         return -ENOMEM;
3757     }
3758 
3759     return 0;
3760 }
3761 
3762 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3763 {
3764     if (!rsk_prot)
3765         return;
3766     kfree(rsk_prot->slab_name);
3767     rsk_prot->slab_name = NULL;
3768     kmem_cache_destroy(rsk_prot->slab);
3769     rsk_prot->slab = NULL;
3770 }
3771 
3772 static int req_prot_init(const struct proto *prot)
3773 {
3774     struct request_sock_ops *rsk_prot = prot->rsk_prot;
3775 
3776     if (!rsk_prot)
3777         return 0;
3778 
3779     rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3780                     prot->name);
3781     if (!rsk_prot->slab_name)
3782         return -ENOMEM;
3783 
3784     rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3785                        rsk_prot->obj_size, 0,
3786                        SLAB_ACCOUNT | prot->slab_flags,
3787                        NULL);
3788 
3789     if (!rsk_prot->slab) {
3790         pr_crit("%s: Can't create request sock SLAB cache!\n",
3791             prot->name);
3792         return -ENOMEM;
3793     }
3794     return 0;
3795 }
3796 
3797 int proto_register(struct proto *prot, int alloc_slab)
3798 {
3799     int ret = -ENOBUFS;
3800 
3801     if (prot->memory_allocated && !prot->sysctl_mem) {
3802         pr_err("%s: missing sysctl_mem\n", prot->name);
3803         return -EINVAL;
3804     }
3805     if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3806         pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3807         return -EINVAL;
3808     }
3809     if (alloc_slab) {
3810         prot->slab = kmem_cache_create_usercopy(prot->name,
3811                     prot->obj_size, 0,
3812                     SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3813                     prot->slab_flags,
3814                     prot->useroffset, prot->usersize,
3815                     NULL);
3816 
3817         if (prot->slab == NULL) {
3818             pr_crit("%s: Can't create sock SLAB cache!\n",
3819                 prot->name);
3820             goto out;
3821         }
3822 
3823         if (req_prot_init(prot))
3824             goto out_free_request_sock_slab;
3825 
3826         if (tw_prot_init(prot))
3827             goto out_free_timewait_sock_slab;
3828     }
3829 
3830     mutex_lock(&proto_list_mutex);
3831     ret = assign_proto_idx(prot);
3832     if (ret) {
3833         mutex_unlock(&proto_list_mutex);
3834         goto out_free_timewait_sock_slab;
3835     }
3836     list_add(&prot->node, &proto_list);
3837     mutex_unlock(&proto_list_mutex);
3838     return ret;
3839 
3840 out_free_timewait_sock_slab:
3841     if (alloc_slab)
3842         tw_prot_cleanup(prot->twsk_prot);
3843 out_free_request_sock_slab:
3844     if (alloc_slab) {
3845         req_prot_cleanup(prot->rsk_prot);
3846 
3847         kmem_cache_destroy(prot->slab);
3848         prot->slab = NULL;
3849     }
3850 out:
3851     return ret;
3852 }
3853 EXPORT_SYMBOL(proto_register);
3854 
3855 void proto_unregister(struct proto *prot)
3856 {
3857     mutex_lock(&proto_list_mutex);
3858     release_proto_idx(prot);
3859     list_del(&prot->node);
3860     mutex_unlock(&proto_list_mutex);
3861 
3862     kmem_cache_destroy(prot->slab);
3863     prot->slab = NULL;
3864 
3865     req_prot_cleanup(prot->rsk_prot);
3866     tw_prot_cleanup(prot->twsk_prot);
3867 }
3868 EXPORT_SYMBOL(proto_unregister);
3869 
3870 int sock_load_diag_module(int family, int protocol)
3871 {
3872     if (!protocol) {
3873         if (!sock_is_registered(family))
3874             return -ENOENT;
3875 
3876         return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3877                       NETLINK_SOCK_DIAG, family);
3878     }
3879 
3880 #ifdef CONFIG_INET
3881     if (family == AF_INET &&
3882         protocol != IPPROTO_RAW &&
3883         protocol < MAX_INET_PROTOS &&
3884         !rcu_access_pointer(inet_protos[protocol]))
3885         return -ENOENT;
3886 #endif
3887 
3888     return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3889                   NETLINK_SOCK_DIAG, family, protocol);
3890 }
3891 EXPORT_SYMBOL(sock_load_diag_module);
3892 
3893 #ifdef CONFIG_PROC_FS
3894 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3895     __acquires(proto_list_mutex)
3896 {
3897     mutex_lock(&proto_list_mutex);
3898     return seq_list_start_head(&proto_list, *pos);
3899 }
3900 
3901 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3902 {
3903     return seq_list_next(v, &proto_list, pos);
3904 }
3905 
3906 static void proto_seq_stop(struct seq_file *seq, void *v)
3907     __releases(proto_list_mutex)
3908 {
3909     mutex_unlock(&proto_list_mutex);
3910 }
3911 
3912 static char proto_method_implemented(const void *method)
3913 {
3914     return method == NULL ? 'n' : 'y';
3915 }
3916 static long sock_prot_memory_allocated(struct proto *proto)
3917 {
3918     return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3919 }
3920 
3921 static const char *sock_prot_memory_pressure(struct proto *proto)
3922 {
3923     return proto->memory_pressure != NULL ?
3924     proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3925 }
3926 
3927 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3928 {
3929 
3930     seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3931             "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3932            proto->name,
3933            proto->obj_size,
3934            sock_prot_inuse_get(seq_file_net(seq), proto),
3935            sock_prot_memory_allocated(proto),
3936            sock_prot_memory_pressure(proto),
3937            proto->max_header,
3938            proto->slab == NULL ? "no" : "yes",
3939            module_name(proto->owner),
3940            proto_method_implemented(proto->close),
3941            proto_method_implemented(proto->connect),
3942            proto_method_implemented(proto->disconnect),
3943            proto_method_implemented(proto->accept),
3944            proto_method_implemented(proto->ioctl),
3945            proto_method_implemented(proto->init),
3946            proto_method_implemented(proto->destroy),
3947            proto_method_implemented(proto->shutdown),
3948            proto_method_implemented(proto->setsockopt),
3949            proto_method_implemented(proto->getsockopt),
3950            proto_method_implemented(proto->sendmsg),
3951            proto_method_implemented(proto->recvmsg),
3952            proto_method_implemented(proto->sendpage),
3953            proto_method_implemented(proto->bind),
3954            proto_method_implemented(proto->backlog_rcv),
3955            proto_method_implemented(proto->hash),
3956            proto_method_implemented(proto->unhash),
3957            proto_method_implemented(proto->get_port),
3958            proto_method_implemented(proto->enter_memory_pressure));
3959 }
3960 
3961 static int proto_seq_show(struct seq_file *seq, void *v)
3962 {
3963     if (v == &proto_list)
3964         seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3965                "protocol",
3966                "size",
3967                "sockets",
3968                "memory",
3969                "press",
3970                "maxhdr",
3971                "slab",
3972                "module",
3973                "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3974     else
3975         proto_seq_printf(seq, list_entry(v, struct proto, node));
3976     return 0;
3977 }
3978 
3979 static const struct seq_operations proto_seq_ops = {
3980     .start  = proto_seq_start,
3981     .next   = proto_seq_next,
3982     .stop   = proto_seq_stop,
3983     .show   = proto_seq_show,
3984 };
3985 
3986 static __net_init int proto_init_net(struct net *net)
3987 {
3988     if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3989             sizeof(struct seq_net_private)))
3990         return -ENOMEM;
3991 
3992     return 0;
3993 }
3994 
3995 static __net_exit void proto_exit_net(struct net *net)
3996 {
3997     remove_proc_entry("protocols", net->proc_net);
3998 }
3999 
4000 
4001 static __net_initdata struct pernet_operations proto_net_ops = {
4002     .init = proto_init_net,
4003     .exit = proto_exit_net,
4004 };
4005 
4006 static int __init proto_init(void)
4007 {
4008     return register_pernet_subsys(&proto_net_ops);
4009 }
4010 
4011 subsys_initcall(proto_init);
4012 
4013 #endif /* PROC_FS */
4014 
4015 #ifdef CONFIG_NET_RX_BUSY_POLL
4016 bool sk_busy_loop_end(void *p, unsigned long start_time)
4017 {
4018     struct sock *sk = p;
4019 
4020     return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4021            sk_busy_loop_timeout(sk, start_time);
4022 }
4023 EXPORT_SYMBOL(sk_busy_loop_end);
4024 #endif /* CONFIG_NET_RX_BUSY_POLL */
4025 
4026 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4027 {
4028     if (!sk->sk_prot->bind_add)
4029         return -EOPNOTSUPP;
4030     return sk->sk_prot->bind_add(sk, addr, addr_len);
4031 }
4032 EXPORT_SYMBOL(sock_bind_add);