net/ipv4/tcp_ipv4.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * INET     An implementation of the TCP/IP protocol suite for the LINUX
0004  *      operating system.  INET is implemented using the  BSD Socket
0005  *      interface as the means of communication with the user level.
0006  *
0007  *      Implementation of the Transmission Control Protocol(TCP).
0008  *
0009  *      IPv4 specific functions
0010  *
0011  *      code split from:
0012  *      linux/ipv4/tcp.c
0013  *      linux/ipv4/tcp_input.c
0014  *      linux/ipv4/tcp_output.c
0015  *
0016  *      See tcp.c for author information
0017  */
0018
0019 /*
0020  * Changes:
0021  *      David S. Miller :   New socket lookup architecture.
0022  *                  This code is dedicated to John Dyson.
0023  *      David S. Miller :   Change semantics of established hash,
0024  *                  half is devoted to TIME_WAIT sockets
0025  *                  and the rest go in the other half.
0026  *      Andi Kleen :        Add support for syncookies and fixed
0027  *                  some bugs: ip options weren't passed to
0028  *                  the TCP layer, missed a check for an
0029  *                  ACK bit.
0030  *      Andi Kleen :        Implemented fast path mtu discovery.
0031  *                      Fixed many serious bugs in the
0032  *                  request_sock handling and moved
0033  *                  most of it into the af independent code.
0034  *                  Added tail drop and some other bugfixes.
0035  *                  Added new listen semantics.
0036  *      Mike McLagan    :   Routing by source
0037  *  Juan Jose Ciarlante:        ip_dynaddr bits
0038  *      Andi Kleen:     various fixes.
0039  *  Vitaly E. Lavrov    :   Transparent proxy revived after year
0040  *                  coma.
0041  *  Andi Kleen      :   Fix new listen.
0042  *  Andi Kleen      :   Fix accept error reporting.
0043  *  YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
0044  *  Alexey Kuznetsov        allow both IPv4 and IPv6 sockets to bind
0045  *                  a single port at the same time.
0046  */
0047
0048 #define pr_fmt(fmt) "TCP: " fmt
0049
0050 #include <linux/bottom_half.h>
0051 #include <linux/types.h>
0052 #include <linux/fcntl.h>
0053 #include <linux/module.h>
0054 #include <linux/random.h>
0055 #include <linux/cache.h>
0056 #include <linux/jhash.h>
0057 #include <linux/init.h>
0058 #include <linux/times.h>
0059 #include <linux/slab.h>
0060
0061 #include <net/net_namespace.h>
0062 #include <net/icmp.h>
0063 #include <net/inet_hashtables.h>
0064 #include <net/tcp.h>
0065 #include <net/transp_v6.h>
0066 #include <net/ipv6.h>
0067 #include <net/inet_common.h>
0068 #include <net/timewait_sock.h>
0069 #include <net/xfrm.h>
0070 #include <net/secure_seq.h>
0071 #include <net/busy_poll.h>
0072
0073 #include <linux/inet.h>
0074 #include <linux/ipv6.h>
0075 #include <linux/stddef.h>
0076 #include <linux/proc_fs.h>
0077 #include <linux/seq_file.h>
0078 #include <linux/inetdevice.h>
0079 #include <linux/btf_ids.h>
0080
0081 #include <crypto/hash.h>
0082 #include <linux/scatterlist.h>
0083
0084 #include <trace/events/tcp.h>
0085
0086 #ifdef CONFIG_TCP_MD5SIG
0087 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
0088                    __be32 daddr, __be32 saddr, const struct tcphdr *th);
0089 #endif
0090
0091 struct inet_hashinfo tcp_hashinfo;
0092 EXPORT_SYMBOL(tcp_hashinfo);
0093
0094 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
0095
0096 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
0097 {
0098     return secure_tcp_seq(ip_hdr(skb)->daddr,
0099                   ip_hdr(skb)->saddr,
0100                   tcp_hdr(skb)->dest,
0101                   tcp_hdr(skb)->source);
0102 }
0103
0104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
0105 {
0106     return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
0107 }
0108
0109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
0110 {
0111     int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
0112     const struct inet_timewait_sock *tw = inet_twsk(sktw);
0113     const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
0114     struct tcp_sock *tp = tcp_sk(sk);
0115
0116     if (reuse == 2) {
0117         /* Still does not detect *everything* that goes through
0118          * lo, since we require a loopback src or dst address
0119          * or direct binding to 'lo' interface.
0120          */
0121         bool loopback = false;
0122         if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
0123             loopback = true;
0124 #if IS_ENABLED(CONFIG_IPV6)
0125         if (tw->tw_family == AF_INET6) {
0126             if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
0127                 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
0128                 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
0129                 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
0130                 loopback = true;
0131         } else
0132 #endif
0133         {
0134             if (ipv4_is_loopback(tw->tw_daddr) ||
0135                 ipv4_is_loopback(tw->tw_rcv_saddr))
0136                 loopback = true;
0137         }
0138         if (!loopback)
0139             reuse = 0;
0140     }
0141
0142     /* With PAWS, it is safe from the viewpoint
0143        of data integrity. Even without PAWS it is safe provided sequence
0144        spaces do not overlap i.e. at data rates <= 80Mbit/sec.
0145
0146        Actually, the idea is close to VJ's one, only timestamp cache is
0147        held not per host, but per port pair and TW bucket is used as state
0148        holder.
0149
0150        If TW bucket has been already destroyed we fall back to VJ's scheme
0151        and use initial timestamp retrieved from peer table.
0152      */
0153     if (tcptw->tw_ts_recent_stamp &&
0154         (!twp || (reuse && time_after32(ktime_get_seconds(),
0155                         tcptw->tw_ts_recent_stamp)))) {
0156         /* In case of repair and re-using TIME-WAIT sockets we still
0157          * want to be sure that it is safe as above but honor the
0158          * sequence numbers and time stamps set as part of the repair
0159          * process.
0160          *
0161          * Without this check re-using a TIME-WAIT socket with TCP
0162          * repair would accumulate a -1 on the repair assigned
0163          * sequence number. The first time it is reused the sequence
0164          * is -1, the second time -2, etc. This fixes that issue
0165          * without appearing to create any others.
0166          */
0167         if (likely(!tp->repair)) {
0168             u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
0169
0170             if (!seq)
0171                 seq = 1;
0172             WRITE_ONCE(tp->write_seq, seq);
0173             tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
0174             tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
0175         }
0176         sock_hold(sktw);
0177         return 1;
0178     }
0179
0180     return 0;
0181 }
0182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
0183
0184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
0185                   int addr_len)
0186 {
0187     /* This check is replicated from tcp_v4_connect() and intended to
0188      * prevent BPF program called below from accessing bytes that are out
0189      * of the bound specified by user in addr_len.
0190      */
0191     if (addr_len < sizeof(struct sockaddr_in))
0192         return -EINVAL;
0193
0194     sock_owned_by_me(sk);
0195
0196     return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
0197 }
0198
0199 /* This will initiate an outgoing connection. */
0200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
0201 {
0202     struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
0203     struct inet_sock *inet = inet_sk(sk);
0204     struct tcp_sock *tp = tcp_sk(sk);
0205     __be16 orig_sport, orig_dport;
0206     __be32 daddr, nexthop;
0207     struct flowi4 *fl4;
0208     struct rtable *rt;
0209     int err;
0210     struct ip_options_rcu *inet_opt;
0211     struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
0212
0213     if (addr_len < sizeof(struct sockaddr_in))
0214         return -EINVAL;
0215
0216     if (usin->sin_family != AF_INET)
0217         return -EAFNOSUPPORT;
0218
0219     nexthop = daddr = usin->sin_addr.s_addr;
0220     inet_opt = rcu_dereference_protected(inet->inet_opt,
0221                          lockdep_sock_is_held(sk));
0222     if (inet_opt && inet_opt->opt.srr) {
0223         if (!daddr)
0224             return -EINVAL;
0225         nexthop = inet_opt->opt.faddr;
0226     }
0227
0228     orig_sport = inet->inet_sport;
0229     orig_dport = usin->sin_port;
0230     fl4 = &inet->cork.fl.u.ip4;
0231     rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
0232                   sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
0233                   orig_dport, sk);
0234     if (IS_ERR(rt)) {
0235         err = PTR_ERR(rt);
0236         if (err == -ENETUNREACH)
0237             IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
0238         return err;
0239     }
0240
0241     if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
0242         ip_rt_put(rt);
0243         return -ENETUNREACH;
0244     }
0245
0246     if (!inet_opt || !inet_opt->opt.srr)
0247         daddr = fl4->daddr;
0248
0249     if (!inet->inet_saddr)
0250         inet->inet_saddr = fl4->saddr;
0251     sk_rcv_saddr_set(sk, inet->inet_saddr);
0252
0253     if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
0254         /* Reset inherited state */
0255         tp->rx_opt.ts_recent       = 0;
0256         tp->rx_opt.ts_recent_stamp = 0;
0257         if (likely(!tp->repair))
0258             WRITE_ONCE(tp->write_seq, 0);
0259     }
0260
0261     inet->inet_dport = usin->sin_port;
0262     sk_daddr_set(sk, daddr);
0263
0264     inet_csk(sk)->icsk_ext_hdr_len = 0;
0265     if (inet_opt)
0266         inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
0267
0268     tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
0269
0270     /* Socket identity is still unknown (sport may be zero).
0271      * However we set state to SYN-SENT and not releasing socket
0272      * lock select source port, enter ourselves into the hash tables and
0273      * complete initialization after this.
0274      */
0275     tcp_set_state(sk, TCP_SYN_SENT);
0276     err = inet_hash_connect(tcp_death_row, sk);
0277     if (err)
0278         goto failure;
0279
0280     sk_set_txhash(sk);
0281
0282     rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
0283                    inet->inet_sport, inet->inet_dport, sk);
0284     if (IS_ERR(rt)) {
0285         err = PTR_ERR(rt);
0286         rt = NULL;
0287         goto failure;
0288     }
0289     /* OK, now commit destination to socket.  */
0290     sk->sk_gso_type = SKB_GSO_TCPV4;
0291     sk_setup_caps(sk, &rt->dst);
0292     rt = NULL;
0293
0294     if (likely(!tp->repair)) {
0295         if (!tp->write_seq)
0296             WRITE_ONCE(tp->write_seq,
0297                    secure_tcp_seq(inet->inet_saddr,
0298                           inet->inet_daddr,
0299                           inet->inet_sport,
0300                           usin->sin_port));
0301         tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
0302                          inet->inet_saddr,
0303                          inet->inet_daddr);
0304     }
0305
0306     inet->inet_id = prandom_u32();
0307
0308     if (tcp_fastopen_defer_connect(sk, &err))
0309         return err;
0310     if (err)
0311         goto failure;
0312
0313     err = tcp_connect(sk);
0314
0315     if (err)
0316         goto failure;
0317
0318     return 0;
0319
0320 failure:
0321     /*
0322      * This unhashes the socket and releases the local port,
0323      * if necessary.
0324      */
0325     tcp_set_state(sk, TCP_CLOSE);
0326     ip_rt_put(rt);
0327     sk->sk_route_caps = 0;
0328     inet->inet_dport = 0;
0329     return err;
0330 }
0331 EXPORT_SYMBOL(tcp_v4_connect);
0332
0333 /*
0334  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
0335  * It can be called through tcp_release_cb() if socket was owned by user
0336  * at the time tcp_v4_err() was called to handle ICMP message.
0337  */
0338 void tcp_v4_mtu_reduced(struct sock *sk)
0339 {
0340     struct inet_sock *inet = inet_sk(sk);
0341     struct dst_entry *dst;
0342     u32 mtu;
0343
0344     if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
0345         return;
0346     mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
0347     dst = inet_csk_update_pmtu(sk, mtu);
0348     if (!dst)
0349         return;
0350
0351     /* Something is about to be wrong... Remember soft error
0352      * for the case, if this connection will not able to recover.
0353      */
0354     if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
0355         sk->sk_err_soft = EMSGSIZE;
0356
0357     mtu = dst_mtu(dst);
0358
0359     if (inet->pmtudisc != IP_PMTUDISC_DONT &&
0360         ip_sk_accept_pmtu(sk) &&
0361         inet_csk(sk)->icsk_pmtu_cookie > mtu) {
0362         tcp_sync_mss(sk, mtu);
0363
0364         /* Resend the TCP packet because it's
0365          * clear that the old packet has been
0366          * dropped. This is the new "fast" path mtu
0367          * discovery.
0368          */
0369         tcp_simple_retransmit(sk);
0370     } /* else let the usual retransmit timer handle it */
0371 }
0372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
0373
0374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
0375 {
0376     struct dst_entry *dst = __sk_dst_check(sk, 0);
0377
0378     if (dst)
0379         dst->ops->redirect(dst, sk, skb);
0380 }
0381
0382
0383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
0384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
0385 {
0386     struct request_sock *req = inet_reqsk(sk);
0387     struct net *net = sock_net(sk);
0388
0389     /* ICMPs are not backlogged, hence we cannot get
0390      * an established socket here.
0391      */
0392     if (seq != tcp_rsk(req)->snt_isn) {
0393         __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
0394     } else if (abort) {
0395         /*
0396          * Still in SYN_RECV, just remove it silently.
0397          * There is no good way to pass the error to the newly
0398          * created socket, and POSIX does not want network
0399          * errors returned from accept().
0400          */
0401         inet_csk_reqsk_queue_drop(req->rsk_listener, req);
0402         tcp_listendrop(req->rsk_listener);
0403     }
0404     reqsk_put(req);
0405 }
0406 EXPORT_SYMBOL(tcp_req_err);
0407
0408 /* TCP-LD (RFC 6069) logic */
0409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
0410 {
0411     struct inet_connection_sock *icsk = inet_csk(sk);
0412     struct tcp_sock *tp = tcp_sk(sk);
0413     struct sk_buff *skb;
0414     s32 remaining;
0415     u32 delta_us;
0416
0417     if (sock_owned_by_user(sk))
0418         return;
0419
0420     if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
0421         !icsk->icsk_backoff)
0422         return;
0423
0424     skb = tcp_rtx_queue_head(sk);
0425     if (WARN_ON_ONCE(!skb))
0426         return;
0427
0428     icsk->icsk_backoff--;
0429     icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
0430     icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
0431
0432     tcp_mstamp_refresh(tp);
0433     delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
0434     remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
0435
0436     if (remaining > 0) {
0437         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
0438                       remaining, TCP_RTO_MAX);
0439     } else {
0440         /* RTO revert clocked out retransmission.
0441          * Will retransmit now.
0442          */
0443         tcp_retransmit_timer(sk);
0444     }
0445 }
0446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
0447
0448 /*
0449  * This routine is called by the ICMP module when it gets some
0450  * sort of error condition.  If err < 0 then the socket should
0451  * be closed and the error returned to the user.  If err > 0
0452  * it's just the icmp type << 8 | icmp code.  After adjustment
0453  * header points to the first 8 bytes of the tcp header.  We need
0454  * to find the appropriate port.
0455  *
0456  * The locking strategy used here is very "optimistic". When
0457  * someone else accesses the socket the ICMP is just dropped
0458  * and for some paths there is no check at all.
0459  * A more general error queue to queue errors for later handling
0460  * is probably better.
0461  *
0462  */
0463
0464 int tcp_v4_err(struct sk_buff *skb, u32 info)
0465 {
0466     const struct iphdr *iph = (const struct iphdr *)skb->data;
0467     struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
0468     struct tcp_sock *tp;
0469     struct inet_sock *inet;
0470     const int type = icmp_hdr(skb)->type;
0471     const int code = icmp_hdr(skb)->code;
0472     struct sock *sk;
0473     struct request_sock *fastopen;
0474     u32 seq, snd_una;
0475     int err;
0476     struct net *net = dev_net(skb->dev);
0477
0478     sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
0479                        th->dest, iph->saddr, ntohs(th->source),
0480                        inet_iif(skb), 0);
0481     if (!sk) {
0482         __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
0483         return -ENOENT;
0484     }
0485     if (sk->sk_state == TCP_TIME_WAIT) {
0486         inet_twsk_put(inet_twsk(sk));
0487         return 0;
0488     }
0489     seq = ntohl(th->seq);
0490     if (sk->sk_state == TCP_NEW_SYN_RECV) {
0491         tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
0492                      type == ICMP_TIME_EXCEEDED ||
0493                      (type == ICMP_DEST_UNREACH &&
0494                       (code == ICMP_NET_UNREACH ||
0495                        code == ICMP_HOST_UNREACH)));
0496         return 0;
0497     }
0498
0499     bh_lock_sock(sk);
0500     /* If too many ICMPs get dropped on busy
0501      * servers this needs to be solved differently.
0502      * We do take care of PMTU discovery (RFC1191) special case :
0503      * we can receive locally generated ICMP messages while socket is held.
0504      */
0505     if (sock_owned_by_user(sk)) {
0506         if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
0507             __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
0508     }
0509     if (sk->sk_state == TCP_CLOSE)
0510         goto out;
0511
0512     if (static_branch_unlikely(&ip4_min_ttl)) {
0513         /* min_ttl can be changed concurrently from do_ip_setsockopt() */
0514         if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
0515             __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
0516             goto out;
0517         }
0518     }
0519
0520     tp = tcp_sk(sk);
0521     /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
0522     fastopen = rcu_dereference(tp->fastopen_rsk);
0523     snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
0524     if (sk->sk_state != TCP_LISTEN &&
0525         !between(seq, snd_una, tp->snd_nxt)) {
0526         __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
0527         goto out;
0528     }
0529
0530     switch (type) {
0531     case ICMP_REDIRECT:
0532         if (!sock_owned_by_user(sk))
0533             do_redirect(skb, sk);
0534         goto out;
0535     case ICMP_SOURCE_QUENCH:
0536         /* Just silently ignore these. */
0537         goto out;
0538     case ICMP_PARAMETERPROB:
0539         err = EPROTO;
0540         break;
0541     case ICMP_DEST_UNREACH:
0542         if (code > NR_ICMP_UNREACH)
0543             goto out;
0544
0545         if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0546             /* We are not interested in TCP_LISTEN and open_requests
0547              * (SYN-ACKs send out by Linux are always <576bytes so
0548              * they should go through unfragmented).
0549              */
0550             if (sk->sk_state == TCP_LISTEN)
0551                 goto out;
0552
0553             WRITE_ONCE(tp->mtu_info, info);
0554             if (!sock_owned_by_user(sk)) {
0555                 tcp_v4_mtu_reduced(sk);
0556             } else {
0557                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
0558                     sock_hold(sk);
0559             }
0560             goto out;
0561         }
0562
0563         err = icmp_err_convert[code].errno;
0564         /* check if this ICMP message allows revert of backoff.
0565          * (see RFC 6069)
0566          */
0567         if (!fastopen &&
0568             (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
0569             tcp_ld_RTO_revert(sk, seq);
0570         break;
0571     case ICMP_TIME_EXCEEDED:
0572         err = EHOSTUNREACH;
0573         break;
0574     default:
0575         goto out;
0576     }
0577
0578     switch (sk->sk_state) {
0579     case TCP_SYN_SENT:
0580     case TCP_SYN_RECV:
0581         /* Only in fast or simultaneous open. If a fast open socket is
0582          * already accepted it is treated as a connected one below.
0583          */
0584         if (fastopen && !fastopen->sk)
0585             break;
0586
0587         ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
0588
0589         if (!sock_owned_by_user(sk)) {
0590             sk->sk_err = err;
0591
0592             sk_error_report(sk);
0593
0594             tcp_done(sk);
0595         } else {
0596             sk->sk_err_soft = err;
0597         }
0598         goto out;
0599     }
0600
0601     /* If we've already connected we will keep trying
0602      * until we time out, or the user gives up.
0603      *
0604      * rfc1122 4.2.3.9 allows to consider as hard errors
0605      * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
0606      * but it is obsoleted by pmtu discovery).
0607      *
0608      * Note, that in modern internet, where routing is unreliable
0609      * and in each dark corner broken firewalls sit, sending random
0610      * errors ordered by their masters even this two messages finally lose
0611      * their original sense (even Linux sends invalid PORT_UNREACHs)
0612      *
0613      * Now we are in compliance with RFCs.
0614      *                          --ANK (980905)
0615      */
0616
0617     inet = inet_sk(sk);
0618     if (!sock_owned_by_user(sk) && inet->recverr) {
0619         sk->sk_err = err;
0620         sk_error_report(sk);
0621     } else  { /* Only an error on timeout */
0622         sk->sk_err_soft = err;
0623     }
0624
0625 out:
0626     bh_unlock_sock(sk);
0627     sock_put(sk);
0628     return 0;
0629 }
0630
0631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
0632 {
0633     struct tcphdr *th = tcp_hdr(skb);
0634
0635     th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
0636     skb->csum_start = skb_transport_header(skb) - skb->head;
0637     skb->csum_offset = offsetof(struct tcphdr, check);
0638 }
0639
0640 /* This routine computes an IPv4 TCP checksum. */
0641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
0642 {
0643     const struct inet_sock *inet = inet_sk(sk);
0644
0645     __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
0646 }
0647 EXPORT_SYMBOL(tcp_v4_send_check);
0648
0649 /*
0650  *  This routine will send an RST to the other tcp.
0651  *
0652  *  Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
0653  *            for reset.
0654  *  Answer: if a packet caused RST, it is not for a socket
0655  *      existing in our system, if it is matched to a socket,
0656  *      it is just duplicate segment or bug in other side's TCP.
0657  *      So that we build reply only basing on parameters
0658  *      arrived with segment.
0659  *  Exception: precedence violation. We do not implement it in any case.
0660  */
0661
0662 #ifdef CONFIG_TCP_MD5SIG
0663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
0664 #else
0665 #define OPTION_BYTES sizeof(__be32)
0666 #endif
0667
0668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
0669 {
0670     const struct tcphdr *th = tcp_hdr(skb);
0671     struct {
0672         struct tcphdr th;
0673         __be32 opt[OPTION_BYTES / sizeof(__be32)];
0674     } rep;
0675     struct ip_reply_arg arg;
0676 #ifdef CONFIG_TCP_MD5SIG
0677     struct tcp_md5sig_key *key = NULL;
0678     const __u8 *hash_location = NULL;
0679     unsigned char newhash[16];
0680     int genhash;
0681     struct sock *sk1 = NULL;
0682 #endif
0683     u64 transmit_time = 0;
0684     struct sock *ctl_sk;
0685     struct net *net;
0686
0687     /* Never send a reset in response to a reset. */
0688     if (th->rst)
0689         return;
0690
0691     /* If sk not NULL, it means we did a successful lookup and incoming
0692      * route had to be correct. prequeue might have dropped our dst.
0693      */
0694     if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
0695         return;
0696
0697     /* Swap the send and the receive. */
0698     memset(&rep, 0, sizeof(rep));
0699     rep.th.dest   = th->source;
0700     rep.th.source = th->dest;
0701     rep.th.doff   = sizeof(struct tcphdr) / 4;
0702     rep.th.rst    = 1;
0703
0704     if (th->ack) {
0705         rep.th.seq = th->ack_seq;
0706     } else {
0707         rep.th.ack = 1;
0708         rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
0709                        skb->len - (th->doff << 2));
0710     }
0711
0712     memset(&arg, 0, sizeof(arg));
0713     arg.iov[0].iov_base = (unsigned char *)&rep;
0714     arg.iov[0].iov_len  = sizeof(rep.th);
0715
0716     net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
0717 #ifdef CONFIG_TCP_MD5SIG
0718     rcu_read_lock();
0719     hash_location = tcp_parse_md5sig_option(th);
0720     if (sk && sk_fullsock(sk)) {
0721         const union tcp_md5_addr *addr;
0722         int l3index;
0723
0724         /* sdif set, means packet ingressed via a device
0725          * in an L3 domain and inet_iif is set to it.
0726          */
0727         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
0728         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
0729         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
0730     } else if (hash_location) {
0731         const union tcp_md5_addr *addr;
0732         int sdif = tcp_v4_sdif(skb);
0733         int dif = inet_iif(skb);
0734         int l3index;
0735
0736         /*
0737          * active side is lost. Try to find listening socket through
0738          * source port, and then find md5 key through listening socket.
0739          * we are not loose security here:
0740          * Incoming packet is checked with md5 hash with finding key,
0741          * no RST generated if md5 hash doesn't match.
0742          */
0743         sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
0744                          ip_hdr(skb)->saddr,
0745                          th->source, ip_hdr(skb)->daddr,
0746                          ntohs(th->source), dif, sdif);
0747         /* don't send rst if it can't find key */
0748         if (!sk1)
0749             goto out;
0750
0751         /* sdif set, means packet ingressed via a device
0752          * in an L3 domain and dif is set to it.
0753          */
0754         l3index = sdif ? dif : 0;
0755         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
0756         key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
0757         if (!key)
0758             goto out;
0759
0760
0761         genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
0762         if (genhash || memcmp(hash_location, newhash, 16) != 0)
0763             goto out;
0764
0765     }
0766
0767     if (key) {
0768         rep.opt[0] = htonl((TCPOPT_NOP << 24) |
0769                    (TCPOPT_NOP << 16) |
0770                    (TCPOPT_MD5SIG << 8) |
0771                    TCPOLEN_MD5SIG);
0772         /* Update length and the length the header thinks exists */
0773         arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
0774         rep.th.doff = arg.iov[0].iov_len / 4;
0775
0776         tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
0777                      key, ip_hdr(skb)->saddr,
0778                      ip_hdr(skb)->daddr, &rep.th);
0779     }
0780 #endif
0781     /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
0782     if (rep.opt[0] == 0) {
0783         __be32 mrst = mptcp_reset_option(skb);
0784
0785         if (mrst) {
0786             rep.opt[0] = mrst;
0787             arg.iov[0].iov_len += sizeof(mrst);
0788             rep.th.doff = arg.iov[0].iov_len / 4;
0789         }
0790     }
0791
0792     arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
0793                       ip_hdr(skb)->saddr, /* XXX */
0794                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
0795     arg.csumoffset = offsetof(struct tcphdr, check) / 2;
0796     arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
0797
0798     /* When socket is gone, all binding information is lost.
0799      * routing might fail in this case. No choice here, if we choose to force
0800      * input interface, we will misroute in case of asymmetric route.
0801      */
0802     if (sk) {
0803         arg.bound_dev_if = sk->sk_bound_dev_if;
0804         if (sk_fullsock(sk))
0805             trace_tcp_send_reset(sk, skb);
0806     }
0807
0808     BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
0809              offsetof(struct inet_timewait_sock, tw_bound_dev_if));
0810
0811     arg.tos = ip_hdr(skb)->tos;
0812     arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
0813     local_bh_disable();
0814     ctl_sk = this_cpu_read(ipv4_tcp_sk);
0815     sock_net_set(ctl_sk, net);
0816     if (sk) {
0817         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
0818                    inet_twsk(sk)->tw_mark : sk->sk_mark;
0819         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
0820                    inet_twsk(sk)->tw_priority : sk->sk_priority;
0821         transmit_time = tcp_transmit_time(sk);
0822         xfrm_sk_clone_policy(ctl_sk, sk);
0823     }
0824     ip_send_unicast_reply(ctl_sk,
0825                   skb, &TCP_SKB_CB(skb)->header.h4.opt,
0826                   ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
0827                   &arg, arg.iov[0].iov_len,
0828                   transmit_time);
0829
0830     ctl_sk->sk_mark = 0;
0831     xfrm_sk_free_policy(ctl_sk);
0832     sock_net_set(ctl_sk, &init_net);
0833     __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
0834     __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
0835     local_bh_enable();
0836
0837 #ifdef CONFIG_TCP_MD5SIG
0838 out:
0839     rcu_read_unlock();
0840 #endif
0841 }
0842
0843 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
0844    outside socket context is ugly, certainly. What can I do?
0845  */
0846
0847 static void tcp_v4_send_ack(const struct sock *sk,
0848                 struct sk_buff *skb, u32 seq, u32 ack,
0849                 u32 win, u32 tsval, u32 tsecr, int oif,
0850                 struct tcp_md5sig_key *key,
0851                 int reply_flags, u8 tos)
0852 {
0853     const struct tcphdr *th = tcp_hdr(skb);
0854     struct {
0855         struct tcphdr th;
0856         __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
0857 #ifdef CONFIG_TCP_MD5SIG
0858                + (TCPOLEN_MD5SIG_ALIGNED >> 2)
0859 #endif
0860             ];
0861     } rep;
0862     struct net *net = sock_net(sk);
0863     struct ip_reply_arg arg;
0864     struct sock *ctl_sk;
0865     u64 transmit_time;
0866
0867     memset(&rep.th, 0, sizeof(struct tcphdr));
0868     memset(&arg, 0, sizeof(arg));
0869
0870     arg.iov[0].iov_base = (unsigned char *)&rep;
0871     arg.iov[0].iov_len  = sizeof(rep.th);
0872     if (tsecr) {
0873         rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
0874                    (TCPOPT_TIMESTAMP << 8) |
0875                    TCPOLEN_TIMESTAMP);
0876         rep.opt[1] = htonl(tsval);
0877         rep.opt[2] = htonl(tsecr);
0878         arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
0879     }
0880
0881     /* Swap the send and the receive. */
0882     rep.th.dest    = th->source;
0883     rep.th.source  = th->dest;
0884     rep.th.doff    = arg.iov[0].iov_len / 4;
0885     rep.th.seq     = htonl(seq);
0886     rep.th.ack_seq = htonl(ack);
0887     rep.th.ack     = 1;
0888     rep.th.window  = htons(win);
0889
0890 #ifdef CONFIG_TCP_MD5SIG
0891     if (key) {
0892         int offset = (tsecr) ? 3 : 0;
0893
0894         rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
0895                       (TCPOPT_NOP << 16) |
0896                       (TCPOPT_MD5SIG << 8) |
0897                       TCPOLEN_MD5SIG);
0898         arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
0899         rep.th.doff = arg.iov[0].iov_len/4;
0900
0901         tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
0902                     key, ip_hdr(skb)->saddr,
0903                     ip_hdr(skb)->daddr, &rep.th);
0904     }
0905 #endif
0906     arg.flags = reply_flags;
0907     arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
0908                       ip_hdr(skb)->saddr, /* XXX */
0909                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
0910     arg.csumoffset = offsetof(struct tcphdr, check) / 2;
0911     if (oif)
0912         arg.bound_dev_if = oif;
0913     arg.tos = tos;
0914     arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
0915     local_bh_disable();
0916     ctl_sk = this_cpu_read(ipv4_tcp_sk);
0917     sock_net_set(ctl_sk, net);
0918     ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
0919                inet_twsk(sk)->tw_mark : sk->sk_mark;
0920     ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
0921                inet_twsk(sk)->tw_priority : sk->sk_priority;
0922     transmit_time = tcp_transmit_time(sk);
0923     ip_send_unicast_reply(ctl_sk,
0924                   skb, &TCP_SKB_CB(skb)->header.h4.opt,
0925                   ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
0926                   &arg, arg.iov[0].iov_len,
0927                   transmit_time);
0928
0929     ctl_sk->sk_mark = 0;
0930     sock_net_set(ctl_sk, &init_net);
0931     __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
0932     local_bh_enable();
0933 }
0934
0935 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
0936 {
0937     struct inet_timewait_sock *tw = inet_twsk(sk);
0938     struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
0939
0940     tcp_v4_send_ack(sk, skb,
0941             tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
0942             tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
0943             tcp_time_stamp_raw() + tcptw->tw_ts_offset,
0944             tcptw->tw_ts_recent,
0945             tw->tw_bound_dev_if,
0946             tcp_twsk_md5_key(tcptw),
0947             tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
0948             tw->tw_tos
0949             );
0950
0951     inet_twsk_put(tw);
0952 }
0953
0954 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
0955                   struct request_sock *req)
0956 {
0957     const union tcp_md5_addr *addr;
0958     int l3index;
0959
0960     /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
0961      * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
0962      */
0963     u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
0964                          tcp_sk(sk)->snd_nxt;
0965
0966     /* RFC 7323 2.3
0967      * The window field (SEG.WND) of every outgoing segment, with the
0968      * exception of <SYN> segments, MUST be right-shifted by
0969      * Rcv.Wind.Shift bits:
0970      */
0971     addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
0972     l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
0973     tcp_v4_send_ack(sk, skb, seq,
0974             tcp_rsk(req)->rcv_nxt,
0975             req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
0976             tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
0977             req->ts_recent,
0978             0,
0979             tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
0980             inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
0981             ip_hdr(skb)->tos);
0982 }
0983
0984 /*
0985  *  Send a SYN-ACK after having received a SYN.
0986  *  This still operates on a request_sock only, not on a big
0987  *  socket.
0988  */
0989 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
0990                   struct flowi *fl,
0991                   struct request_sock *req,
0992                   struct tcp_fastopen_cookie *foc,
0993                   enum tcp_synack_type synack_type,
0994                   struct sk_buff *syn_skb)
0995 {
0996     const struct inet_request_sock *ireq = inet_rsk(req);
0997     struct flowi4 fl4;
0998     int err = -1;
0999     struct sk_buff *skb;
1000     u8 tos;
1001
1002     /* First, grab a route. */
1003     if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1004         return -1;
1005
1006     skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1007
1008     if (skb) {
1009         __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1010
1011         tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1012                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1013                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1014                 inet_sk(sk)->tos;
1015
1016         if (!INET_ECN_is_capable(tos) &&
1017             tcp_bpf_ca_needs_ecn((struct sock *)req))
1018             tos |= INET_ECN_ECT_0;
1019
1020         rcu_read_lock();
1021         err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1022                         ireq->ir_rmt_addr,
1023                         rcu_dereference(ireq->ireq_opt),
1024                         tos);
1025         rcu_read_unlock();
1026         err = net_xmit_eval(err);
1027     }
1028
1029     return err;
1030 }
1031
1032 /*
1033  *  IPv4 request_sock destructor.
1034  */
1035 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1036 {
1037     kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1038 }
1039
1040 #ifdef CONFIG_TCP_MD5SIG
1041 /*
1042  * RFC2385 MD5 checksumming requires a mapping of
1043  * IP address->MD5 Key.
1044  * We need to maintain these in the sk structure.
1045  */
1046
1047 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1048 EXPORT_SYMBOL(tcp_md5_needed);
1049
1050 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1051 {
1052     if (!old)
1053         return true;
1054
1055     /* l3index always overrides non-l3index */
1056     if (old->l3index && new->l3index == 0)
1057         return false;
1058     if (old->l3index == 0 && new->l3index)
1059         return true;
1060
1061     return old->prefixlen < new->prefixlen;
1062 }
1063
1064 /* Find the Key structure for an address.  */
1065 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1066                        const union tcp_md5_addr *addr,
1067                        int family)
1068 {
1069     const struct tcp_sock *tp = tcp_sk(sk);
1070     struct tcp_md5sig_key *key;
1071     const struct tcp_md5sig_info *md5sig;
1072     __be32 mask;
1073     struct tcp_md5sig_key *best_match = NULL;
1074     bool match;
1075
1076     /* caller either holds rcu_read_lock() or socket lock */
1077     md5sig = rcu_dereference_check(tp->md5sig_info,
1078                        lockdep_sock_is_held(sk));
1079     if (!md5sig)
1080         return NULL;
1081
1082     hlist_for_each_entry_rcu(key, &md5sig->head, node,
1083                  lockdep_sock_is_held(sk)) {
1084         if (key->family != family)
1085             continue;
1086         if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1087             continue;
1088         if (family == AF_INET) {
1089             mask = inet_make_mask(key->prefixlen);
1090             match = (key->addr.a4.s_addr & mask) ==
1091                 (addr->a4.s_addr & mask);
1092 #if IS_ENABLED(CONFIG_IPV6)
1093         } else if (family == AF_INET6) {
1094             match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1095                           key->prefixlen);
1096 #endif
1097         } else {
1098             match = false;
1099         }
1100
1101         if (match && better_md5_match(best_match, key))
1102             best_match = key;
1103     }
1104     return best_match;
1105 }
1106 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1107
1108 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1109                               const union tcp_md5_addr *addr,
1110                               int family, u8 prefixlen,
1111                               int l3index, u8 flags)
1112 {
1113     const struct tcp_sock *tp = tcp_sk(sk);
1114     struct tcp_md5sig_key *key;
1115     unsigned int size = sizeof(struct in_addr);
1116     const struct tcp_md5sig_info *md5sig;
1117
1118     /* caller either holds rcu_read_lock() or socket lock */
1119     md5sig = rcu_dereference_check(tp->md5sig_info,
1120                        lockdep_sock_is_held(sk));
1121     if (!md5sig)
1122         return NULL;
1123 #if IS_ENABLED(CONFIG_IPV6)
1124     if (family == AF_INET6)
1125         size = sizeof(struct in6_addr);
1126 #endif
1127     hlist_for_each_entry_rcu(key, &md5sig->head, node,
1128                  lockdep_sock_is_held(sk)) {
1129         if (key->family != family)
1130             continue;
1131         if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1132             continue;
1133         if (key->l3index != l3index)
1134             continue;
1135         if (!memcmp(&key->addr, addr, size) &&
1136             key->prefixlen == prefixlen)
1137             return key;
1138     }
1139     return NULL;
1140 }
1141
1142 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1143                      const struct sock *addr_sk)
1144 {
1145     const union tcp_md5_addr *addr;
1146     int l3index;
1147
1148     l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1149                          addr_sk->sk_bound_dev_if);
1150     addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1151     return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1152 }
1153 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1154
1155 /* This can be called on a newly created socket, from other files */
1156 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1157            int family, u8 prefixlen, int l3index, u8 flags,
1158            const u8 *newkey, u8 newkeylen, gfp_t gfp)
1159 {
1160     /* Add Key to the list */
1161     struct tcp_md5sig_key *key;
1162     struct tcp_sock *tp = tcp_sk(sk);
1163     struct tcp_md5sig_info *md5sig;
1164
1165     key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1166     if (key) {
1167         /* Pre-existing entry - just update that one.
1168          * Note that the key might be used concurrently.
1169          * data_race() is telling kcsan that we do not care of
1170          * key mismatches, since changing MD5 key on live flows
1171          * can lead to packet drops.
1172          */
1173         data_race(memcpy(key->key, newkey, newkeylen));
1174
1175         /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1176          * Also note that a reader could catch new key->keylen value
1177          * but old key->key[], this is the reason we use __GFP_ZERO
1178          * at sock_kmalloc() time below these lines.
1179          */
1180         WRITE_ONCE(key->keylen, newkeylen);
1181
1182         return 0;
1183     }
1184
1185     md5sig = rcu_dereference_protected(tp->md5sig_info,
1186                        lockdep_sock_is_held(sk));
1187     if (!md5sig) {
1188         md5sig = kmalloc(sizeof(*md5sig), gfp);
1189         if (!md5sig)
1190             return -ENOMEM;
1191
1192         sk_gso_disable(sk);
1193         INIT_HLIST_HEAD(&md5sig->head);
1194         rcu_assign_pointer(tp->md5sig_info, md5sig);
1195     }
1196
1197     key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1198     if (!key)
1199         return -ENOMEM;
1200     if (!tcp_alloc_md5sig_pool()) {
1201         sock_kfree_s(sk, key, sizeof(*key));
1202         return -ENOMEM;
1203     }
1204
1205     memcpy(key->key, newkey, newkeylen);
1206     key->keylen = newkeylen;
1207     key->family = family;
1208     key->prefixlen = prefixlen;
1209     key->l3index = l3index;
1210     key->flags = flags;
1211     memcpy(&key->addr, addr,
1212            (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1213                                  sizeof(struct in_addr));
1214     hlist_add_head_rcu(&key->node, &md5sig->head);
1215     return 0;
1216 }
1217 EXPORT_SYMBOL(tcp_md5_do_add);
1218
1219 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1220            u8 prefixlen, int l3index, u8 flags)
1221 {
1222     struct tcp_md5sig_key *key;
1223
1224     key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1225     if (!key)
1226         return -ENOENT;
1227     hlist_del_rcu(&key->node);
1228     atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1229     kfree_rcu(key, rcu);
1230     return 0;
1231 }
1232 EXPORT_SYMBOL(tcp_md5_do_del);
1233
1234 static void tcp_clear_md5_list(struct sock *sk)
1235 {
1236     struct tcp_sock *tp = tcp_sk(sk);
1237     struct tcp_md5sig_key *key;
1238     struct hlist_node *n;
1239     struct tcp_md5sig_info *md5sig;
1240
1241     md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1242
1243     hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1244         hlist_del_rcu(&key->node);
1245         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1246         kfree_rcu(key, rcu);
1247     }
1248 }
1249
1250 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1251                  sockptr_t optval, int optlen)
1252 {
1253     struct tcp_md5sig cmd;
1254     struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1255     const union tcp_md5_addr *addr;
1256     u8 prefixlen = 32;
1257     int l3index = 0;
1258     u8 flags;
1259
1260     if (optlen < sizeof(cmd))
1261         return -EINVAL;
1262
1263     if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1264         return -EFAULT;
1265
1266     if (sin->sin_family != AF_INET)
1267         return -EINVAL;
1268
1269     flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1270
1271     if (optname == TCP_MD5SIG_EXT &&
1272         cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1273         prefixlen = cmd.tcpm_prefixlen;
1274         if (prefixlen > 32)
1275             return -EINVAL;
1276     }
1277
1278     if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1279         cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1280         struct net_device *dev;
1281
1282         rcu_read_lock();
1283         dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1284         if (dev && netif_is_l3_master(dev))
1285             l3index = dev->ifindex;
1286
1287         rcu_read_unlock();
1288
1289         /* ok to reference set/not set outside of rcu;
1290          * right now device MUST be an L3 master
1291          */
1292         if (!dev || !l3index)
1293             return -EINVAL;
1294     }
1295
1296     addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1297
1298     if (!cmd.tcpm_keylen)
1299         return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1300
1301     if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1302         return -EINVAL;
1303
1304     return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1305                   cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1306 }
1307
1308 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1309                    __be32 daddr, __be32 saddr,
1310                    const struct tcphdr *th, int nbytes)
1311 {
1312     struct tcp4_pseudohdr *bp;
1313     struct scatterlist sg;
1314     struct tcphdr *_th;
1315
1316     bp = hp->scratch;
1317     bp->saddr = saddr;
1318     bp->daddr = daddr;
1319     bp->pad = 0;
1320     bp->protocol = IPPROTO_TCP;
1321     bp->len = cpu_to_be16(nbytes);
1322
1323     _th = (struct tcphdr *)(bp + 1);
1324     memcpy(_th, th, sizeof(*th));
1325     _th->check = 0;
1326
1327     sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1328     ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1329                 sizeof(*bp) + sizeof(*th));
1330     return crypto_ahash_update(hp->md5_req);
1331 }
1332
1333 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1334                    __be32 daddr, __be32 saddr, const struct tcphdr *th)
1335 {
1336     struct tcp_md5sig_pool *hp;
1337     struct ahash_request *req;
1338
1339     hp = tcp_get_md5sig_pool();
1340     if (!hp)
1341         goto clear_hash_noput;
1342     req = hp->md5_req;
1343
1344     if (crypto_ahash_init(req))
1345         goto clear_hash;
1346     if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1347         goto clear_hash;
1348     if (tcp_md5_hash_key(hp, key))
1349         goto clear_hash;
1350     ahash_request_set_crypt(req, NULL, md5_hash, 0);
1351     if (crypto_ahash_final(req))
1352         goto clear_hash;
1353
1354     tcp_put_md5sig_pool();
1355     return 0;
1356
1357 clear_hash:
1358     tcp_put_md5sig_pool();
1359 clear_hash_noput:
1360     memset(md5_hash, 0, 16);
1361     return 1;
1362 }
1363
1364 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1365             const struct sock *sk,
1366             const struct sk_buff *skb)
1367 {
1368     struct tcp_md5sig_pool *hp;
1369     struct ahash_request *req;
1370     const struct tcphdr *th = tcp_hdr(skb);
1371     __be32 saddr, daddr;
1372
1373     if (sk) { /* valid for establish/request sockets */
1374         saddr = sk->sk_rcv_saddr;
1375         daddr = sk->sk_daddr;
1376     } else {
1377         const struct iphdr *iph = ip_hdr(skb);
1378         saddr = iph->saddr;
1379         daddr = iph->daddr;
1380     }
1381
1382     hp = tcp_get_md5sig_pool();
1383     if (!hp)
1384         goto clear_hash_noput;
1385     req = hp->md5_req;
1386
1387     if (crypto_ahash_init(req))
1388         goto clear_hash;
1389
1390     if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1391         goto clear_hash;
1392     if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1393         goto clear_hash;
1394     if (tcp_md5_hash_key(hp, key))
1395         goto clear_hash;
1396     ahash_request_set_crypt(req, NULL, md5_hash, 0);
1397     if (crypto_ahash_final(req))
1398         goto clear_hash;
1399
1400     tcp_put_md5sig_pool();
1401     return 0;
1402
1403 clear_hash:
1404     tcp_put_md5sig_pool();
1405 clear_hash_noput:
1406     memset(md5_hash, 0, 16);
1407     return 1;
1408 }
1409 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1410
1411 #endif
1412
1413 static void tcp_v4_init_req(struct request_sock *req,
1414                 const struct sock *sk_listener,
1415                 struct sk_buff *skb)
1416 {
1417     struct inet_request_sock *ireq = inet_rsk(req);
1418     struct net *net = sock_net(sk_listener);
1419
1420     sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1421     sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1422     RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1423 }
1424
1425 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1426                       struct sk_buff *skb,
1427                       struct flowi *fl,
1428                       struct request_sock *req)
1429 {
1430     tcp_v4_init_req(req, sk, skb);
1431
1432     if (security_inet_conn_request(sk, skb, req))
1433         return NULL;
1434
1435     return inet_csk_route_req(sk, &fl->u.ip4, req);
1436 }
1437
1438 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1439     .family     =   PF_INET,
1440     .obj_size   =   sizeof(struct tcp_request_sock),
1441     .rtx_syn_ack    =   tcp_rtx_synack,
1442     .send_ack   =   tcp_v4_reqsk_send_ack,
1443     .destructor =   tcp_v4_reqsk_destructor,
1444     .send_reset =   tcp_v4_send_reset,
1445     .syn_ack_timeout =  tcp_syn_ack_timeout,
1446 };
1447
1448 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1449     .mss_clamp  =   TCP_MSS_DEFAULT,
1450 #ifdef CONFIG_TCP_MD5SIG
1451     .req_md5_lookup =   tcp_v4_md5_lookup,
1452     .calc_md5_hash  =   tcp_v4_md5_hash_skb,
1453 #endif
1454 #ifdef CONFIG_SYN_COOKIES
1455     .cookie_init_seq =  cookie_v4_init_sequence,
1456 #endif
1457     .route_req  =   tcp_v4_route_req,
1458     .init_seq   =   tcp_v4_init_seq,
1459     .init_ts_off    =   tcp_v4_init_ts_off,
1460     .send_synack    =   tcp_v4_send_synack,
1461 };
1462
1463 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1464 {
1465     /* Never answer to SYNs send to broadcast or multicast */
1466     if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1467         goto drop;
1468
1469     return tcp_conn_request(&tcp_request_sock_ops,
1470                 &tcp_request_sock_ipv4_ops, sk, skb);
1471
1472 drop:
1473     tcp_listendrop(sk);
1474     return 0;
1475 }
1476 EXPORT_SYMBOL(tcp_v4_conn_request);
1477
1478
1479 /*
1480  * The three way handshake has completed - we got a valid synack -
1481  * now create the new socket.
1482  */
1483 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1484                   struct request_sock *req,
1485                   struct dst_entry *dst,
1486                   struct request_sock *req_unhash,
1487                   bool *own_req)
1488 {
1489     struct inet_request_sock *ireq;
1490     bool found_dup_sk = false;
1491     struct inet_sock *newinet;
1492     struct tcp_sock *newtp;
1493     struct sock *newsk;
1494 #ifdef CONFIG_TCP_MD5SIG
1495     const union tcp_md5_addr *addr;
1496     struct tcp_md5sig_key *key;
1497     int l3index;
1498 #endif
1499     struct ip_options_rcu *inet_opt;
1500
1501     if (sk_acceptq_is_full(sk))
1502         goto exit_overflow;
1503
1504     newsk = tcp_create_openreq_child(sk, req, skb);
1505     if (!newsk)
1506         goto exit_nonewsk;
1507
1508     newsk->sk_gso_type = SKB_GSO_TCPV4;
1509     inet_sk_rx_dst_set(newsk, skb);
1510
1511     newtp             = tcp_sk(newsk);
1512     newinet           = inet_sk(newsk);
1513     ireq              = inet_rsk(req);
1514     sk_daddr_set(newsk, ireq->ir_rmt_addr);
1515     sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1516     newsk->sk_bound_dev_if = ireq->ir_iif;
1517     newinet->inet_saddr   = ireq->ir_loc_addr;
1518     inet_opt          = rcu_dereference(ireq->ireq_opt);
1519     RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1520     newinet->mc_index     = inet_iif(skb);
1521     newinet->mc_ttl       = ip_hdr(skb)->ttl;
1522     newinet->rcv_tos      = ip_hdr(skb)->tos;
1523     inet_csk(newsk)->icsk_ext_hdr_len = 0;
1524     if (inet_opt)
1525         inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1526     newinet->inet_id = prandom_u32();
1527
1528     /* Set ToS of the new socket based upon the value of incoming SYN.
1529      * ECT bits are set later in tcp_init_transfer().
1530      */
1531     if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1532         newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1533
1534     if (!dst) {
1535         dst = inet_csk_route_child_sock(sk, newsk, req);
1536         if (!dst)
1537             goto put_and_exit;
1538     } else {
1539         /* syncookie case : see end of cookie_v4_check() */
1540     }
1541     sk_setup_caps(newsk, dst);
1542
1543     tcp_ca_openreq_child(newsk, dst);
1544
1545     tcp_sync_mss(newsk, dst_mtu(dst));
1546     newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1547
1548     tcp_initialize_rcv_mss(newsk);
1549
1550 #ifdef CONFIG_TCP_MD5SIG
1551     l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1552     /* Copy over the MD5 key from the original socket */
1553     addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1554     key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1555     if (key) {
1556         /*
1557          * We're using one, so create a matching key
1558          * on the newsk structure. If we fail to get
1559          * memory, then we end up not copying the key
1560          * across. Shucks.
1561          */
1562         tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1563                    key->key, key->keylen, GFP_ATOMIC);
1564         sk_gso_disable(newsk);
1565     }
1566 #endif
1567
1568     if (__inet_inherit_port(sk, newsk) < 0)
1569         goto put_and_exit;
1570     *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1571                        &found_dup_sk);
1572     if (likely(*own_req)) {
1573         tcp_move_syn(newtp, req);
1574         ireq->ireq_opt = NULL;
1575     } else {
1576         newinet->inet_opt = NULL;
1577
1578         if (!req_unhash && found_dup_sk) {
1579             /* This code path should only be executed in the
1580              * syncookie case only
1581              */
1582             bh_unlock_sock(newsk);
1583             sock_put(newsk);
1584             newsk = NULL;
1585         }
1586     }
1587     return newsk;
1588
1589 exit_overflow:
1590     NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1591 exit_nonewsk:
1592     dst_release(dst);
1593 exit:
1594     tcp_listendrop(sk);
1595     return NULL;
1596 put_and_exit:
1597     newinet->inet_opt = NULL;
1598     inet_csk_prepare_forced_close(newsk);
1599     tcp_done(newsk);
1600     goto exit;
1601 }
1602 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1603
1604 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1605 {
1606 #ifdef CONFIG_SYN_COOKIES
1607     const struct tcphdr *th = tcp_hdr(skb);
1608
1609     if (!th->syn)
1610         sk = cookie_v4_check(sk, skb);
1611 #endif
1612     return sk;
1613 }
1614
1615 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1616              struct tcphdr *th, u32 *cookie)
1617 {
1618     u16 mss = 0;
1619 #ifdef CONFIG_SYN_COOKIES
1620     mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1621                     &tcp_request_sock_ipv4_ops, sk, th);
1622     if (mss) {
1623         *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1624         tcp_synq_overflow(sk);
1625     }
1626 #endif
1627     return mss;
1628 }
1629
1630 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1631                                u32));
1632 /* The socket must have it's spinlock held when we get
1633  * here, unless it is a TCP_LISTEN socket.
1634  *
1635  * We have a potential double-lock case here, so even when
1636  * doing backlog processing we use the BH locking scheme.
1637  * This is because we cannot sleep with the original spinlock
1638  * held.
1639  */
1640 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1641 {
1642     enum skb_drop_reason reason;
1643     struct sock *rsk;
1644
1645     if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1646         struct dst_entry *dst;
1647
1648         dst = rcu_dereference_protected(sk->sk_rx_dst,
1649                         lockdep_sock_is_held(sk));
1650
1651         sock_rps_save_rxhash(sk, skb);
1652         sk_mark_napi_id(sk, skb);
1653         if (dst) {
1654             if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1655                 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1656                          dst, 0)) {
1657                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1658                 dst_release(dst);
1659             }
1660         }
1661         tcp_rcv_established(sk, skb);
1662         return 0;
1663     }
1664
1665     reason = SKB_DROP_REASON_NOT_SPECIFIED;
1666     if (tcp_checksum_complete(skb))
1667         goto csum_err;
1668
1669     if (sk->sk_state == TCP_LISTEN) {
1670         struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1671
1672         if (!nsk)
1673             goto discard;
1674         if (nsk != sk) {
1675             if (tcp_child_process(sk, nsk, skb)) {
1676                 rsk = nsk;
1677                 goto reset;
1678             }
1679             return 0;
1680         }
1681     } else
1682         sock_rps_save_rxhash(sk, skb);
1683
1684     if (tcp_rcv_state_process(sk, skb)) {
1685         rsk = sk;
1686         goto reset;
1687     }
1688     return 0;
1689
1690 reset:
1691     tcp_v4_send_reset(rsk, skb);
1692 discard:
1693     kfree_skb_reason(skb, reason);
1694     /* Be careful here. If this function gets more complicated and
1695      * gcc suffers from register pressure on the x86, sk (in %ebx)
1696      * might be destroyed here. This current version compiles correctly,
1697      * but you have been warned.
1698      */
1699     return 0;
1700
1701 csum_err:
1702     reason = SKB_DROP_REASON_TCP_CSUM;
1703     trace_tcp_bad_csum(skb);
1704     TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1705     TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1706     goto discard;
1707 }
1708 EXPORT_SYMBOL(tcp_v4_do_rcv);
1709
1710 int tcp_v4_early_demux(struct sk_buff *skb)
1711 {
1712     const struct iphdr *iph;
1713     const struct tcphdr *th;
1714     struct sock *sk;
1715
1716     if (skb->pkt_type != PACKET_HOST)
1717         return 0;
1718
1719     if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1720         return 0;
1721
1722     iph = ip_hdr(skb);
1723     th = tcp_hdr(skb);
1724
1725     if (th->doff < sizeof(struct tcphdr) / 4)
1726         return 0;
1727
1728     sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1729                        iph->saddr, th->source,
1730                        iph->daddr, ntohs(th->dest),
1731                        skb->skb_iif, inet_sdif(skb));
1732     if (sk) {
1733         skb->sk = sk;
1734         skb->destructor = sock_edemux;
1735         if (sk_fullsock(sk)) {
1736             struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1737
1738             if (dst)
1739                 dst = dst_check(dst, 0);
1740             if (dst &&
1741                 sk->sk_rx_dst_ifindex == skb->skb_iif)
1742                 skb_dst_set_noref(skb, dst);
1743         }
1744     }
1745     return 0;
1746 }
1747
1748 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1749              enum skb_drop_reason *reason)
1750 {
1751     u32 limit, tail_gso_size, tail_gso_segs;
1752     struct skb_shared_info *shinfo;
1753     const struct tcphdr *th;
1754     struct tcphdr *thtail;
1755     struct sk_buff *tail;
1756     unsigned int hdrlen;
1757     bool fragstolen;
1758     u32 gso_segs;
1759     u32 gso_size;
1760     int delta;
1761
1762     /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1763      * we can fix skb->truesize to its real value to avoid future drops.
1764      * This is valid because skb is not yet charged to the socket.
1765      * It has been noticed pure SACK packets were sometimes dropped
1766      * (if cooked by drivers without copybreak feature).
1767      */
1768     skb_condense(skb);
1769
1770     skb_dst_drop(skb);
1771
1772     if (unlikely(tcp_checksum_complete(skb))) {
1773         bh_unlock_sock(sk);
1774         trace_tcp_bad_csum(skb);
1775         *reason = SKB_DROP_REASON_TCP_CSUM;
1776         __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777         __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1778         return true;
1779     }
1780
1781     /* Attempt coalescing to last skb in backlog, even if we are
1782      * above the limits.
1783      * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1784      */
1785     th = (const struct tcphdr *)skb->data;
1786     hdrlen = th->doff * 4;
1787
1788     tail = sk->sk_backlog.tail;
1789     if (!tail)
1790         goto no_coalesce;
1791     thtail = (struct tcphdr *)tail->data;
1792
1793     if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1794         TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1795         ((TCP_SKB_CB(tail)->tcp_flags |
1796           TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1797         !((TCP_SKB_CB(tail)->tcp_flags &
1798           TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1799         ((TCP_SKB_CB(tail)->tcp_flags ^
1800           TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1801 #ifdef CONFIG_TLS_DEVICE
1802         tail->decrypted != skb->decrypted ||
1803 #endif
1804         thtail->doff != th->doff ||
1805         memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1806         goto no_coalesce;
1807
1808     __skb_pull(skb, hdrlen);
1809
1810     shinfo = skb_shinfo(skb);
1811     gso_size = shinfo->gso_size ?: skb->len;
1812     gso_segs = shinfo->gso_segs ?: 1;
1813
1814     shinfo = skb_shinfo(tail);
1815     tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1816     tail_gso_segs = shinfo->gso_segs ?: 1;
1817
1818     if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1819         TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1820
1821         if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1822             TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1823             thtail->window = th->window;
1824         }
1825
1826         /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1827          * thtail->fin, so that the fast path in tcp_rcv_established()
1828          * is not entered if we append a packet with a FIN.
1829          * SYN, RST, URG are not present.
1830          * ACK is set on both packets.
1831          * PSH : we do not really care in TCP stack,
1832          *       at least for 'GRO' packets.
1833          */
1834         thtail->fin |= th->fin;
1835         TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1836
1837         if (TCP_SKB_CB(skb)->has_rxtstamp) {
1838             TCP_SKB_CB(tail)->has_rxtstamp = true;
1839             tail->tstamp = skb->tstamp;
1840             skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1841         }
1842
1843         /* Not as strict as GRO. We only need to carry mss max value */
1844         shinfo->gso_size = max(gso_size, tail_gso_size);
1845         shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1846
1847         sk->sk_backlog.len += delta;
1848         __NET_INC_STATS(sock_net(sk),
1849                 LINUX_MIB_TCPBACKLOGCOALESCE);
1850         kfree_skb_partial(skb, fragstolen);
1851         return false;
1852     }
1853     __skb_push(skb, hdrlen);
1854
1855 no_coalesce:
1856     /* Only socket owner can try to collapse/prune rx queues
1857      * to reduce memory overhead, so add a little headroom here.
1858      * Few sockets backlog are possibly concurrently non empty.
1859      */
1860     limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1861
1862     if (unlikely(sk_add_backlog(sk, skb, limit))) {
1863         bh_unlock_sock(sk);
1864         *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1865         __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1866         return true;
1867     }
1868     return false;
1869 }
1870 EXPORT_SYMBOL(tcp_add_backlog);
1871
1872 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1873 {
1874     struct tcphdr *th = (struct tcphdr *)skb->data;
1875
1876     return sk_filter_trim_cap(sk, skb, th->doff * 4);
1877 }
1878 EXPORT_SYMBOL(tcp_filter);
1879
1880 static void tcp_v4_restore_cb(struct sk_buff *skb)
1881 {
1882     memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1883         sizeof(struct inet_skb_parm));
1884 }
1885
1886 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1887                const struct tcphdr *th)
1888 {
1889     /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1890      * barrier() makes sure compiler wont play fool^Waliasing games.
1891      */
1892     memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1893         sizeof(struct inet_skb_parm));
1894     barrier();
1895
1896     TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1897     TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1898                     skb->len - th->doff * 4);
1899     TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1900     TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1901     TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1902     TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1903     TCP_SKB_CB(skb)->sacked  = 0;
1904     TCP_SKB_CB(skb)->has_rxtstamp =
1905             skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1906 }
1907
1908 /*
1909  *  From tcp_input.c
1910  */
1911
1912 int tcp_v4_rcv(struct sk_buff *skb)
1913 {
1914     struct net *net = dev_net(skb->dev);
1915     enum skb_drop_reason drop_reason;
1916     int sdif = inet_sdif(skb);
1917     int dif = inet_iif(skb);
1918     const struct iphdr *iph;
1919     const struct tcphdr *th;
1920     bool refcounted;
1921     struct sock *sk;
1922     int ret;
1923
1924     drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1925     if (skb->pkt_type != PACKET_HOST)
1926         goto discard_it;
1927
1928     /* Count it even if it's bad */
1929     __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1930
1931     if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1932         goto discard_it;
1933
1934     th = (const struct tcphdr *)skb->data;
1935
1936     if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1937         drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1938         goto bad_packet;
1939     }
1940     if (!pskb_may_pull(skb, th->doff * 4))
1941         goto discard_it;
1942
1943     /* An explanation is required here, I think.
1944      * Packet length and doff are validated by header prediction,
1945      * provided case of th->doff==0 is eliminated.
1946      * So, we defer the checks. */
1947
1948     if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1949         goto csum_error;
1950
1951     th = (const struct tcphdr *)skb->data;
1952     iph = ip_hdr(skb);
1953 lookup:
1954     sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1955                    th->dest, sdif, &refcounted);
1956     if (!sk)
1957         goto no_tcp_socket;
1958
1959 process:
1960     if (sk->sk_state == TCP_TIME_WAIT)
1961         goto do_time_wait;
1962
1963     if (sk->sk_state == TCP_NEW_SYN_RECV) {
1964         struct request_sock *req = inet_reqsk(sk);
1965         bool req_stolen = false;
1966         struct sock *nsk;
1967
1968         sk = req->rsk_listener;
1969         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1970             drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1971         else
1972             drop_reason = tcp_inbound_md5_hash(sk, skb,
1973                            &iph->saddr, &iph->daddr,
1974                            AF_INET, dif, sdif);
1975         if (unlikely(drop_reason)) {
1976             sk_drops_add(sk, skb);
1977             reqsk_put(req);
1978             goto discard_it;
1979         }
1980         if (tcp_checksum_complete(skb)) {
1981             reqsk_put(req);
1982             goto csum_error;
1983         }
1984         if (unlikely(sk->sk_state != TCP_LISTEN)) {
1985             nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1986             if (!nsk) {
1987                 inet_csk_reqsk_queue_drop_and_put(sk, req);
1988                 goto lookup;
1989             }
1990             sk = nsk;
1991             /* reuseport_migrate_sock() has already held one sk_refcnt
1992              * before returning.
1993              */
1994         } else {
1995             /* We own a reference on the listener, increase it again
1996              * as we might lose it too soon.
1997              */
1998             sock_hold(sk);
1999         }
2000         refcounted = true;
2001         nsk = NULL;
2002         if (!tcp_filter(sk, skb)) {
2003             th = (const struct tcphdr *)skb->data;
2004             iph = ip_hdr(skb);
2005             tcp_v4_fill_cb(skb, iph, th);
2006             nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2007         } else {
2008             drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2009         }
2010         if (!nsk) {
2011             reqsk_put(req);
2012             if (req_stolen) {
2013                 /* Another cpu got exclusive access to req
2014                  * and created a full blown socket.
2015                  * Try to feed this packet to this socket
2016                  * instead of discarding it.
2017                  */
2018                 tcp_v4_restore_cb(skb);
2019                 sock_put(sk);
2020                 goto lookup;
2021             }
2022             goto discard_and_relse;
2023         }
2024         nf_reset_ct(skb);
2025         if (nsk == sk) {
2026             reqsk_put(req);
2027             tcp_v4_restore_cb(skb);
2028         } else if (tcp_child_process(sk, nsk, skb)) {
2029             tcp_v4_send_reset(nsk, skb);
2030             goto discard_and_relse;
2031         } else {
2032             sock_put(sk);
2033             return 0;
2034         }
2035     }
2036
2037     if (static_branch_unlikely(&ip4_min_ttl)) {
2038         /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2039         if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2040             __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2041             goto discard_and_relse;
2042         }
2043     }
2044
2045     if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2046         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2047         goto discard_and_relse;
2048     }
2049
2050     drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2051                        &iph->daddr, AF_INET, dif, sdif);
2052     if (drop_reason)
2053         goto discard_and_relse;
2054
2055     nf_reset_ct(skb);
2056
2057     if (tcp_filter(sk, skb)) {
2058         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2059         goto discard_and_relse;
2060     }
2061     th = (const struct tcphdr *)skb->data;
2062     iph = ip_hdr(skb);
2063     tcp_v4_fill_cb(skb, iph, th);
2064
2065     skb->dev = NULL;
2066
2067     if (sk->sk_state == TCP_LISTEN) {
2068         ret = tcp_v4_do_rcv(sk, skb);
2069         goto put_and_return;
2070     }
2071
2072     sk_incoming_cpu_update(sk);
2073
2074     bh_lock_sock_nested(sk);
2075     tcp_segs_in(tcp_sk(sk), skb);
2076     ret = 0;
2077     if (!sock_owned_by_user(sk)) {
2078         ret = tcp_v4_do_rcv(sk, skb);
2079     } else {
2080         if (tcp_add_backlog(sk, skb, &drop_reason))
2081             goto discard_and_relse;
2082     }
2083     bh_unlock_sock(sk);
2084
2085 put_and_return:
2086     if (refcounted)
2087         sock_put(sk);
2088
2089     return ret;
2090
2091 no_tcp_socket:
2092     drop_reason = SKB_DROP_REASON_NO_SOCKET;
2093     if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2094         goto discard_it;
2095
2096     tcp_v4_fill_cb(skb, iph, th);
2097
2098     if (tcp_checksum_complete(skb)) {
2099 csum_error:
2100         drop_reason = SKB_DROP_REASON_TCP_CSUM;
2101         trace_tcp_bad_csum(skb);
2102         __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2103 bad_packet:
2104         __TCP_INC_STATS(net, TCP_MIB_INERRS);
2105     } else {
2106         tcp_v4_send_reset(NULL, skb);
2107     }
2108
2109 discard_it:
2110     SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2111     /* Discard frame. */
2112     kfree_skb_reason(skb, drop_reason);
2113     return 0;
2114
2115 discard_and_relse:
2116     sk_drops_add(sk, skb);
2117     if (refcounted)
2118         sock_put(sk);
2119     goto discard_it;
2120
2121 do_time_wait:
2122     if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2123         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2124         inet_twsk_put(inet_twsk(sk));
2125         goto discard_it;
2126     }
2127
2128     tcp_v4_fill_cb(skb, iph, th);
2129
2130     if (tcp_checksum_complete(skb)) {
2131         inet_twsk_put(inet_twsk(sk));
2132         goto csum_error;
2133     }
2134     switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2135     case TCP_TW_SYN: {
2136         struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2137                             &tcp_hashinfo, skb,
2138                             __tcp_hdrlen(th),
2139                             iph->saddr, th->source,
2140                             iph->daddr, th->dest,
2141                             inet_iif(skb),
2142                             sdif);
2143         if (sk2) {
2144             inet_twsk_deschedule_put(inet_twsk(sk));
2145             sk = sk2;
2146             tcp_v4_restore_cb(skb);
2147             refcounted = false;
2148             goto process;
2149         }
2150     }
2151         /* to ACK */
2152         fallthrough;
2153     case TCP_TW_ACK:
2154         tcp_v4_timewait_ack(sk, skb);
2155         break;
2156     case TCP_TW_RST:
2157         tcp_v4_send_reset(sk, skb);
2158         inet_twsk_deschedule_put(inet_twsk(sk));
2159         goto discard_it;
2160     case TCP_TW_SUCCESS:;
2161     }
2162     goto discard_it;
2163 }
2164
2165 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2166     .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2167     .twsk_unique    = tcp_twsk_unique,
2168     .twsk_destructor= tcp_twsk_destructor,
2169 };
2170
2171 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2172 {
2173     struct dst_entry *dst = skb_dst(skb);
2174
2175     if (dst && dst_hold_safe(dst)) {
2176         rcu_assign_pointer(sk->sk_rx_dst, dst);
2177         sk->sk_rx_dst_ifindex = skb->skb_iif;
2178     }
2179 }
2180 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2181
2182 const struct inet_connection_sock_af_ops ipv4_specific = {
2183     .queue_xmit    = ip_queue_xmit,
2184     .send_check    = tcp_v4_send_check,
2185     .rebuild_header    = inet_sk_rebuild_header,
2186     .sk_rx_dst_set     = inet_sk_rx_dst_set,
2187     .conn_request      = tcp_v4_conn_request,
2188     .syn_recv_sock     = tcp_v4_syn_recv_sock,
2189     .net_header_len    = sizeof(struct iphdr),
2190     .setsockopt    = ip_setsockopt,
2191     .getsockopt    = ip_getsockopt,
2192     .addr2sockaddr     = inet_csk_addr2sockaddr,
2193     .sockaddr_len      = sizeof(struct sockaddr_in),
2194     .mtu_reduced       = tcp_v4_mtu_reduced,
2195 };
2196 EXPORT_SYMBOL(ipv4_specific);
2197
2198 #ifdef CONFIG_TCP_MD5SIG
2199 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2200     .md5_lookup     = tcp_v4_md5_lookup,
2201     .calc_md5_hash      = tcp_v4_md5_hash_skb,
2202     .md5_parse      = tcp_v4_parse_md5_keys,
2203 };
2204 #endif
2205
2206 /* NOTE: A lot of things set to zero explicitly by call to
2207  *       sk_alloc() so need not be done here.
2208  */
2209 static int tcp_v4_init_sock(struct sock *sk)
2210 {
2211     struct inet_connection_sock *icsk = inet_csk(sk);
2212
2213     tcp_init_sock(sk);
2214
2215     icsk->icsk_af_ops = &ipv4_specific;
2216
2217 #ifdef CONFIG_TCP_MD5SIG
2218     tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2219 #endif
2220
2221     return 0;
2222 }
2223
2224 void tcp_v4_destroy_sock(struct sock *sk)
2225 {
2226     struct tcp_sock *tp = tcp_sk(sk);
2227
2228     trace_tcp_destroy_sock(sk);
2229
2230     tcp_clear_xmit_timers(sk);
2231
2232     tcp_cleanup_congestion_control(sk);
2233
2234     tcp_cleanup_ulp(sk);
2235
2236     /* Cleanup up the write buffer. */
2237     tcp_write_queue_purge(sk);
2238
2239     /* Check if we want to disable active TFO */
2240     tcp_fastopen_active_disable_ofo_check(sk);
2241
2242     /* Cleans up our, hopefully empty, out_of_order_queue. */
2243     skb_rbtree_purge(&tp->out_of_order_queue);
2244
2245 #ifdef CONFIG_TCP_MD5SIG
2246     /* Clean up the MD5 key list, if any */
2247     if (tp->md5sig_info) {
2248         tcp_clear_md5_list(sk);
2249         kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2250         tp->md5sig_info = NULL;
2251     }
2252 #endif
2253
2254     /* Clean up a referenced TCP bind bucket. */
2255     if (inet_csk(sk)->icsk_bind_hash)
2256         inet_put_port(sk);
2257
2258     BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2259
2260     /* If socket is aborted during connect operation */
2261     tcp_free_fastopen_req(tp);
2262     tcp_fastopen_destroy_cipher(sk);
2263     tcp_saved_syn_free(tp);
2264
2265     sk_sockets_allocated_dec(sk);
2266 }
2267 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2268
2269 #ifdef CONFIG_PROC_FS
2270 /* Proc filesystem TCP sock list dumping. */
2271
2272 static unsigned short seq_file_family(const struct seq_file *seq);
2273
2274 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2275 {
2276     unsigned short family = seq_file_family(seq);
2277
2278     /* AF_UNSPEC is used as a match all */
2279     return ((family == AF_UNSPEC || family == sk->sk_family) &&
2280         net_eq(sock_net(sk), seq_file_net(seq)));
2281 }
2282
2283 /* Find a non empty bucket (starting from st->bucket)
2284  * and return the first sk from it.
2285  */
2286 static void *listening_get_first(struct seq_file *seq)
2287 {
2288     struct tcp_iter_state *st = seq->private;
2289
2290     st->offset = 0;
2291     for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2292         struct inet_listen_hashbucket *ilb2;
2293         struct hlist_nulls_node *node;
2294         struct sock *sk;
2295
2296         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2297         if (hlist_nulls_empty(&ilb2->nulls_head))
2298             continue;
2299
2300         spin_lock(&ilb2->lock);
2301         sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2302             if (seq_sk_match(seq, sk))
2303                 return sk;
2304         }
2305         spin_unlock(&ilb2->lock);
2306     }
2307
2308     return NULL;
2309 }
2310
2311 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2312  * If "cur" is the last one in the st->bucket,
2313  * call listening_get_first() to return the first sk of the next
2314  * non empty bucket.
2315  */
2316 static void *listening_get_next(struct seq_file *seq, void *cur)
2317 {
2318     struct tcp_iter_state *st = seq->private;
2319     struct inet_listen_hashbucket *ilb2;
2320     struct hlist_nulls_node *node;
2321     struct sock *sk = cur;
2322
2323     ++st->num;
2324     ++st->offset;
2325
2326     sk = sk_nulls_next(sk);
2327     sk_nulls_for_each_from(sk, node) {
2328         if (seq_sk_match(seq, sk))
2329             return sk;
2330     }
2331
2332     ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2333     spin_unlock(&ilb2->lock);
2334     ++st->bucket;
2335     return listening_get_first(seq);
2336 }
2337
2338 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2339 {
2340     struct tcp_iter_state *st = seq->private;
2341     void *rc;
2342
2343     st->bucket = 0;
2344     st->offset = 0;
2345     rc = listening_get_first(seq);
2346
2347     while (rc && *pos) {
2348         rc = listening_get_next(seq, rc);
2349         --*pos;
2350     }
2351     return rc;
2352 }
2353
2354 static inline bool empty_bucket(const struct tcp_iter_state *st)
2355 {
2356     return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2357 }
2358
2359 /*
2360  * Get first established socket starting from bucket given in st->bucket.
2361  * If st->bucket is zero, the very first socket in the hash is returned.
2362  */
2363 static void *established_get_first(struct seq_file *seq)
2364 {
2365     struct tcp_iter_state *st = seq->private;
2366
2367     st->offset = 0;
2368     for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2369         struct sock *sk;
2370         struct hlist_nulls_node *node;
2371         spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2372
2373         /* Lockless fast path for the common case of empty buckets */
2374         if (empty_bucket(st))
2375             continue;
2376
2377         spin_lock_bh(lock);
2378         sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2379             if (seq_sk_match(seq, sk))
2380                 return sk;
2381         }
2382         spin_unlock_bh(lock);
2383     }
2384
2385     return NULL;
2386 }
2387
2388 static void *established_get_next(struct seq_file *seq, void *cur)
2389 {
2390     struct sock *sk = cur;
2391     struct hlist_nulls_node *node;
2392     struct tcp_iter_state *st = seq->private;
2393
2394     ++st->num;
2395     ++st->offset;
2396
2397     sk = sk_nulls_next(sk);
2398
2399     sk_nulls_for_each_from(sk, node) {
2400         if (seq_sk_match(seq, sk))
2401             return sk;
2402     }
2403
2404     spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2405     ++st->bucket;
2406     return established_get_first(seq);
2407 }
2408
2409 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2410 {
2411     struct tcp_iter_state *st = seq->private;
2412     void *rc;
2413
2414     st->bucket = 0;
2415     rc = established_get_first(seq);
2416
2417     while (rc && pos) {
2418         rc = established_get_next(seq, rc);
2419         --pos;
2420     }
2421     return rc;
2422 }
2423
2424 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2425 {
2426     void *rc;
2427     struct tcp_iter_state *st = seq->private;
2428
2429     st->state = TCP_SEQ_STATE_LISTENING;
2430     rc    = listening_get_idx(seq, &pos);
2431
2432     if (!rc) {
2433         st->state = TCP_SEQ_STATE_ESTABLISHED;
2434         rc    = established_get_idx(seq, pos);
2435     }
2436
2437     return rc;
2438 }
2439
2440 static void *tcp_seek_last_pos(struct seq_file *seq)
2441 {
2442     struct tcp_iter_state *st = seq->private;
2443     int bucket = st->bucket;
2444     int offset = st->offset;
2445     int orig_num = st->num;
2446     void *rc = NULL;
2447
2448     switch (st->state) {
2449     case TCP_SEQ_STATE_LISTENING:
2450         if (st->bucket > tcp_hashinfo.lhash2_mask)
2451             break;
2452         st->state = TCP_SEQ_STATE_LISTENING;
2453         rc = listening_get_first(seq);
2454         while (offset-- && rc && bucket == st->bucket)
2455             rc = listening_get_next(seq, rc);
2456         if (rc)
2457             break;
2458         st->bucket = 0;
2459         st->state = TCP_SEQ_STATE_ESTABLISHED;
2460         fallthrough;
2461     case TCP_SEQ_STATE_ESTABLISHED:
2462         if (st->bucket > tcp_hashinfo.ehash_mask)
2463             break;
2464         rc = established_get_first(seq);
2465         while (offset-- && rc && bucket == st->bucket)
2466             rc = established_get_next(seq, rc);
2467     }
2468
2469     st->num = orig_num;
2470
2471     return rc;
2472 }
2473
2474 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2475 {
2476     struct tcp_iter_state *st = seq->private;
2477     void *rc;
2478
2479     if (*pos && *pos == st->last_pos) {
2480         rc = tcp_seek_last_pos(seq);
2481         if (rc)
2482             goto out;
2483     }
2484
2485     st->state = TCP_SEQ_STATE_LISTENING;
2486     st->num = 0;
2487     st->bucket = 0;
2488     st->offset = 0;
2489     rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2490
2491 out:
2492     st->last_pos = *pos;
2493     return rc;
2494 }
2495 EXPORT_SYMBOL(tcp_seq_start);
2496
2497 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2498 {
2499     struct tcp_iter_state *st = seq->private;
2500     void *rc = NULL;
2501
2502     if (v == SEQ_START_TOKEN) {
2503         rc = tcp_get_idx(seq, 0);
2504         goto out;
2505     }
2506
2507     switch (st->state) {
2508     case TCP_SEQ_STATE_LISTENING:
2509         rc = listening_get_next(seq, v);
2510         if (!rc) {
2511             st->state = TCP_SEQ_STATE_ESTABLISHED;
2512             st->bucket = 0;
2513             st->offset = 0;
2514             rc    = established_get_first(seq);
2515         }
2516         break;
2517     case TCP_SEQ_STATE_ESTABLISHED:
2518         rc = established_get_next(seq, v);
2519         break;
2520     }
2521 out:
2522     ++*pos;
2523     st->last_pos = *pos;
2524     return rc;
2525 }
2526 EXPORT_SYMBOL(tcp_seq_next);
2527
2528 void tcp_seq_stop(struct seq_file *seq, void *v)
2529 {
2530     struct tcp_iter_state *st = seq->private;
2531
2532     switch (st->state) {
2533     case TCP_SEQ_STATE_LISTENING:
2534         if (v != SEQ_START_TOKEN)
2535             spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2536         break;
2537     case TCP_SEQ_STATE_ESTABLISHED:
2538         if (v)
2539             spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2540         break;
2541     }
2542 }
2543 EXPORT_SYMBOL(tcp_seq_stop);
2544
2545 static void get_openreq4(const struct request_sock *req,
2546              struct seq_file *f, int i)
2547 {
2548     const struct inet_request_sock *ireq = inet_rsk(req);
2549     long delta = req->rsk_timer.expires - jiffies;
2550
2551     seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2552         " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2553         i,
2554         ireq->ir_loc_addr,
2555         ireq->ir_num,
2556         ireq->ir_rmt_addr,
2557         ntohs(ireq->ir_rmt_port),
2558         TCP_SYN_RECV,
2559         0, 0, /* could print option size, but that is af dependent. */
2560         1,    /* timers active (only the expire timer) */
2561         jiffies_delta_to_clock_t(delta),
2562         req->num_timeout,
2563         from_kuid_munged(seq_user_ns(f),
2564                  sock_i_uid(req->rsk_listener)),
2565         0,  /* non standard timer */
2566         0, /* open_requests have no inode */
2567         0,
2568         req);
2569 }
2570
2571 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2572 {
2573     int timer_active;
2574     unsigned long timer_expires;
2575     const struct tcp_sock *tp = tcp_sk(sk);
2576     const struct inet_connection_sock *icsk = inet_csk(sk);
2577     const struct inet_sock *inet = inet_sk(sk);
2578     const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2579     __be32 dest = inet->inet_daddr;
2580     __be32 src = inet->inet_rcv_saddr;
2581     __u16 destp = ntohs(inet->inet_dport);
2582     __u16 srcp = ntohs(inet->inet_sport);
2583     int rx_queue;
2584     int state;
2585
2586     if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2587         icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2588         icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2589         timer_active    = 1;
2590         timer_expires   = icsk->icsk_timeout;
2591     } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2592         timer_active    = 4;
2593         timer_expires   = icsk->icsk_timeout;
2594     } else if (timer_pending(&sk->sk_timer)) {
2595         timer_active    = 2;
2596         timer_expires   = sk->sk_timer.expires;
2597     } else {
2598         timer_active    = 0;
2599         timer_expires = jiffies;
2600     }
2601
2602     state = inet_sk_state_load(sk);
2603     if (state == TCP_LISTEN)
2604         rx_queue = READ_ONCE(sk->sk_ack_backlog);
2605     else
2606         /* Because we don't lock the socket,
2607          * we might find a transient negative value.
2608          */
2609         rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2610                       READ_ONCE(tp->copied_seq), 0);
2611
2612     seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2613             "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2614         i, src, srcp, dest, destp, state,
2615         READ_ONCE(tp->write_seq) - tp->snd_una,
2616         rx_queue,
2617         timer_active,
2618         jiffies_delta_to_clock_t(timer_expires - jiffies),
2619         icsk->icsk_retransmits,
2620         from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2621         icsk->icsk_probes_out,
2622         sock_i_ino(sk),
2623         refcount_read(&sk->sk_refcnt), sk,
2624         jiffies_to_clock_t(icsk->icsk_rto),
2625         jiffies_to_clock_t(icsk->icsk_ack.ato),
2626         (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2627         tcp_snd_cwnd(tp),
2628         state == TCP_LISTEN ?
2629             fastopenq->max_qlen :
2630             (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2631 }
2632
2633 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2634                    struct seq_file *f, int i)
2635 {
2636     long delta = tw->tw_timer.expires - jiffies;
2637     __be32 dest, src;
2638     __u16 destp, srcp;
2639
2640     dest  = tw->tw_daddr;
2641     src   = tw->tw_rcv_saddr;
2642     destp = ntohs(tw->tw_dport);
2643     srcp  = ntohs(tw->tw_sport);
2644
2645     seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2646         " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2647         i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2648         3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2649         refcount_read(&tw->tw_refcnt), tw);
2650 }
2651
2652 #define TMPSZ 150
2653
2654 static int tcp4_seq_show(struct seq_file *seq, void *v)
2655 {
2656     struct tcp_iter_state *st;
2657     struct sock *sk = v;
2658
2659     seq_setwidth(seq, TMPSZ - 1);
2660     if (v == SEQ_START_TOKEN) {
2661         seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2662                "rx_queue tr tm->when retrnsmt   uid  timeout "
2663                "inode");
2664         goto out;
2665     }
2666     st = seq->private;
2667
2668     if (sk->sk_state == TCP_TIME_WAIT)
2669         get_timewait4_sock(v, seq, st->num);
2670     else if (sk->sk_state == TCP_NEW_SYN_RECV)
2671         get_openreq4(v, seq, st->num);
2672     else
2673         get_tcp4_sock(v, seq, st->num);
2674 out:
2675     seq_pad(seq, '\n');
2676     return 0;
2677 }
2678
2679 #ifdef CONFIG_BPF_SYSCALL
2680 struct bpf_tcp_iter_state {
2681     struct tcp_iter_state state;
2682     unsigned int cur_sk;
2683     unsigned int end_sk;
2684     unsigned int max_sk;
2685     struct sock **batch;
2686     bool st_bucket_done;
2687 };
2688
2689 struct bpf_iter__tcp {
2690     __bpf_md_ptr(struct bpf_iter_meta *, meta);
2691     __bpf_md_ptr(struct sock_common *, sk_common);
2692     uid_t uid __aligned(8);
2693 };
2694
2695 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2696                  struct sock_common *sk_common, uid_t uid)
2697 {
2698     struct bpf_iter__tcp ctx;
2699
2700     meta->seq_num--;  /* skip SEQ_START_TOKEN */
2701     ctx.meta = meta;
2702     ctx.sk_common = sk_common;
2703     ctx.uid = uid;
2704     return bpf_iter_run_prog(prog, &ctx);
2705 }
2706
2707 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2708 {
2709     while (iter->cur_sk < iter->end_sk)
2710         sock_put(iter->batch[iter->cur_sk++]);
2711 }
2712
2713 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2714                       unsigned int new_batch_sz)
2715 {
2716     struct sock **new_batch;
2717
2718     new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2719                  GFP_USER | __GFP_NOWARN);
2720     if (!new_batch)
2721         return -ENOMEM;
2722
2723     bpf_iter_tcp_put_batch(iter);
2724     kvfree(iter->batch);
2725     iter->batch = new_batch;
2726     iter->max_sk = new_batch_sz;
2727
2728     return 0;
2729 }
2730
2731 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2732                          struct sock *start_sk)
2733 {
2734     struct bpf_tcp_iter_state *iter = seq->private;
2735     struct tcp_iter_state *st = &iter->state;
2736     struct hlist_nulls_node *node;
2737     unsigned int expected = 1;
2738     struct sock *sk;
2739
2740     sock_hold(start_sk);
2741     iter->batch[iter->end_sk++] = start_sk;
2742
2743     sk = sk_nulls_next(start_sk);
2744     sk_nulls_for_each_from(sk, node) {
2745         if (seq_sk_match(seq, sk)) {
2746             if (iter->end_sk < iter->max_sk) {
2747                 sock_hold(sk);
2748                 iter->batch[iter->end_sk++] = sk;
2749             }
2750             expected++;
2751         }
2752     }
2753     spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2754
2755     return expected;
2756 }
2757
2758 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2759                            struct sock *start_sk)
2760 {
2761     struct bpf_tcp_iter_state *iter = seq->private;
2762     struct tcp_iter_state *st = &iter->state;
2763     struct hlist_nulls_node *node;
2764     unsigned int expected = 1;
2765     struct sock *sk;
2766
2767     sock_hold(start_sk);
2768     iter->batch[iter->end_sk++] = start_sk;
2769
2770     sk = sk_nulls_next(start_sk);
2771     sk_nulls_for_each_from(sk, node) {
2772         if (seq_sk_match(seq, sk)) {
2773             if (iter->end_sk < iter->max_sk) {
2774                 sock_hold(sk);
2775                 iter->batch[iter->end_sk++] = sk;
2776             }
2777             expected++;
2778         }
2779     }
2780     spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2781
2782     return expected;
2783 }
2784
2785 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2786 {
2787     struct bpf_tcp_iter_state *iter = seq->private;
2788     struct tcp_iter_state *st = &iter->state;
2789     unsigned int expected;
2790     bool resized = false;
2791     struct sock *sk;
2792
2793     /* The st->bucket is done.  Directly advance to the next
2794      * bucket instead of having the tcp_seek_last_pos() to skip
2795      * one by one in the current bucket and eventually find out
2796      * it has to advance to the next bucket.
2797      */
2798     if (iter->st_bucket_done) {
2799         st->offset = 0;
2800         st->bucket++;
2801         if (st->state == TCP_SEQ_STATE_LISTENING &&
2802             st->bucket > tcp_hashinfo.lhash2_mask) {
2803             st->state = TCP_SEQ_STATE_ESTABLISHED;
2804             st->bucket = 0;
2805         }
2806     }
2807
2808 again:
2809     /* Get a new batch */
2810     iter->cur_sk = 0;
2811     iter->end_sk = 0;
2812     iter->st_bucket_done = false;
2813
2814     sk = tcp_seek_last_pos(seq);
2815     if (!sk)
2816         return NULL; /* Done */
2817
2818     if (st->state == TCP_SEQ_STATE_LISTENING)
2819         expected = bpf_iter_tcp_listening_batch(seq, sk);
2820     else
2821         expected = bpf_iter_tcp_established_batch(seq, sk);
2822
2823     if (iter->end_sk == expected) {
2824         iter->st_bucket_done = true;
2825         return sk;
2826     }
2827
2828     if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2829         resized = true;
2830         goto again;
2831     }
2832
2833     return sk;
2834 }
2835
2836 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2837 {
2838     /* bpf iter does not support lseek, so it always
2839      * continue from where it was stop()-ped.
2840      */
2841     if (*pos)
2842         return bpf_iter_tcp_batch(seq);
2843
2844     return SEQ_START_TOKEN;
2845 }
2846
2847 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2848 {
2849     struct bpf_tcp_iter_state *iter = seq->private;
2850     struct tcp_iter_state *st = &iter->state;
2851     struct sock *sk;
2852
2853     /* Whenever seq_next() is called, the iter->cur_sk is
2854      * done with seq_show(), so advance to the next sk in
2855      * the batch.
2856      */
2857     if (iter->cur_sk < iter->end_sk) {
2858         /* Keeping st->num consistent in tcp_iter_state.
2859          * bpf_iter_tcp does not use st->num.
2860          * meta.seq_num is used instead.
2861          */
2862         st->num++;
2863         /* Move st->offset to the next sk in the bucket such that
2864          * the future start() will resume at st->offset in
2865          * st->bucket.  See tcp_seek_last_pos().
2866          */
2867         st->offset++;
2868         sock_put(iter->batch[iter->cur_sk++]);
2869     }
2870
2871     if (iter->cur_sk < iter->end_sk)
2872         sk = iter->batch[iter->cur_sk];
2873     else
2874         sk = bpf_iter_tcp_batch(seq);
2875
2876     ++*pos;
2877     /* Keeping st->last_pos consistent in tcp_iter_state.
2878      * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2879      */
2880     st->last_pos = *pos;
2881     return sk;
2882 }
2883
2884 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2885 {
2886     struct bpf_iter_meta meta;
2887     struct bpf_prog *prog;
2888     struct sock *sk = v;
2889     bool slow;
2890     uid_t uid;
2891     int ret;
2892
2893     if (v == SEQ_START_TOKEN)
2894         return 0;
2895
2896     if (sk_fullsock(sk))
2897         slow = lock_sock_fast(sk);
2898
2899     if (unlikely(sk_unhashed(sk))) {
2900         ret = SEQ_SKIP;
2901         goto unlock;
2902     }
2903
2904     if (sk->sk_state == TCP_TIME_WAIT) {
2905         uid = 0;
2906     } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2907         const struct request_sock *req = v;
2908
2909         uid = from_kuid_munged(seq_user_ns(seq),
2910                        sock_i_uid(req->rsk_listener));
2911     } else {
2912         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2913     }
2914
2915     meta.seq = seq;
2916     prog = bpf_iter_get_info(&meta, false);
2917     ret = tcp_prog_seq_show(prog, &meta, v, uid);
2918
2919 unlock:
2920     if (sk_fullsock(sk))
2921         unlock_sock_fast(sk, slow);
2922     return ret;
2923
2924 }
2925
2926 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2927 {
2928     struct bpf_tcp_iter_state *iter = seq->private;
2929     struct bpf_iter_meta meta;
2930     struct bpf_prog *prog;
2931
2932     if (!v) {
2933         meta.seq = seq;
2934         prog = bpf_iter_get_info(&meta, true);
2935         if (prog)
2936             (void)tcp_prog_seq_show(prog, &meta, v, 0);
2937     }
2938
2939     if (iter->cur_sk < iter->end_sk) {
2940         bpf_iter_tcp_put_batch(iter);
2941         iter->st_bucket_done = false;
2942     }
2943 }
2944
2945 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2946     .show       = bpf_iter_tcp_seq_show,
2947     .start      = bpf_iter_tcp_seq_start,
2948     .next       = bpf_iter_tcp_seq_next,
2949     .stop       = bpf_iter_tcp_seq_stop,
2950 };
2951 #endif
2952 static unsigned short seq_file_family(const struct seq_file *seq)
2953 {
2954     const struct tcp_seq_afinfo *afinfo;
2955
2956 #ifdef CONFIG_BPF_SYSCALL
2957     /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2958     if (seq->op == &bpf_iter_tcp_seq_ops)
2959         return AF_UNSPEC;
2960 #endif
2961
2962     /* Iterated from proc fs */
2963     afinfo = pde_data(file_inode(seq->file));
2964     return afinfo->family;
2965 }
2966
2967 static const struct seq_operations tcp4_seq_ops = {
2968     .show       = tcp4_seq_show,
2969     .start      = tcp_seq_start,
2970     .next       = tcp_seq_next,
2971     .stop       = tcp_seq_stop,
2972 };
2973
2974 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2975     .family     = AF_INET,
2976 };
2977
2978 static int __net_init tcp4_proc_init_net(struct net *net)
2979 {
2980     if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2981             sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2982         return -ENOMEM;
2983     return 0;
2984 }
2985
2986 static void __net_exit tcp4_proc_exit_net(struct net *net)
2987 {
2988     remove_proc_entry("tcp", net->proc_net);
2989 }
2990
2991 static struct pernet_operations tcp4_net_ops = {
2992     .init = tcp4_proc_init_net,
2993     .exit = tcp4_proc_exit_net,
2994 };
2995
2996 int __init tcp4_proc_init(void)
2997 {
2998     return register_pernet_subsys(&tcp4_net_ops);
2999 }
3000
3001 void tcp4_proc_exit(void)
3002 {
3003     unregister_pernet_subsys(&tcp4_net_ops);
3004 }
3005 #endif /* CONFIG_PROC_FS */
3006
3007 /* @wake is one when sk_stream_write_space() calls us.
3008  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3009  * This mimics the strategy used in sock_def_write_space().
3010  */
3011 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3012 {
3013     const struct tcp_sock *tp = tcp_sk(sk);
3014     u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3015                 READ_ONCE(tp->snd_nxt);
3016
3017     return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3018 }
3019 EXPORT_SYMBOL(tcp_stream_memory_free);
3020
3021 struct proto tcp_prot = {
3022     .name           = "TCP",
3023     .owner          = THIS_MODULE,
3024     .close          = tcp_close,
3025     .pre_connect        = tcp_v4_pre_connect,
3026     .connect        = tcp_v4_connect,
3027     .disconnect     = tcp_disconnect,
3028     .accept         = inet_csk_accept,
3029     .ioctl          = tcp_ioctl,
3030     .init           = tcp_v4_init_sock,
3031     .destroy        = tcp_v4_destroy_sock,
3032     .shutdown       = tcp_shutdown,
3033     .setsockopt     = tcp_setsockopt,
3034     .getsockopt     = tcp_getsockopt,
3035     .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3036     .keepalive      = tcp_set_keepalive,
3037     .recvmsg        = tcp_recvmsg,
3038     .sendmsg        = tcp_sendmsg,
3039     .sendpage       = tcp_sendpage,
3040     .backlog_rcv        = tcp_v4_do_rcv,
3041     .release_cb     = tcp_release_cb,
3042     .hash           = inet_hash,
3043     .unhash         = inet_unhash,
3044     .get_port       = inet_csk_get_port,
3045     .put_port       = inet_put_port,
3046 #ifdef CONFIG_BPF_SYSCALL
3047     .psock_update_sk_prot   = tcp_bpf_update_proto,
3048 #endif
3049     .enter_memory_pressure  = tcp_enter_memory_pressure,
3050     .leave_memory_pressure  = tcp_leave_memory_pressure,
3051     .stream_memory_free = tcp_stream_memory_free,
3052     .sockets_allocated  = &tcp_sockets_allocated,
3053     .orphan_count       = &tcp_orphan_count,
3054
3055     .memory_allocated   = &tcp_memory_allocated,
3056     .per_cpu_fw_alloc   = &tcp_memory_per_cpu_fw_alloc,
3057
3058     .memory_pressure    = &tcp_memory_pressure,
3059     .sysctl_mem     = sysctl_tcp_mem,
3060     .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3061     .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3062     .max_header     = MAX_TCP_HEADER,
3063     .obj_size       = sizeof(struct tcp_sock),
3064     .slab_flags     = SLAB_TYPESAFE_BY_RCU,
3065     .twsk_prot      = &tcp_timewait_sock_ops,
3066     .rsk_prot       = &tcp_request_sock_ops,
3067     .h.hashinfo     = &tcp_hashinfo,
3068     .no_autobind        = true,
3069     .diag_destroy       = tcp_abort,
3070 };
3071 EXPORT_SYMBOL(tcp_prot);
3072
3073 static void __net_exit tcp_sk_exit(struct net *net)
3074 {
3075     struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3076
3077     if (net->ipv4.tcp_congestion_control)
3078         bpf_module_put(net->ipv4.tcp_congestion_control,
3079                    net->ipv4.tcp_congestion_control->owner);
3080     if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3081         kfree(tcp_death_row);
3082 }
3083
3084 static int __net_init tcp_sk_init(struct net *net)
3085 {
3086     int cnt;
3087
3088     net->ipv4.sysctl_tcp_ecn = 2;
3089     net->ipv4.sysctl_tcp_ecn_fallback = 1;
3090
3091     net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3092     net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3093     net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3094     net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3095     net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3096
3097     net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3098     net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3099     net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3100
3101     net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3102     net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3103     net->ipv4.sysctl_tcp_syncookies = 1;
3104     net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3105     net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3106     net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3107     net->ipv4.sysctl_tcp_orphan_retries = 0;
3108     net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3109     net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3110     net->ipv4.sysctl_tcp_tw_reuse = 2;
3111     net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3112
3113     net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3114     if (!net->ipv4.tcp_death_row)
3115         return -ENOMEM;
3116     refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3117     cnt = tcp_hashinfo.ehash_mask + 1;
3118     net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3119     net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3120
3121     net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3122     net->ipv4.sysctl_tcp_sack = 1;
3123     net->ipv4.sysctl_tcp_window_scaling = 1;
3124     net->ipv4.sysctl_tcp_timestamps = 1;
3125     net->ipv4.sysctl_tcp_early_retrans = 3;
3126     net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3127     net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3128     net->ipv4.sysctl_tcp_retrans_collapse = 1;
3129     net->ipv4.sysctl_tcp_max_reordering = 300;
3130     net->ipv4.sysctl_tcp_dsack = 1;
3131     net->ipv4.sysctl_tcp_app_win = 31;
3132     net->ipv4.sysctl_tcp_adv_win_scale = 1;
3133     net->ipv4.sysctl_tcp_frto = 2;
3134     net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3135     /* This limits the percentage of the congestion window which we
3136      * will allow a single TSO frame to consume.  Building TSO frames
3137      * which are too large can cause TCP streams to be bursty.
3138      */
3139     net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3140     /* Default TSQ limit of 16 TSO segments */
3141     net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3142
3143     /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3144     net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3145
3146     net->ipv4.sysctl_tcp_min_tso_segs = 2;
3147     net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3148     net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3149     net->ipv4.sysctl_tcp_autocorking = 1;
3150     net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3151     net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3152     net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3153     if (net != &init_net) {
3154         memcpy(net->ipv4.sysctl_tcp_rmem,
3155                init_net.ipv4.sysctl_tcp_rmem,
3156                sizeof(init_net.ipv4.sysctl_tcp_rmem));
3157         memcpy(net->ipv4.sysctl_tcp_wmem,
3158                init_net.ipv4.sysctl_tcp_wmem,
3159                sizeof(init_net.ipv4.sysctl_tcp_wmem));
3160     }
3161     net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3162     net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3163     net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3164     net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3165     net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3166     atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3167
3168     /* Reno is always built in */
3169     if (!net_eq(net, &init_net) &&
3170         bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3171                    init_net.ipv4.tcp_congestion_control->owner))
3172         net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3173     else
3174         net->ipv4.tcp_congestion_control = &tcp_reno;
3175
3176     return 0;
3177 }
3178
3179 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3180 {
3181     struct net *net;
3182
3183     inet_twsk_purge(&tcp_hashinfo, AF_INET);
3184
3185     list_for_each_entry(net, net_exit_list, exit_list)
3186         tcp_fastopen_ctx_destroy(net);
3187 }
3188
3189 static struct pernet_operations __net_initdata tcp_sk_ops = {
3190        .init       = tcp_sk_init,
3191        .exit       = tcp_sk_exit,
3192        .exit_batch = tcp_sk_exit_batch,
3193 };
3194
3195 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3196 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3197              struct sock_common *sk_common, uid_t uid)
3198
3199 #define INIT_BATCH_SZ 16
3200
3201 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3202 {
3203     struct bpf_tcp_iter_state *iter = priv_data;
3204     int err;
3205
3206     err = bpf_iter_init_seq_net(priv_data, aux);
3207     if (err)
3208         return err;
3209
3210     err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3211     if (err) {
3212         bpf_iter_fini_seq_net(priv_data);
3213         return err;
3214     }
3215
3216     return 0;
3217 }
3218
3219 static void bpf_iter_fini_tcp(void *priv_data)
3220 {
3221     struct bpf_tcp_iter_state *iter = priv_data;
3222
3223     bpf_iter_fini_seq_net(priv_data);
3224     kvfree(iter->batch);
3225 }
3226
3227 static const struct bpf_iter_seq_info tcp_seq_info = {
3228     .seq_ops        = &bpf_iter_tcp_seq_ops,
3229     .init_seq_private   = bpf_iter_init_tcp,
3230     .fini_seq_private   = bpf_iter_fini_tcp,
3231     .seq_priv_size      = sizeof(struct bpf_tcp_iter_state),
3232 };
3233
3234 static const struct bpf_func_proto *
3235 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3236                 const struct bpf_prog *prog)
3237 {
3238     switch (func_id) {
3239     case BPF_FUNC_setsockopt:
3240         return &bpf_sk_setsockopt_proto;
3241     case BPF_FUNC_getsockopt:
3242         return &bpf_sk_getsockopt_proto;
3243     default:
3244         return NULL;
3245     }
3246 }
3247
3248 static struct bpf_iter_reg tcp_reg_info = {
3249     .target         = "tcp",
3250     .ctx_arg_info_size  = 1,
3251     .ctx_arg_info       = {
3252         { offsetof(struct bpf_iter__tcp, sk_common),
3253           PTR_TO_BTF_ID_OR_NULL },
3254     },
3255     .get_func_proto     = bpf_iter_tcp_get_func_proto,
3256     .seq_info       = &tcp_seq_info,
3257 };
3258
3259 static void __init bpf_iter_register(void)
3260 {
3261     tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3262     if (bpf_iter_reg_target(&tcp_reg_info))
3263         pr_warn("Warning: could not register bpf iterator tcp\n");
3264 }
3265
3266 #endif
3267
3268 void __init tcp_v4_init(void)
3269 {
3270     int cpu, res;
3271
3272     for_each_possible_cpu(cpu) {
3273         struct sock *sk;
3274
3275         res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3276                        IPPROTO_TCP, &init_net);
3277         if (res)
3278             panic("Failed to create the TCP control socket.\n");
3279         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3280
3281         /* Please enforce IP_DF and IPID==0 for RST and
3282          * ACK sent in SYN-RECV and TIME-WAIT state.
3283          */
3284         inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3285
3286         per_cpu(ipv4_tcp_sk, cpu) = sk;
3287     }
3288     if (register_pernet_subsys(&tcp_sk_ops))
3289         panic("Failed to create the TCP control socket.\n");
3290
3291 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3292     bpf_iter_register();
3293 #endif
3294 }