net/ipv4/tcp_input.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * INET     An implementation of the TCP/IP protocol suite for the LINUX
0004  *      operating system.  INET is implemented using the  BSD Socket
0005  *      interface as the means of communication with the user level.
0006  *
0007  *      Implementation of the Transmission Control Protocol(TCP).
0008  *
0009  * Authors: Ross Biro
0010  *      Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
0011  *      Mark Evans, <evansmp@uhura.aston.ac.uk>
0012  *      Corey Minyard <wf-rch!minyard@relay.EU.net>
0013  *      Florian La Roche, <flla@stud.uni-sb.de>
0014  *      Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
0015  *      Linus Torvalds, <torvalds@cs.helsinki.fi>
0016  *      Alan Cox, <gw4pts@gw4pts.ampr.org>
0017  *      Matthew Dillon, <dillon@apollo.west.oic.com>
0018  *      Arnt Gulbrandsen, <agulbra@nvg.unit.no>
0019  *      Jorge Cwik, <jorge@laser.satlink.net>
0020  */
0021
0022 /*
0023  * Changes:
0024  *      Pedro Roque :   Fast Retransmit/Recovery.
0025  *                  Two receive queues.
0026  *                  Retransmit queue handled by TCP.
0027  *                  Better retransmit timer handling.
0028  *                  New congestion avoidance.
0029  *                  Header prediction.
0030  *                  Variable renaming.
0031  *
0032  *      Eric        :   Fast Retransmit.
0033  *      Randy Scott :   MSS option defines.
0034  *      Eric Schenk :   Fixes to slow start algorithm.
0035  *      Eric Schenk :   Yet another double ACK bug.
0036  *      Eric Schenk :   Delayed ACK bug fixes.
0037  *      Eric Schenk :   Floyd style fast retrans war avoidance.
0038  *      David S. Miller :   Don't allow zero congestion window.
0039  *      Eric Schenk :   Fix retransmitter so that it sends
0040  *                  next packet on ack of previous packet.
0041  *      Andi Kleen  :   Moved open_request checking here
0042  *                  and process RSTs for open_requests.
0043  *      Andi Kleen  :   Better prune_queue, and other fixes.
0044  *      Andrey Savochkin:   Fix RTT measurements in the presence of
0045  *                  timestamps.
0046  *      Andrey Savochkin:   Check sequence numbers correctly when
0047  *                  removing SACKs due to in sequence incoming
0048  *                  data segments.
0049  *      Andi Kleen:     Make sure we never ack data there is not
0050  *                  enough room for. Also make this condition
0051  *                  a fatal error if it might still happen.
0052  *      Andi Kleen:     Add tcp_measure_rcv_mss to make
0053  *                  connections with MSS<min(MTU,ann. MSS)
0054  *                  work without delayed acks.
0055  *      Andi Kleen:     Process packets with PSH set in the
0056  *                  fast path.
0057  *      J Hadi Salim:       ECN support
0058  *      Andrei Gurtov,
0059  *      Pasi Sarolahti,
0060  *      Panu Kuhlberg:      Experimental audit of TCP (re)transmission
0061  *                  engine. Lots of bugs are found.
0062  *      Pasi Sarolahti:     F-RTO for dealing with spurious RTOs
0063  */
0064
0065 #define pr_fmt(fmt) "TCP: " fmt
0066
0067 #include <linux/mm.h>
0068 #include <linux/slab.h>
0069 #include <linux/module.h>
0070 #include <linux/sysctl.h>
0071 #include <linux/kernel.h>
0072 #include <linux/prefetch.h>
0073 #include <net/dst.h>
0074 #include <net/tcp.h>
0075 #include <net/inet_common.h>
0076 #include <linux/ipsec.h>
0077 #include <asm/unaligned.h>
0078 #include <linux/errqueue.h>
0079 #include <trace/events/tcp.h>
0080 #include <linux/jump_label_ratelimit.h>
0081 #include <net/busy_poll.h>
0082 #include <net/mptcp.h>
0083
0084 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
0085
0086 #define FLAG_DATA       0x01 /* Incoming frame contained data.      */
0087 #define FLAG_WIN_UPDATE     0x02 /* Incoming ACK was a window update.   */
0088 #define FLAG_DATA_ACKED     0x04 /* This ACK acknowledged new data.     */
0089 #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
0090 #define FLAG_SYN_ACKED      0x10 /* This ACK acknowledged SYN.      */
0091 #define FLAG_DATA_SACKED    0x20 /* New SACK.               */
0092 #define FLAG_ECE        0x40 /* ECE in this ACK             */
0093 #define FLAG_LOST_RETRANS   0x80 /* This ACK marks some retransmission lost */
0094 #define FLAG_SLOWPATH       0x100 /* Do not skip RFC checks for window update.*/
0095 #define FLAG_ORIG_SACK_ACKED    0x200 /* Never retransmitted data are (s)acked  */
0096 #define FLAG_SND_UNA_ADVANCED   0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
0097 #define FLAG_DSACKING_ACK   0x800 /* SACK blocks contained D-SACK info */
0098 #define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
0099 #define FLAG_SACK_RENEGING  0x2000 /* snd_una advanced to a sacked seq */
0100 #define FLAG_UPDATE_TS_RECENT   0x4000 /* tcp_replace_ts_recent() */
0101 #define FLAG_NO_CHALLENGE_ACK   0x8000 /* do not call tcp_send_challenge_ack()  */
0102 #define FLAG_ACK_MAYBE_DELAYED  0x10000 /* Likely a delayed ACK */
0103 #define FLAG_DSACK_TLP      0x20000 /* DSACK for tail loss probe */
0104
0105 #define FLAG_ACKED      (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
0106 #define FLAG_NOT_DUP        (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
0107 #define FLAG_CA_ALERT       (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
0108 #define FLAG_FORWARD_PROGRESS   (FLAG_ACKED|FLAG_DATA_SACKED)
0109
0110 #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
0111 #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
0112
0113 #define REXMIT_NONE 0 /* no loss recovery to do */
0114 #define REXMIT_LOST 1 /* retransmit packets marked lost */
0115 #define REXMIT_NEW  2 /* FRTO-style transmit of unsent/new packets */
0116
0117 #if IS_ENABLED(CONFIG_TLS_DEVICE)
0118 static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
0119
0120 void clean_acked_data_enable(struct inet_connection_sock *icsk,
0121                  void (*cad)(struct sock *sk, u32 ack_seq))
0122 {
0123     icsk->icsk_clean_acked = cad;
0124     static_branch_deferred_inc(&clean_acked_data_enabled);
0125 }
0126 EXPORT_SYMBOL_GPL(clean_acked_data_enable);
0127
0128 void clean_acked_data_disable(struct inet_connection_sock *icsk)
0129 {
0130     static_branch_slow_dec_deferred(&clean_acked_data_enabled);
0131     icsk->icsk_clean_acked = NULL;
0132 }
0133 EXPORT_SYMBOL_GPL(clean_acked_data_disable);
0134
0135 void clean_acked_data_flush(void)
0136 {
0137     static_key_deferred_flush(&clean_acked_data_enabled);
0138 }
0139 EXPORT_SYMBOL_GPL(clean_acked_data_flush);
0140 #endif
0141
0142 #ifdef CONFIG_CGROUP_BPF
0143 static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
0144 {
0145     bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
0146         BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
0147                        BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
0148     bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
0149                             BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
0150     struct bpf_sock_ops_kern sock_ops;
0151
0152     if (likely(!unknown_opt && !parse_all_opt))
0153         return;
0154
0155     /* The skb will be handled in the
0156      * bpf_skops_established() or
0157      * bpf_skops_write_hdr_opt().
0158      */
0159     switch (sk->sk_state) {
0160     case TCP_SYN_RECV:
0161     case TCP_SYN_SENT:
0162     case TCP_LISTEN:
0163         return;
0164     }
0165
0166     sock_owned_by_me(sk);
0167
0168     memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
0169     sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
0170     sock_ops.is_fullsock = 1;
0171     sock_ops.sk = sk;
0172     bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
0173
0174     BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
0175 }
0176
0177 static void bpf_skops_established(struct sock *sk, int bpf_op,
0178                   struct sk_buff *skb)
0179 {
0180     struct bpf_sock_ops_kern sock_ops;
0181
0182     sock_owned_by_me(sk);
0183
0184     memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
0185     sock_ops.op = bpf_op;
0186     sock_ops.is_fullsock = 1;
0187     sock_ops.sk = sk;
0188     /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
0189     if (skb)
0190         bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
0191
0192     BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
0193 }
0194 #else
0195 static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
0196 {
0197 }
0198
0199 static void bpf_skops_established(struct sock *sk, int bpf_op,
0200                   struct sk_buff *skb)
0201 {
0202 }
0203 #endif
0204
0205 static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
0206                  unsigned int len)
0207 {
0208     static bool __once __read_mostly;
0209
0210     if (!__once) {
0211         struct net_device *dev;
0212
0213         __once = true;
0214
0215         rcu_read_lock();
0216         dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
0217         if (!dev || len >= dev->mtu)
0218             pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
0219                 dev ? dev->name : "Unknown driver");
0220         rcu_read_unlock();
0221     }
0222 }
0223
0224 /* Adapt the MSS value used to make delayed ack decision to the
0225  * real world.
0226  */
0227 static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
0228 {
0229     struct inet_connection_sock *icsk = inet_csk(sk);
0230     const unsigned int lss = icsk->icsk_ack.last_seg_size;
0231     unsigned int len;
0232
0233     icsk->icsk_ack.last_seg_size = 0;
0234
0235     /* skb->len may jitter because of SACKs, even if peer
0236      * sends good full-sized frames.
0237      */
0238     len = skb_shinfo(skb)->gso_size ? : skb->len;
0239     if (len >= icsk->icsk_ack.rcv_mss) {
0240         icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
0241                            tcp_sk(sk)->advmss);
0242         /* Account for possibly-removed options */
0243         if (unlikely(len > icsk->icsk_ack.rcv_mss +
0244                    MAX_TCP_OPTION_SPACE))
0245             tcp_gro_dev_warn(sk, skb, len);
0246     } else {
0247         /* Otherwise, we make more careful check taking into account,
0248          * that SACKs block is variable.
0249          *
0250          * "len" is invariant segment length, including TCP header.
0251          */
0252         len += skb->data - skb_transport_header(skb);
0253         if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
0254             /* If PSH is not set, packet should be
0255              * full sized, provided peer TCP is not badly broken.
0256              * This observation (if it is correct 8)) allows
0257              * to handle super-low mtu links fairly.
0258              */
0259             (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
0260              !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
0261             /* Subtract also invariant (if peer is RFC compliant),
0262              * tcp header plus fixed timestamp option length.
0263              * Resulting "len" is MSS free of SACK jitter.
0264              */
0265             len -= tcp_sk(sk)->tcp_header_len;
0266             icsk->icsk_ack.last_seg_size = len;
0267             if (len == lss) {
0268                 icsk->icsk_ack.rcv_mss = len;
0269                 return;
0270             }
0271         }
0272         if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
0273             icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
0274         icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
0275     }
0276 }
0277
0278 static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
0279 {
0280     struct inet_connection_sock *icsk = inet_csk(sk);
0281     unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
0282
0283     if (quickacks == 0)
0284         quickacks = 2;
0285     quickacks = min(quickacks, max_quickacks);
0286     if (quickacks > icsk->icsk_ack.quick)
0287         icsk->icsk_ack.quick = quickacks;
0288 }
0289
0290 void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
0291 {
0292     struct inet_connection_sock *icsk = inet_csk(sk);
0293
0294     tcp_incr_quickack(sk, max_quickacks);
0295     inet_csk_exit_pingpong_mode(sk);
0296     icsk->icsk_ack.ato = TCP_ATO_MIN;
0297 }
0298 EXPORT_SYMBOL(tcp_enter_quickack_mode);
0299
0300 /* Send ACKs quickly, if "quick" count is not exhausted
0301  * and the session is not interactive.
0302  */
0303
0304 static bool tcp_in_quickack_mode(struct sock *sk)
0305 {
0306     const struct inet_connection_sock *icsk = inet_csk(sk);
0307     const struct dst_entry *dst = __sk_dst_get(sk);
0308
0309     return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
0310         (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
0311 }
0312
0313 static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
0314 {
0315     if (tp->ecn_flags & TCP_ECN_OK)
0316         tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
0317 }
0318
0319 static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
0320 {
0321     if (tcp_hdr(skb)->cwr) {
0322         tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
0323
0324         /* If the sender is telling us it has entered CWR, then its
0325          * cwnd may be very low (even just 1 packet), so we should ACK
0326          * immediately.
0327          */
0328         if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
0329             inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
0330     }
0331 }
0332
0333 static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
0334 {
0335     tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
0336 }
0337
0338 static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
0339 {
0340     struct tcp_sock *tp = tcp_sk(sk);
0341
0342     switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
0343     case INET_ECN_NOT_ECT:
0344         /* Funny extension: if ECT is not set on a segment,
0345          * and we already seen ECT on a previous segment,
0346          * it is probably a retransmit.
0347          */
0348         if (tp->ecn_flags & TCP_ECN_SEEN)
0349             tcp_enter_quickack_mode(sk, 2);
0350         break;
0351     case INET_ECN_CE:
0352         if (tcp_ca_needs_ecn(sk))
0353             tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
0354
0355         if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
0356             /* Better not delay acks, sender can have a very low cwnd */
0357             tcp_enter_quickack_mode(sk, 2);
0358             tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
0359         }
0360         tp->ecn_flags |= TCP_ECN_SEEN;
0361         break;
0362     default:
0363         if (tcp_ca_needs_ecn(sk))
0364             tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
0365         tp->ecn_flags |= TCP_ECN_SEEN;
0366         break;
0367     }
0368 }
0369
0370 static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
0371 {
0372     if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
0373         __tcp_ecn_check_ce(sk, skb);
0374 }
0375
0376 static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
0377 {
0378     if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
0379         tp->ecn_flags &= ~TCP_ECN_OK;
0380 }
0381
0382 static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
0383 {
0384     if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
0385         tp->ecn_flags &= ~TCP_ECN_OK;
0386 }
0387
0388 static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
0389 {
0390     if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
0391         return true;
0392     return false;
0393 }
0394
0395 /* Buffer size and advertised window tuning.
0396  *
0397  * 1. Tuning sk->sk_sndbuf, when connection enters established state.
0398  */
0399
0400 static void tcp_sndbuf_expand(struct sock *sk)
0401 {
0402     const struct tcp_sock *tp = tcp_sk(sk);
0403     const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
0404     int sndmem, per_mss;
0405     u32 nr_segs;
0406
0407     /* Worst case is non GSO/TSO : each frame consumes one skb
0408      * and skb->head is kmalloced using power of two area of memory
0409      */
0410     per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
0411           MAX_TCP_HEADER +
0412           SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
0413
0414     per_mss = roundup_pow_of_two(per_mss) +
0415           SKB_DATA_ALIGN(sizeof(struct sk_buff));
0416
0417     nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp));
0418     nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
0419
0420     /* Fast Recovery (RFC 5681 3.2) :
0421      * Cubic needs 1.7 factor, rounded to 2 to include
0422      * extra cushion (application might react slowly to EPOLLOUT)
0423      */
0424     sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
0425     sndmem *= nr_segs * per_mss;
0426
0427     if (sk->sk_sndbuf < sndmem)
0428         WRITE_ONCE(sk->sk_sndbuf,
0429                min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
0430 }
0431
0432 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
0433  *
0434  * All tcp_full_space() is split to two parts: "network" buffer, allocated
0435  * forward and advertised in receiver window (tp->rcv_wnd) and
0436  * "application buffer", required to isolate scheduling/application
0437  * latencies from network.
0438  * window_clamp is maximal advertised window. It can be less than
0439  * tcp_full_space(), in this case tcp_full_space() - window_clamp
0440  * is reserved for "application" buffer. The less window_clamp is
0441  * the smoother our behaviour from viewpoint of network, but the lower
0442  * throughput and the higher sensitivity of the connection to losses. 8)
0443  *
0444  * rcv_ssthresh is more strict window_clamp used at "slow start"
0445  * phase to predict further behaviour of this connection.
0446  * It is used for two goals:
0447  * - to enforce header prediction at sender, even when application
0448  *   requires some significant "application buffer". It is check #1.
0449  * - to prevent pruning of receive queue because of misprediction
0450  *   of receiver window. Check #2.
0451  *
0452  * The scheme does not work when sender sends good segments opening
0453  * window and then starts to feed us spaghetti. But it should work
0454  * in common situations. Otherwise, we have to rely on queue collapsing.
0455  */
0456
0457 /* Slow part of check#2. */
0458 static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
0459                  unsigned int skbtruesize)
0460 {
0461     struct tcp_sock *tp = tcp_sk(sk);
0462     /* Optimize this! */
0463     int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
0464     int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
0465
0466     while (tp->rcv_ssthresh <= window) {
0467         if (truesize <= skb->len)
0468             return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
0469
0470         truesize >>= 1;
0471         window >>= 1;
0472     }
0473     return 0;
0474 }
0475
0476 /* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
0477  * can play nice with us, as sk_buff and skb->head might be either
0478  * freed or shared with up to MAX_SKB_FRAGS segments.
0479  * Only give a boost to drivers using page frag(s) to hold the frame(s),
0480  * and if no payload was pulled in skb->head before reaching us.
0481  */
0482 static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
0483 {
0484     u32 truesize = skb->truesize;
0485
0486     if (adjust && !skb_headlen(skb)) {
0487         truesize -= SKB_TRUESIZE(skb_end_offset(skb));
0488         /* paranoid check, some drivers might be buggy */
0489         if (unlikely((int)truesize < (int)skb->len))
0490             truesize = skb->truesize;
0491     }
0492     return truesize;
0493 }
0494
0495 static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
0496                 bool adjust)
0497 {
0498     struct tcp_sock *tp = tcp_sk(sk);
0499     int room;
0500
0501     room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
0502
0503     if (room <= 0)
0504         return;
0505
0506     /* Check #1 */
0507     if (!tcp_under_memory_pressure(sk)) {
0508         unsigned int truesize = truesize_adjust(adjust, skb);
0509         int incr;
0510
0511         /* Check #2. Increase window, if skb with such overhead
0512          * will fit to rcvbuf in future.
0513          */
0514         if (tcp_win_from_space(sk, truesize) <= skb->len)
0515             incr = 2 * tp->advmss;
0516         else
0517             incr = __tcp_grow_window(sk, skb, truesize);
0518
0519         if (incr) {
0520             incr = max_t(int, incr, 2 * skb->len);
0521             tp->rcv_ssthresh += min(room, incr);
0522             inet_csk(sk)->icsk_ack.quick |= 1;
0523         }
0524     } else {
0525         /* Under pressure:
0526          * Adjust rcv_ssthresh according to reserved mem
0527          */
0528         tcp_adjust_rcv_ssthresh(sk);
0529     }
0530 }
0531
0532 /* 3. Try to fixup all. It is made immediately after connection enters
0533  *    established state.
0534  */
0535 static void tcp_init_buffer_space(struct sock *sk)
0536 {
0537     int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
0538     struct tcp_sock *tp = tcp_sk(sk);
0539     int maxwin;
0540
0541     if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
0542         tcp_sndbuf_expand(sk);
0543
0544     tcp_mstamp_refresh(tp);
0545     tp->rcvq_space.time = tp->tcp_mstamp;
0546     tp->rcvq_space.seq = tp->copied_seq;
0547
0548     maxwin = tcp_full_space(sk);
0549
0550     if (tp->window_clamp >= maxwin) {
0551         tp->window_clamp = maxwin;
0552
0553         if (tcp_app_win && maxwin > 4 * tp->advmss)
0554             tp->window_clamp = max(maxwin -
0555                            (maxwin >> tcp_app_win),
0556                            4 * tp->advmss);
0557     }
0558
0559     /* Force reservation of one segment. */
0560     if (tcp_app_win &&
0561         tp->window_clamp > 2 * tp->advmss &&
0562         tp->window_clamp + tp->advmss > maxwin)
0563         tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
0564
0565     tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
0566     tp->snd_cwnd_stamp = tcp_jiffies32;
0567     tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
0568                     (u32)TCP_INIT_CWND * tp->advmss);
0569 }
0570
0571 /* 4. Recalculate window clamp after socket hit its memory bounds. */
0572 static void tcp_clamp_window(struct sock *sk)
0573 {
0574     struct tcp_sock *tp = tcp_sk(sk);
0575     struct inet_connection_sock *icsk = inet_csk(sk);
0576     struct net *net = sock_net(sk);
0577     int rmem2;
0578
0579     icsk->icsk_ack.quick = 0;
0580     rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
0581
0582     if (sk->sk_rcvbuf < rmem2 &&
0583         !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
0584         !tcp_under_memory_pressure(sk) &&
0585         sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
0586         WRITE_ONCE(sk->sk_rcvbuf,
0587                min(atomic_read(&sk->sk_rmem_alloc), rmem2));
0588     }
0589     if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
0590         tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
0591 }
0592
0593 /* Initialize RCV_MSS value.
0594  * RCV_MSS is an our guess about MSS used by the peer.
0595  * We haven't any direct information about the MSS.
0596  * It's better to underestimate the RCV_MSS rather than overestimate.
0597  * Overestimations make us ACKing less frequently than needed.
0598  * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
0599  */
0600 void tcp_initialize_rcv_mss(struct sock *sk)
0601 {
0602     const struct tcp_sock *tp = tcp_sk(sk);
0603     unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
0604
0605     hint = min(hint, tp->rcv_wnd / 2);
0606     hint = min(hint, TCP_MSS_DEFAULT);
0607     hint = max(hint, TCP_MIN_MSS);
0608
0609     inet_csk(sk)->icsk_ack.rcv_mss = hint;
0610 }
0611 EXPORT_SYMBOL(tcp_initialize_rcv_mss);
0612
0613 /* Receiver "autotuning" code.
0614  *
0615  * The algorithm for RTT estimation w/o timestamps is based on
0616  * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
0617  * <https://public.lanl.gov/radiant/pubs.html#DRS>
0618  *
0619  * More detail on this code can be found at
0620  * <http://staff.psc.edu/jheffner/>,
0621  * though this reference is out of date.  A new paper
0622  * is pending.
0623  */
0624 static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
0625 {
0626     u32 new_sample = tp->rcv_rtt_est.rtt_us;
0627     long m = sample;
0628
0629     if (new_sample != 0) {
0630         /* If we sample in larger samples in the non-timestamp
0631          * case, we could grossly overestimate the RTT especially
0632          * with chatty applications or bulk transfer apps which
0633          * are stalled on filesystem I/O.
0634          *
0635          * Also, since we are only going for a minimum in the
0636          * non-timestamp case, we do not smooth things out
0637          * else with timestamps disabled convergence takes too
0638          * long.
0639          */
0640         if (!win_dep) {
0641             m -= (new_sample >> 3);
0642             new_sample += m;
0643         } else {
0644             m <<= 3;
0645             if (m < new_sample)
0646                 new_sample = m;
0647         }
0648     } else {
0649         /* No previous measure. */
0650         new_sample = m << 3;
0651     }
0652
0653     tp->rcv_rtt_est.rtt_us = new_sample;
0654 }
0655
0656 static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
0657 {
0658     u32 delta_us;
0659
0660     if (tp->rcv_rtt_est.time == 0)
0661         goto new_measure;
0662     if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
0663         return;
0664     delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
0665     if (!delta_us)
0666         delta_us = 1;
0667     tcp_rcv_rtt_update(tp, delta_us, 1);
0668
0669 new_measure:
0670     tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
0671     tp->rcv_rtt_est.time = tp->tcp_mstamp;
0672 }
0673
0674 static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
0675                       const struct sk_buff *skb)
0676 {
0677     struct tcp_sock *tp = tcp_sk(sk);
0678
0679     if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
0680         return;
0681     tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
0682
0683     if (TCP_SKB_CB(skb)->end_seq -
0684         TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
0685         u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
0686         u32 delta_us;
0687
0688         if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
0689             if (!delta)
0690                 delta = 1;
0691             delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
0692             tcp_rcv_rtt_update(tp, delta_us, 0);
0693         }
0694     }
0695 }
0696
0697 /*
0698  * This function should be called every time data is copied to user space.
0699  * It calculates the appropriate TCP receive buffer space.
0700  */
0701 void tcp_rcv_space_adjust(struct sock *sk)
0702 {
0703     struct tcp_sock *tp = tcp_sk(sk);
0704     u32 copied;
0705     int time;
0706
0707     trace_tcp_rcv_space_adjust(sk);
0708
0709     tcp_mstamp_refresh(tp);
0710     time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
0711     if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
0712         return;
0713
0714     /* Number of bytes copied to user in last RTT */
0715     copied = tp->copied_seq - tp->rcvq_space.seq;
0716     if (copied <= tp->rcvq_space.space)
0717         goto new_measure;
0718
0719     /* A bit of theory :
0720      * copied = bytes received in previous RTT, our base window
0721      * To cope with packet losses, we need a 2x factor
0722      * To cope with slow start, and sender growing its cwin by 100 %
0723      * every RTT, we need a 4x factor, because the ACK we are sending
0724      * now is for the next RTT, not the current one :
0725      * <prev RTT . ><current RTT .. ><next RTT .... >
0726      */
0727
0728     if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
0729         !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
0730         int rcvmem, rcvbuf;
0731         u64 rcvwin, grow;
0732
0733         /* minimal window to cope with packet losses, assuming
0734          * steady state. Add some cushion because of small variations.
0735          */
0736         rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
0737
0738         /* Accommodate for sender rate increase (eg. slow start) */
0739         grow = rcvwin * (copied - tp->rcvq_space.space);
0740         do_div(grow, tp->rcvq_space.space);
0741         rcvwin += (grow << 1);
0742
0743         rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
0744         while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
0745             rcvmem += 128;
0746
0747         do_div(rcvwin, tp->advmss);
0748         rcvbuf = min_t(u64, rcvwin * rcvmem,
0749                    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
0750         if (rcvbuf > sk->sk_rcvbuf) {
0751             WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
0752
0753             /* Make the window clamp follow along.  */
0754             tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
0755         }
0756     }
0757     tp->rcvq_space.space = copied;
0758
0759 new_measure:
0760     tp->rcvq_space.seq = tp->copied_seq;
0761     tp->rcvq_space.time = tp->tcp_mstamp;
0762 }
0763
0764 /* There is something which you must keep in mind when you analyze the
0765  * behavior of the tp->ato delayed ack timeout interval.  When a
0766  * connection starts up, we want to ack as quickly as possible.  The
0767  * problem is that "good" TCP's do slow start at the beginning of data
0768  * transmission.  The means that until we send the first few ACK's the
0769  * sender will sit on his end and only queue most of his data, because
0770  * he can only send snd_cwnd unacked packets at any given time.  For
0771  * each ACK we send, he increments snd_cwnd and transmits more of his
0772  * queue.  -DaveM
0773  */
0774 static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
0775 {
0776     struct tcp_sock *tp = tcp_sk(sk);
0777     struct inet_connection_sock *icsk = inet_csk(sk);
0778     u32 now;
0779
0780     inet_csk_schedule_ack(sk);
0781
0782     tcp_measure_rcv_mss(sk, skb);
0783
0784     tcp_rcv_rtt_measure(tp);
0785
0786     now = tcp_jiffies32;
0787
0788     if (!icsk->icsk_ack.ato) {
0789         /* The _first_ data packet received, initialize
0790          * delayed ACK engine.
0791          */
0792         tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
0793         icsk->icsk_ack.ato = TCP_ATO_MIN;
0794     } else {
0795         int m = now - icsk->icsk_ack.lrcvtime;
0796
0797         if (m <= TCP_ATO_MIN / 2) {
0798             /* The fastest case is the first. */
0799             icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
0800         } else if (m < icsk->icsk_ack.ato) {
0801             icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
0802             if (icsk->icsk_ack.ato > icsk->icsk_rto)
0803                 icsk->icsk_ack.ato = icsk->icsk_rto;
0804         } else if (m > icsk->icsk_rto) {
0805             /* Too long gap. Apparently sender failed to
0806              * restart window, so that we send ACKs quickly.
0807              */
0808             tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
0809         }
0810     }
0811     icsk->icsk_ack.lrcvtime = now;
0812
0813     tcp_ecn_check_ce(sk, skb);
0814
0815     if (skb->len >= 128)
0816         tcp_grow_window(sk, skb, true);
0817 }
0818
0819 /* Called to compute a smoothed rtt estimate. The data fed to this
0820  * routine either comes from timestamps, or from segments that were
0821  * known _not_ to have been retransmitted [see Karn/Partridge
0822  * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
0823  * piece by Van Jacobson.
0824  * NOTE: the next three routines used to be one big routine.
0825  * To save cycles in the RFC 1323 implementation it was better to break
0826  * it up into three procedures. -- erics
0827  */
0828 static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
0829 {
0830     struct tcp_sock *tp = tcp_sk(sk);
0831     long m = mrtt_us; /* RTT */
0832     u32 srtt = tp->srtt_us;
0833
0834     /*  The following amusing code comes from Jacobson's
0835      *  article in SIGCOMM '88.  Note that rtt and mdev
0836      *  are scaled versions of rtt and mean deviation.
0837      *  This is designed to be as fast as possible
0838      *  m stands for "measurement".
0839      *
0840      *  On a 1990 paper the rto value is changed to:
0841      *  RTO = rtt + 4 * mdev
0842      *
0843      * Funny. This algorithm seems to be very broken.
0844      * These formulae increase RTO, when it should be decreased, increase
0845      * too slowly, when it should be increased quickly, decrease too quickly
0846      * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
0847      * does not matter how to _calculate_ it. Seems, it was trap
0848      * that VJ failed to avoid. 8)
0849      */
0850     if (srtt != 0) {
0851         m -= (srtt >> 3);   /* m is now error in rtt est */
0852         srtt += m;      /* rtt = 7/8 rtt + 1/8 new */
0853         if (m < 0) {
0854             m = -m;     /* m is now abs(error) */
0855             m -= (tp->mdev_us >> 2);   /* similar update on mdev */
0856             /* This is similar to one of Eifel findings.
0857              * Eifel blocks mdev updates when rtt decreases.
0858              * This solution is a bit different: we use finer gain
0859              * for mdev in this case (alpha*beta).
0860              * Like Eifel it also prevents growth of rto,
0861              * but also it limits too fast rto decreases,
0862              * happening in pure Eifel.
0863              */
0864             if (m > 0)
0865                 m >>= 3;
0866         } else {
0867             m -= (tp->mdev_us >> 2);   /* similar update on mdev */
0868         }
0869         tp->mdev_us += m;       /* mdev = 3/4 mdev + 1/4 new */
0870         if (tp->mdev_us > tp->mdev_max_us) {
0871             tp->mdev_max_us = tp->mdev_us;
0872             if (tp->mdev_max_us > tp->rttvar_us)
0873                 tp->rttvar_us = tp->mdev_max_us;
0874         }
0875         if (after(tp->snd_una, tp->rtt_seq)) {
0876             if (tp->mdev_max_us < tp->rttvar_us)
0877                 tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
0878             tp->rtt_seq = tp->snd_nxt;
0879             tp->mdev_max_us = tcp_rto_min_us(sk);
0880
0881             tcp_bpf_rtt(sk);
0882         }
0883     } else {
0884         /* no previous measure. */
0885         srtt = m << 3;      /* take the measured time to be rtt */
0886         tp->mdev_us = m << 1;   /* make sure rto = 3*rtt */
0887         tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
0888         tp->mdev_max_us = tp->rttvar_us;
0889         tp->rtt_seq = tp->snd_nxt;
0890
0891         tcp_bpf_rtt(sk);
0892     }
0893     tp->srtt_us = max(1U, srtt);
0894 }
0895
0896 static void tcp_update_pacing_rate(struct sock *sk)
0897 {
0898     const struct tcp_sock *tp = tcp_sk(sk);
0899     u64 rate;
0900
0901     /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
0902     rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
0903
0904     /* current rate is (cwnd * mss) / srtt
0905      * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
0906      * In Congestion Avoidance phase, set it to 120 % the current rate.
0907      *
0908      * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
0909      *   If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
0910      *   end of slow start and should slow down.
0911      */
0912     if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2)
0913         rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio);
0914     else
0915         rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio);
0916
0917     rate *= max(tcp_snd_cwnd(tp), tp->packets_out);
0918
0919     if (likely(tp->srtt_us))
0920         do_div(rate, tp->srtt_us);
0921
0922     /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
0923      * without any lock. We want to make sure compiler wont store
0924      * intermediate values in this location.
0925      */
0926     WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
0927                          sk->sk_max_pacing_rate));
0928 }
0929
0930 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
0931  * routine referred to above.
0932  */
0933 static void tcp_set_rto(struct sock *sk)
0934 {
0935     const struct tcp_sock *tp = tcp_sk(sk);
0936     /* Old crap is replaced with new one. 8)
0937      *
0938      * More seriously:
0939      * 1. If rtt variance happened to be less 50msec, it is hallucination.
0940      *    It cannot be less due to utterly erratic ACK generation made
0941      *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
0942      *    to do with delayed acks, because at cwnd>2 true delack timeout
0943      *    is invisible. Actually, Linux-2.4 also generates erratic
0944      *    ACKs in some circumstances.
0945      */
0946     inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
0947
0948     /* 2. Fixups made earlier cannot be right.
0949      *    If we do not estimate RTO correctly without them,
0950      *    all the algo is pure shit and should be replaced
0951      *    with correct one. It is exactly, which we pretend to do.
0952      */
0953
0954     /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
0955      * guarantees that rto is higher.
0956      */
0957     tcp_bound_rto(sk);
0958 }
0959
0960 __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
0961 {
0962     __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
0963
0964     if (!cwnd)
0965         cwnd = TCP_INIT_CWND;
0966     return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
0967 }
0968
0969 struct tcp_sacktag_state {
0970     /* Timestamps for earliest and latest never-retransmitted segment
0971      * that was SACKed. RTO needs the earliest RTT to stay conservative,
0972      * but congestion control should still get an accurate delay signal.
0973      */
0974     u64 first_sackt;
0975     u64 last_sackt;
0976     u32 reord;
0977     u32 sack_delivered;
0978     int flag;
0979     unsigned int mss_now;
0980     struct rate_sample *rate;
0981 };
0982
0983 /* Take a notice that peer is sending D-SACKs. Skip update of data delivery
0984  * and spurious retransmission information if this DSACK is unlikely caused by
0985  * sender's action:
0986  * - DSACKed sequence range is larger than maximum receiver's window.
0987  * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
0988  */
0989 static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
0990               u32 end_seq, struct tcp_sacktag_state *state)
0991 {
0992     u32 seq_len, dup_segs = 1;
0993
0994     if (!before(start_seq, end_seq))
0995         return 0;
0996
0997     seq_len = end_seq - start_seq;
0998     /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
0999     if (seq_len > tp->max_window)
1000         return 0;
1001     if (seq_len > tp->mss_cache)
1002         dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
1003     else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
1004         state->flag |= FLAG_DSACK_TLP;
1005
1006     tp->dsack_dups += dup_segs;
1007     /* Skip the DSACK if dup segs weren't retransmitted by sender */
1008     if (tp->dsack_dups > tp->total_retrans)
1009         return 0;
1010
1011     tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
1012     /* We increase the RACK ordering window in rounds where we receive
1013      * DSACKs that may have been due to reordering causing RACK to trigger
1014      * a spurious fast recovery. Thus RACK ignores DSACKs that happen
1015      * without having seen reordering, or that match TLP probes (TLP
1016      * is timer-driven, not triggered by RACK).
1017      */
1018     if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP))
1019         tp->rack.dsack_seen = 1;
1020
1021     state->flag |= FLAG_DSACKING_ACK;
1022     /* A spurious retransmission is delivered */
1023     state->sack_delivered += dup_segs;
1024
1025     return dup_segs;
1026 }
1027
1028 /* It's reordering when higher sequence was delivered (i.e. sacked) before
1029  * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
1030  * distance is approximated in full-mss packet distance ("reordering").
1031  */
1032 static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
1033                       const int ts)
1034 {
1035     struct tcp_sock *tp = tcp_sk(sk);
1036     const u32 mss = tp->mss_cache;
1037     u32 fack, metric;
1038
1039     fack = tcp_highest_sack_seq(tp);
1040     if (!before(low_seq, fack))
1041         return;
1042
1043     metric = fack - low_seq;
1044     if ((metric > tp->reordering * mss) && mss) {
1045 #if FASTRETRANS_DEBUG > 1
1046         pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
1047              tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
1048              tp->reordering,
1049              0,
1050              tp->sacked_out,
1051              tp->undo_marker ? tp->undo_retrans : 0);
1052 #endif
1053         tp->reordering = min_t(u32, (metric + mss - 1) / mss,
1054                        READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
1055     }
1056
1057     /* This exciting event is worth to be remembered. 8) */
1058     tp->reord_seen++;
1059     NET_INC_STATS(sock_net(sk),
1060               ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
1061 }
1062
1063  /* This must be called before lost_out or retrans_out are updated
1064   * on a new loss, because we want to know if all skbs previously
1065   * known to be lost have already been retransmitted, indicating
1066   * that this newly lost skb is our next skb to retransmit.
1067   */
1068 static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
1069 {
1070     if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
1071         (tp->retransmit_skb_hint &&
1072          before(TCP_SKB_CB(skb)->seq,
1073             TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
1074         tp->retransmit_skb_hint = skb;
1075 }
1076
1077 /* Sum the number of packets on the wire we have marked as lost, and
1078  * notify the congestion control module that the given skb was marked lost.
1079  */
1080 static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
1081 {
1082     tp->lost += tcp_skb_pcount(skb);
1083 }
1084
1085 void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
1086 {
1087     __u8 sacked = TCP_SKB_CB(skb)->sacked;
1088     struct tcp_sock *tp = tcp_sk(sk);
1089
1090     if (sacked & TCPCB_SACKED_ACKED)
1091         return;
1092
1093     tcp_verify_retransmit_hint(tp, skb);
1094     if (sacked & TCPCB_LOST) {
1095         if (sacked & TCPCB_SACKED_RETRANS) {
1096             /* Account for retransmits that are lost again */
1097             TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1098             tp->retrans_out -= tcp_skb_pcount(skb);
1099             NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
1100                       tcp_skb_pcount(skb));
1101             tcp_notify_skb_loss_event(tp, skb);
1102         }
1103     } else {
1104         tp->lost_out += tcp_skb_pcount(skb);
1105         TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1106         tcp_notify_skb_loss_event(tp, skb);
1107     }
1108 }
1109
1110 /* Updates the delivered and delivered_ce counts */
1111 static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
1112                 bool ece_ack)
1113 {
1114     tp->delivered += delivered;
1115     if (ece_ack)
1116         tp->delivered_ce += delivered;
1117 }
1118
1119 /* This procedure tags the retransmission queue when SACKs arrive.
1120  *
1121  * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
1122  * Packets in queue with these bits set are counted in variables
1123  * sacked_out, retrans_out and lost_out, correspondingly.
1124  *
1125  * Valid combinations are:
1126  * Tag  InFlight    Description
1127  * 0    1       - orig segment is in flight.
1128  * S    0       - nothing flies, orig reached receiver.
1129  * L    0       - nothing flies, orig lost by net.
1130  * R    2       - both orig and retransmit are in flight.
1131  * L|R  1       - orig is lost, retransmit is in flight.
1132  * S|R  1       - orig reached receiver, retrans is still in flight.
1133  * (L|S|R is logically valid, it could occur when L|R is sacked,
1134  *  but it is equivalent to plain S and code short-curcuits it to S.
1135  *  L|S is logically invalid, it would mean -1 packet in flight 8))
1136  *
1137  * These 6 states form finite state machine, controlled by the following events:
1138  * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
1139  * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
1140  * 3. Loss detection event of two flavors:
1141  *  A. Scoreboard estimator decided the packet is lost.
1142  *     A'. Reno "three dupacks" marks head of queue lost.
1143  *  B. SACK arrives sacking SND.NXT at the moment, when the
1144  *     segment was retransmitted.
1145  * 4. D-SACK added new rule: D-SACK changes any tag to S.
1146  *
1147  * It is pleasant to note, that state diagram turns out to be commutative,
1148  * so that we are allowed not to be bothered by order of our actions,
1149  * when multiple events arrive simultaneously. (see the function below).
1150  *
1151  * Reordering detection.
1152  * --------------------
1153  * Reordering metric is maximal distance, which a packet can be displaced
1154  * in packet stream. With SACKs we can estimate it:
1155  *
1156  * 1. SACK fills old hole and the corresponding segment was not
1157  *    ever retransmitted -> reordering. Alas, we cannot use it
1158  *    when segment was retransmitted.
1159  * 2. The last flaw is solved with D-SACK. D-SACK arrives
1160  *    for retransmitted and already SACKed segment -> reordering..
1161  * Both of these heuristics are not used in Loss state, when we cannot
1162  * account for retransmits accurately.
1163  *
1164  * SACK block validation.
1165  * ----------------------
1166  *
1167  * SACK block range validation checks that the received SACK block fits to
1168  * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
1169  * Note that SND.UNA is not included to the range though being valid because
1170  * it means that the receiver is rather inconsistent with itself reporting
1171  * SACK reneging when it should advance SND.UNA. Such SACK block this is
1172  * perfectly valid, however, in light of RFC2018 which explicitly states
1173  * that "SACK block MUST reflect the newest segment.  Even if the newest
1174  * segment is going to be discarded ...", not that it looks very clever
1175  * in case of head skb. Due to potentional receiver driven attacks, we
1176  * choose to avoid immediate execution of a walk in write queue due to
1177  * reneging and defer head skb's loss recovery to standard loss recovery
1178  * procedure that will eventually trigger (nothing forbids us doing this).
1179  *
1180  * Implements also blockage to start_seq wrap-around. Problem lies in the
1181  * fact that though start_seq (s) is before end_seq (i.e., not reversed),
1182  * there's no guarantee that it will be before snd_nxt (n). The problem
1183  * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
1184  * wrap (s_w):
1185  *
1186  *         <- outs wnd ->                          <- wrapzone ->
1187  *         u     e      n                         u_w   e_w  s n_w
1188  *         |     |      |                          |     |   |  |
1189  * |<------------+------+----- TCP seqno space --------------+---------->|
1190  * ...-- <2^31 ->|                                           |<--------...
1191  * ...---- >2^31 ------>|                                    |<--------...
1192  *
1193  * Current code wouldn't be vulnerable but it's better still to discard such
1194  * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
1195  * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
1196  * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
1197  * equal to the ideal case (infinite seqno space without wrap caused issues).
1198  *
1199  * With D-SACK the lower bound is extended to cover sequence space below
1200  * SND.UNA down to undo_marker, which is the last point of interest. Yet
1201  * again, D-SACK block must not to go across snd_una (for the same reason as
1202  * for the normal SACK blocks, explained above). But there all simplicity
1203  * ends, TCP might receive valid D-SACKs below that. As long as they reside
1204  * fully below undo_marker they do not affect behavior in anyway and can
1205  * therefore be safely ignored. In rare cases (which are more or less
1206  * theoretical ones), the D-SACK will nicely cross that boundary due to skb
1207  * fragmentation and packet reordering past skb's retransmission. To consider
1208  * them correctly, the acceptable range must be extended even more though
1209  * the exact amount is rather hard to quantify. However, tp->max_window can
1210  * be used as an exaggerated estimate.
1211  */
1212 static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
1213                    u32 start_seq, u32 end_seq)
1214 {
1215     /* Too far in future, or reversed (interpretation is ambiguous) */
1216     if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
1217         return false;
1218
1219     /* Nasty start_seq wrap-around check (see comments above) */
1220     if (!before(start_seq, tp->snd_nxt))
1221         return false;
1222
1223     /* In outstanding window? ...This is valid exit for D-SACKs too.
1224      * start_seq == snd_una is non-sensical (see comments above)
1225      */
1226     if (after(start_seq, tp->snd_una))
1227         return true;
1228
1229     if (!is_dsack || !tp->undo_marker)
1230         return false;
1231
1232     /* ...Then it's D-SACK, and must reside below snd_una completely */
1233     if (after(end_seq, tp->snd_una))
1234         return false;
1235
1236     if (!before(start_seq, tp->undo_marker))
1237         return true;
1238
1239     /* Too old */
1240     if (!after(end_seq, tp->undo_marker))
1241         return false;
1242
1243     /* Undo_marker boundary crossing (overestimates a lot). Known already:
1244      *   start_seq < undo_marker and end_seq >= undo_marker.
1245      */
1246     return !before(start_seq, end_seq - tp->max_window);
1247 }
1248
1249 static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1250                 struct tcp_sack_block_wire *sp, int num_sacks,
1251                 u32 prior_snd_una, struct tcp_sacktag_state *state)
1252 {
1253     struct tcp_sock *tp = tcp_sk(sk);
1254     u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
1255     u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1256     u32 dup_segs;
1257
1258     if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1259         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1260     } else if (num_sacks > 1) {
1261         u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
1262         u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
1263
1264         if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
1265             return false;
1266         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
1267     } else {
1268         return false;
1269     }
1270
1271     dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
1272     if (!dup_segs) {    /* Skip dubious DSACK */
1273         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
1274         return false;
1275     }
1276
1277     NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
1278
1279     /* D-SACK for already forgotten data... Do dumb counting. */
1280     if (tp->undo_marker && tp->undo_retrans > 0 &&
1281         !after(end_seq_0, prior_snd_una) &&
1282         after(end_seq_0, tp->undo_marker))
1283         tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
1284
1285     return true;
1286 }
1287
1288 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
1289  * the incoming SACK may not exactly match but we can find smaller MSS
1290  * aligned portion of it that matches. Therefore we might need to fragment
1291  * which may fail and creates some hassle (caller must handle error case
1292  * returns).
1293  *
1294  * FIXME: this could be merged to shift decision code
1295  */
1296 static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1297                   u32 start_seq, u32 end_seq)
1298 {
1299     int err;
1300     bool in_sack;
1301     unsigned int pkt_len;
1302     unsigned int mss;
1303
1304     in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1305           !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1306
1307     if (tcp_skb_pcount(skb) > 1 && !in_sack &&
1308         after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1309         mss = tcp_skb_mss(skb);
1310         in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1311
1312         if (!in_sack) {
1313             pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1314             if (pkt_len < mss)
1315                 pkt_len = mss;
1316         } else {
1317             pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1318             if (pkt_len < mss)
1319                 return -EINVAL;
1320         }
1321
1322         /* Round if necessary so that SACKs cover only full MSSes
1323          * and/or the remaining small portion (if present)
1324          */
1325         if (pkt_len > mss) {
1326             unsigned int new_len = (pkt_len / mss) * mss;
1327             if (!in_sack && new_len < pkt_len)
1328                 new_len += mss;
1329             pkt_len = new_len;
1330         }
1331
1332         if (pkt_len >= skb->len && !in_sack)
1333             return 0;
1334
1335         err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
1336                    pkt_len, mss, GFP_ATOMIC);
1337         if (err < 0)
1338             return err;
1339     }
1340
1341     return in_sack;
1342 }
1343
1344 /* Mark the given newly-SACKed range as such, adjusting counters and hints. */
1345 static u8 tcp_sacktag_one(struct sock *sk,
1346               struct tcp_sacktag_state *state, u8 sacked,
1347               u32 start_seq, u32 end_seq,
1348               int dup_sack, int pcount,
1349               u64 xmit_time)
1350 {
1351     struct tcp_sock *tp = tcp_sk(sk);
1352
1353     /* Account D-SACK for retransmitted packet. */
1354     if (dup_sack && (sacked & TCPCB_RETRANS)) {
1355         if (tp->undo_marker && tp->undo_retrans > 0 &&
1356             after(end_seq, tp->undo_marker))
1357             tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
1358         if ((sacked & TCPCB_SACKED_ACKED) &&
1359             before(start_seq, state->reord))
1360                 state->reord = start_seq;
1361     }
1362
1363     /* Nothing to do; acked frame is about to be dropped (was ACKed). */
1364     if (!after(end_seq, tp->snd_una))
1365         return sacked;
1366
1367     if (!(sacked & TCPCB_SACKED_ACKED)) {
1368         tcp_rack_advance(tp, sacked, end_seq, xmit_time);
1369
1370         if (sacked & TCPCB_SACKED_RETRANS) {
1371             /* If the segment is not tagged as lost,
1372              * we do not clear RETRANS, believing
1373              * that retransmission is still in flight.
1374              */
1375             if (sacked & TCPCB_LOST) {
1376                 sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1377                 tp->lost_out -= pcount;
1378                 tp->retrans_out -= pcount;
1379             }
1380         } else {
1381             if (!(sacked & TCPCB_RETRANS)) {
1382                 /* New sack for not retransmitted frame,
1383                  * which was in hole. It is reordering.
1384                  */
1385                 if (before(start_seq,
1386                        tcp_highest_sack_seq(tp)) &&
1387                     before(start_seq, state->reord))
1388                     state->reord = start_seq;
1389
1390                 if (!after(end_seq, tp->high_seq))
1391                     state->flag |= FLAG_ORIG_SACK_ACKED;
1392                 if (state->first_sackt == 0)
1393                     state->first_sackt = xmit_time;
1394                 state->last_sackt = xmit_time;
1395             }
1396
1397             if (sacked & TCPCB_LOST) {
1398                 sacked &= ~TCPCB_LOST;
1399                 tp->lost_out -= pcount;
1400             }
1401         }
1402
1403         sacked |= TCPCB_SACKED_ACKED;
1404         state->flag |= FLAG_DATA_SACKED;
1405         tp->sacked_out += pcount;
1406         /* Out-of-order packets delivered */
1407         state->sack_delivered += pcount;
1408
1409         /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1410         if (tp->lost_skb_hint &&
1411             before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1412             tp->lost_cnt_hint += pcount;
1413     }
1414
1415     /* D-SACK. We can detect redundant retransmission in S|R and plain R
1416      * frames and clear it. undo_retrans is decreased above, L|R frames
1417      * are accounted above as well.
1418      */
1419     if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
1420         sacked &= ~TCPCB_SACKED_RETRANS;
1421         tp->retrans_out -= pcount;
1422     }
1423
1424     return sacked;
1425 }
1426
1427 /* Shift newly-SACKed bytes from this skb to the immediately previous
1428  * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
1429  */
1430 static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
1431                 struct sk_buff *skb,
1432                 struct tcp_sacktag_state *state,
1433                 unsigned int pcount, int shifted, int mss,
1434                 bool dup_sack)
1435 {
1436     struct tcp_sock *tp = tcp_sk(sk);
1437     u32 start_seq = TCP_SKB_CB(skb)->seq;   /* start of newly-SACKed */
1438     u32 end_seq = start_seq + shifted;  /* end of newly-SACKed */
1439
1440     BUG_ON(!pcount);
1441
1442     /* Adjust counters and hints for the newly sacked sequence
1443      * range but discard the return value since prev is already
1444      * marked. We must tag the range first because the seq
1445      * advancement below implicitly advances
1446      * tcp_highest_sack_seq() when skb is highest_sack.
1447      */
1448     tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1449             start_seq, end_seq, dup_sack, pcount,
1450             tcp_skb_timestamp_us(skb));
1451     tcp_rate_skb_delivered(sk, skb, state->rate);
1452
1453     if (skb == tp->lost_skb_hint)
1454         tp->lost_cnt_hint += pcount;
1455
1456     TCP_SKB_CB(prev)->end_seq += shifted;
1457     TCP_SKB_CB(skb)->seq += shifted;
1458
1459     tcp_skb_pcount_add(prev, pcount);
1460     WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
1461     tcp_skb_pcount_add(skb, -pcount);
1462
1463     /* When we're adding to gso_segs == 1, gso_size will be zero,
1464      * in theory this shouldn't be necessary but as long as DSACK
1465      * code can come after this skb later on it's better to keep
1466      * setting gso_size to something.
1467      */
1468     if (!TCP_SKB_CB(prev)->tcp_gso_size)
1469         TCP_SKB_CB(prev)->tcp_gso_size = mss;
1470
1471     /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1472     if (tcp_skb_pcount(skb) <= 1)
1473         TCP_SKB_CB(skb)->tcp_gso_size = 0;
1474
1475     /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1476     TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1477
1478     if (skb->len > 0) {
1479         BUG_ON(!tcp_skb_pcount(skb));
1480         NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1481         return false;
1482     }
1483
1484     /* Whole SKB was eaten :-) */
1485
1486     if (skb == tp->retransmit_skb_hint)
1487         tp->retransmit_skb_hint = prev;
1488     if (skb == tp->lost_skb_hint) {
1489         tp->lost_skb_hint = prev;
1490         tp->lost_cnt_hint -= tcp_skb_pcount(prev);
1491     }
1492
1493     TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1494     TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
1495     if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1496         TCP_SKB_CB(prev)->end_seq++;
1497
1498     if (skb == tcp_highest_sack(sk))
1499         tcp_advance_highest_sack(sk, skb);
1500
1501     tcp_skb_collapse_tstamp(prev, skb);
1502     if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
1503         TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
1504
1505     tcp_rtx_queue_unlink_and_free(skb, sk);
1506
1507     NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
1508
1509     return true;
1510 }
1511
1512 /* I wish gso_size would have a bit more sane initialization than
1513  * something-or-zero which complicates things
1514  */
1515 static int tcp_skb_seglen(const struct sk_buff *skb)
1516 {
1517     return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
1518 }
1519
1520 /* Shifting pages past head area doesn't work */
1521 static int skb_can_shift(const struct sk_buff *skb)
1522 {
1523     return !skb_headlen(skb) && skb_is_nonlinear(skb);
1524 }
1525
1526 int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
1527           int pcount, int shiftlen)
1528 {
1529     /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
1530      * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
1531      * to make sure not storing more than 65535 * 8 bytes per skb,
1532      * even if current MSS is bigger.
1533      */
1534     if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
1535         return 0;
1536     if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
1537         return 0;
1538     return skb_shift(to, from, shiftlen);
1539 }
1540
1541 /* Try collapsing SACK blocks spanning across multiple skbs to a single
1542  * skb.
1543  */
1544 static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1545                       struct tcp_sacktag_state *state,
1546                       u32 start_seq, u32 end_seq,
1547                       bool dup_sack)
1548 {
1549     struct tcp_sock *tp = tcp_sk(sk);
1550     struct sk_buff *prev;
1551     int mss;
1552     int pcount = 0;
1553     int len;
1554     int in_sack;
1555
1556     /* Normally R but no L won't result in plain S */
1557     if (!dup_sack &&
1558         (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
1559         goto fallback;
1560     if (!skb_can_shift(skb))
1561         goto fallback;
1562     /* This frame is about to be dropped (was ACKed). */
1563     if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1564         goto fallback;
1565
1566     /* Can only happen with delayed DSACK + discard craziness */
1567     prev = skb_rb_prev(skb);
1568     if (!prev)
1569         goto fallback;
1570
1571     if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1572         goto fallback;
1573
1574     if (!tcp_skb_can_collapse(prev, skb))
1575         goto fallback;
1576
1577     in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1578           !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1579
1580     if (in_sack) {
1581         len = skb->len;
1582         pcount = tcp_skb_pcount(skb);
1583         mss = tcp_skb_seglen(skb);
1584
1585         /* TODO: Fix DSACKs to not fragment already SACKed and we can
1586          * drop this restriction as unnecessary
1587          */
1588         if (mss != tcp_skb_seglen(prev))
1589             goto fallback;
1590     } else {
1591         if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1592             goto noop;
1593         /* CHECKME: This is non-MSS split case only?, this will
1594          * cause skipped skbs due to advancing loop btw, original
1595          * has that feature too
1596          */
1597         if (tcp_skb_pcount(skb) <= 1)
1598             goto noop;
1599
1600         in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1601         if (!in_sack) {
1602             /* TODO: head merge to next could be attempted here
1603              * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
1604              * though it might not be worth of the additional hassle
1605              *
1606              * ...we can probably just fallback to what was done
1607              * previously. We could try merging non-SACKed ones
1608              * as well but it probably isn't going to buy off
1609              * because later SACKs might again split them, and
1610              * it would make skb timestamp tracking considerably
1611              * harder problem.
1612              */
1613             goto fallback;
1614         }
1615
1616         len = end_seq - TCP_SKB_CB(skb)->seq;
1617         BUG_ON(len < 0);
1618         BUG_ON(len > skb->len);
1619
1620         /* MSS boundaries should be honoured or else pcount will
1621          * severely break even though it makes things bit trickier.
1622          * Optimize common case to avoid most of the divides
1623          */
1624         mss = tcp_skb_mss(skb);
1625
1626         /* TODO: Fix DSACKs to not fragment already SACKed and we can
1627          * drop this restriction as unnecessary
1628          */
1629         if (mss != tcp_skb_seglen(prev))
1630             goto fallback;
1631
1632         if (len == mss) {
1633             pcount = 1;
1634         } else if (len < mss) {
1635             goto noop;
1636         } else {
1637             pcount = len / mss;
1638             len = pcount * mss;
1639         }
1640     }
1641
1642     /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
1643     if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
1644         goto fallback;
1645
1646     if (!tcp_skb_shift(prev, skb, pcount, len))
1647         goto fallback;
1648     if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
1649         goto out;
1650
1651     /* Hole filled allows collapsing with the next as well, this is very
1652      * useful when hole on every nth skb pattern happens
1653      */
1654     skb = skb_rb_next(prev);
1655     if (!skb)
1656         goto out;
1657
1658     if (!skb_can_shift(skb) ||
1659         ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1660         (mss != tcp_skb_seglen(skb)))
1661         goto out;
1662
1663     if (!tcp_skb_can_collapse(prev, skb))
1664         goto out;
1665     len = skb->len;
1666     pcount = tcp_skb_pcount(skb);
1667     if (tcp_skb_shift(prev, skb, pcount, len))
1668         tcp_shifted_skb(sk, prev, skb, state, pcount,
1669                 len, mss, 0);
1670
1671 out:
1672     return prev;
1673
1674 noop:
1675     return skb;
1676
1677 fallback:
1678     NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1679     return NULL;
1680 }
1681
1682 static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1683                     struct tcp_sack_block *next_dup,
1684                     struct tcp_sacktag_state *state,
1685                     u32 start_seq, u32 end_seq,
1686                     bool dup_sack_in)
1687 {
1688     struct tcp_sock *tp = tcp_sk(sk);
1689     struct sk_buff *tmp;
1690
1691     skb_rbtree_walk_from(skb) {
1692         int in_sack = 0;
1693         bool dup_sack = dup_sack_in;
1694
1695         /* queue is in-order => we can short-circuit the walk early */
1696         if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1697             break;
1698
1699         if (next_dup  &&
1700             before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
1701             in_sack = tcp_match_skb_to_sack(sk, skb,
1702                             next_dup->start_seq,
1703                             next_dup->end_seq);
1704             if (in_sack > 0)
1705                 dup_sack = true;
1706         }
1707
1708         /* skb reference here is a bit tricky to get right, since
1709          * shifting can eat and free both this skb and the next,
1710          * so not even _safe variant of the loop is enough.
1711          */
1712         if (in_sack <= 0) {
1713             tmp = tcp_shift_skb_data(sk, skb, state,
1714                          start_seq, end_seq, dup_sack);
1715             if (tmp) {
1716                 if (tmp != skb) {
1717                     skb = tmp;
1718                     continue;
1719                 }
1720
1721                 in_sack = 0;
1722             } else {
1723                 in_sack = tcp_match_skb_to_sack(sk, skb,
1724                                 start_seq,
1725                                 end_seq);
1726             }
1727         }
1728
1729         if (unlikely(in_sack < 0))
1730             break;
1731
1732         if (in_sack) {
1733             TCP_SKB_CB(skb)->sacked =
1734                 tcp_sacktag_one(sk,
1735                         state,
1736                         TCP_SKB_CB(skb)->sacked,
1737                         TCP_SKB_CB(skb)->seq,
1738                         TCP_SKB_CB(skb)->end_seq,
1739                         dup_sack,
1740                         tcp_skb_pcount(skb),
1741                         tcp_skb_timestamp_us(skb));
1742             tcp_rate_skb_delivered(sk, skb, state->rate);
1743             if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1744                 list_del_init(&skb->tcp_tsorted_anchor);
1745
1746             if (!before(TCP_SKB_CB(skb)->seq,
1747                     tcp_highest_sack_seq(tp)))
1748                 tcp_advance_highest_sack(sk, skb);
1749         }
1750     }
1751     return skb;
1752 }
1753
1754 static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
1755 {
1756     struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
1757     struct sk_buff *skb;
1758
1759     while (*p) {
1760         parent = *p;
1761         skb = rb_to_skb(parent);
1762         if (before(seq, TCP_SKB_CB(skb)->seq)) {
1763             p = &parent->rb_left;
1764             continue;
1765         }
1766         if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
1767             p = &parent->rb_right;
1768             continue;
1769         }
1770         return skb;
1771     }
1772     return NULL;
1773 }
1774
1775 static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1776                     u32 skip_to_seq)
1777 {
1778     if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
1779         return skb;
1780
1781     return tcp_sacktag_bsearch(sk, skip_to_seq);
1782 }
1783
1784 static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
1785                         struct sock *sk,
1786                         struct tcp_sack_block *next_dup,
1787                         struct tcp_sacktag_state *state,
1788                         u32 skip_to_seq)
1789 {
1790     if (!next_dup)
1791         return skb;
1792
1793     if (before(next_dup->start_seq, skip_to_seq)) {
1794         skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
1795         skb = tcp_sacktag_walk(skb, sk, NULL, state,
1796                        next_dup->start_seq, next_dup->end_seq,
1797                        1);
1798     }
1799
1800     return skb;
1801 }
1802
1803 static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
1804 {
1805     return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1806 }
1807
1808 static int
1809 tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1810             u32 prior_snd_una, struct tcp_sacktag_state *state)
1811 {
1812     struct tcp_sock *tp = tcp_sk(sk);
1813     const unsigned char *ptr = (skb_transport_header(ack_skb) +
1814                     TCP_SKB_CB(ack_skb)->sacked);
1815     struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1816     struct tcp_sack_block sp[TCP_NUM_SACKS];
1817     struct tcp_sack_block *cache;
1818     struct sk_buff *skb;
1819     int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1820     int used_sacks;
1821     bool found_dup_sack = false;
1822     int i, j;
1823     int first_sack_index;
1824
1825     state->flag = 0;
1826     state->reord = tp->snd_nxt;
1827
1828     if (!tp->sacked_out)
1829         tcp_highest_sack_reset(sk);
1830
1831     found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1832                      num_sacks, prior_snd_una, state);
1833
1834     /* Eliminate too old ACKs, but take into
1835      * account more or less fresh ones, they can
1836      * contain valid SACK info.
1837      */
1838     if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
1839         return 0;
1840
1841     if (!tp->packets_out)
1842         goto out;
1843
1844     used_sacks = 0;
1845     first_sack_index = 0;
1846     for (i = 0; i < num_sacks; i++) {
1847         bool dup_sack = !i && found_dup_sack;
1848
1849         sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
1850         sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
1851
1852         if (!tcp_is_sackblock_valid(tp, dup_sack,
1853                         sp[used_sacks].start_seq,
1854                         sp[used_sacks].end_seq)) {
1855             int mib_idx;
1856
1857             if (dup_sack) {
1858                 if (!tp->undo_marker)
1859                     mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
1860                 else
1861                     mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
1862             } else {
1863                 /* Don't count olds caused by ACK reordering */
1864                 if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
1865                     !after(sp[used_sacks].end_seq, tp->snd_una))
1866                     continue;
1867                 mib_idx = LINUX_MIB_TCPSACKDISCARD;
1868             }
1869
1870             NET_INC_STATS(sock_net(sk), mib_idx);
1871             if (i == 0)
1872                 first_sack_index = -1;
1873             continue;
1874         }
1875
1876         /* Ignore very old stuff early */
1877         if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
1878             if (i == 0)
1879                 first_sack_index = -1;
1880             continue;
1881         }
1882
1883         used_sacks++;
1884     }
1885
1886     /* order SACK blocks to allow in order walk of the retrans queue */
1887     for (i = used_sacks - 1; i > 0; i--) {
1888         for (j = 0; j < i; j++) {
1889             if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
1890                 swap(sp[j], sp[j + 1]);
1891
1892                 /* Track where the first SACK block goes to */
1893                 if (j == first_sack_index)
1894                     first_sack_index = j + 1;
1895             }
1896         }
1897     }
1898
1899     state->mss_now = tcp_current_mss(sk);
1900     skb = NULL;
1901     i = 0;
1902
1903     if (!tp->sacked_out) {
1904         /* It's already past, so skip checking against it */
1905         cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1906     } else {
1907         cache = tp->recv_sack_cache;
1908         /* Skip empty blocks in at head of the cache */
1909         while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
1910                !cache->end_seq)
1911             cache++;
1912     }
1913
1914     while (i < used_sacks) {
1915         u32 start_seq = sp[i].start_seq;
1916         u32 end_seq = sp[i].end_seq;
1917         bool dup_sack = (found_dup_sack && (i == first_sack_index));
1918         struct tcp_sack_block *next_dup = NULL;
1919
1920         if (found_dup_sack && ((i + 1) == first_sack_index))
1921             next_dup = &sp[i + 1];
1922
1923         /* Skip too early cached blocks */
1924         while (tcp_sack_cache_ok(tp, cache) &&
1925                !before(start_seq, cache->end_seq))
1926             cache++;
1927
1928         /* Can skip some work by looking recv_sack_cache? */
1929         if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
1930             after(end_seq, cache->start_seq)) {
1931
1932             /* Head todo? */
1933             if (before(start_seq, cache->start_seq)) {
1934                 skb = tcp_sacktag_skip(skb, sk, start_seq);
1935                 skb = tcp_sacktag_walk(skb, sk, next_dup,
1936                                state,
1937                                start_seq,
1938                                cache->start_seq,
1939                                dup_sack);
1940             }
1941
1942             /* Rest of the block already fully processed? */
1943             if (!after(end_seq, cache->end_seq))
1944                 goto advance_sp;
1945
1946             skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
1947                                state,
1948                                cache->end_seq);
1949
1950             /* ...tail remains todo... */
1951             if (tcp_highest_sack_seq(tp) == cache->end_seq) {
1952                 /* ...but better entrypoint exists! */
1953                 skb = tcp_highest_sack(sk);
1954                 if (!skb)
1955                     break;
1956                 cache++;
1957                 goto walk;
1958             }
1959
1960             skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
1961             /* Check overlap against next cached too (past this one already) */
1962             cache++;
1963             continue;
1964         }
1965
1966         if (!before(start_seq, tcp_highest_sack_seq(tp))) {
1967             skb = tcp_highest_sack(sk);
1968             if (!skb)
1969                 break;
1970         }
1971         skb = tcp_sacktag_skip(skb, sk, start_seq);
1972
1973 walk:
1974         skb = tcp_sacktag_walk(skb, sk, next_dup, state,
1975                        start_seq, end_seq, dup_sack);
1976
1977 advance_sp:
1978         i++;
1979     }
1980
1981     /* Clear the head of the cache sack blocks so we can skip it next time */
1982     for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
1983         tp->recv_sack_cache[i].start_seq = 0;
1984         tp->recv_sack_cache[i].end_seq = 0;
1985     }
1986     for (j = 0; j < used_sacks; j++)
1987         tp->recv_sack_cache[i++] = sp[j];
1988
1989     if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
1990         tcp_check_sack_reordering(sk, state->reord, 0);
1991
1992     tcp_verify_left_out(tp);
1993 out:
1994
1995 #if FASTRETRANS_DEBUG > 0
1996     WARN_ON((int)tp->sacked_out < 0);
1997     WARN_ON((int)tp->lost_out < 0);
1998     WARN_ON((int)tp->retrans_out < 0);
1999     WARN_ON((int)tcp_packets_in_flight(tp) < 0);
2000 #endif
2001     return state->flag;
2002 }
2003
2004 /* Limits sacked_out so that sum with lost_out isn't ever larger than
2005  * packets_out. Returns false if sacked_out adjustement wasn't necessary.
2006  */
2007 static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
2008 {
2009     u32 holes;
2010
2011     holes = max(tp->lost_out, 1U);
2012     holes = min(holes, tp->packets_out);
2013
2014     if ((tp->sacked_out + holes) > tp->packets_out) {
2015         tp->sacked_out = tp->packets_out - holes;
2016         return true;
2017     }
2018     return false;
2019 }
2020
2021 /* If we receive more dupacks than we expected counting segments
2022  * in assumption of absent reordering, interpret this as reordering.
2023  * The only another reason could be bug in receiver TCP.
2024  */
2025 static void tcp_check_reno_reordering(struct sock *sk, const int addend)
2026 {
2027     struct tcp_sock *tp = tcp_sk(sk);
2028
2029     if (!tcp_limit_reno_sacked(tp))
2030         return;
2031
2032     tp->reordering = min_t(u32, tp->packets_out + addend,
2033                    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
2034     tp->reord_seen++;
2035     NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
2036 }
2037
2038 /* Emulate SACKs for SACKless connection: account for a new dupack. */
2039
2040 static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
2041 {
2042     if (num_dupack) {
2043         struct tcp_sock *tp = tcp_sk(sk);
2044         u32 prior_sacked = tp->sacked_out;
2045         s32 delivered;
2046
2047         tp->sacked_out += num_dupack;
2048         tcp_check_reno_reordering(sk, 0);
2049         delivered = tp->sacked_out - prior_sacked;
2050         if (delivered > 0)
2051             tcp_count_delivered(tp, delivered, ece_ack);
2052         tcp_verify_left_out(tp);
2053     }
2054 }
2055
2056 /* Account for ACK, ACKing some data in Reno Recovery phase. */
2057
2058 static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
2059 {
2060     struct tcp_sock *tp = tcp_sk(sk);
2061
2062     if (acked > 0) {
2063         /* One ACK acked hole. The rest eat duplicate ACKs. */
2064         tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
2065                     ece_ack);
2066         if (acked - 1 >= tp->sacked_out)
2067             tp->sacked_out = 0;
2068         else
2069             tp->sacked_out -= acked - 1;
2070     }
2071     tcp_check_reno_reordering(sk, acked);
2072     tcp_verify_left_out(tp);
2073 }
2074
2075 static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
2076 {
2077     tp->sacked_out = 0;
2078 }
2079
2080 void tcp_clear_retrans(struct tcp_sock *tp)
2081 {
2082     tp->retrans_out = 0;
2083     tp->lost_out = 0;
2084     tp->undo_marker = 0;
2085     tp->undo_retrans = -1;
2086     tp->sacked_out = 0;
2087 }
2088
2089 static inline void tcp_init_undo(struct tcp_sock *tp)
2090 {
2091     tp->undo_marker = tp->snd_una;
2092     /* Retransmission still in flight may cause DSACKs later. */
2093     tp->undo_retrans = tp->retrans_out ? : -1;
2094 }
2095
2096 static bool tcp_is_rack(const struct sock *sk)
2097 {
2098     return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
2099         TCP_RACK_LOSS_DETECTION;
2100 }
2101
2102 /* If we detect SACK reneging, forget all SACK information
2103  * and reset tags completely, otherwise preserve SACKs. If receiver
2104  * dropped its ofo queue, we will know this due to reneging detection.
2105  */
2106 static void tcp_timeout_mark_lost(struct sock *sk)
2107 {
2108     struct tcp_sock *tp = tcp_sk(sk);
2109     struct sk_buff *skb, *head;
2110     bool is_reneg;          /* is receiver reneging on SACKs? */
2111
2112     head = tcp_rtx_queue_head(sk);
2113     is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
2114     if (is_reneg) {
2115         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
2116         tp->sacked_out = 0;
2117         /* Mark SACK reneging until we recover from this loss event. */
2118         tp->is_sack_reneg = 1;
2119     } else if (tcp_is_reno(tp)) {
2120         tcp_reset_reno_sack(tp);
2121     }
2122
2123     skb = head;
2124     skb_rbtree_walk_from(skb) {
2125         if (is_reneg)
2126             TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
2127         else if (tcp_is_rack(sk) && skb != head &&
2128              tcp_rack_skb_timeout(tp, skb, 0) > 0)
2129             continue; /* Don't mark recently sent ones lost yet */
2130         tcp_mark_skb_lost(sk, skb);
2131     }
2132     tcp_verify_left_out(tp);
2133     tcp_clear_all_retrans_hints(tp);
2134 }
2135
2136 /* Enter Loss state. */
2137 void tcp_enter_loss(struct sock *sk)
2138 {
2139     const struct inet_connection_sock *icsk = inet_csk(sk);
2140     struct tcp_sock *tp = tcp_sk(sk);
2141     struct net *net = sock_net(sk);
2142     bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
2143     u8 reordering;
2144
2145     tcp_timeout_mark_lost(sk);
2146
2147     /* Reduce ssthresh if it has not yet been made inside this window. */
2148     if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
2149         !after(tp->high_seq, tp->snd_una) ||
2150         (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
2151         tp->prior_ssthresh = tcp_current_ssthresh(sk);
2152         tp->prior_cwnd = tcp_snd_cwnd(tp);
2153         tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
2154         tcp_ca_event(sk, CA_EVENT_LOSS);
2155         tcp_init_undo(tp);
2156     }
2157     tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + 1);
2158     tp->snd_cwnd_cnt   = 0;
2159     tp->snd_cwnd_stamp = tcp_jiffies32;
2160
2161     /* Timeout in disordered state after receiving substantial DUPACKs
2162      * suggests that the degree of reordering is over-estimated.
2163      */
2164     reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
2165     if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
2166         tp->sacked_out >= reordering)
2167         tp->reordering = min_t(unsigned int, tp->reordering,
2168                        reordering);
2169
2170     tcp_set_ca_state(sk, TCP_CA_Loss);
2171     tp->high_seq = tp->snd_nxt;
2172     tcp_ecn_queue_cwr(tp);
2173
2174     /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
2175      * loss recovery is underway except recurring timeout(s) on
2176      * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
2177      */
2178     tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
2179            (new_recovery || icsk->icsk_retransmits) &&
2180            !inet_csk(sk)->icsk_mtup.probe_size;
2181 }
2182
2183 /* If ACK arrived pointing to a remembered SACK, it means that our
2184  * remembered SACKs do not reflect real state of receiver i.e.
2185  * receiver _host_ is heavily congested (or buggy).
2186  *
2187  * To avoid big spurious retransmission bursts due to transient SACK
2188  * scoreboard oddities that look like reneging, we give the receiver a
2189  * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
2190  * restore sanity to the SACK scoreboard. If the apparent reneging
2191  * persists until this RTO then we'll clear the SACK scoreboard.
2192  */
2193 static bool tcp_check_sack_reneging(struct sock *sk, int flag)
2194 {
2195     if (flag & FLAG_SACK_RENEGING) {
2196         struct tcp_sock *tp = tcp_sk(sk);
2197         unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
2198                       msecs_to_jiffies(10));
2199
2200         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2201                       delay, TCP_RTO_MAX);
2202         return true;
2203     }
2204     return false;
2205 }
2206
2207 /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
2208  * counter when SACK is enabled (without SACK, sacked_out is used for
2209  * that purpose).
2210  *
2211  * With reordering, holes may still be in flight, so RFC3517 recovery
2212  * uses pure sacked_out (total number of SACKed segments) even though
2213  * it violates the RFC that uses duplicate ACKs, often these are equal
2214  * but when e.g. out-of-window ACKs or packet duplication occurs,
2215  * they differ. Since neither occurs due to loss, TCP should really
2216  * ignore them.
2217  */
2218 static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2219 {
2220     return tp->sacked_out + 1;
2221 }
2222
2223 /* Linux NewReno/SACK/ECN state machine.
2224  * --------------------------------------
2225  *
2226  * "Open"   Normal state, no dubious events, fast path.
2227  * "Disorder"   In all the respects it is "Open",
2228  *      but requires a bit more attention. It is entered when
2229  *      we see some SACKs or dupacks. It is split of "Open"
2230  *      mainly to move some processing from fast path to slow one.
2231  * "CWR"    CWND was reduced due to some Congestion Notification event.
2232  *      It can be ECN, ICMP source quench, local device congestion.
2233  * "Recovery"   CWND was reduced, we are fast-retransmitting.
2234  * "Loss"   CWND was reduced due to RTO timeout or SACK reneging.
2235  *
2236  * tcp_fastretrans_alert() is entered:
2237  * - each incoming ACK, if state is not "Open"
2238  * - when arrived ACK is unusual, namely:
2239  *  * SACK
2240  *  * Duplicate ACK.
2241  *  * ECN ECE.
2242  *
2243  * Counting packets in flight is pretty simple.
2244  *
2245  *  in_flight = packets_out - left_out + retrans_out
2246  *
2247  *  packets_out is SND.NXT-SND.UNA counted in packets.
2248  *
2249  *  retrans_out is number of retransmitted segments.
2250  *
2251  *  left_out is number of segments left network, but not ACKed yet.
2252  *
2253  *      left_out = sacked_out + lost_out
2254  *
2255  *     sacked_out: Packets, which arrived to receiver out of order
2256  *         and hence not ACKed. With SACKs this number is simply
2257  *         amount of SACKed data. Even without SACKs
2258  *         it is easy to give pretty reliable estimate of this number,
2259  *         counting duplicate ACKs.
2260  *
2261  *       lost_out: Packets lost by network. TCP has no explicit
2262  *         "loss notification" feedback from network (for now).
2263  *         It means that this number can be only _guessed_.
2264  *         Actually, it is the heuristics to predict lossage that
2265  *         distinguishes different algorithms.
2266  *
2267  *  F.e. after RTO, when all the queue is considered as lost,
2268  *  lost_out = packets_out and in_flight = retrans_out.
2269  *
2270  *      Essentially, we have now a few algorithms detecting
2271  *      lost packets.
2272  *
2273  *      If the receiver supports SACK:
2274  *
2275  *      RFC6675/3517: It is the conventional algorithm. A packet is
2276  *      considered lost if the number of higher sequence packets
2277  *      SACKed is greater than or equal the DUPACK thoreshold
2278  *      (reordering). This is implemented in tcp_mark_head_lost and
2279  *      tcp_update_scoreboard.
2280  *
2281  *      RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
2282  *      (2017-) that checks timing instead of counting DUPACKs.
2283  *      Essentially a packet is considered lost if it's not S/ACKed
2284  *      after RTT + reordering_window, where both metrics are
2285  *      dynamically measured and adjusted. This is implemented in
2286  *      tcp_rack_mark_lost.
2287  *
2288  *      If the receiver does not support SACK:
2289  *
2290  *      NewReno (RFC6582): in Recovery we assume that one segment
2291  *      is lost (classic Reno). While we are in Recovery and
2292  *      a partial ACK arrives, we assume that one more packet
2293  *      is lost (NewReno). This heuristics are the same in NewReno
2294  *      and SACK.
2295  *
2296  * Really tricky (and requiring careful tuning) part of algorithm
2297  * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
2298  * The first determines the moment _when_ we should reduce CWND and,
2299  * hence, slow down forward transmission. In fact, it determines the moment
2300  * when we decide that hole is caused by loss, rather than by a reorder.
2301  *
2302  * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
2303  * holes, caused by lost packets.
2304  *
2305  * And the most logically complicated part of algorithm is undo
2306  * heuristics. We detect false retransmits due to both too early
2307  * fast retransmit (reordering) and underestimated RTO, analyzing
2308  * timestamps and D-SACKs. When we detect that some segments were
2309  * retransmitted by mistake and CWND reduction was wrong, we undo
2310  * window reduction and abort recovery phase. This logic is hidden
2311  * inside several functions named tcp_try_undo_<something>.
2312  */
2313
2314 /* This function decides, when we should leave Disordered state
2315  * and enter Recovery phase, reducing congestion window.
2316  *
2317  * Main question: may we further continue forward transmission
2318  * with the same cwnd?
2319  */
2320 static bool tcp_time_to_recover(struct sock *sk, int flag)
2321 {
2322     struct tcp_sock *tp = tcp_sk(sk);
2323
2324     /* Trick#1: The loss is proven. */
2325     if (tp->lost_out)
2326         return true;
2327
2328     /* Not-A-Trick#2 : Classic rule... */
2329     if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
2330         return true;
2331
2332     return false;
2333 }
2334
2335 /* Detect loss in event "A" above by marking head of queue up as lost.
2336  * For RFC3517 SACK, a segment is considered lost if it
2337  * has at least tp->reordering SACKed seqments above it; "packets" refers to
2338  * the maximum SACKed segments to pass before reaching this limit.
2339  */
2340 static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2341 {
2342     struct tcp_sock *tp = tcp_sk(sk);
2343     struct sk_buff *skb;
2344     int cnt;
2345     /* Use SACK to deduce losses of new sequences sent during recovery */
2346     const u32 loss_high = tp->snd_nxt;
2347
2348     WARN_ON(packets > tp->packets_out);
2349     skb = tp->lost_skb_hint;
2350     if (skb) {
2351         /* Head already handled? */
2352         if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
2353             return;
2354         cnt = tp->lost_cnt_hint;
2355     } else {
2356         skb = tcp_rtx_queue_head(sk);
2357         cnt = 0;
2358     }
2359
2360     skb_rbtree_walk_from(skb) {
2361         /* TODO: do this better */
2362         /* this is not the most efficient way to do this... */
2363         tp->lost_skb_hint = skb;
2364         tp->lost_cnt_hint = cnt;
2365
2366         if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2367             break;
2368
2369         if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2370             cnt += tcp_skb_pcount(skb);
2371
2372         if (cnt > packets)
2373             break;
2374
2375         if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
2376             tcp_mark_skb_lost(sk, skb);
2377
2378         if (mark_head)
2379             break;
2380     }
2381     tcp_verify_left_out(tp);
2382 }
2383
2384 /* Account newly detected lost packet(s) */
2385
2386 static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2387 {
2388     struct tcp_sock *tp = tcp_sk(sk);
2389
2390     if (tcp_is_sack(tp)) {
2391         int sacked_upto = tp->sacked_out - tp->reordering;
2392         if (sacked_upto >= 0)
2393             tcp_mark_head_lost(sk, sacked_upto, 0);
2394         else if (fast_rexmit)
2395             tcp_mark_head_lost(sk, 1, 1);
2396     }
2397 }
2398
2399 static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
2400 {
2401     return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2402            before(tp->rx_opt.rcv_tsecr, when);
2403 }
2404
2405 /* skb is spurious retransmitted if the returned timestamp echo
2406  * reply is prior to the skb transmission time
2407  */
2408 static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
2409                      const struct sk_buff *skb)
2410 {
2411     return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
2412            tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
2413 }
2414
2415 /* Nothing was retransmitted or returned timestamp is less
2416  * than timestamp of the first retransmission.
2417  */
2418 static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2419 {
2420     return tp->retrans_stamp &&
2421            tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
2422 }
2423
2424 /* Undo procedures. */
2425
2426 /* We can clear retrans_stamp when there are no retransmissions in the
2427  * window. It would seem that it is trivially available for us in
2428  * tp->retrans_out, however, that kind of assumptions doesn't consider
2429  * what will happen if errors occur when sending retransmission for the
2430  * second time. ...It could the that such segment has only
2431  * TCPCB_EVER_RETRANS set at the present time. It seems that checking
2432  * the head skb is enough except for some reneging corner cases that
2433  * are not worth the effort.
2434  *
2435  * Main reason for all this complexity is the fact that connection dying
2436  * time now depends on the validity of the retrans_stamp, in particular,
2437  * that successive retransmissions of a segment must not advance
2438  * retrans_stamp under any conditions.
2439  */
2440 static bool tcp_any_retrans_done(const struct sock *sk)
2441 {
2442     const struct tcp_sock *tp = tcp_sk(sk);
2443     struct sk_buff *skb;
2444
2445     if (tp->retrans_out)
2446         return true;
2447
2448     skb = tcp_rtx_queue_head(sk);
2449     if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2450         return true;
2451
2452     return false;
2453 }
2454
2455 static void DBGUNDO(struct sock *sk, const char *msg)
2456 {
2457 #if FASTRETRANS_DEBUG > 1
2458     struct tcp_sock *tp = tcp_sk(sk);
2459     struct inet_sock *inet = inet_sk(sk);
2460
2461     if (sk->sk_family == AF_INET) {
2462         pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2463              msg,
2464              &inet->inet_daddr, ntohs(inet->inet_dport),
2465              tcp_snd_cwnd(tp), tcp_left_out(tp),
2466              tp->snd_ssthresh, tp->prior_ssthresh,
2467              tp->packets_out);
2468     }
2469 #if IS_ENABLED(CONFIG_IPV6)
2470     else if (sk->sk_family == AF_INET6) {
2471         pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2472              msg,
2473              &sk->sk_v6_daddr, ntohs(inet->inet_dport),
2474              tcp_snd_cwnd(tp), tcp_left_out(tp),
2475              tp->snd_ssthresh, tp->prior_ssthresh,
2476              tp->packets_out);
2477     }
2478 #endif
2479 #endif
2480 }
2481
2482 static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2483 {
2484     struct tcp_sock *tp = tcp_sk(sk);
2485
2486     if (unmark_loss) {
2487         struct sk_buff *skb;
2488
2489         skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2490             TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
2491         }
2492         tp->lost_out = 0;
2493         tcp_clear_all_retrans_hints(tp);
2494     }
2495
2496     if (tp->prior_ssthresh) {
2497         const struct inet_connection_sock *icsk = inet_csk(sk);
2498
2499         tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk));
2500
2501         if (tp->prior_ssthresh > tp->snd_ssthresh) {
2502             tp->snd_ssthresh = tp->prior_ssthresh;
2503             tcp_ecn_withdraw_cwr(tp);
2504         }
2505     }
2506     tp->snd_cwnd_stamp = tcp_jiffies32;
2507     tp->undo_marker = 0;
2508     tp->rack.advanced = 1; /* Force RACK to re-exam losses */
2509 }
2510
2511 static inline bool tcp_may_undo(const struct tcp_sock *tp)
2512 {
2513     return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
2514 }
2515
2516 static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
2517 {
2518     struct tcp_sock *tp = tcp_sk(sk);
2519
2520     if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2521         /* Hold old state until something *above* high_seq
2522          * is ACKed. For Reno it is MUST to prevent false
2523          * fast retransmits (RFC2582). SACK TCP is safe. */
2524         if (!tcp_any_retrans_done(sk))
2525             tp->retrans_stamp = 0;
2526         return true;
2527     }
2528     return false;
2529 }
2530
2531 /* People celebrate: "We love our President!" */
2532 static bool tcp_try_undo_recovery(struct sock *sk)
2533 {
2534     struct tcp_sock *tp = tcp_sk(sk);
2535
2536     if (tcp_may_undo(tp)) {
2537         int mib_idx;
2538
2539         /* Happy end! We did not retransmit anything
2540          * or our original transmission succeeded.
2541          */
2542         DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2543         tcp_undo_cwnd_reduction(sk, false);
2544         if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2545             mib_idx = LINUX_MIB_TCPLOSSUNDO;
2546         else
2547             mib_idx = LINUX_MIB_TCPFULLUNDO;
2548
2549         NET_INC_STATS(sock_net(sk), mib_idx);
2550     } else if (tp->rack.reo_wnd_persist) {
2551         tp->rack.reo_wnd_persist--;
2552     }
2553     if (tcp_is_non_sack_preventing_reopen(sk))
2554         return true;
2555     tcp_set_ca_state(sk, TCP_CA_Open);
2556     tp->is_sack_reneg = 0;
2557     return false;
2558 }
2559
2560 /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
2561 static bool tcp_try_undo_dsack(struct sock *sk)
2562 {
2563     struct tcp_sock *tp = tcp_sk(sk);
2564
2565     if (tp->undo_marker && !tp->undo_retrans) {
2566         tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
2567                            tp->rack.reo_wnd_persist + 1);
2568         DBGUNDO(sk, "D-SACK");
2569         tcp_undo_cwnd_reduction(sk, false);
2570         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2571         return true;
2572     }
2573     return false;
2574 }
2575
2576 /* Undo during loss recovery after partial ACK or using F-RTO. */
2577 static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2578 {
2579     struct tcp_sock *tp = tcp_sk(sk);
2580
2581     if (frto_undo || tcp_may_undo(tp)) {
2582         tcp_undo_cwnd_reduction(sk, true);
2583
2584         DBGUNDO(sk, "partial loss");
2585         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2586         if (frto_undo)
2587             NET_INC_STATS(sock_net(sk),
2588                     LINUX_MIB_TCPSPURIOUSRTOS);
2589         inet_csk(sk)->icsk_retransmits = 0;
2590         if (tcp_is_non_sack_preventing_reopen(sk))
2591             return true;
2592         if (frto_undo || tcp_is_sack(tp)) {
2593             tcp_set_ca_state(sk, TCP_CA_Open);
2594             tp->is_sack_reneg = 0;
2595         }
2596         return true;
2597     }
2598     return false;
2599 }
2600
2601 /* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
2602  * It computes the number of packets to send (sndcnt) based on packets newly
2603  * delivered:
2604  *   1) If the packets in flight is larger than ssthresh, PRR spreads the
2605  *  cwnd reductions across a full RTT.
2606  *   2) Otherwise PRR uses packet conservation to send as much as delivered.
2607  *      But when SND_UNA is acked without further losses,
2608  *      slow starts cwnd up to ssthresh to speed up the recovery.
2609  */
2610 static void tcp_init_cwnd_reduction(struct sock *sk)
2611 {
2612     struct tcp_sock *tp = tcp_sk(sk);
2613
2614     tp->high_seq = tp->snd_nxt;
2615     tp->tlp_high_seq = 0;
2616     tp->snd_cwnd_cnt = 0;
2617     tp->prior_cwnd = tcp_snd_cwnd(tp);
2618     tp->prr_delivered = 0;
2619     tp->prr_out = 0;
2620     tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2621     tcp_ecn_queue_cwr(tp);
2622 }
2623
2624 void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag)
2625 {
2626     struct tcp_sock *tp = tcp_sk(sk);
2627     int sndcnt = 0;
2628     int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2629
2630     if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
2631         return;
2632
2633     tp->prr_delivered += newly_acked_sacked;
2634     if (delta < 0) {
2635         u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2636                    tp->prior_cwnd - 1;
2637         sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2638     } else {
2639         sndcnt = max_t(int, tp->prr_delivered - tp->prr_out,
2640                    newly_acked_sacked);
2641         if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
2642             sndcnt++;
2643         sndcnt = min(delta, sndcnt);
2644     }
2645     /* Force a fast retransmit upon entering fast recovery */
2646     sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
2647     tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt);
2648 }
2649
2650 static inline void tcp_end_cwnd_reduction(struct sock *sk)
2651 {
2652     struct tcp_sock *tp = tcp_sk(sk);
2653
2654     if (inet_csk(sk)->icsk_ca_ops->cong_control)
2655         return;
2656
2657     /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
2658     if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
2659         (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
2660         tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
2661         tp->snd_cwnd_stamp = tcp_jiffies32;
2662     }
2663     tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2664 }
2665
2666 /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2667 void tcp_enter_cwr(struct sock *sk)
2668 {
2669     struct tcp_sock *tp = tcp_sk(sk);
2670
2671     tp->prior_ssthresh = 0;
2672     if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2673         tp->undo_marker = 0;
2674         tcp_init_cwnd_reduction(sk);
2675         tcp_set_ca_state(sk, TCP_CA_CWR);
2676     }
2677 }
2678 EXPORT_SYMBOL(tcp_enter_cwr);
2679
2680 static void tcp_try_keep_open(struct sock *sk)
2681 {
2682     struct tcp_sock *tp = tcp_sk(sk);
2683     int state = TCP_CA_Open;
2684
2685     if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
2686         state = TCP_CA_Disorder;
2687
2688     if (inet_csk(sk)->icsk_ca_state != state) {
2689         tcp_set_ca_state(sk, state);
2690         tp->high_seq = tp->snd_nxt;
2691     }
2692 }
2693
2694 static void tcp_try_to_open(struct sock *sk, int flag)
2695 {
2696     struct tcp_sock *tp = tcp_sk(sk);
2697
2698     tcp_verify_left_out(tp);
2699
2700     if (!tcp_any_retrans_done(sk))
2701         tp->retrans_stamp = 0;
2702
2703     if (flag & FLAG_ECE)
2704         tcp_enter_cwr(sk);
2705
2706     if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2707         tcp_try_keep_open(sk);
2708     }
2709 }
2710
2711 static void tcp_mtup_probe_failed(struct sock *sk)
2712 {
2713     struct inet_connection_sock *icsk = inet_csk(sk);
2714
2715     icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
2716     icsk->icsk_mtup.probe_size = 0;
2717     NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
2718 }
2719
2720 static void tcp_mtup_probe_success(struct sock *sk)
2721 {
2722     struct tcp_sock *tp = tcp_sk(sk);
2723     struct inet_connection_sock *icsk = inet_csk(sk);
2724     u64 val;
2725
2726     tp->prior_ssthresh = tcp_current_ssthresh(sk);
2727
2728     val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache);
2729     do_div(val, icsk->icsk_mtup.probe_size);
2730     DEBUG_NET_WARN_ON_ONCE((u32)val != val);
2731     tcp_snd_cwnd_set(tp, max_t(u32, 1U, val));
2732
2733     tp->snd_cwnd_cnt = 0;
2734     tp->snd_cwnd_stamp = tcp_jiffies32;
2735     tp->snd_ssthresh = tcp_current_ssthresh(sk);
2736
2737     icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2738     icsk->icsk_mtup.probe_size = 0;
2739     tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
2740     NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
2741 }
2742
2743 /* Do a simple retransmit without using the backoff mechanisms in
2744  * tcp_timer. This is used for path mtu discovery.
2745  * The socket is already locked here.
2746  */
2747 void tcp_simple_retransmit(struct sock *sk)
2748 {
2749     const struct inet_connection_sock *icsk = inet_csk(sk);
2750     struct tcp_sock *tp = tcp_sk(sk);
2751     struct sk_buff *skb;
2752     int mss;
2753
2754     /* A fastopen SYN request is stored as two separate packets within
2755      * the retransmit queue, this is done by tcp_send_syn_data().
2756      * As a result simply checking the MSS of the frames in the queue
2757      * will not work for the SYN packet.
2758      *
2759      * Us being here is an indication of a path MTU issue so we can
2760      * assume that the fastopen SYN was lost and just mark all the
2761      * frames in the retransmit queue as lost. We will use an MSS of
2762      * -1 to mark all frames as lost, otherwise compute the current MSS.
2763      */
2764     if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
2765         mss = -1;
2766     else
2767         mss = tcp_current_mss(sk);
2768
2769     skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2770         if (tcp_skb_seglen(skb) > mss)
2771             tcp_mark_skb_lost(sk, skb);
2772     }
2773
2774     tcp_clear_retrans_hints_partial(tp);
2775
2776     if (!tp->lost_out)
2777         return;
2778
2779     if (tcp_is_reno(tp))
2780         tcp_limit_reno_sacked(tp);
2781
2782     tcp_verify_left_out(tp);
2783
2784     /* Don't muck with the congestion window here.
2785      * Reason is that we do not increase amount of _data_
2786      * in network, but units changed and effective
2787      * cwnd/ssthresh really reduced now.
2788      */
2789     if (icsk->icsk_ca_state != TCP_CA_Loss) {
2790         tp->high_seq = tp->snd_nxt;
2791         tp->snd_ssthresh = tcp_current_ssthresh(sk);
2792         tp->prior_ssthresh = 0;
2793         tp->undo_marker = 0;
2794         tcp_set_ca_state(sk, TCP_CA_Loss);
2795     }
2796     tcp_xmit_retransmit_queue(sk);
2797 }
2798 EXPORT_SYMBOL(tcp_simple_retransmit);
2799
2800 void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2801 {
2802     struct tcp_sock *tp = tcp_sk(sk);
2803     int mib_idx;
2804
2805     if (tcp_is_reno(tp))
2806         mib_idx = LINUX_MIB_TCPRENORECOVERY;
2807     else
2808         mib_idx = LINUX_MIB_TCPSACKRECOVERY;
2809
2810     NET_INC_STATS(sock_net(sk), mib_idx);
2811
2812     tp->prior_ssthresh = 0;
2813     tcp_init_undo(tp);
2814
2815     if (!tcp_in_cwnd_reduction(sk)) {
2816         if (!ece_ack)
2817             tp->prior_ssthresh = tcp_current_ssthresh(sk);
2818         tcp_init_cwnd_reduction(sk);
2819     }
2820     tcp_set_ca_state(sk, TCP_CA_Recovery);
2821 }
2822
2823 /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
2824  * recovered or spurious. Otherwise retransmits more on partial ACKs.
2825  */
2826 static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
2827                  int *rexmit)
2828 {
2829     struct tcp_sock *tp = tcp_sk(sk);
2830     bool recovered = !before(tp->snd_una, tp->high_seq);
2831
2832     if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
2833         tcp_try_undo_loss(sk, false))
2834         return;
2835
2836     if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2837         /* Step 3.b. A timeout is spurious if not all data are
2838          * lost, i.e., never-retransmitted data are (s)acked.
2839          */
2840         if ((flag & FLAG_ORIG_SACK_ACKED) &&
2841             tcp_try_undo_loss(sk, true))
2842             return;
2843
2844         if (after(tp->snd_nxt, tp->high_seq)) {
2845             if (flag & FLAG_DATA_SACKED || num_dupack)
2846                 tp->frto = 0; /* Step 3.a. loss was real */
2847         } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2848             tp->high_seq = tp->snd_nxt;
2849             /* Step 2.b. Try send new data (but deferred until cwnd
2850              * is updated in tcp_ack()). Otherwise fall back to
2851              * the conventional recovery.
2852              */
2853             if (!tcp_write_queue_empty(sk) &&
2854                 after(tcp_wnd_end(tp), tp->snd_nxt)) {
2855                 *rexmit = REXMIT_NEW;
2856                 return;
2857             }
2858             tp->frto = 0;
2859         }
2860     }
2861
2862     if (recovered) {
2863         /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
2864         tcp_try_undo_recovery(sk);
2865         return;
2866     }
2867     if (tcp_is_reno(tp)) {
2868         /* A Reno DUPACK means new data in F-RTO step 2.b above are
2869          * delivered. Lower inflight to clock out (re)tranmissions.
2870          */
2871         if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
2872             tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
2873         else if (flag & FLAG_SND_UNA_ADVANCED)
2874             tcp_reset_reno_sack(tp);
2875     }
2876     *rexmit = REXMIT_LOST;
2877 }
2878
2879 static bool tcp_force_fast_retransmit(struct sock *sk)
2880 {
2881     struct tcp_sock *tp = tcp_sk(sk);
2882
2883     return after(tcp_highest_sack_seq(tp),
2884              tp->snd_una + tp->reordering * tp->mss_cache);
2885 }
2886
2887 /* Undo during fast recovery after partial ACK. */
2888 static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
2889                  bool *do_lost)
2890 {
2891     struct tcp_sock *tp = tcp_sk(sk);
2892
2893     if (tp->undo_marker && tcp_packet_delayed(tp)) {
2894         /* Plain luck! Hole if filled with delayed
2895          * packet, rather than with a retransmit. Check reordering.
2896          */
2897         tcp_check_sack_reordering(sk, prior_snd_una, 1);
2898
2899         /* We are getting evidence that the reordering degree is higher
2900          * than we realized. If there are no retransmits out then we
2901          * can undo. Otherwise we clock out new packets but do not
2902          * mark more packets lost or retransmit more.
2903          */
2904         if (tp->retrans_out)
2905             return true;
2906
2907         if (!tcp_any_retrans_done(sk))
2908             tp->retrans_stamp = 0;
2909
2910         DBGUNDO(sk, "partial recovery");
2911         tcp_undo_cwnd_reduction(sk, true);
2912         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2913         tcp_try_keep_open(sk);
2914     } else {
2915         /* Partial ACK arrived. Force fast retransmit. */
2916         *do_lost = tcp_force_fast_retransmit(sk);
2917     }
2918     return false;
2919 }
2920
2921 static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
2922 {
2923     struct tcp_sock *tp = tcp_sk(sk);
2924
2925     if (tcp_rtx_queue_empty(sk))
2926         return;
2927
2928     if (unlikely(tcp_is_reno(tp))) {
2929         tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
2930     } else if (tcp_is_rack(sk)) {
2931         u32 prior_retrans = tp->retrans_out;
2932
2933         if (tcp_rack_mark_lost(sk))
2934             *ack_flag &= ~FLAG_SET_XMIT_TIMER;
2935         if (prior_retrans > tp->retrans_out)
2936             *ack_flag |= FLAG_LOST_RETRANS;
2937     }
2938 }
2939
2940 /* Process an event, which can update packets-in-flight not trivially.
2941  * Main goal of this function is to calculate new estimate for left_out,
2942  * taking into account both packets sitting in receiver's buffer and
2943  * packets lost by network.
2944  *
2945  * Besides that it updates the congestion state when packet loss or ECN
2946  * is detected. But it does not reduce the cwnd, it is done by the
2947  * congestion control later.
2948  *
2949  * It does _not_ decide what to send, it is made in function
2950  * tcp_xmit_retransmit_queue().
2951  */
2952 static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2953                   int num_dupack, int *ack_flag, int *rexmit)
2954 {
2955     struct inet_connection_sock *icsk = inet_csk(sk);
2956     struct tcp_sock *tp = tcp_sk(sk);
2957     int fast_rexmit = 0, flag = *ack_flag;
2958     bool ece_ack = flag & FLAG_ECE;
2959     bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
2960                       tcp_force_fast_retransmit(sk));
2961
2962     if (!tp->packets_out && tp->sacked_out)
2963         tp->sacked_out = 0;
2964
2965     /* Now state machine starts.
2966      * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
2967     if (ece_ack)
2968         tp->prior_ssthresh = 0;
2969
2970     /* B. In all the states check for reneging SACKs. */
2971     if (tcp_check_sack_reneging(sk, flag))
2972         return;
2973
2974     /* C. Check consistency of the current state. */
2975     tcp_verify_left_out(tp);
2976
2977     /* D. Check state exit conditions. State can be terminated
2978      *    when high_seq is ACKed. */
2979     if (icsk->icsk_ca_state == TCP_CA_Open) {
2980         WARN_ON(tp->retrans_out != 0 && !tp->syn_data);
2981         tp->retrans_stamp = 0;
2982     } else if (!before(tp->snd_una, tp->high_seq)) {
2983         switch (icsk->icsk_ca_state) {
2984         case TCP_CA_CWR:
2985             /* CWR is to be held something *above* high_seq
2986              * is ACKed for CWR bit to reach receiver. */
2987             if (tp->snd_una != tp->high_seq) {
2988                 tcp_end_cwnd_reduction(sk);
2989                 tcp_set_ca_state(sk, TCP_CA_Open);
2990             }
2991             break;
2992
2993         case TCP_CA_Recovery:
2994             if (tcp_is_reno(tp))
2995                 tcp_reset_reno_sack(tp);
2996             if (tcp_try_undo_recovery(sk))
2997                 return;
2998             tcp_end_cwnd_reduction(sk);
2999             break;
3000         }
3001     }
3002
3003     /* E. Process state. */
3004     switch (icsk->icsk_ca_state) {
3005     case TCP_CA_Recovery:
3006         if (!(flag & FLAG_SND_UNA_ADVANCED)) {
3007             if (tcp_is_reno(tp))
3008                 tcp_add_reno_sack(sk, num_dupack, ece_ack);
3009         } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
3010             return;
3011
3012         if (tcp_try_undo_dsack(sk))
3013             tcp_try_keep_open(sk);
3014
3015         tcp_identify_packet_loss(sk, ack_flag);
3016         if (icsk->icsk_ca_state != TCP_CA_Recovery) {
3017             if (!tcp_time_to_recover(sk, flag))
3018                 return;
3019             /* Undo reverts the recovery state. If loss is evident,
3020              * starts a new recovery (e.g. reordering then loss);
3021              */
3022             tcp_enter_recovery(sk, ece_ack);
3023         }
3024         break;
3025     case TCP_CA_Loss:
3026         tcp_process_loss(sk, flag, num_dupack, rexmit);
3027         tcp_identify_packet_loss(sk, ack_flag);
3028         if (!(icsk->icsk_ca_state == TCP_CA_Open ||
3029               (*ack_flag & FLAG_LOST_RETRANS)))
3030             return;
3031         /* Change state if cwnd is undone or retransmits are lost */
3032         fallthrough;
3033     default:
3034         if (tcp_is_reno(tp)) {
3035             if (flag & FLAG_SND_UNA_ADVANCED)
3036                 tcp_reset_reno_sack(tp);
3037             tcp_add_reno_sack(sk, num_dupack, ece_ack);
3038         }
3039
3040         if (icsk->icsk_ca_state <= TCP_CA_Disorder)
3041             tcp_try_undo_dsack(sk);
3042
3043         tcp_identify_packet_loss(sk, ack_flag);
3044         if (!tcp_time_to_recover(sk, flag)) {
3045             tcp_try_to_open(sk, flag);
3046             return;
3047         }
3048
3049         /* MTU probe failure: don't reduce cwnd */
3050         if (icsk->icsk_ca_state < TCP_CA_CWR &&
3051             icsk->icsk_mtup.probe_size &&
3052             tp->snd_una == tp->mtu_probe.probe_seq_start) {
3053             tcp_mtup_probe_failed(sk);
3054             /* Restores the reduction we did in tcp_mtup_probe() */
3055             tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
3056             tcp_simple_retransmit(sk);
3057             return;
3058         }
3059
3060         /* Otherwise enter Recovery state */
3061         tcp_enter_recovery(sk, ece_ack);
3062         fast_rexmit = 1;
3063     }
3064
3065     if (!tcp_is_rack(sk) && do_lost)
3066         tcp_update_scoreboard(sk, fast_rexmit);
3067     *rexmit = REXMIT_LOST;
3068 }
3069
3070 static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
3071 {
3072     u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
3073     struct tcp_sock *tp = tcp_sk(sk);
3074
3075     if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
3076         /* If the remote keeps returning delayed ACKs, eventually
3077          * the min filter would pick it up and overestimate the
3078          * prop. delay when it expires. Skip suspected delayed ACKs.
3079          */
3080         return;
3081     }
3082     minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
3083                rtt_us ? : jiffies_to_usecs(1));
3084 }
3085
3086 static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
3087                    long seq_rtt_us, long sack_rtt_us,
3088                    long ca_rtt_us, struct rate_sample *rs)
3089 {
3090     const struct tcp_sock *tp = tcp_sk(sk);
3091
3092     /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
3093      * broken middle-boxes or peers may corrupt TS-ECR fields. But
3094      * Karn's algorithm forbids taking RTT if some retransmitted data
3095      * is acked (RFC6298).
3096      */
3097     if (seq_rtt_us < 0)
3098         seq_rtt_us = sack_rtt_us;
3099
3100     /* RTTM Rule: A TSecr value received in a segment is used to
3101      * update the averaged RTT measurement only if the segment
3102      * acknowledges some new data, i.e., only if it advances the
3103      * left edge of the send window.
3104      * See draft-ietf-tcplw-high-performance-00, section 3.3.
3105      */
3106     if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
3107         flag & FLAG_ACKED) {
3108         u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
3109
3110         if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
3111             if (!delta)
3112                 delta = 1;
3113             seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
3114             ca_rtt_us = seq_rtt_us;
3115         }
3116     }
3117     rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
3118     if (seq_rtt_us < 0)
3119         return false;
3120
3121     /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
3122      * always taken together with ACK, SACK, or TS-opts. Any negative
3123      * values will be skipped with the seq_rtt_us < 0 check above.
3124      */
3125     tcp_update_rtt_min(sk, ca_rtt_us, flag);
3126     tcp_rtt_estimator(sk, seq_rtt_us);
3127     tcp_set_rto(sk);
3128
3129     /* RFC6298: only reset backoff on valid RTT measurement. */
3130     inet_csk(sk)->icsk_backoff = 0;
3131     return true;
3132 }
3133
3134 /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
3135 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
3136 {
3137     struct rate_sample rs;
3138     long rtt_us = -1L;
3139
3140     if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
3141         rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
3142
3143     tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
3144 }
3145
3146
3147 static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
3148 {
3149     const struct inet_connection_sock *icsk = inet_csk(sk);
3150
3151     icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
3152     tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
3153 }
3154
3155 /* Restart timer after forward progress on connection.
3156  * RFC2988 recommends to restart timer to now+rto.
3157  */
3158 void tcp_rearm_rto(struct sock *sk)
3159 {
3160     const struct inet_connection_sock *icsk = inet_csk(sk);
3161     struct tcp_sock *tp = tcp_sk(sk);
3162
3163     /* If the retrans timer is currently being used by Fast Open
3164      * for SYN-ACK retrans purpose, stay put.
3165      */
3166     if (rcu_access_pointer(tp->fastopen_rsk))
3167         return;
3168
3169     if (!tp->packets_out) {
3170         inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3171     } else {
3172         u32 rto = inet_csk(sk)->icsk_rto;
3173         /* Offset the time elapsed after installing regular RTO */
3174         if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
3175             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3176             s64 delta_us = tcp_rto_delta_us(sk);
3177             /* delta_us may not be positive if the socket is locked
3178              * when the retrans timer fires and is rescheduled.
3179              */
3180             rto = usecs_to_jiffies(max_t(int, delta_us, 1));
3181         }
3182         tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3183                      TCP_RTO_MAX);
3184     }
3185 }
3186
3187 /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
3188 static void tcp_set_xmit_timer(struct sock *sk)
3189 {
3190     if (!tcp_schedule_loss_probe(sk, true))
3191         tcp_rearm_rto(sk);
3192 }
3193
3194 /* If we get here, the whole TSO packet has not been acked. */
3195 static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3196 {
3197     struct tcp_sock *tp = tcp_sk(sk);
3198     u32 packets_acked;
3199
3200     BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
3201
3202     packets_acked = tcp_skb_pcount(skb);
3203     if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3204         return 0;
3205     packets_acked -= tcp_skb_pcount(skb);
3206
3207     if (packets_acked) {
3208         BUG_ON(tcp_skb_pcount(skb) == 0);
3209         BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
3210     }
3211
3212     return packets_acked;
3213 }
3214
3215 static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3216                const struct sk_buff *ack_skb, u32 prior_snd_una)
3217 {
3218     const struct skb_shared_info *shinfo;
3219
3220     /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
3221     if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
3222         return;
3223
3224     shinfo = skb_shinfo(skb);
3225     if (!before(shinfo->tskey, prior_snd_una) &&
3226         before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
3227         tcp_skb_tsorted_save(skb) {
3228             __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK);
3229         } tcp_skb_tsorted_restore(skb);
3230     }
3231 }
3232
3233 /* Remove acknowledged frames from the retransmission queue. If our packet
3234  * is before the ack sequence we can discard it as it's confirmed to have
3235  * arrived at the other end.
3236  */
3237 static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
3238                    u32 prior_fack, u32 prior_snd_una,
3239                    struct tcp_sacktag_state *sack, bool ece_ack)
3240 {
3241     const struct inet_connection_sock *icsk = inet_csk(sk);
3242     u64 first_ackt, last_ackt;
3243     struct tcp_sock *tp = tcp_sk(sk);
3244     u32 prior_sacked = tp->sacked_out;
3245     u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
3246     struct sk_buff *skb, *next;
3247     bool fully_acked = true;
3248     long sack_rtt_us = -1L;
3249     long seq_rtt_us = -1L;
3250     long ca_rtt_us = -1L;
3251     u32 pkts_acked = 0;
3252     bool rtt_update;
3253     int flag = 0;
3254
3255     first_ackt = 0;
3256
3257     for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
3258         struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3259         const u32 start_seq = scb->seq;
3260         u8 sacked = scb->sacked;
3261         u32 acked_pcount;
3262
3263         /* Determine how many packets and what bytes were acked, tso and else */
3264         if (after(scb->end_seq, tp->snd_una)) {
3265             if (tcp_skb_pcount(skb) == 1 ||
3266                 !after(tp->snd_una, scb->seq))
3267                 break;
3268
3269             acked_pcount = tcp_tso_acked(sk, skb);
3270             if (!acked_pcount)
3271                 break;
3272             fully_acked = false;
3273         } else {
3274             acked_pcount = tcp_skb_pcount(skb);
3275         }
3276
3277         if (unlikely(sacked & TCPCB_RETRANS)) {
3278             if (sacked & TCPCB_SACKED_RETRANS)
3279                 tp->retrans_out -= acked_pcount;
3280             flag |= FLAG_RETRANS_DATA_ACKED;
3281         } else if (!(sacked & TCPCB_SACKED_ACKED)) {
3282             last_ackt = tcp_skb_timestamp_us(skb);
3283             WARN_ON_ONCE(last_ackt == 0);
3284             if (!first_ackt)
3285                 first_ackt = last_ackt;
3286
3287             if (before(start_seq, reord))
3288                 reord = start_seq;
3289             if (!after(scb->end_seq, tp->high_seq))
3290                 flag |= FLAG_ORIG_SACK_ACKED;
3291         }
3292
3293         if (sacked & TCPCB_SACKED_ACKED) {
3294             tp->sacked_out -= acked_pcount;
3295         } else if (tcp_is_sack(tp)) {
3296             tcp_count_delivered(tp, acked_pcount, ece_ack);
3297             if (!tcp_skb_spurious_retrans(tp, skb))
3298                 tcp_rack_advance(tp, sacked, scb->end_seq,
3299                          tcp_skb_timestamp_us(skb));
3300         }
3301         if (sacked & TCPCB_LOST)
3302             tp->lost_out -= acked_pcount;
3303
3304         tp->packets_out -= acked_pcount;
3305         pkts_acked += acked_pcount;
3306         tcp_rate_skb_delivered(sk, skb, sack->rate);
3307
3308         /* Initial outgoing SYN's get put onto the write_queue
3309          * just like anything else we transmit.  It is not
3310          * true data, and if we misinform our callers that
3311          * this ACK acks real data, we will erroneously exit
3312          * connection startup slow start one packet too
3313          * quickly.  This is severely frowned upon behavior.
3314          */
3315         if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
3316             flag |= FLAG_DATA_ACKED;
3317         } else {
3318             flag |= FLAG_SYN_ACKED;
3319             tp->retrans_stamp = 0;
3320         }
3321
3322         if (!fully_acked)
3323             break;
3324
3325         tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
3326
3327         next = skb_rb_next(skb);
3328         if (unlikely(skb == tp->retransmit_skb_hint))
3329             tp->retransmit_skb_hint = NULL;
3330         if (unlikely(skb == tp->lost_skb_hint))
3331             tp->lost_skb_hint = NULL;
3332         tcp_highest_sack_replace(sk, skb, next);
3333         tcp_rtx_queue_unlink_and_free(skb, sk);
3334     }
3335
3336     if (!skb)
3337         tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
3338
3339     if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3340         tp->snd_up = tp->snd_una;
3341
3342     if (skb) {
3343         tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
3344         if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
3345             flag |= FLAG_SACK_RENEGING;
3346     }
3347
3348     if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3349         seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
3350         ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
3351
3352         if (pkts_acked == 1 && fully_acked && !prior_sacked &&
3353             (tp->snd_una - prior_snd_una) < tp->mss_cache &&
3354             sack->rate->prior_delivered + 1 == tp->delivered &&
3355             !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
3356             /* Conservatively mark a delayed ACK. It's typically
3357              * from a lone runt packet over the round trip to
3358              * a receiver w/o out-of-order or CE events.
3359              */
3360             flag |= FLAG_ACK_MAYBE_DELAYED;
3361         }
3362     }
3363     if (sack->first_sackt) {
3364         sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
3365         ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
3366     }
3367     rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3368                     ca_rtt_us, sack->rate);
3369
3370     if (flag & FLAG_ACKED) {
3371         flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
3372         if (unlikely(icsk->icsk_mtup.probe_size &&
3373                  !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3374             tcp_mtup_probe_success(sk);
3375         }
3376
3377         if (tcp_is_reno(tp)) {
3378             tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
3379
3380             /* If any of the cumulatively ACKed segments was
3381              * retransmitted, non-SACK case cannot confirm that
3382              * progress was due to original transmission due to
3383              * lack of TCPCB_SACKED_ACKED bits even if some of
3384              * the packets may have been never retransmitted.
3385              */
3386             if (flag & FLAG_RETRANS_DATA_ACKED)
3387                 flag &= ~FLAG_ORIG_SACK_ACKED;
3388         } else {
3389             int delta;
3390
3391             /* Non-retransmitted hole got filled? That's reordering */
3392             if (before(reord, prior_fack))
3393                 tcp_check_sack_reordering(sk, reord, 0);
3394
3395             delta = prior_sacked - tp->sacked_out;
3396             tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3397         }
3398     } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3399            sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
3400                             tcp_skb_timestamp_us(skb))) {
3401         /* Do not re-arm RTO if the sack RTT is measured from data sent
3402          * after when the head was last (re)transmitted. Otherwise the
3403          * timeout may continue to extend in loss recovery.
3404          */
3405         flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
3406     }
3407
3408     if (icsk->icsk_ca_ops->pkts_acked) {
3409         struct ack_sample sample = { .pkts_acked = pkts_acked,
3410                          .rtt_us = sack->rate->rtt_us };
3411
3412         sample.in_flight = tp->mss_cache *
3413             (tp->delivered - sack->rate->prior_delivered);
3414         icsk->icsk_ca_ops->pkts_acked(sk, &sample);
3415     }
3416
3417 #if FASTRETRANS_DEBUG > 0
3418     WARN_ON((int)tp->sacked_out < 0);
3419     WARN_ON((int)tp->lost_out < 0);
3420     WARN_ON((int)tp->retrans_out < 0);
3421     if (!tp->packets_out && tcp_is_sack(tp)) {
3422         icsk = inet_csk(sk);
3423         if (tp->lost_out) {
3424             pr_debug("Leak l=%u %d\n",
3425                  tp->lost_out, icsk->icsk_ca_state);
3426             tp->lost_out = 0;
3427         }
3428         if (tp->sacked_out) {
3429             pr_debug("Leak s=%u %d\n",
3430                  tp->sacked_out, icsk->icsk_ca_state);
3431             tp->sacked_out = 0;
3432         }
3433         if (tp->retrans_out) {
3434             pr_debug("Leak r=%u %d\n",
3435                  tp->retrans_out, icsk->icsk_ca_state);
3436             tp->retrans_out = 0;
3437         }
3438     }
3439 #endif
3440     return flag;
3441 }
3442
3443 static void tcp_ack_probe(struct sock *sk)
3444 {
3445     struct inet_connection_sock *icsk = inet_csk(sk);
3446     struct sk_buff *head = tcp_send_head(sk);
3447     const struct tcp_sock *tp = tcp_sk(sk);
3448
3449     /* Was it a usable window open? */
3450     if (!head)
3451         return;
3452     if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
3453         icsk->icsk_backoff = 0;
3454         icsk->icsk_probes_tstamp = 0;
3455         inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3456         /* Socket must be waked up by subsequent tcp_data_snd_check().
3457          * This function is not for random using!
3458          */
3459     } else {
3460         unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
3461
3462         when = tcp_clamp_probe0_to_user_timeout(sk, when);
3463         tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
3464     }
3465 }
3466
3467 static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
3468 {
3469     return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3470         inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3471 }
3472
3473 /* Decide wheather to run the increase function of congestion control. */
3474 static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3475 {
3476     /* If reordering is high then always grow cwnd whenever data is
3477      * delivered regardless of its ordering. Otherwise stay conservative
3478      * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
3479      * new SACK or ECE mark may first advance cwnd here and later reduce
3480      * cwnd in tcp_fastretrans_alert() based on more states.
3481      */
3482     if (tcp_sk(sk)->reordering >
3483         READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
3484         return flag & FLAG_FORWARD_PROGRESS;
3485
3486     return flag & FLAG_DATA_ACKED;
3487 }
3488
3489 /* The "ultimate" congestion control function that aims to replace the rigid
3490  * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
3491  * It's called toward the end of processing an ACK with precise rate
3492  * information. All transmission or retransmission are delayed afterwards.
3493  */
3494 static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
3495                  int flag, const struct rate_sample *rs)
3496 {
3497     const struct inet_connection_sock *icsk = inet_csk(sk);
3498
3499     if (icsk->icsk_ca_ops->cong_control) {
3500         icsk->icsk_ca_ops->cong_control(sk, rs);
3501         return;
3502     }
3503
3504     if (tcp_in_cwnd_reduction(sk)) {
3505         /* Reduce cwnd if state mandates */
3506         tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag);
3507     } else if (tcp_may_raise_cwnd(sk, flag)) {
3508         /* Advance cwnd if state allows */
3509         tcp_cong_avoid(sk, ack, acked_sacked);
3510     }
3511     tcp_update_pacing_rate(sk);
3512 }
3513
3514 /* Check that window update is acceptable.
3515  * The function assumes that snd_una<=ack<=snd_next.
3516  */
3517 static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3518                     const u32 ack, const u32 ack_seq,
3519                     const u32 nwin)
3520 {
3521     return  after(ack, tp->snd_una) ||
3522         after(ack_seq, tp->snd_wl1) ||
3523         (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3524 }
3525
3526 /* If we update tp->snd_una, also update tp->bytes_acked */
3527 static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
3528 {
3529     u32 delta = ack - tp->snd_una;
3530
3531     sock_owned_by_me((struct sock *)tp);
3532     tp->bytes_acked += delta;
3533     tp->snd_una = ack;
3534 }
3535
3536 /* If we update tp->rcv_nxt, also update tp->bytes_received */
3537 static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
3538 {
3539     u32 delta = seq - tp->rcv_nxt;
3540
3541     sock_owned_by_me((struct sock *)tp);
3542     tp->bytes_received += delta;
3543     WRITE_ONCE(tp->rcv_nxt, seq);
3544 }
3545
3546 /* Update our send window.
3547  *
3548  * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
3549  * and in FreeBSD. NetBSD's one is even worse.) is wrong.
3550  */
3551 static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
3552                  u32 ack_seq)
3553 {
3554     struct tcp_sock *tp = tcp_sk(sk);
3555     int flag = 0;
3556     u32 nwin = ntohs(tcp_hdr(skb)->window);
3557
3558     if (likely(!tcp_hdr(skb)->syn))
3559         nwin <<= tp->rx_opt.snd_wscale;
3560
3561     if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
3562         flag |= FLAG_WIN_UPDATE;
3563         tcp_update_wl(tp, ack_seq);
3564
3565         if (tp->snd_wnd != nwin) {
3566             tp->snd_wnd = nwin;
3567
3568             /* Note, it is the only place, where
3569              * fast path is recovered for sending TCP.
3570              */
3571             tp->pred_flags = 0;
3572             tcp_fast_path_check(sk);
3573
3574             if (!tcp_write_queue_empty(sk))
3575                 tcp_slow_start_after_idle_check(sk);
3576
3577             if (nwin > tp->max_window) {
3578                 tp->max_window = nwin;
3579                 tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
3580             }
3581         }
3582     }
3583
3584     tcp_snd_una_update(tp, ack);
3585
3586     return flag;
3587 }
3588
3589 static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
3590                    u32 *last_oow_ack_time)
3591 {
3592     if (*last_oow_ack_time) {
3593         s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
3594
3595         if (0 <= elapsed &&
3596             elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
3597             NET_INC_STATS(net, mib_idx);
3598             return true;    /* rate-limited: don't send yet! */
3599         }
3600     }
3601
3602     *last_oow_ack_time = tcp_jiffies32;
3603
3604     return false;   /* not rate-limited: go ahead, send dupack now! */
3605 }
3606
3607 /* Return true if we're currently rate-limiting out-of-window ACKs and
3608  * thus shouldn't send a dupack right now. We rate-limit dupacks in
3609  * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
3610  * attacks that send repeated SYNs or ACKs for the same connection. To
3611  * do this, we do not send a duplicate SYNACK or ACK if the remote
3612  * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
3613  */
3614 bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
3615               int mib_idx, u32 *last_oow_ack_time)
3616 {
3617     /* Data packets without SYNs are not likely part of an ACK loop. */
3618     if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
3619         !tcp_hdr(skb)->syn)
3620         return false;
3621
3622     return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
3623 }
3624
3625 /* RFC 5961 7 [ACK Throttling] */
3626 static void tcp_send_challenge_ack(struct sock *sk)
3627 {
3628     struct tcp_sock *tp = tcp_sk(sk);
3629     struct net *net = sock_net(sk);
3630     u32 count, now, ack_limit;
3631
3632     /* First check our per-socket dupack rate limit. */
3633     if (__tcp_oow_rate_limited(net,
3634                    LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
3635                    &tp->last_oow_ack_time))
3636         return;
3637
3638     ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
3639     if (ack_limit == INT_MAX)
3640         goto send_ack;
3641
3642     /* Then check host-wide RFC 5961 rate limit. */
3643     now = jiffies / HZ;
3644     if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) {
3645         u32 half = (ack_limit + 1) >> 1;
3646
3647         WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now);
3648         WRITE_ONCE(net->ipv4.tcp_challenge_count, half + prandom_u32_max(ack_limit));
3649     }
3650     count = READ_ONCE(net->ipv4.tcp_challenge_count);
3651     if (count > 0) {
3652         WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
3653 send_ack:
3654         NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
3655         tcp_send_ack(sk);
3656     }
3657 }
3658
3659 static void tcp_store_ts_recent(struct tcp_sock *tp)
3660 {
3661     tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3662     tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
3663 }
3664
3665 static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3666 {
3667     if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3668         /* PAWS bug workaround wrt. ACK frames, the PAWS discard
3669          * extra check below makes sure this can only happen
3670          * for pure ACK frames.  -DaveM
3671          *
3672          * Not only, also it occurs for expired timestamps.
3673          */
3674
3675         if (tcp_paws_check(&tp->rx_opt, 0))
3676             tcp_store_ts_recent(tp);
3677     }
3678 }
3679
3680 /* This routine deals with acks during a TLP episode and ends an episode by
3681  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
3682  */
3683 static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3684 {
3685     struct tcp_sock *tp = tcp_sk(sk);
3686
3687     if (before(ack, tp->tlp_high_seq))
3688         return;
3689
3690     if (!tp->tlp_retrans) {
3691         /* TLP of new data has been acknowledged */
3692         tp->tlp_high_seq = 0;
3693     } else if (flag & FLAG_DSACK_TLP) {
3694         /* This DSACK means original and TLP probe arrived; no loss */
3695         tp->tlp_high_seq = 0;
3696     } else if (after(ack, tp->tlp_high_seq)) {
3697         /* ACK advances: there was a loss, so reduce cwnd. Reset
3698          * tlp_high_seq in tcp_init_cwnd_reduction()
3699          */
3700         tcp_init_cwnd_reduction(sk);
3701         tcp_set_ca_state(sk, TCP_CA_CWR);
3702         tcp_end_cwnd_reduction(sk);
3703         tcp_try_keep_open(sk);
3704         NET_INC_STATS(sock_net(sk),
3705                 LINUX_MIB_TCPLOSSPROBERECOVERY);
3706     } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
3707                  FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
3708         /* Pure dupack: original and TLP probe arrived; no loss */
3709         tp->tlp_high_seq = 0;
3710     }
3711 }
3712
3713 static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
3714 {
3715     const struct inet_connection_sock *icsk = inet_csk(sk);
3716
3717     if (icsk->icsk_ca_ops->in_ack_event)
3718         icsk->icsk_ca_ops->in_ack_event(sk, flags);
3719 }
3720
3721 /* Congestion control has updated the cwnd already. So if we're in
3722  * loss recovery then now we do any new sends (for FRTO) or
3723  * retransmits (for CA_Loss or CA_recovery) that make sense.
3724  */
3725 static void tcp_xmit_recovery(struct sock *sk, int rexmit)
3726 {
3727     struct tcp_sock *tp = tcp_sk(sk);
3728
3729     if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
3730         return;
3731
3732     if (unlikely(rexmit == REXMIT_NEW)) {
3733         __tcp_push_pending_frames(sk, tcp_current_mss(sk),
3734                       TCP_NAGLE_OFF);
3735         if (after(tp->snd_nxt, tp->high_seq))
3736             return;
3737         tp->frto = 0;
3738     }
3739     tcp_xmit_retransmit_queue(sk);
3740 }
3741
3742 /* Returns the number of packets newly acked or sacked by the current ACK */
3743 static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
3744 {
3745     const struct net *net = sock_net(sk);
3746     struct tcp_sock *tp = tcp_sk(sk);
3747     u32 delivered;
3748
3749     delivered = tp->delivered - prior_delivered;
3750     NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
3751     if (flag & FLAG_ECE)
3752         NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
3753
3754     return delivered;
3755 }
3756
3757 /* This routine deals with incoming acks, but not outgoing ones. */
3758 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3759 {
3760     struct inet_connection_sock *icsk = inet_csk(sk);
3761     struct tcp_sock *tp = tcp_sk(sk);
3762     struct tcp_sacktag_state sack_state;
3763     struct rate_sample rs = { .prior_delivered = 0 };
3764     u32 prior_snd_una = tp->snd_una;
3765     bool is_sack_reneg = tp->is_sack_reneg;
3766     u32 ack_seq = TCP_SKB_CB(skb)->seq;
3767     u32 ack = TCP_SKB_CB(skb)->ack_seq;
3768     int num_dupack = 0;
3769     int prior_packets = tp->packets_out;
3770     u32 delivered = tp->delivered;
3771     u32 lost = tp->lost;
3772     int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
3773     u32 prior_fack;
3774
3775     sack_state.first_sackt = 0;
3776     sack_state.rate = &rs;
3777     sack_state.sack_delivered = 0;
3778
3779     /* We very likely will need to access rtx queue. */
3780     prefetch(sk->tcp_rtx_queue.rb_node);
3781
3782     /* If the ack is older than previous acks
3783      * then we can probably ignore it.
3784      */
3785     if (before(ack, prior_snd_una)) {
3786         /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
3787         if (before(ack, prior_snd_una - tp->max_window)) {
3788             if (!(flag & FLAG_NO_CHALLENGE_ACK))
3789                 tcp_send_challenge_ack(sk);
3790             return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
3791         }
3792         goto old_ack;
3793     }
3794
3795     /* If the ack includes data we haven't sent yet, discard
3796      * this segment (RFC793 Section 3.9).
3797      */
3798     if (after(ack, tp->snd_nxt))
3799         return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA;
3800
3801     if (after(ack, prior_snd_una)) {
3802         flag |= FLAG_SND_UNA_ADVANCED;
3803         icsk->icsk_retransmits = 0;
3804
3805 #if IS_ENABLED(CONFIG_TLS_DEVICE)
3806         if (static_branch_unlikely(&clean_acked_data_enabled.key))
3807             if (icsk->icsk_clean_acked)
3808                 icsk->icsk_clean_acked(sk, ack);
3809 #endif
3810     }
3811
3812     prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
3813     rs.prior_in_flight = tcp_packets_in_flight(tp);
3814
3815     /* ts_recent update must be made after we are sure that the packet
3816      * is in window.
3817      */
3818     if (flag & FLAG_UPDATE_TS_RECENT)
3819         tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3820
3821     if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
3822         FLAG_SND_UNA_ADVANCED) {
3823         /* Window is constant, pure forward advance.
3824          * No more checks are required.
3825          * Note, we use the fact that SND.UNA>=SND.WL2.
3826          */
3827         tcp_update_wl(tp, ack_seq);
3828         tcp_snd_una_update(tp, ack);
3829         flag |= FLAG_WIN_UPDATE;
3830
3831         tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3832
3833         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
3834     } else {
3835         u32 ack_ev_flags = CA_ACK_SLOWPATH;
3836
3837         if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3838             flag |= FLAG_DATA;
3839         else
3840             NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
3841
3842         flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
3843
3844         if (TCP_SKB_CB(skb)->sacked)
3845             flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3846                             &sack_state);
3847
3848         if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
3849             flag |= FLAG_ECE;
3850             ack_ev_flags |= CA_ACK_ECE;
3851         }
3852
3853         if (sack_state.sack_delivered)
3854             tcp_count_delivered(tp, sack_state.sack_delivered,
3855                         flag & FLAG_ECE);
3856
3857         if (flag & FLAG_WIN_UPDATE)
3858             ack_ev_flags |= CA_ACK_WIN_UPDATE;
3859
3860         tcp_in_ack_event(sk, ack_ev_flags);
3861     }
3862
3863     /* This is a deviation from RFC3168 since it states that:
3864      * "When the TCP data sender is ready to set the CWR bit after reducing
3865      * the congestion window, it SHOULD set the CWR bit only on the first
3866      * new data packet that it transmits."
3867      * We accept CWR on pure ACKs to be more robust
3868      * with widely-deployed TCP implementations that do this.
3869      */
3870     tcp_ecn_accept_cwr(sk, skb);
3871
3872     /* We passed data and got it acked, remove any soft error
3873      * log. Something worked...
3874      */
3875     sk->sk_err_soft = 0;
3876     icsk->icsk_probes_out = 0;
3877     tp->rcv_tstamp = tcp_jiffies32;
3878     if (!prior_packets)
3879         goto no_queue;
3880
3881     /* See if we can take anything off of the retransmit queue. */
3882     flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una,
3883                     &sack_state, flag & FLAG_ECE);
3884
3885     tcp_rack_update_reo_wnd(sk, &rs);
3886
3887     if (tp->tlp_high_seq)
3888         tcp_process_tlp_ack(sk, ack, flag);
3889
3890     if (tcp_ack_is_dubious(sk, flag)) {
3891         if (!(flag & (FLAG_SND_UNA_ADVANCED |
3892                   FLAG_NOT_DUP | FLAG_DSACKING_ACK))) {
3893             num_dupack = 1;
3894             /* Consider if pure acks were aggregated in tcp_add_backlog() */
3895             if (!(flag & FLAG_DATA))
3896                 num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
3897         }
3898         tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3899                       &rexmit);
3900     }
3901
3902     /* If needed, reset TLP/RTO timer when RACK doesn't set. */
3903     if (flag & FLAG_SET_XMIT_TIMER)
3904         tcp_set_xmit_timer(sk);
3905
3906     if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3907         sk_dst_confirm(sk);
3908
3909     delivered = tcp_newly_delivered(sk, delivered, flag);
3910     lost = tp->lost - lost;         /* freshly marked lost */
3911     rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
3912     tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
3913     tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
3914     tcp_xmit_recovery(sk, rexmit);
3915     return 1;
3916
3917 no_queue:
3918     /* If data was DSACKed, see if we can undo a cwnd reduction. */
3919     if (flag & FLAG_DSACKING_ACK) {
3920         tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3921                       &rexmit);
3922         tcp_newly_delivered(sk, delivered, flag);
3923     }
3924     /* If this ack opens up a zero window, clear backoff.  It was
3925      * being used to time the probes, and is probably far higher than
3926      * it needs to be for normal retransmission.
3927      */
3928     tcp_ack_probe(sk);
3929
3930     if (tp->tlp_high_seq)
3931         tcp_process_tlp_ack(sk, ack, flag);
3932     return 1;
3933
3934 old_ack:
3935     /* If data was SACKed, tag it and see if we should send more data.
3936      * If data was DSACKed, see if we can undo a cwnd reduction.
3937      */
3938     if (TCP_SKB_CB(skb)->sacked) {
3939         flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3940                         &sack_state);
3941         tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3942                       &rexmit);
3943         tcp_newly_delivered(sk, delivered, flag);
3944         tcp_xmit_recovery(sk, rexmit);
3945     }
3946
3947     return 0;
3948 }
3949
3950 static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
3951                       bool syn, struct tcp_fastopen_cookie *foc,
3952                       bool exp_opt)
3953 {
3954     /* Valid only in SYN or SYN-ACK with an even length.  */
3955     if (!foc || !syn || len < 0 || (len & 1))
3956         return;
3957
3958     if (len >= TCP_FASTOPEN_COOKIE_MIN &&
3959         len <= TCP_FASTOPEN_COOKIE_MAX)
3960         memcpy(foc->val, cookie, len);
3961     else if (len != 0)
3962         len = -1;
3963     foc->len = len;
3964     foc->exp = exp_opt;
3965 }
3966
3967 static bool smc_parse_options(const struct tcphdr *th,
3968                   struct tcp_options_received *opt_rx,
3969                   const unsigned char *ptr,
3970                   int opsize)
3971 {
3972 #if IS_ENABLED(CONFIG_SMC)
3973     if (static_branch_unlikely(&tcp_have_smc)) {
3974         if (th->syn && !(opsize & 1) &&
3975             opsize >= TCPOLEN_EXP_SMC_BASE &&
3976             get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
3977             opt_rx->smc_ok = 1;
3978             return true;
3979         }
3980     }
3981 #endif
3982     return false;
3983 }
3984
3985 /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
3986  * value on success.
3987  */
3988 u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
3989 {
3990     const unsigned char *ptr = (const unsigned char *)(th + 1);
3991     int length = (th->doff * 4) - sizeof(struct tcphdr);
3992     u16 mss = 0;
3993
3994     while (length > 0) {
3995         int opcode = *ptr++;
3996         int opsize;
3997
3998         switch (opcode) {
3999         case TCPOPT_EOL:
4000             return mss;
4001         case TCPOPT_NOP:    /* Ref: RFC 793 section 3.1 */
4002             length--;
4003             continue;
4004         default:
4005             if (length < 2)
4006                 return mss;
4007             opsize = *ptr++;
4008             if (opsize < 2) /* "silly options" */
4009                 return mss;
4010             if (opsize > length)
4011                 return mss; /* fail on partial options */
4012             if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
4013                 u16 in_mss = get_unaligned_be16(ptr);
4014
4015                 if (in_mss) {
4016                     if (user_mss && user_mss < in_mss)
4017                         in_mss = user_mss;
4018                     mss = in_mss;
4019                 }
4020             }
4021             ptr += opsize - 2;
4022             length -= opsize;
4023         }
4024     }
4025     return mss;
4026 }
4027 EXPORT_SYMBOL_GPL(tcp_parse_mss_option);
4028
4029 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
4030  * But, this can also be called on packets in the established flow when
4031  * the fast version below fails.
4032  */
4033 void tcp_parse_options(const struct net *net,
4034                const struct sk_buff *skb,
4035                struct tcp_options_received *opt_rx, int estab,
4036                struct tcp_fastopen_cookie *foc)
4037 {
4038     const unsigned char *ptr;
4039     const struct tcphdr *th = tcp_hdr(skb);
4040     int length = (th->doff * 4) - sizeof(struct tcphdr);
4041
4042     ptr = (const unsigned char *)(th + 1);
4043     opt_rx->saw_tstamp = 0;
4044     opt_rx->saw_unknown = 0;
4045
4046     while (length > 0) {
4047         int opcode = *ptr++;
4048         int opsize;
4049
4050         switch (opcode) {
4051         case TCPOPT_EOL:
4052             return;
4053         case TCPOPT_NOP:    /* Ref: RFC 793 section 3.1 */
4054             length--;
4055             continue;
4056         default:
4057             if (length < 2)
4058                 return;
4059             opsize = *ptr++;
4060             if (opsize < 2) /* "silly options" */
4061                 return;
4062             if (opsize > length)
4063                 return; /* don't parse partial options */
4064             switch (opcode) {
4065             case TCPOPT_MSS:
4066                 if (opsize == TCPOLEN_MSS && th->syn && !estab) {
4067                     u16 in_mss = get_unaligned_be16(ptr);
4068                     if (in_mss) {
4069                         if (opt_rx->user_mss &&
4070                             opt_rx->user_mss < in_mss)
4071                             in_mss = opt_rx->user_mss;
4072                         opt_rx->mss_clamp = in_mss;
4073                     }
4074                 }
4075                 break;
4076             case TCPOPT_WINDOW:
4077                 if (opsize == TCPOLEN_WINDOW && th->syn &&
4078                     !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
4079                     __u8 snd_wscale = *(__u8 *)ptr;
4080                     opt_rx->wscale_ok = 1;
4081                     if (snd_wscale > TCP_MAX_WSCALE) {
4082                         net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
4083                                      __func__,
4084                                      snd_wscale,
4085                                      TCP_MAX_WSCALE);
4086                         snd_wscale = TCP_MAX_WSCALE;
4087                     }
4088                     opt_rx->snd_wscale = snd_wscale;
4089                 }
4090                 break;
4091             case TCPOPT_TIMESTAMP:
4092                 if ((opsize == TCPOLEN_TIMESTAMP) &&
4093                     ((estab && opt_rx->tstamp_ok) ||
4094                      (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
4095                     opt_rx->saw_tstamp = 1;
4096                     opt_rx->rcv_tsval = get_unaligned_be32(ptr);
4097                     opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
4098                 }
4099                 break;
4100             case TCPOPT_SACK_PERM:
4101                 if (opsize == TCPOLEN_SACK_PERM && th->syn &&
4102                     !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
4103                     opt_rx->sack_ok = TCP_SACK_SEEN;
4104                     tcp_sack_reset(opt_rx);
4105                 }
4106                 break;
4107
4108             case TCPOPT_SACK:
4109                 if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
4110                    !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
4111                    opt_rx->sack_ok) {
4112                     TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
4113                 }
4114                 break;
4115 #ifdef CONFIG_TCP_MD5SIG
4116             case TCPOPT_MD5SIG:
4117                 /*
4118                  * The MD5 Hash has already been
4119                  * checked (see tcp_v{4,6}_do_rcv()).
4120                  */
4121                 break;
4122 #endif
4123             case TCPOPT_FASTOPEN:
4124                 tcp_parse_fastopen_option(
4125                     opsize - TCPOLEN_FASTOPEN_BASE,
4126                     ptr, th->syn, foc, false);
4127                 break;
4128
4129             case TCPOPT_EXP:
4130                 /* Fast Open option shares code 254 using a
4131                  * 16 bits magic number.
4132                  */
4133                 if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
4134                     get_unaligned_be16(ptr) ==
4135                     TCPOPT_FASTOPEN_MAGIC) {
4136                     tcp_parse_fastopen_option(opsize -
4137                         TCPOLEN_EXP_FASTOPEN_BASE,
4138                         ptr + 2, th->syn, foc, true);
4139                     break;
4140                 }
4141
4142                 if (smc_parse_options(th, opt_rx, ptr, opsize))
4143                     break;
4144
4145                 opt_rx->saw_unknown = 1;
4146                 break;
4147
4148             default:
4149                 opt_rx->saw_unknown = 1;
4150             }
4151             ptr += opsize-2;
4152             length -= opsize;
4153         }
4154     }
4155 }
4156 EXPORT_SYMBOL(tcp_parse_options);
4157
4158 static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
4159 {
4160     const __be32 *ptr = (const __be32 *)(th + 1);
4161
4162     if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
4163               | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
4164         tp->rx_opt.saw_tstamp = 1;
4165         ++ptr;
4166         tp->rx_opt.rcv_tsval = ntohl(*ptr);
4167         ++ptr;
4168         if (*ptr)
4169             tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
4170         else
4171             tp->rx_opt.rcv_tsecr = 0;
4172         return true;
4173     }
4174     return false;
4175 }
4176
4177 /* Fast parse options. This hopes to only see timestamps.
4178  * If it is wrong it falls back on tcp_parse_options().
4179  */
4180 static bool tcp_fast_parse_options(const struct net *net,
4181                    const struct sk_buff *skb,
4182                    const struct tcphdr *th, struct tcp_sock *tp)
4183 {
4184     /* In the spirit of fast parsing, compare doff directly to constant
4185      * values.  Because equality is used, short doff can be ignored here.
4186      */
4187     if (th->doff == (sizeof(*th) / 4)) {
4188         tp->rx_opt.saw_tstamp = 0;
4189         return false;
4190     } else if (tp->rx_opt.tstamp_ok &&
4191            th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
4192         if (tcp_parse_aligned_timestamp(tp, th))
4193             return true;
4194     }
4195
4196     tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
4197     if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
4198         tp->rx_opt.rcv_tsecr -= tp->tsoffset;
4199
4200     return true;
4201 }
4202
4203 #ifdef CONFIG_TCP_MD5SIG
4204 /*
4205  * Parse MD5 Signature option
4206  */
4207 const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
4208 {
4209     int length = (th->doff << 2) - sizeof(*th);
4210     const u8 *ptr = (const u8 *)(th + 1);
4211
4212     /* If not enough data remaining, we can short cut */
4213     while (length >= TCPOLEN_MD5SIG) {
4214         int opcode = *ptr++;
4215         int opsize;
4216
4217         switch (opcode) {
4218         case TCPOPT_EOL:
4219             return NULL;
4220         case TCPOPT_NOP:
4221             length--;
4222             continue;
4223         default:
4224             opsize = *ptr++;
4225             if (opsize < 2 || opsize > length)
4226                 return NULL;
4227             if (opcode == TCPOPT_MD5SIG)
4228                 return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
4229         }
4230         ptr += opsize - 2;
4231         length -= opsize;
4232     }
4233     return NULL;
4234 }
4235 EXPORT_SYMBOL(tcp_parse_md5sig_option);
4236 #endif
4237
4238 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
4239  *
4240  * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
4241  * it can pass through stack. So, the following predicate verifies that
4242  * this segment is not used for anything but congestion avoidance or
4243  * fast retransmit. Moreover, we even are able to eliminate most of such
4244  * second order effects, if we apply some small "replay" window (~RTO)
4245  * to timestamp space.
4246  *
4247  * All these measures still do not guarantee that we reject wrapped ACKs
4248  * on networks with high bandwidth, when sequence space is recycled fastly,
4249  * but it guarantees that such events will be very rare and do not affect
4250  * connection seriously. This doesn't look nice, but alas, PAWS is really
4251  * buggy extension.
4252  *
4253  * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
4254  * states that events when retransmit arrives after original data are rare.
4255  * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
4256  * the biggest problem on large power networks even with minor reordering.
4257  * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
4258  * up to bandwidth of 18Gigabit/sec. 8) ]
4259  */
4260
4261 static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
4262 {
4263     const struct tcp_sock *tp = tcp_sk(sk);
4264     const struct tcphdr *th = tcp_hdr(skb);
4265     u32 seq = TCP_SKB_CB(skb)->seq;
4266     u32 ack = TCP_SKB_CB(skb)->ack_seq;
4267
4268     return (/* 1. Pure ACK with correct sequence number. */
4269         (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
4270
4271         /* 2. ... and duplicate ACK. */
4272         ack == tp->snd_una &&
4273
4274         /* 3. ... and does not update window. */
4275         !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
4276
4277         /* 4. ... and sits in replay window. */
4278         (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
4279 }
4280
4281 static inline bool tcp_paws_discard(const struct sock *sk,
4282                    const struct sk_buff *skb)
4283 {
4284     const struct tcp_sock *tp = tcp_sk(sk);
4285
4286     return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
4287            !tcp_disordered_ack(sk, skb);
4288 }
4289
4290 /* Check segment sequence number for validity.
4291  *
4292  * Segment controls are considered valid, if the segment
4293  * fits to the window after truncation to the window. Acceptability
4294  * of data (and SYN, FIN, of course) is checked separately.
4295  * See tcp_data_queue(), for example.
4296  *
4297  * Also, controls (RST is main one) are accepted using RCV.WUP instead
4298  * of RCV.NXT. Peer still did not advance his SND.UNA when we
4299  * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
4300  * (borrowed from freebsd)
4301  */
4302
4303 static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
4304 {
4305     return  !before(end_seq, tp->rcv_wup) &&
4306         !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
4307 }
4308
4309 /* When we get a reset we do this. */
4310 void tcp_reset(struct sock *sk, struct sk_buff *skb)
4311 {
4312     trace_tcp_receive_reset(sk);
4313
4314     /* mptcp can't tell us to ignore reset pkts,
4315      * so just ignore the return value of mptcp_incoming_options().
4316      */
4317     if (sk_is_mptcp(sk))
4318         mptcp_incoming_options(sk, skb);
4319
4320     /* We want the right error as BSD sees it (and indeed as we do). */
4321     switch (sk->sk_state) {
4322     case TCP_SYN_SENT:
4323         sk->sk_err = ECONNREFUSED;
4324         break;
4325     case TCP_CLOSE_WAIT:
4326         sk->sk_err = EPIPE;
4327         break;
4328     case TCP_CLOSE:
4329         return;
4330     default:
4331         sk->sk_err = ECONNRESET;
4332     }
4333     /* This barrier is coupled with smp_rmb() in tcp_poll() */
4334     smp_wmb();
4335
4336     tcp_write_queue_purge(sk);
4337     tcp_done(sk);
4338
4339     if (!sock_flag(sk, SOCK_DEAD))
4340         sk_error_report(sk);
4341 }
4342
4343 /*
4344  *  Process the FIN bit. This now behaves as it is supposed to work
4345  *  and the FIN takes effect when it is validly part of sequence
4346  *  space. Not before when we get holes.
4347  *
4348  *  If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
4349  *  (and thence onto LAST-ACK and finally, CLOSE, we never enter
4350  *  TIME-WAIT)
4351  *
4352  *  If we are in FINWAIT-1, a received FIN indicates simultaneous
4353  *  close and we go into CLOSING (and later onto TIME-WAIT)
4354  *
4355  *  If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
4356  */
4357 void tcp_fin(struct sock *sk)
4358 {
4359     struct tcp_sock *tp = tcp_sk(sk);
4360
4361     inet_csk_schedule_ack(sk);
4362
4363     sk->sk_shutdown |= RCV_SHUTDOWN;
4364     sock_set_flag(sk, SOCK_DONE);
4365
4366     switch (sk->sk_state) {
4367     case TCP_SYN_RECV:
4368     case TCP_ESTABLISHED:
4369         /* Move to CLOSE_WAIT */
4370         tcp_set_state(sk, TCP_CLOSE_WAIT);
4371         inet_csk_enter_pingpong_mode(sk);
4372         break;
4373
4374     case TCP_CLOSE_WAIT:
4375     case TCP_CLOSING:
4376         /* Received a retransmission of the FIN, do
4377          * nothing.
4378          */
4379         break;
4380     case TCP_LAST_ACK:
4381         /* RFC793: Remain in the LAST-ACK state. */
4382         break;
4383
4384     case TCP_FIN_WAIT1:
4385         /* This case occurs when a simultaneous close
4386          * happens, we must ack the received FIN and
4387          * enter the CLOSING state.
4388          */
4389         tcp_send_ack(sk);
4390         tcp_set_state(sk, TCP_CLOSING);
4391         break;
4392     case TCP_FIN_WAIT2:
4393         /* Received a FIN -- send ACK and enter TIME_WAIT. */
4394         tcp_send_ack(sk);
4395         tcp_time_wait(sk, TCP_TIME_WAIT, 0);
4396         break;
4397     default:
4398         /* Only TCP_LISTEN and TCP_CLOSE are left, in these
4399          * cases we should never reach this piece of code.
4400          */
4401         pr_err("%s: Impossible, sk->sk_state=%d\n",
4402                __func__, sk->sk_state);
4403         break;
4404     }
4405
4406     /* It _is_ possible, that we have something out-of-order _after_ FIN.
4407      * Probably, we should reset in this case. For now drop them.
4408      */
4409     skb_rbtree_purge(&tp->out_of_order_queue);
4410     if (tcp_is_sack(tp))
4411         tcp_sack_reset(&tp->rx_opt);
4412
4413     if (!sock_flag(sk, SOCK_DEAD)) {
4414         sk->sk_state_change(sk);
4415
4416         /* Do not send POLL_HUP for half duplex close. */
4417         if (sk->sk_shutdown == SHUTDOWN_MASK ||
4418             sk->sk_state == TCP_CLOSE)
4419             sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
4420         else
4421             sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
4422     }
4423 }
4424
4425 static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4426                   u32 end_seq)
4427 {
4428     if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
4429         if (before(seq, sp->start_seq))
4430             sp->start_seq = seq;
4431         if (after(end_seq, sp->end_seq))
4432             sp->end_seq = end_seq;
4433         return true;
4434     }
4435     return false;
4436 }
4437
4438 static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4439 {
4440     struct tcp_sock *tp = tcp_sk(sk);
4441
4442     if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
4443         int mib_idx;
4444
4445         if (before(seq, tp->rcv_nxt))
4446             mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
4447         else
4448             mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
4449
4450         NET_INC_STATS(sock_net(sk), mib_idx);
4451
4452         tp->rx_opt.dsack = 1;
4453         tp->duplicate_sack[0].start_seq = seq;
4454         tp->duplicate_sack[0].end_seq = end_seq;
4455     }
4456 }
4457
4458 static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
4459 {
4460     struct tcp_sock *tp = tcp_sk(sk);
4461
4462     if (!tp->rx_opt.dsack)
4463         tcp_dsack_set(sk, seq, end_seq);
4464     else
4465         tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
4466 }
4467
4468 static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
4469 {
4470     /* When the ACK path fails or drops most ACKs, the sender would
4471      * timeout and spuriously retransmit the same segment repeatedly.
4472      * The receiver remembers and reflects via DSACKs. Leverage the
4473      * DSACK state and change the txhash to re-route speculatively.
4474      */
4475     if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
4476         sk_rethink_txhash(sk))
4477         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
4478 }
4479
4480 static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
4481 {
4482     struct tcp_sock *tp = tcp_sk(sk);
4483
4484     if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4485         before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4486         NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4487         tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
4488
4489         if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
4490             u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4491
4492             tcp_rcv_spurious_retrans(sk, skb);
4493             if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
4494                 end_seq = tp->rcv_nxt;
4495             tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
4496         }
4497     }
4498
4499     tcp_send_ack(sk);
4500 }
4501
4502 /* These routines update the SACK block as out-of-order packets arrive or
4503  * in-order packets close up the sequence space.
4504  */
4505 static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4506 {
4507     int this_sack;
4508     struct tcp_sack_block *sp = &tp->selective_acks[0];
4509     struct tcp_sack_block *swalk = sp + 1;
4510
4511     /* See if the recent change to the first SACK eats into
4512      * or hits the sequence space of other SACK blocks, if so coalesce.
4513      */
4514     for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
4515         if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
4516             int i;
4517
4518             /* Zap SWALK, by moving every further SACK up by one slot.
4519              * Decrease num_sacks.
4520              */
4521             tp->rx_opt.num_sacks--;
4522             for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
4523                 sp[i] = sp[i + 1];
4524             continue;
4525         }
4526         this_sack++;
4527         swalk++;
4528     }
4529 }
4530
4531 static void tcp_sack_compress_send_ack(struct sock *sk)
4532 {
4533     struct tcp_sock *tp = tcp_sk(sk);
4534
4535     if (!tp->compressed_ack)
4536         return;
4537
4538     if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
4539         __sock_put(sk);
4540
4541     /* Since we have to send one ack finally,
4542      * substract one from tp->compressed_ack to keep
4543      * LINUX_MIB_TCPACKCOMPRESSED accurate.
4544      */
4545     NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
4546               tp->compressed_ack - 1);
4547
4548     tp->compressed_ack = 0;
4549     tcp_send_ack(sk);
4550 }
4551
4552 /* Reasonable amount of sack blocks included in TCP SACK option
4553  * The max is 4, but this becomes 3 if TCP timestamps are there.
4554  * Given that SACK packets might be lost, be conservative and use 2.
4555  */
4556 #define TCP_SACK_BLOCKS_EXPECTED 2
4557
4558 static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4559 {
4560     struct tcp_sock *tp = tcp_sk(sk);
4561     struct tcp_sack_block *sp = &tp->selective_acks[0];
4562     int cur_sacks = tp->rx_opt.num_sacks;
4563     int this_sack;
4564
4565     if (!cur_sacks)
4566         goto new_sack;
4567
4568     for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
4569         if (tcp_sack_extend(sp, seq, end_seq)) {
4570             if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
4571                 tcp_sack_compress_send_ack(sk);
4572             /* Rotate this_sack to the first one. */
4573             for (; this_sack > 0; this_sack--, sp--)
4574                 swap(*sp, *(sp - 1));
4575             if (cur_sacks > 1)
4576                 tcp_sack_maybe_coalesce(tp);
4577             return;
4578         }
4579     }
4580
4581     if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
4582         tcp_sack_compress_send_ack(sk);
4583
4584     /* Could not find an adjacent existing SACK, build a new one,
4585      * put it at the front, and shift everyone else down.  We
4586      * always know there is at least one SACK present already here.
4587      *
4588      * If the sack array is full, forget about the last one.
4589      */
4590     if (this_sack >= TCP_NUM_SACKS) {
4591         this_sack--;
4592         tp->rx_opt.num_sacks--;
4593         sp--;
4594     }
4595     for (; this_sack > 0; this_sack--, sp--)
4596         *sp = *(sp - 1);
4597
4598 new_sack:
4599     /* Build the new head SACK, and we're done. */
4600     sp->start_seq = seq;
4601     sp->end_seq = end_seq;
4602     tp->rx_opt.num_sacks++;
4603 }
4604
4605 /* RCV.NXT advances, some SACKs should be eaten. */
4606
4607 static void tcp_sack_remove(struct tcp_sock *tp)
4608 {
4609     struct tcp_sack_block *sp = &tp->selective_acks[0];
4610     int num_sacks = tp->rx_opt.num_sacks;
4611     int this_sack;
4612
4613     /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
4614     if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4615         tp->rx_opt.num_sacks = 0;
4616         return;
4617     }
4618
4619     for (this_sack = 0; this_sack < num_sacks;) {
4620         /* Check if the start of the sack is covered by RCV.NXT. */
4621         if (!before(tp->rcv_nxt, sp->start_seq)) {
4622             int i;
4623
4624             /* RCV.NXT must cover all the block! */
4625             WARN_ON(before(tp->rcv_nxt, sp->end_seq));
4626
4627             /* Zap this SACK, by moving forward any other SACKS. */
4628             for (i = this_sack+1; i < num_sacks; i++)
4629                 tp->selective_acks[i-1] = tp->selective_acks[i];
4630             num_sacks--;
4631             continue;
4632         }
4633         this_sack++;
4634         sp++;
4635     }
4636     tp->rx_opt.num_sacks = num_sacks;
4637 }
4638
4639 /**
4640  * tcp_try_coalesce - try to merge skb to prior one
4641  * @sk: socket
4642  * @to: prior buffer
4643  * @from: buffer to add in queue
4644  * @fragstolen: pointer to boolean
4645  *
4646  * Before queueing skb @from after @to, try to merge them
4647  * to reduce overall memory use and queue lengths, if cost is small.
4648  * Packets in ofo or receive queues can stay a long time.
4649  * Better try to coalesce them right now to avoid future collapses.
4650  * Returns true if caller should free @from instead of queueing it
4651  */
4652 static bool tcp_try_coalesce(struct sock *sk,
4653                  struct sk_buff *to,
4654                  struct sk_buff *from,
4655                  bool *fragstolen)
4656 {
4657     int delta;
4658
4659     *fragstolen = false;
4660
4661     /* Its possible this segment overlaps with prior segment in queue */
4662     if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4663         return false;
4664
4665     if (!mptcp_skb_can_collapse(to, from))
4666         return false;
4667
4668 #ifdef CONFIG_TLS_DEVICE
4669     if (from->decrypted != to->decrypted)
4670         return false;
4671 #endif
4672
4673     if (!skb_try_coalesce(to, from, fragstolen, &delta))
4674         return false;
4675
4676     atomic_add(delta, &sk->sk_rmem_alloc);
4677     sk_mem_charge(sk, delta);
4678     NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4679     TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4680     TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4681     TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
4682
4683     if (TCP_SKB_CB(from)->has_rxtstamp) {
4684         TCP_SKB_CB(to)->has_rxtstamp = true;
4685         to->tstamp = from->tstamp;
4686         skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
4687     }
4688
4689     return true;
4690 }
4691
4692 static bool tcp_ooo_try_coalesce(struct sock *sk,
4693                  struct sk_buff *to,
4694                  struct sk_buff *from,
4695                  bool *fragstolen)
4696 {
4697     bool res = tcp_try_coalesce(sk, to, from, fragstolen);
4698
4699     /* In case tcp_drop_reason() is called later, update to->gso_segs */
4700     if (res) {
4701         u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
4702                    max_t(u16, 1, skb_shinfo(from)->gso_segs);
4703
4704         skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
4705     }
4706     return res;
4707 }
4708
4709 static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb,
4710                 enum skb_drop_reason reason)
4711 {
4712     sk_drops_add(sk, skb);
4713     kfree_skb_reason(skb, reason);
4714 }
4715
4716 /* This one checks to see if we can put data from the
4717  * out_of_order queue into the receive_queue.
4718  */
4719 static void tcp_ofo_queue(struct sock *sk)
4720 {
4721     struct tcp_sock *tp = tcp_sk(sk);
4722     __u32 dsack_high = tp->rcv_nxt;
4723     bool fin, fragstolen, eaten;
4724     struct sk_buff *skb, *tail;
4725     struct rb_node *p;
4726
4727     p = rb_first(&tp->out_of_order_queue);
4728     while (p) {
4729         skb = rb_to_skb(p);
4730         if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4731             break;
4732
4733         if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
4734             __u32 dsack = dsack_high;
4735             if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
4736                 dsack_high = TCP_SKB_CB(skb)->end_seq;
4737             tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4738         }
4739         p = rb_next(p);
4740         rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4741
4742         if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4743             tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_DROP);
4744             continue;
4745         }
4746
4747         tail = skb_peek_tail(&sk->sk_receive_queue);
4748         eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4749         tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4750         fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4751         if (!eaten)
4752             __skb_queue_tail(&sk->sk_receive_queue, skb);
4753         else
4754             kfree_skb_partial(skb, fragstolen);
4755
4756         if (unlikely(fin)) {
4757             tcp_fin(sk);
4758             /* tcp_fin() purges tp->out_of_order_queue,
4759              * so we must end this loop right now.
4760              */
4761             break;
4762         }
4763     }
4764 }
4765
4766 static bool tcp_prune_ofo_queue(struct sock *sk);
4767 static int tcp_prune_queue(struct sock *sk);
4768
4769 static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4770                  unsigned int size)
4771 {
4772     if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4773         !sk_rmem_schedule(sk, skb, size)) {
4774
4775         if (tcp_prune_queue(sk) < 0)
4776             return -1;
4777
4778         while (!sk_rmem_schedule(sk, skb, size)) {
4779             if (!tcp_prune_ofo_queue(sk))
4780                 return -1;
4781         }
4782     }
4783     return 0;
4784 }
4785
4786 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4787 {
4788     struct tcp_sock *tp = tcp_sk(sk);
4789     struct rb_node **p, *parent;
4790     struct sk_buff *skb1;
4791     u32 seq, end_seq;
4792     bool fragstolen;
4793
4794     tcp_ecn_check_ce(sk, skb);
4795
4796     if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4797         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
4798         sk->sk_data_ready(sk);
4799         tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM);
4800         return;
4801     }
4802
4803     /* Disable header prediction. */
4804     tp->pred_flags = 0;
4805     inet_csk_schedule_ack(sk);
4806
4807     tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
4808     NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4809     seq = TCP_SKB_CB(skb)->seq;
4810     end_seq = TCP_SKB_CB(skb)->end_seq;
4811
4812     p = &tp->out_of_order_queue.rb_node;
4813     if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4814         /* Initial out of order segment, build 1 SACK. */
4815         if (tcp_is_sack(tp)) {
4816             tp->rx_opt.num_sacks = 1;
4817             tp->selective_acks[0].start_seq = seq;
4818             tp->selective_acks[0].end_seq = end_seq;
4819         }
4820         rb_link_node(&skb->rbnode, NULL, p);
4821         rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4822         tp->ooo_last_skb = skb;
4823         goto end;
4824     }
4825
4826     /* In the typical case, we are adding an skb to the end of the list.
4827      * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
4828      */
4829     if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
4830                  skb, &fragstolen)) {
4831 coalesce_done:
4832         /* For non sack flows, do not grow window to force DUPACK
4833          * and trigger fast retransmit.
4834          */
4835         if (tcp_is_sack(tp))
4836             tcp_grow_window(sk, skb, true);
4837         kfree_skb_partial(skb, fragstolen);
4838         skb = NULL;
4839         goto add_sack;
4840     }
4841     /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
4842     if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
4843         parent = &tp->ooo_last_skb->rbnode;
4844         p = &parent->rb_right;
4845         goto insert;
4846     }
4847
4848     /* Find place to insert this segment. Handle overlaps on the way. */
4849     parent = NULL;
4850     while (*p) {
4851         parent = *p;
4852         skb1 = rb_to_skb(parent);
4853         if (before(seq, TCP_SKB_CB(skb1)->seq)) {
4854             p = &parent->rb_left;
4855             continue;
4856         }
4857         if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4858             if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4859                 /* All the bits are present. Drop. */
4860                 NET_INC_STATS(sock_net(sk),
4861                           LINUX_MIB_TCPOFOMERGE);
4862                 tcp_drop_reason(sk, skb,
4863                         SKB_DROP_REASON_TCP_OFOMERGE);
4864                 skb = NULL;
4865                 tcp_dsack_set(sk, seq, end_seq);
4866                 goto add_sack;
4867             }
4868             if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4869                 /* Partial overlap. */
4870                 tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
4871             } else {
4872                 /* skb's seq == skb1's seq and skb covers skb1.
4873                  * Replace skb1 with skb.
4874                  */
4875                 rb_replace_node(&skb1->rbnode, &skb->rbnode,
4876                         &tp->out_of_order_queue);
4877                 tcp_dsack_extend(sk,
4878                          TCP_SKB_CB(skb1)->seq,
4879                          TCP_SKB_CB(skb1)->end_seq);
4880                 NET_INC_STATS(sock_net(sk),
4881                           LINUX_MIB_TCPOFOMERGE);
4882                 tcp_drop_reason(sk, skb1,
4883                         SKB_DROP_REASON_TCP_OFOMERGE);
4884                 goto merge_right;
4885             }
4886         } else if (tcp_ooo_try_coalesce(sk, skb1,
4887                         skb, &fragstolen)) {
4888             goto coalesce_done;
4889         }
4890         p = &parent->rb_right;
4891     }
4892 insert:
4893     /* Insert segment into RB tree. */
4894     rb_link_node(&skb->rbnode, parent, p);
4895     rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4896
4897 merge_right:
4898     /* Remove other segments covered by skb. */
4899     while ((skb1 = skb_rb_next(skb)) != NULL) {
4900         if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4901             break;
4902         if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4903             tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4904                      end_seq);
4905             break;
4906         }
4907         rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
4908         tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4909                  TCP_SKB_CB(skb1)->end_seq);
4910         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4911         tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE);
4912     }
4913     /* If there is no skb after us, we are the last_skb ! */
4914     if (!skb1)
4915         tp->ooo_last_skb = skb;
4916
4917 add_sack:
4918     if (tcp_is_sack(tp))
4919         tcp_sack_new_ofo_skb(sk, seq, end_seq);
4920 end:
4921     if (skb) {
4922         /* For non sack flows, do not grow window to force DUPACK
4923          * and trigger fast retransmit.
4924          */
4925         if (tcp_is_sack(tp))
4926             tcp_grow_window(sk, skb, false);
4927         skb_condense(skb);
4928         skb_set_owner_r(skb, sk);
4929     }
4930 }
4931
4932 static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
4933                       bool *fragstolen)
4934 {
4935     int eaten;
4936     struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4937
4938     eaten = (tail &&
4939          tcp_try_coalesce(sk, tail,
4940                   skb, fragstolen)) ? 1 : 0;
4941     tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
4942     if (!eaten) {
4943         __skb_queue_tail(&sk->sk_receive_queue, skb);
4944         skb_set_owner_r(skb, sk);
4945     }
4946     return eaten;
4947 }
4948
4949 int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4950 {
4951     struct sk_buff *skb;
4952     int err = -ENOMEM;
4953     int data_len = 0;
4954     bool fragstolen;
4955
4956     if (size == 0)
4957         return 0;
4958
4959     if (size > PAGE_SIZE) {
4960         int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
4961
4962         data_len = npages << PAGE_SHIFT;
4963         size = data_len + (size & ~PAGE_MASK);
4964     }
4965     skb = alloc_skb_with_frags(size - data_len, data_len,
4966                    PAGE_ALLOC_COSTLY_ORDER,
4967                    &err, sk->sk_allocation);
4968     if (!skb)
4969         goto err;
4970
4971     skb_put(skb, size - data_len);
4972     skb->data_len = data_len;
4973     skb->len = size;
4974
4975     if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
4976         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
4977         goto err_free;
4978     }
4979
4980     err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
4981     if (err)
4982         goto err_free;
4983
4984     TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
4985     TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4986     TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4987
4988     if (tcp_queue_rcv(sk, skb, &fragstolen)) {
4989         WARN_ON_ONCE(fragstolen); /* should not happen */
4990         __kfree_skb(skb);
4991     }
4992     return size;
4993
4994 err_free:
4995     kfree_skb(skb);
4996 err:
4997     return err;
4998
4999 }
5000
5001 void tcp_data_ready(struct sock *sk)
5002 {
5003     if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE))
5004         sk->sk_data_ready(sk);
5005 }
5006
5007 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
5008 {
5009     struct tcp_sock *tp = tcp_sk(sk);
5010     enum skb_drop_reason reason;
5011     bool fragstolen;
5012     int eaten;
5013
5014     /* If a subflow has been reset, the packet should not continue
5015      * to be processed, drop the packet.
5016      */
5017     if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
5018         __kfree_skb(skb);
5019         return;
5020     }
5021
5022     if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
5023         __kfree_skb(skb);
5024         return;
5025     }
5026     skb_dst_drop(skb);
5027     __skb_pull(skb, tcp_hdr(skb)->doff * 4);
5028
5029     reason = SKB_DROP_REASON_NOT_SPECIFIED;
5030     tp->rx_opt.dsack = 0;
5031
5032     /*  Queue data for delivery to the user.
5033      *  Packets in sequence go to the receive queue.
5034      *  Out of sequence packets to the out_of_order_queue.
5035      */
5036     if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
5037         if (tcp_receive_window(tp) == 0) {
5038             reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
5039             NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
5040             goto out_of_window;
5041         }
5042
5043         /* Ok. In sequence. In window. */
5044 queue_and_out:
5045         if (skb_queue_len(&sk->sk_receive_queue) == 0)
5046             sk_forced_mem_schedule(sk, skb->truesize);
5047         else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
5048             reason = SKB_DROP_REASON_PROTO_MEM;
5049             NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
5050             sk->sk_data_ready(sk);
5051             goto drop;
5052         }
5053
5054         eaten = tcp_queue_rcv(sk, skb, &fragstolen);
5055         if (skb->len)
5056             tcp_event_data_recv(sk, skb);
5057         if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
5058             tcp_fin(sk);
5059
5060         if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
5061             tcp_ofo_queue(sk);
5062
5063             /* RFC5681. 4.2. SHOULD send immediate ACK, when
5064              * gap in queue is filled.
5065              */
5066             if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
5067                 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
5068         }
5069
5070         if (tp->rx_opt.num_sacks)
5071             tcp_sack_remove(tp);
5072
5073         tcp_fast_path_check(sk);
5074
5075         if (eaten > 0)
5076             kfree_skb_partial(skb, fragstolen);
5077         if (!sock_flag(sk, SOCK_DEAD))
5078             tcp_data_ready(sk);
5079         return;
5080     }
5081
5082     if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
5083         tcp_rcv_spurious_retrans(sk, skb);
5084         /* A retransmit, 2nd most common case.  Force an immediate ack. */
5085         reason = SKB_DROP_REASON_TCP_OLD_DATA;
5086         NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
5087         tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
5088
5089 out_of_window:
5090         tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
5091         inet_csk_schedule_ack(sk);
5092 drop:
5093         tcp_drop_reason(sk, skb, reason);
5094         return;
5095     }
5096
5097     /* Out of window. F.e. zero window probe. */
5098     if (!before(TCP_SKB_CB(skb)->seq,
5099             tp->rcv_nxt + tcp_receive_window(tp))) {
5100         reason = SKB_DROP_REASON_TCP_OVERWINDOW;
5101         goto out_of_window;
5102     }
5103
5104     if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
5105         /* Partial packet, seq < rcv_next < end_seq */
5106         tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
5107
5108         /* If window is closed, drop tail of packet. But after
5109          * remembering D-SACK for its head made in previous line.
5110          */
5111         if (!tcp_receive_window(tp)) {
5112             reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
5113             NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
5114             goto out_of_window;
5115         }
5116         goto queue_and_out;
5117     }
5118
5119     tcp_data_queue_ofo(sk, skb);
5120 }
5121
5122 static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
5123 {
5124     if (list)
5125         return !skb_queue_is_last(list, skb) ? skb->next : NULL;
5126
5127     return skb_rb_next(skb);
5128 }
5129
5130 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
5131                     struct sk_buff_head *list,
5132                     struct rb_root *root)
5133 {
5134     struct sk_buff *next = tcp_skb_next(skb, list);
5135
5136     if (list)
5137         __skb_unlink(skb, list);
5138     else
5139         rb_erase(&skb->rbnode, root);
5140
5141     __kfree_skb(skb);
5142     NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
5143
5144     return next;
5145 }
5146
5147 /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
5148 void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
5149 {
5150     struct rb_node **p = &root->rb_node;
5151     struct rb_node *parent = NULL;
5152     struct sk_buff *skb1;
5153
5154     while (*p) {
5155         parent = *p;
5156         skb1 = rb_to_skb(parent);
5157         if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
5158             p = &parent->rb_left;
5159         else
5160             p = &parent->rb_right;
5161     }
5162     rb_link_node(&skb->rbnode, parent, p);
5163     rb_insert_color(&skb->rbnode, root);
5164 }
5165
5166 /* Collapse contiguous sequence of skbs head..tail with
5167  * sequence numbers start..end.
5168  *
5169  * If tail is NULL, this means until the end of the queue.
5170  *
5171  * Segments with FIN/SYN are not collapsed (only because this
5172  * simplifies code)
5173  */
5174 static void
5175 tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
5176          struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
5177 {
5178     struct sk_buff *skb = head, *n;
5179     struct sk_buff_head tmp;
5180     bool end_of_skbs;
5181
5182     /* First, check that queue is collapsible and find
5183      * the point where collapsing can be useful.
5184      */
5185 restart:
5186     for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
5187         n = tcp_skb_next(skb, list);
5188
5189         /* No new bits? It is possible on ofo queue. */
5190         if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
5191             skb = tcp_collapse_one(sk, skb, list, root);
5192             if (!skb)
5193                 break;
5194             goto restart;
5195         }
5196
5197         /* The first skb to collapse is:
5198          * - not SYN/FIN and
5199          * - bloated or contains data before "start" or
5200          *   overlaps to the next one and mptcp allow collapsing.
5201          */
5202         if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
5203             (tcp_win_from_space(sk, skb->truesize) > skb->len ||
5204              before(TCP_SKB_CB(skb)->seq, start))) {
5205             end_of_skbs = false;
5206             break;
5207         }
5208
5209         if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
5210             TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
5211             end_of_skbs = false;
5212             break;
5213         }
5214
5215         /* Decided to skip this, advance start seq. */
5216         start = TCP_SKB_CB(skb)->end_seq;
5217     }
5218     if (end_of_skbs ||
5219         (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
5220         return;
5221
5222     __skb_queue_head_init(&tmp);
5223
5224     while (before(start, end)) {
5225         int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
5226         struct sk_buff *nskb;
5227
5228         nskb = alloc_skb(copy, GFP_ATOMIC);
5229         if (!nskb)
5230             break;
5231
5232         memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
5233 #ifdef CONFIG_TLS_DEVICE
5234         nskb->decrypted = skb->decrypted;
5235 #endif
5236         TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
5237         if (list)
5238             __skb_queue_before(list, skb, nskb);
5239         else
5240             __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
5241         skb_set_owner_r(nskb, sk);
5242         mptcp_skb_ext_move(nskb, skb);
5243
5244         /* Copy data, releasing collapsed skbs. */
5245         while (copy > 0) {
5246             int offset = start - TCP_SKB_CB(skb)->seq;
5247             int size = TCP_SKB_CB(skb)->end_seq - start;
5248
5249             BUG_ON(offset < 0);
5250             if (size > 0) {
5251                 size = min(copy, size);
5252                 if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
5253                     BUG();
5254                 TCP_SKB_CB(nskb)->end_seq += size;
5255                 copy -= size;
5256                 start += size;
5257             }
5258             if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
5259                 skb = tcp_collapse_one(sk, skb, list, root);
5260                 if (!skb ||
5261                     skb == tail ||
5262                     !mptcp_skb_can_collapse(nskb, skb) ||
5263                     (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
5264                     goto end;
5265 #ifdef CONFIG_TLS_DEVICE
5266                 if (skb->decrypted != nskb->decrypted)
5267                     goto end;
5268 #endif
5269             }
5270         }
5271     }
5272 end:
5273     skb_queue_walk_safe(&tmp, skb, n)
5274         tcp_rbtree_insert(root, skb);
5275 }
5276
5277 /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
5278  * and tcp_collapse() them until all the queue is collapsed.
5279  */
5280 static void tcp_collapse_ofo_queue(struct sock *sk)
5281 {
5282     struct tcp_sock *tp = tcp_sk(sk);
5283     u32 range_truesize, sum_tiny = 0;
5284     struct sk_buff *skb, *head;
5285     u32 start, end;
5286
5287     skb = skb_rb_first(&tp->out_of_order_queue);
5288 new_range:
5289     if (!skb) {
5290         tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
5291         return;
5292     }
5293     start = TCP_SKB_CB(skb)->seq;
5294     end = TCP_SKB_CB(skb)->end_seq;
5295     range_truesize = skb->truesize;
5296
5297     for (head = skb;;) {
5298         skb = skb_rb_next(skb);
5299
5300         /* Range is terminated when we see a gap or when
5301          * we are at the queue end.
5302          */
5303         if (!skb ||
5304             after(TCP_SKB_CB(skb)->seq, end) ||
5305             before(TCP_SKB_CB(skb)->end_seq, start)) {
5306             /* Do not attempt collapsing tiny skbs */
5307             if (range_truesize != head->truesize ||
5308                 end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) {
5309                 tcp_collapse(sk, NULL, &tp->out_of_order_queue,
5310                          head, skb, start, end);
5311             } else {
5312                 sum_tiny += range_truesize;
5313                 if (sum_tiny > sk->sk_rcvbuf >> 3)
5314                     return;
5315             }
5316             goto new_range;
5317         }
5318
5319         range_truesize += skb->truesize;
5320         if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
5321             start = TCP_SKB_CB(skb)->seq;
5322         if (after(TCP_SKB_CB(skb)->end_seq, end))
5323             end = TCP_SKB_CB(skb)->end_seq;
5324     }
5325 }
5326
5327 /*
5328  * Clean the out-of-order queue to make room.
5329  * We drop high sequences packets to :
5330  * 1) Let a chance for holes to be filled.
5331  * 2) not add too big latencies if thousands of packets sit there.
5332  *    (But if application shrinks SO_RCVBUF, we could still end up
5333  *     freeing whole queue here)
5334  * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
5335  *
5336  * Return true if queue has shrunk.
5337  */
5338 static bool tcp_prune_ofo_queue(struct sock *sk)
5339 {
5340     struct tcp_sock *tp = tcp_sk(sk);
5341     struct rb_node *node, *prev;
5342     int goal;
5343
5344     if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
5345         return false;
5346
5347     NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
5348     goal = sk->sk_rcvbuf >> 3;
5349     node = &tp->ooo_last_skb->rbnode;
5350     do {
5351         prev = rb_prev(node);
5352         rb_erase(node, &tp->out_of_order_queue);
5353         goal -= rb_to_skb(node)->truesize;
5354         tcp_drop_reason(sk, rb_to_skb(node),
5355                 SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
5356         if (!prev || goal <= 0) {
5357             if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
5358                 !tcp_under_memory_pressure(sk))
5359                 break;
5360             goal = sk->sk_rcvbuf >> 3;
5361         }
5362         node = prev;
5363     } while (node);
5364     tp->ooo_last_skb = rb_to_skb(prev);
5365
5366     /* Reset SACK state.  A conforming SACK implementation will
5367      * do the same at a timeout based retransmit.  When a connection
5368      * is in a sad state like this, we care only about integrity
5369      * of the connection not performance.
5370      */
5371     if (tp->rx_opt.sack_ok)
5372         tcp_sack_reset(&tp->rx_opt);
5373     return true;
5374 }
5375
5376 /* Reduce allocated memory if we can, trying to get
5377  * the socket within its memory limits again.
5378  *
5379  * Return less than zero if we should start dropping frames
5380  * until the socket owning process reads some of the data
5381  * to stabilize the situation.
5382  */
5383 static int tcp_prune_queue(struct sock *sk)
5384 {
5385     struct tcp_sock *tp = tcp_sk(sk);
5386
5387     NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
5388
5389     if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
5390         tcp_clamp_window(sk);
5391     else if (tcp_under_memory_pressure(sk))
5392         tcp_adjust_rcv_ssthresh(sk);
5393
5394     if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
5395         return 0;
5396
5397     tcp_collapse_ofo_queue(sk);
5398     if (!skb_queue_empty(&sk->sk_receive_queue))
5399         tcp_collapse(sk, &sk->sk_receive_queue, NULL,
5400                  skb_peek(&sk->sk_receive_queue),
5401                  NULL,
5402                  tp->copied_seq, tp->rcv_nxt);
5403
5404     if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
5405         return 0;
5406
5407     /* Collapsing did not help, destructive actions follow.
5408      * This must not ever occur. */
5409
5410     tcp_prune_ofo_queue(sk);
5411
5412     if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
5413         return 0;
5414
5415     /* If we are really being abused, tell the caller to silently
5416      * drop receive data on the floor.  It will get retransmitted
5417      * and hopefully then we'll have sufficient space.
5418      */
5419     NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
5420
5421     /* Massive buffer overcommit. */
5422     tp->pred_flags = 0;
5423     return -1;
5424 }
5425
5426 static bool tcp_should_expand_sndbuf(struct sock *sk)
5427 {
5428     const struct tcp_sock *tp = tcp_sk(sk);
5429
5430     /* If the user specified a specific send buffer setting, do
5431      * not modify it.
5432      */
5433     if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
5434         return false;
5435
5436     /* If we are under global TCP memory pressure, do not expand.  */
5437     if (tcp_under_memory_pressure(sk)) {
5438         int unused_mem = sk_unused_reserved_mem(sk);
5439
5440         /* Adjust sndbuf according to reserved mem. But make sure
5441          * it never goes below SOCK_MIN_SNDBUF.
5442          * See sk_stream_moderate_sndbuf() for more details.
5443          */
5444         if (unused_mem > SOCK_MIN_SNDBUF)
5445             WRITE_ONCE(sk->sk_sndbuf, unused_mem);
5446
5447         return false;
5448     }
5449
5450     /* If we are under soft global TCP memory pressure, do not expand.  */
5451     if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
5452         return false;
5453
5454     /* If we filled the congestion window, do not expand.  */
5455     if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp))
5456         return false;
5457
5458     return true;
5459 }
5460
5461 static void tcp_new_space(struct sock *sk)
5462 {
5463     struct tcp_sock *tp = tcp_sk(sk);
5464
5465     if (tcp_should_expand_sndbuf(sk)) {
5466         tcp_sndbuf_expand(sk);
5467         tp->snd_cwnd_stamp = tcp_jiffies32;
5468     }
5469
5470     INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
5471 }
5472
5473 /* Caller made space either from:
5474  * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
5475  * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
5476  *
5477  * We might be able to generate EPOLLOUT to the application if:
5478  * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
5479  * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
5480  *    small enough that tcp_stream_memory_free() decides it
5481  *    is time to generate EPOLLOUT.
5482  */
5483 void tcp_check_space(struct sock *sk)
5484 {
5485     /* pairs with tcp_poll() */
5486     smp_mb();
5487     if (sk->sk_socket &&
5488         test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5489         tcp_new_space(sk);
5490         if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5491             tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
5492     }
5493 }
5494
5495 static inline void tcp_data_snd_check(struct sock *sk)
5496 {
5497     tcp_push_pending_frames(sk);
5498     tcp_check_space(sk);
5499 }
5500
5501 /*
5502  * Check if sending an ack is needed.
5503  */
5504 static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
5505 {
5506     struct tcp_sock *tp = tcp_sk(sk);
5507     unsigned long rtt, delay;
5508
5509         /* More than one full frame received... */
5510     if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
5511          /* ... and right edge of window advances far enough.
5512           * (tcp_recvmsg() will send ACK otherwise).
5513           * If application uses SO_RCVLOWAT, we want send ack now if
5514           * we have not received enough bytes to satisfy the condition.
5515           */
5516         (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
5517          __tcp_select_window(sk) >= tp->rcv_wnd)) ||
5518         /* We ACK each frame or... */
5519         tcp_in_quickack_mode(sk) ||
5520         /* Protocol state mandates a one-time immediate ACK */
5521         inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
5522 send_now:
5523         tcp_send_ack(sk);
5524         return;
5525     }
5526
5527     if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
5528         tcp_send_delayed_ack(sk);
5529         return;
5530     }
5531
5532     if (!tcp_is_sack(tp) ||
5533         tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
5534         goto send_now;
5535
5536     if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
5537         tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
5538         tp->dup_ack_counter = 0;
5539     }
5540     if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
5541         tp->dup_ack_counter++;
5542         goto send_now;
5543     }
5544     tp->compressed_ack++;
5545     if (hrtimer_is_queued(&tp->compressed_ack_timer))
5546         return;
5547
5548     /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
5549
5550     rtt = tp->rcv_rtt_est.rtt_us;
5551     if (tp->srtt_us && tp->srtt_us < rtt)
5552         rtt = tp->srtt_us;
5553
5554     delay = min_t(unsigned long,
5555               READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
5556               rtt * (NSEC_PER_USEC >> 3)/20);
5557     sock_hold(sk);
5558     hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
5559                    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
5560                    HRTIMER_MODE_REL_PINNED_SOFT);
5561 }
5562
5563 static inline void tcp_ack_snd_check(struct sock *sk)
5564 {
5565     if (!inet_csk_ack_scheduled(sk)) {
5566         /* We sent a data segment already. */
5567         return;
5568     }
5569     __tcp_ack_snd_check(sk, 1);
5570 }
5571
5572 /*
5573  *  This routine is only called when we have urgent data
5574  *  signaled. Its the 'slow' part of tcp_urg. It could be
5575  *  moved inline now as tcp_urg is only called from one
5576  *  place. We handle URGent data wrong. We have to - as
5577  *  BSD still doesn't use the correction from RFC961.
5578  *  For 1003.1g we should support a new option TCP_STDURG to permit
5579  *  either form (or just set the sysctl tcp_stdurg).
5580  */
5581
5582 static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
5583 {
5584     struct tcp_sock *tp = tcp_sk(sk);
5585     u32 ptr = ntohs(th->urg_ptr);
5586
5587     if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
5588         ptr--;
5589     ptr += ntohl(th->seq);
5590
5591     /* Ignore urgent data that we've already seen and read. */
5592     if (after(tp->copied_seq, ptr))
5593         return;
5594
5595     /* Do not replay urg ptr.
5596      *
5597      * NOTE: interesting situation not covered by specs.
5598      * Misbehaving sender may send urg ptr, pointing to segment,
5599      * which we already have in ofo queue. We are not able to fetch
5600      * such data and will stay in TCP_URG_NOTYET until will be eaten
5601      * by recvmsg(). Seems, we are not obliged to handle such wicked
5602      * situations. But it is worth to think about possibility of some
5603      * DoSes using some hypothetical application level deadlock.
5604      */
5605     if (before(ptr, tp->rcv_nxt))
5606         return;
5607
5608     /* Do we already have a newer (or duplicate) urgent pointer? */
5609     if (tp->urg_data && !after(ptr, tp->urg_seq))
5610         return;
5611
5612     /* Tell the world about our new urgent pointer. */
5613     sk_send_sigurg(sk);
5614
5615     /* We may be adding urgent data when the last byte read was
5616      * urgent. To do this requires some care. We cannot just ignore
5617      * tp->copied_seq since we would read the last urgent byte again
5618      * as data, nor can we alter copied_seq until this data arrives
5619      * or we break the semantics of SIOCATMARK (and thus sockatmark())
5620      *
5621      * NOTE. Double Dutch. Rendering to plain English: author of comment
5622      * above did something sort of  send("A", MSG_OOB); send("B", MSG_OOB);
5623      * and expect that both A and B disappear from stream. This is _wrong_.
5624      * Though this happens in BSD with high probability, this is occasional.
5625      * Any application relying on this is buggy. Note also, that fix "works"
5626      * only in this artificial test. Insert some normal data between A and B and we will
5627      * decline of BSD again. Verdict: it is better to remove to trap
5628      * buggy users.
5629      */
5630     if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
5631         !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
5632         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
5633         tp->copied_seq++;
5634         if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
5635             __skb_unlink(skb, &sk->sk_receive_queue);
5636             __kfree_skb(skb);
5637         }
5638     }
5639
5640     WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
5641     WRITE_ONCE(tp->urg_seq, ptr);
5642
5643     /* Disable header prediction. */
5644     tp->pred_flags = 0;
5645 }
5646
5647 /* This is the 'fast' part of urgent handling. */
5648 static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
5649 {
5650     struct tcp_sock *tp = tcp_sk(sk);
5651
5652     /* Check if we get a new urgent pointer - normally not. */
5653     if (unlikely(th->urg))
5654         tcp_check_urg(sk, th);
5655
5656     /* Do we wait for any urgent data? - normally not... */
5657     if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
5658         u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
5659               th->syn;
5660
5661         /* Is the urgent pointer pointing into this packet? */
5662         if (ptr < skb->len) {
5663             u8 tmp;
5664             if (skb_copy_bits(skb, ptr, &tmp, 1))
5665                 BUG();
5666             WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp);
5667             if (!sock_flag(sk, SOCK_DEAD))
5668                 sk->sk_data_ready(sk);
5669         }
5670     }
5671 }
5672
5673 /* Accept RST for rcv_nxt - 1 after a FIN.
5674  * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
5675  * FIN is sent followed by a RST packet. The RST is sent with the same
5676  * sequence number as the FIN, and thus according to RFC 5961 a challenge
5677  * ACK should be sent. However, Mac OSX rate limits replies to challenge
5678  * ACKs on the closed socket. In addition middleboxes can drop either the
5679  * challenge ACK or a subsequent RST.
5680  */
5681 static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
5682 {
5683     struct tcp_sock *tp = tcp_sk(sk);
5684
5685     return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
5686             (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
5687                            TCPF_CLOSING));
5688 }
5689
5690 /* Does PAWS and seqno based validation of an incoming segment, flags will
5691  * play significant role here.
5692  */
5693 static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5694                   const struct tcphdr *th, int syn_inerr)
5695 {
5696     struct tcp_sock *tp = tcp_sk(sk);
5697     SKB_DR(reason);
5698
5699     /* RFC1323: H1. Apply PAWS check first. */
5700     if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
5701         tp->rx_opt.saw_tstamp &&
5702         tcp_paws_discard(sk, skb)) {
5703         if (!th->rst) {
5704             NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
5705             if (!tcp_oow_rate_limited(sock_net(sk), skb,
5706                           LINUX_MIB_TCPACKSKIPPEDPAWS,
5707                           &tp->last_oow_ack_time))
5708                 tcp_send_dupack(sk, skb);
5709             SKB_DR_SET(reason, TCP_RFC7323_PAWS);
5710             goto discard;
5711         }
5712         /* Reset is accepted even if it did not pass PAWS. */
5713     }
5714
5715     /* Step 1: check sequence number */
5716     if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
5717         /* RFC793, page 37: "In all states except SYN-SENT, all reset
5718          * (RST) segments are validated by checking their SEQ-fields."
5719          * And page 69: "If an incoming segment is not acceptable,
5720          * an acknowledgment should be sent in reply (unless the RST
5721          * bit is set, if so drop the segment and return)".
5722          */
5723         if (!th->rst) {
5724             if (th->syn)
5725                 goto syn_challenge;
5726             if (!tcp_oow_rate_limited(sock_net(sk), skb,
5727                           LINUX_MIB_TCPACKSKIPPEDSEQ,
5728                           &tp->last_oow_ack_time))
5729                 tcp_send_dupack(sk, skb);
5730         } else if (tcp_reset_check(sk, skb)) {
5731             goto reset;
5732         }
5733         SKB_DR_SET(reason, TCP_INVALID_SEQUENCE);
5734         goto discard;
5735     }
5736
5737     /* Step 2: check RST bit */
5738     if (th->rst) {
5739         /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
5740          * FIN and SACK too if available):
5741          * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
5742          * the right-most SACK block,
5743          * then
5744          *     RESET the connection
5745          * else
5746          *     Send a challenge ACK
5747          */
5748         if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
5749             tcp_reset_check(sk, skb))
5750             goto reset;
5751
5752         if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
5753             struct tcp_sack_block *sp = &tp->selective_acks[0];
5754             int max_sack = sp[0].end_seq;
5755             int this_sack;
5756
5757             for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
5758                  ++this_sack) {
5759                 max_sack = after(sp[this_sack].end_seq,
5760                          max_sack) ?
5761                     sp[this_sack].end_seq : max_sack;
5762             }
5763
5764             if (TCP_SKB_CB(skb)->seq == max_sack)
5765                 goto reset;
5766         }
5767
5768         /* Disable TFO if RST is out-of-order
5769          * and no data has been received
5770          * for current active TFO socket
5771          */
5772         if (tp->syn_fastopen && !tp->data_segs_in &&
5773             sk->sk_state == TCP_ESTABLISHED)
5774             tcp_fastopen_active_disable(sk);
5775         tcp_send_challenge_ack(sk);
5776         SKB_DR_SET(reason, TCP_RESET);
5777         goto discard;
5778     }
5779
5780     /* step 3: check security and precedence [ignored] */
5781
5782     /* step 4: Check for a SYN
5783      * RFC 5961 4.2 : Send a challenge ack
5784      */
5785     if (th->syn) {
5786 syn_challenge:
5787         if (syn_inerr)
5788             TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
5789         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5790         tcp_send_challenge_ack(sk);
5791         SKB_DR_SET(reason, TCP_INVALID_SYN);
5792         goto discard;
5793     }
5794
5795     bpf_skops_parse_hdr(sk, skb);
5796
5797     return true;
5798
5799 discard:
5800     tcp_drop_reason(sk, skb, reason);
5801     return false;
5802
5803 reset:
5804     tcp_reset(sk, skb);
5805     __kfree_skb(skb);
5806     return false;
5807 }
5808
5809 /*
5810  *  TCP receive function for the ESTABLISHED state.
5811  *
5812  *  It is split into a fast path and a slow path. The fast path is
5813  *  disabled when:
5814  *  - A zero window was announced from us - zero window probing
5815  *        is only handled properly in the slow path.
5816  *  - Out of order segments arrived.
5817  *  - Urgent data is expected.
5818  *  - There is no buffer space left
5819  *  - Unexpected TCP flags/window values/header lengths are received
5820  *    (detected by checking the TCP header against pred_flags)
5821  *  - Data is sent in both directions. Fast path only supports pure senders
5822  *    or pure receivers (this means either the sequence number or the ack
5823  *    value must stay constant)
5824  *  - Unexpected TCP option.
5825  *
5826  *  When these conditions are not satisfied it drops into a standard
5827  *  receive procedure patterned after RFC793 to handle all cases.
5828  *  The first three cases are guaranteed by proper pred_flags setting,
5829  *  the rest is checked inline. Fast processing is turned on in
5830  *  tcp_data_queue when everything is OK.
5831  */
5832 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
5833 {
5834     enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
5835     const struct tcphdr *th = (const struct tcphdr *)skb->data;
5836     struct tcp_sock *tp = tcp_sk(sk);
5837     unsigned int len = skb->len;
5838
5839     /* TCP congestion window tracking */
5840     trace_tcp_probe(sk, skb);
5841
5842     tcp_mstamp_refresh(tp);
5843     if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
5844         inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5845     /*
5846      *  Header prediction.
5847      *  The code loosely follows the one in the famous
5848      *  "30 instruction TCP receive" Van Jacobson mail.
5849      *
5850      *  Van's trick is to deposit buffers into socket queue
5851      *  on a device interrupt, to call tcp_recv function
5852      *  on the receive process context and checksum and copy
5853      *  the buffer to user space. smart...
5854      *
5855      *  Our current scheme is not silly either but we take the
5856      *  extra cost of the net_bh soft interrupt processing...
5857      *  We do checksum and copy also but from device to kernel.
5858      */
5859
5860     tp->rx_opt.saw_tstamp = 0;
5861
5862     /*  pred_flags is 0xS?10 << 16 + snd_wnd
5863      *  if header_prediction is to be made
5864      *  'S' will always be tp->tcp_header_len >> 2
5865      *  '?' will be 0 for the fast path, otherwise pred_flags is 0 to
5866      *  turn it off (when there are holes in the receive
5867      *   space for instance)
5868      *  PSH flag is ignored.
5869      */
5870
5871     if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5872         TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5873         !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5874         int tcp_header_len = tp->tcp_header_len;
5875
5876         /* Timestamp header prediction: tcp_header_len
5877          * is automatically equal to th->doff*4 due to pred_flags
5878          * match.
5879          */
5880
5881         /* Check timestamp */
5882         if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
5883             /* No? Slow path! */
5884             if (!tcp_parse_aligned_timestamp(tp, th))
5885                 goto slow_path;
5886
5887             /* If PAWS failed, check it more carefully in slow path */
5888             if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
5889                 goto slow_path;
5890
5891             /* DO NOT update ts_recent here, if checksum fails
5892              * and timestamp was corrupted part, it will result
5893              * in a hung connection since we will drop all
5894              * future packets due to the PAWS test.
5895              */
5896         }
5897
5898         if (len <= tcp_header_len) {
5899             /* Bulk data transfer: sender */
5900             if (len == tcp_header_len) {
5901                 /* Predicted packet is in window by definition.
5902                  * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5903                  * Hence, check seq<=rcv_wup reduces to:
5904                  */
5905                 if (tcp_header_len ==
5906                     (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5907                     tp->rcv_nxt == tp->rcv_wup)
5908                     tcp_store_ts_recent(tp);
5909
5910                 /* We know that such packets are checksummed
5911                  * on entry.
5912                  */
5913                 tcp_ack(sk, skb, 0);
5914                 __kfree_skb(skb);
5915                 tcp_data_snd_check(sk);
5916                 /* When receiving pure ack in fast path, update
5917                  * last ts ecr directly instead of calling
5918                  * tcp_rcv_rtt_measure_ts()
5919                  */
5920                 tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
5921                 return;
5922             } else { /* Header too small */
5923                 reason = SKB_DROP_REASON_PKT_TOO_SMALL;
5924                 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
5925                 goto discard;
5926             }
5927         } else {
5928             int eaten = 0;
5929             bool fragstolen = false;
5930
5931             if (tcp_checksum_complete(skb))
5932                 goto csum_error;
5933
5934             if ((int)skb->truesize > sk->sk_forward_alloc)
5935                 goto step5;
5936
5937             /* Predicted packet is in window by definition.
5938              * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5939              * Hence, check seq<=rcv_wup reduces to:
5940              */
5941             if (tcp_header_len ==
5942                 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5943                 tp->rcv_nxt == tp->rcv_wup)
5944                 tcp_store_ts_recent(tp);
5945
5946             tcp_rcv_rtt_measure_ts(sk, skb);
5947
5948             NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
5949
5950             /* Bulk data transfer: receiver */
5951             skb_dst_drop(skb);
5952             __skb_pull(skb, tcp_header_len);
5953             eaten = tcp_queue_rcv(sk, skb, &fragstolen);
5954
5955             tcp_event_data_recv(sk, skb);
5956
5957             if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5958                 /* Well, only one small jumplet in fast path... */
5959                 tcp_ack(sk, skb, FLAG_DATA);
5960                 tcp_data_snd_check(sk);
5961                 if (!inet_csk_ack_scheduled(sk))
5962                     goto no_ack;
5963             } else {
5964                 tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
5965             }
5966
5967             __tcp_ack_snd_check(sk, 0);
5968 no_ack:
5969             if (eaten)
5970                 kfree_skb_partial(skb, fragstolen);
5971             tcp_data_ready(sk);
5972             return;
5973         }
5974     }
5975
5976 slow_path:
5977     if (len < (th->doff << 2) || tcp_checksum_complete(skb))
5978         goto csum_error;
5979
5980     if (!th->ack && !th->rst && !th->syn) {
5981         reason = SKB_DROP_REASON_TCP_FLAGS;
5982         goto discard;
5983     }
5984
5985     /*
5986      *  Standard slow path.
5987      */
5988
5989     if (!tcp_validate_incoming(sk, skb, th, 1))
5990         return;
5991
5992 step5:
5993     reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
5994     if ((int)reason < 0) {
5995         reason = -reason;
5996         goto discard;
5997     }
5998     tcp_rcv_rtt_measure_ts(sk, skb);
5999
6000     /* Process urgent data. */
6001     tcp_urg(sk, skb, th);
6002
6003     /* step 7: process the segment text */
6004     tcp_data_queue(sk, skb);
6005
6006     tcp_data_snd_check(sk);
6007     tcp_ack_snd_check(sk);
6008     return;
6009
6010 csum_error:
6011     reason = SKB_DROP_REASON_TCP_CSUM;
6012     trace_tcp_bad_csum(skb);
6013     TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
6014     TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
6015
6016 discard:
6017     tcp_drop_reason(sk, skb, reason);
6018 }
6019 EXPORT_SYMBOL(tcp_rcv_established);
6020
6021 void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
6022 {
6023     struct inet_connection_sock *icsk = inet_csk(sk);
6024     struct tcp_sock *tp = tcp_sk(sk);
6025
6026     tcp_mtup_init(sk);
6027     icsk->icsk_af_ops->rebuild_header(sk);
6028     tcp_init_metrics(sk);
6029
6030     /* Initialize the congestion window to start the transfer.
6031      * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
6032      * retransmitted. In light of RFC6298 more aggressive 1sec
6033      * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
6034      * retransmission has occurred.
6035      */
6036     if (tp->total_retrans > 1 && tp->undo_marker)
6037         tcp_snd_cwnd_set(tp, 1);
6038     else
6039         tcp_snd_cwnd_set(tp, tcp_init_cwnd(tp, __sk_dst_get(sk)));
6040     tp->snd_cwnd_stamp = tcp_jiffies32;
6041
6042     bpf_skops_established(sk, bpf_op, skb);
6043     /* Initialize congestion control unless BPF initialized it already: */
6044     if (!icsk->icsk_ca_initialized)
6045         tcp_init_congestion_control(sk);
6046     tcp_init_buffer_space(sk);
6047 }
6048
6049 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
6050 {
6051     struct tcp_sock *tp = tcp_sk(sk);
6052     struct inet_connection_sock *icsk = inet_csk(sk);
6053
6054     tcp_set_state(sk, TCP_ESTABLISHED);
6055     icsk->icsk_ack.lrcvtime = tcp_jiffies32;
6056
6057     if (skb) {
6058         icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
6059         security_inet_conn_established(sk, skb);
6060         sk_mark_napi_id(sk, skb);
6061     }
6062
6063     tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
6064
6065     /* Prevent spurious tcp_cwnd_restart() on first data
6066      * packet.
6067      */
6068     tp->lsndtime = tcp_jiffies32;
6069
6070     if (sock_flag(sk, SOCK_KEEPOPEN))
6071         inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
6072
6073     if (!tp->rx_opt.snd_wscale)
6074         __tcp_fast_path_on(tp, tp->snd_wnd);
6075     else
6076         tp->pred_flags = 0;
6077 }
6078
6079 static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
6080                     struct tcp_fastopen_cookie *cookie)
6081 {
6082     struct tcp_sock *tp = tcp_sk(sk);
6083     struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
6084     u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
6085     bool syn_drop = false;
6086
6087     if (mss == tp->rx_opt.user_mss) {
6088         struct tcp_options_received opt;
6089
6090         /* Get original SYNACK MSS value if user MSS sets mss_clamp */
6091         tcp_clear_options(&opt);
6092         opt.user_mss = opt.mss_clamp = 0;
6093         tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
6094         mss = opt.mss_clamp;
6095     }
6096
6097     if (!tp->syn_fastopen) {
6098         /* Ignore an unsolicited cookie */
6099         cookie->len = -1;
6100     } else if (tp->total_retrans) {
6101         /* SYN timed out and the SYN-ACK neither has a cookie nor
6102          * acknowledges data. Presumably the remote received only
6103          * the retransmitted (regular) SYNs: either the original
6104          * SYN-data or the corresponding SYN-ACK was dropped.
6105          */
6106         syn_drop = (cookie->len < 0 && data);
6107     } else if (cookie->len < 0 && !tp->syn_data) {
6108         /* We requested a cookie but didn't get it. If we did not use
6109          * the (old) exp opt format then try so next time (try_exp=1).
6110          * Otherwise we go back to use the RFC7413 opt (try_exp=2).
6111          */
6112         try_exp = tp->syn_fastopen_exp ? 2 : 1;
6113     }
6114
6115     tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
6116
6117     if (data) { /* Retransmit unacked data in SYN */
6118         if (tp->total_retrans)
6119             tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
6120         else
6121             tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
6122         skb_rbtree_walk_from(data)
6123              tcp_mark_skb_lost(sk, data);
6124         tcp_xmit_retransmit_queue(sk);
6125         NET_INC_STATS(sock_net(sk),
6126                 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
6127         return true;
6128     }
6129     tp->syn_data_acked = tp->syn_data;
6130     if (tp->syn_data_acked) {
6131         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
6132         /* SYN-data is counted as two separate packets in tcp_ack() */
6133         if (tp->delivered > 1)
6134             --tp->delivered;
6135     }
6136
6137     tcp_fastopen_add_skb(sk, synack);
6138
6139     return false;
6140 }
6141
6142 static void smc_check_reset_syn(struct tcp_sock *tp)
6143 {
6144 #if IS_ENABLED(CONFIG_SMC)
6145     if (static_branch_unlikely(&tcp_have_smc)) {
6146         if (tp->syn_smc && !tp->rx_opt.smc_ok)
6147             tp->syn_smc = 0;
6148     }
6149 #endif
6150 }
6151
6152 static void tcp_try_undo_spurious_syn(struct sock *sk)
6153 {
6154     struct tcp_sock *tp = tcp_sk(sk);
6155     u32 syn_stamp;
6156
6157     /* undo_marker is set when SYN or SYNACK times out. The timeout is
6158      * spurious if the ACK's timestamp option echo value matches the
6159      * original SYN timestamp.
6160      */
6161     syn_stamp = tp->retrans_stamp;
6162     if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
6163         syn_stamp == tp->rx_opt.rcv_tsecr)
6164         tp->undo_marker = 0;
6165 }
6166
6167 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
6168                      const struct tcphdr *th)
6169 {
6170     struct inet_connection_sock *icsk = inet_csk(sk);
6171     struct tcp_sock *tp = tcp_sk(sk);
6172     struct tcp_fastopen_cookie foc = { .len = -1 };
6173     int saved_clamp = tp->rx_opt.mss_clamp;
6174     bool fastopen_fail;
6175     SKB_DR(reason);
6176
6177     tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
6178     if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
6179         tp->rx_opt.rcv_tsecr -= tp->tsoffset;
6180
6181     if (th->ack) {
6182         /* rfc793:
6183          * "If the state is SYN-SENT then
6184          *    first check the ACK bit
6185          *      If the ACK bit is set
6186          *    If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
6187          *        a reset (unless the RST bit is set, if so drop
6188          *        the segment and return)"
6189          */
6190         if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
6191             after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
6192             /* Previous FIN/ACK or RST/ACK might be ignored. */
6193             if (icsk->icsk_retransmits == 0)
6194                 inet_csk_reset_xmit_timer(sk,
6195                         ICSK_TIME_RETRANS,
6196                         TCP_TIMEOUT_MIN, TCP_RTO_MAX);
6197             goto reset_and_undo;
6198         }
6199
6200         if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
6201             !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
6202                  tcp_time_stamp(tp))) {
6203             NET_INC_STATS(sock_net(sk),
6204                     LINUX_MIB_PAWSACTIVEREJECTED);
6205             goto reset_and_undo;
6206         }
6207
6208         /* Now ACK is acceptable.
6209          *
6210          * "If the RST bit is set
6211          *    If the ACK was acceptable then signal the user "error:
6212          *    connection reset", drop the segment, enter CLOSED state,
6213          *    delete TCB, and return."
6214          */
6215
6216         if (th->rst) {
6217             tcp_reset(sk, skb);
6218 consume:
6219             __kfree_skb(skb);
6220             return 0;
6221         }
6222
6223         /* rfc793:
6224          *   "fifth, if neither of the SYN or RST bits is set then
6225          *    drop the segment and return."
6226          *
6227          *    See note below!
6228          *                                        --ANK(990513)
6229          */
6230         if (!th->syn) {
6231             SKB_DR_SET(reason, TCP_FLAGS);
6232             goto discard_and_undo;
6233         }
6234         /* rfc793:
6235          *   "If the SYN bit is on ...
6236          *    are acceptable then ...
6237          *    (our SYN has been ACKed), change the connection
6238          *    state to ESTABLISHED..."
6239          */
6240
6241         tcp_ecn_rcv_synack(tp, th);
6242
6243         tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
6244         tcp_try_undo_spurious_syn(sk);
6245         tcp_ack(sk, skb, FLAG_SLOWPATH);
6246
6247         /* Ok.. it's good. Set up sequence numbers and
6248          * move to established.
6249          */
6250         WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
6251         tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
6252
6253         /* RFC1323: The window in SYN & SYN/ACK segments is
6254          * never scaled.
6255          */
6256         tp->snd_wnd = ntohs(th->window);
6257
6258         if (!tp->rx_opt.wscale_ok) {
6259             tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
6260             tp->window_clamp = min(tp->window_clamp, 65535U);
6261         }
6262
6263         if (tp->rx_opt.saw_tstamp) {
6264             tp->rx_opt.tstamp_ok       = 1;
6265             tp->tcp_header_len =
6266                 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
6267             tp->advmss      -= TCPOLEN_TSTAMP_ALIGNED;
6268             tcp_store_ts_recent(tp);
6269         } else {
6270             tp->tcp_header_len = sizeof(struct tcphdr);
6271         }
6272
6273         tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
6274         tcp_initialize_rcv_mss(sk);
6275
6276         /* Remember, tcp_poll() does not lock socket!
6277          * Change state from SYN-SENT only after copied_seq
6278          * is initialized. */
6279         WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6280
6281         smc_check_reset_syn(tp);
6282
6283         smp_mb();
6284
6285         tcp_finish_connect(sk, skb);
6286
6287         fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
6288                 tcp_rcv_fastopen_synack(sk, skb, &foc);
6289
6290         if (!sock_flag(sk, SOCK_DEAD)) {
6291             sk->sk_state_change(sk);
6292             sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
6293         }
6294         if (fastopen_fail)
6295             return -1;
6296         if (sk->sk_write_pending ||
6297             icsk->icsk_accept_queue.rskq_defer_accept ||
6298             inet_csk_in_pingpong_mode(sk)) {
6299             /* Save one ACK. Data will be ready after
6300              * several ticks, if write_pending is set.
6301              *
6302              * It may be deleted, but with this feature tcpdumps
6303              * look so _wonderfully_ clever, that I was not able
6304              * to stand against the temptation 8)     --ANK
6305              */
6306             inet_csk_schedule_ack(sk);
6307             tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
6308             inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
6309                           TCP_DELACK_MAX, TCP_RTO_MAX);
6310             goto consume;
6311         }
6312         tcp_send_ack(sk);
6313         return -1;
6314     }
6315
6316     /* No ACK in the segment */
6317
6318     if (th->rst) {
6319         /* rfc793:
6320          * "If the RST bit is set
6321          *
6322          *      Otherwise (no ACK) drop the segment and return."
6323          */
6324         SKB_DR_SET(reason, TCP_RESET);
6325         goto discard_and_undo;
6326     }
6327
6328     /* PAWS check. */
6329     if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
6330         tcp_paws_reject(&tp->rx_opt, 0)) {
6331         SKB_DR_SET(reason, TCP_RFC7323_PAWS);
6332         goto discard_and_undo;
6333     }
6334     if (th->syn) {
6335         /* We see SYN without ACK. It is attempt of
6336          * simultaneous connect with crossed SYNs.
6337          * Particularly, it can be connect to self.
6338          */
6339         tcp_set_state(sk, TCP_SYN_RECV);
6340
6341         if (tp->rx_opt.saw_tstamp) {
6342             tp->rx_opt.tstamp_ok = 1;
6343             tcp_store_ts_recent(tp);
6344             tp->tcp_header_len =
6345                 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
6346         } else {
6347             tp->tcp_header_len = sizeof(struct tcphdr);
6348         }
6349
6350         WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
6351         WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6352         tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
6353
6354         /* RFC1323: The window in SYN & SYN/ACK segments is
6355          * never scaled.
6356          */
6357         tp->snd_wnd    = ntohs(th->window);
6358         tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
6359         tp->max_window = tp->snd_wnd;
6360
6361         tcp_ecn_rcv_syn(tp, th);
6362
6363         tcp_mtup_init(sk);
6364         tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
6365         tcp_initialize_rcv_mss(sk);
6366
6367         tcp_send_synack(sk);
6368 #if 0
6369         /* Note, we could accept data and URG from this segment.
6370          * There are no obstacles to make this (except that we must
6371          * either change tcp_recvmsg() to prevent it from returning data
6372          * before 3WHS completes per RFC793, or employ TCP Fast Open).
6373          *
6374          * However, if we ignore data in ACKless segments sometimes,
6375          * we have no reasons to accept it sometimes.
6376          * Also, seems the code doing it in step6 of tcp_rcv_state_process
6377          * is not flawless. So, discard packet for sanity.
6378          * Uncomment this return to process the data.
6379          */
6380         return -1;
6381 #else
6382         goto consume;
6383 #endif
6384     }
6385     /* "fifth, if neither of the SYN or RST bits is set then
6386      * drop the segment and return."
6387      */
6388
6389 discard_and_undo:
6390     tcp_clear_options(&tp->rx_opt);
6391     tp->rx_opt.mss_clamp = saved_clamp;
6392     tcp_drop_reason(sk, skb, reason);
6393     return 0;
6394
6395 reset_and_undo:
6396     tcp_clear_options(&tp->rx_opt);
6397     tp->rx_opt.mss_clamp = saved_clamp;
6398     return 1;
6399 }
6400
6401 static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
6402 {
6403     struct request_sock *req;
6404
6405     /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
6406      * undo. If peer SACKs triggered fast recovery, we can't undo here.
6407      */
6408     if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
6409         tcp_try_undo_loss(sk, false);
6410
6411     /* Reset rtx states to prevent spurious retransmits_timed_out() */
6412     tcp_sk(sk)->retrans_stamp = 0;
6413     inet_csk(sk)->icsk_retransmits = 0;
6414
6415     /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
6416      * we no longer need req so release it.
6417      */
6418     req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
6419                     lockdep_sock_is_held(sk));
6420     reqsk_fastopen_remove(sk, req, false);
6421
6422     /* Re-arm the timer because data may have been sent out.
6423      * This is similar to the regular data transmission case
6424      * when new data has just been ack'ed.
6425      *
6426      * (TFO) - we could try to be more aggressive and
6427      * retransmitting any data sooner based on when they
6428      * are sent out.
6429      */
6430     tcp_rearm_rto(sk);
6431 }
6432
6433 /*
6434  *  This function implements the receiving procedure of RFC 793 for
6435  *  all states except ESTABLISHED and TIME_WAIT.
6436  *  It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
6437  *  address independent.
6438  */
6439
6440 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
6441 {
6442     struct tcp_sock *tp = tcp_sk(sk);
6443     struct inet_connection_sock *icsk = inet_csk(sk);
6444     const struct tcphdr *th = tcp_hdr(skb);
6445     struct request_sock *req;
6446     int queued = 0;
6447     bool acceptable;
6448     SKB_DR(reason);
6449
6450     switch (sk->sk_state) {
6451     case TCP_CLOSE:
6452         SKB_DR_SET(reason, TCP_CLOSE);
6453         goto discard;
6454
6455     case TCP_LISTEN:
6456         if (th->ack)
6457             return 1;
6458
6459         if (th->rst) {
6460             SKB_DR_SET(reason, TCP_RESET);
6461             goto discard;
6462         }
6463         if (th->syn) {
6464             if (th->fin) {
6465                 SKB_DR_SET(reason, TCP_FLAGS);
6466                 goto discard;
6467             }
6468             /* It is possible that we process SYN packets from backlog,
6469              * so we need to make sure to disable BH and RCU right there.
6470              */
6471             rcu_read_lock();
6472             local_bh_disable();
6473             acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
6474             local_bh_enable();
6475             rcu_read_unlock();
6476
6477             if (!acceptable)
6478                 return 1;
6479             consume_skb(skb);
6480             return 0;
6481         }
6482         SKB_DR_SET(reason, TCP_FLAGS);
6483         goto discard;
6484
6485     case TCP_SYN_SENT:
6486         tp->rx_opt.saw_tstamp = 0;
6487         tcp_mstamp_refresh(tp);
6488         queued = tcp_rcv_synsent_state_process(sk, skb, th);
6489         if (queued >= 0)
6490             return queued;
6491
6492         /* Do step6 onward by hand. */
6493         tcp_urg(sk, skb, th);
6494         __kfree_skb(skb);
6495         tcp_data_snd_check(sk);
6496         return 0;
6497     }
6498
6499     tcp_mstamp_refresh(tp);
6500     tp->rx_opt.saw_tstamp = 0;
6501     req = rcu_dereference_protected(tp->fastopen_rsk,
6502                     lockdep_sock_is_held(sk));
6503     if (req) {
6504         bool req_stolen;
6505
6506         WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
6507             sk->sk_state != TCP_FIN_WAIT1);
6508
6509         if (!tcp_check_req(sk, skb, req, true, &req_stolen)) {
6510             SKB_DR_SET(reason, TCP_FASTOPEN);
6511             goto discard;
6512         }
6513     }
6514
6515     if (!th->ack && !th->rst && !th->syn) {
6516         SKB_DR_SET(reason, TCP_FLAGS);
6517         goto discard;
6518     }
6519     if (!tcp_validate_incoming(sk, skb, th, 0))
6520         return 0;
6521
6522     /* step 5: check the ACK field */
6523     acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
6524                       FLAG_UPDATE_TS_RECENT |
6525                       FLAG_NO_CHALLENGE_ACK) > 0;
6526
6527     if (!acceptable) {
6528         if (sk->sk_state == TCP_SYN_RECV)
6529             return 1;   /* send one RST */
6530         tcp_send_challenge_ack(sk);
6531         SKB_DR_SET(reason, TCP_OLD_ACK);
6532         goto discard;
6533     }
6534     switch (sk->sk_state) {
6535     case TCP_SYN_RECV:
6536         tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
6537         if (!tp->srtt_us)
6538             tcp_synack_rtt_meas(sk, req);
6539
6540         if (req) {
6541             tcp_rcv_synrecv_state_fastopen(sk);
6542         } else {
6543             tcp_try_undo_spurious_syn(sk);
6544             tp->retrans_stamp = 0;
6545             tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
6546                       skb);
6547             WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6548         }
6549         smp_mb();
6550         tcp_set_state(sk, TCP_ESTABLISHED);
6551         sk->sk_state_change(sk);
6552
6553         /* Note, that this wakeup is only for marginal crossed SYN case.
6554          * Passively open sockets are not waked up, because
6555          * sk->sk_sleep == NULL and sk->sk_socket == NULL.
6556          */
6557         if (sk->sk_socket)
6558             sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
6559
6560         tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
6561         tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
6562         tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
6563
6564         if (tp->rx_opt.tstamp_ok)
6565             tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6566
6567         if (!inet_csk(sk)->icsk_ca_ops->cong_control)
6568             tcp_update_pacing_rate(sk);
6569
6570         /* Prevent spurious tcp_cwnd_restart() on first data packet */
6571         tp->lsndtime = tcp_jiffies32;
6572
6573         tcp_initialize_rcv_mss(sk);
6574         tcp_fast_path_on(tp);
6575         break;
6576
6577     case TCP_FIN_WAIT1: {
6578         int tmo;
6579
6580         if (req)
6581             tcp_rcv_synrecv_state_fastopen(sk);
6582
6583         if (tp->snd_una != tp->write_seq)
6584             break;
6585
6586         tcp_set_state(sk, TCP_FIN_WAIT2);
6587         sk->sk_shutdown |= SEND_SHUTDOWN;
6588
6589         sk_dst_confirm(sk);
6590
6591         if (!sock_flag(sk, SOCK_DEAD)) {
6592             /* Wake up lingering close() */
6593             sk->sk_state_change(sk);
6594             break;
6595         }
6596
6597         if (tp->linger2 < 0) {
6598             tcp_done(sk);
6599             NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6600             return 1;
6601         }
6602         if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
6603             after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6604             /* Receive out of order FIN after close() */
6605             if (tp->syn_fastopen && th->fin)
6606                 tcp_fastopen_active_disable(sk);
6607             tcp_done(sk);
6608             NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6609             return 1;
6610         }
6611
6612         tmo = tcp_fin_time(sk);
6613         if (tmo > TCP_TIMEWAIT_LEN) {
6614             inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
6615         } else if (th->fin || sock_owned_by_user(sk)) {
6616             /* Bad case. We could lose such FIN otherwise.
6617              * It is not a big problem, but it looks confusing
6618              * and not so rare event. We still can lose it now,
6619              * if it spins in bh_lock_sock(), but it is really
6620              * marginal case.
6621              */
6622             inet_csk_reset_keepalive_timer(sk, tmo);
6623         } else {
6624             tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
6625             goto consume;
6626         }
6627         break;
6628     }
6629
6630     case TCP_CLOSING:
6631         if (tp->snd_una == tp->write_seq) {
6632             tcp_time_wait(sk, TCP_TIME_WAIT, 0);
6633             goto consume;
6634         }
6635         break;
6636
6637     case TCP_LAST_ACK:
6638         if (tp->snd_una == tp->write_seq) {
6639             tcp_update_metrics(sk);
6640             tcp_done(sk);
6641             goto consume;
6642         }
6643         break;
6644     }
6645
6646     /* step 6: check the URG bit */
6647     tcp_urg(sk, skb, th);
6648
6649     /* step 7: process the segment text */
6650     switch (sk->sk_state) {
6651     case TCP_CLOSE_WAIT:
6652     case TCP_CLOSING:
6653     case TCP_LAST_ACK:
6654         if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
6655             /* If a subflow has been reset, the packet should not
6656              * continue to be processed, drop the packet.
6657              */
6658             if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
6659                 goto discard;
6660             break;
6661         }
6662         fallthrough;
6663     case TCP_FIN_WAIT1:
6664     case TCP_FIN_WAIT2:
6665         /* RFC 793 says to queue data in these states,
6666          * RFC 1122 says we MUST send a reset.
6667          * BSD 4.4 also does reset.
6668          */
6669         if (sk->sk_shutdown & RCV_SHUTDOWN) {
6670             if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
6671                 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6672                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6673                 tcp_reset(sk, skb);
6674                 return 1;
6675             }
6676         }
6677         fallthrough;
6678     case TCP_ESTABLISHED:
6679         tcp_data_queue(sk, skb);
6680         queued = 1;
6681         break;
6682     }
6683
6684     /* tcp_data could move socket to TIME-WAIT */
6685     if (sk->sk_state != TCP_CLOSE) {
6686         tcp_data_snd_check(sk);
6687         tcp_ack_snd_check(sk);
6688     }
6689
6690     if (!queued) {
6691 discard:
6692         tcp_drop_reason(sk, skb, reason);
6693     }
6694     return 0;
6695
6696 consume:
6697     __kfree_skb(skb);
6698     return 0;
6699 }
6700 EXPORT_SYMBOL(tcp_rcv_state_process);
6701
6702 static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
6703 {
6704     struct inet_request_sock *ireq = inet_rsk(req);
6705
6706     if (family == AF_INET)
6707         net_dbg_ratelimited("drop open request from %pI4/%u\n",
6708                     &ireq->ir_rmt_addr, port);
6709 #if IS_ENABLED(CONFIG_IPV6)
6710     else if (family == AF_INET6)
6711         net_dbg_ratelimited("drop open request from %pI6/%u\n",
6712                     &ireq->ir_v6_rmt_addr, port);
6713 #endif
6714 }
6715
6716 /* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
6717  *
6718  * If we receive a SYN packet with these bits set, it means a
6719  * network is playing bad games with TOS bits. In order to
6720  * avoid possible false congestion notifications, we disable
6721  * TCP ECN negotiation.
6722  *
6723  * Exception: tcp_ca wants ECN. This is required for DCTCP
6724  * congestion control: Linux DCTCP asserts ECT on all packets,
6725  * including SYN, which is most optimal solution; however,
6726  * others, such as FreeBSD do not.
6727  *
6728  * Exception: At least one of the reserved bits of the TCP header (th->res1) is
6729  * set, indicating the use of a future TCP extension (such as AccECN). See
6730  * RFC8311 §4.3 which updates RFC3168 to allow the development of such
6731  * extensions.
6732  */
6733 static void tcp_ecn_create_request(struct request_sock *req,
6734                    const struct sk_buff *skb,
6735                    const struct sock *listen_sk,
6736                    const struct dst_entry *dst)
6737 {
6738     const struct tcphdr *th = tcp_hdr(skb);
6739     const struct net *net = sock_net(listen_sk);
6740     bool th_ecn = th->ece && th->cwr;
6741     bool ect, ecn_ok;
6742     u32 ecn_ok_dst;
6743
6744     if (!th_ecn)
6745         return;
6746
6747     ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
6748     ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
6749     ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;
6750
6751     if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
6752         (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
6753         tcp_bpf_ca_needs_ecn((struct sock *)req))
6754         inet_rsk(req)->ecn_ok = 1;
6755 }
6756
6757 static void tcp_openreq_init(struct request_sock *req,
6758                  const struct tcp_options_received *rx_opt,
6759                  struct sk_buff *skb, const struct sock *sk)
6760 {
6761     struct inet_request_sock *ireq = inet_rsk(req);
6762
6763     req->rsk_rcv_wnd = 0;       /* So that tcp_send_synack() knows! */
6764     tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
6765     tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
6766     tcp_rsk(req)->snt_synack = 0;
6767     tcp_rsk(req)->last_oow_ack_time = 0;
6768     req->mss = rx_opt->mss_clamp;
6769     req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
6770     ireq->tstamp_ok = rx_opt->tstamp_ok;
6771     ireq->sack_ok = rx_opt->sack_ok;
6772     ireq->snd_wscale = rx_opt->snd_wscale;
6773     ireq->wscale_ok = rx_opt->wscale_ok;
6774     ireq->acked = 0;
6775     ireq->ecn_ok = 0;
6776     ireq->ir_rmt_port = tcp_hdr(skb)->source;
6777     ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
6778     ireq->ir_mark = inet_request_mark(sk, skb);
6779 #if IS_ENABLED(CONFIG_SMC)
6780     ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
6781             tcp_sk(sk)->smc_hs_congested(sk));
6782 #endif
6783 }
6784
6785 struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
6786                       struct sock *sk_listener,
6787                       bool attach_listener)
6788 {
6789     struct request_sock *req = reqsk_alloc(ops, sk_listener,
6790                            attach_listener);
6791
6792     if (req) {
6793         struct inet_request_sock *ireq = inet_rsk(req);
6794
6795         ireq->ireq_opt = NULL;
6796 #if IS_ENABLED(CONFIG_IPV6)
6797         ireq->pktopts = NULL;
6798 #endif
6799         atomic64_set(&ireq->ir_cookie, 0);
6800         ireq->ireq_state = TCP_NEW_SYN_RECV;
6801         write_pnet(&ireq->ireq_net, sock_net(sk_listener));
6802         ireq->ireq_family = sk_listener->sk_family;
6803         req->timeout = TCP_TIMEOUT_INIT;
6804     }
6805
6806     return req;
6807 }
6808 EXPORT_SYMBOL(inet_reqsk_alloc);
6809
6810 /*
6811  * Return true if a syncookie should be sent
6812  */
6813 static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
6814 {
6815     struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
6816     const char *msg = "Dropping request";
6817     struct net *net = sock_net(sk);
6818     bool want_cookie = false;
6819     u8 syncookies;
6820
6821     syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
6822
6823 #ifdef CONFIG_SYN_COOKIES
6824     if (syncookies) {
6825         msg = "Sending cookies";
6826         want_cookie = true;
6827         __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
6828     } else
6829 #endif
6830         __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
6831
6832     if (!queue->synflood_warned && syncookies != 2 &&
6833         xchg(&queue->synflood_warned, 1) == 0)
6834         net_info_ratelimited("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
6835                      proto, sk->sk_num, msg);
6836
6837     return want_cookie;
6838 }
6839
6840 static void tcp_reqsk_record_syn(const struct sock *sk,
6841                  struct request_sock *req,
6842                  const struct sk_buff *skb)
6843 {
6844     if (tcp_sk(sk)->save_syn) {
6845         u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
6846         struct saved_syn *saved_syn;
6847         u32 mac_hdrlen;
6848         void *base;
6849
6850         if (tcp_sk(sk)->save_syn == 2) {  /* Save full header. */
6851             base = skb_mac_header(skb);
6852             mac_hdrlen = skb_mac_header_len(skb);
6853             len += mac_hdrlen;
6854         } else {
6855             base = skb_network_header(skb);
6856             mac_hdrlen = 0;
6857         }
6858
6859         saved_syn = kmalloc(struct_size(saved_syn, data, len),
6860                     GFP_ATOMIC);
6861         if (saved_syn) {
6862             saved_syn->mac_hdrlen = mac_hdrlen;
6863             saved_syn->network_hdrlen = skb_network_header_len(skb);
6864             saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
6865             memcpy(saved_syn->data, base, len);
6866             req->saved_syn = saved_syn;
6867         }
6868     }
6869 }
6870
6871 /* If a SYN cookie is required and supported, returns a clamped MSS value to be
6872  * used for SYN cookie generation.
6873  */
6874 u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
6875               const struct tcp_request_sock_ops *af_ops,
6876               struct sock *sk, struct tcphdr *th)
6877 {
6878     struct tcp_sock *tp = tcp_sk(sk);
6879     u16 mss;
6880
6881     if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 &&
6882         !inet_csk_reqsk_queue_is_full(sk))
6883         return 0;
6884
6885     if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
6886         return 0;
6887
6888     if (sk_acceptq_is_full(sk)) {
6889         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6890         return 0;
6891     }
6892
6893     mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
6894     if (!mss)
6895         mss = af_ops->mss_clamp;
6896
6897     return mss;
6898 }
6899 EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
6900
6901 int tcp_conn_request(struct request_sock_ops *rsk_ops,
6902              const struct tcp_request_sock_ops *af_ops,
6903              struct sock *sk, struct sk_buff *skb)
6904 {
6905     struct tcp_fastopen_cookie foc = { .len = -1 };
6906     __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
6907     struct tcp_options_received tmp_opt;
6908     struct tcp_sock *tp = tcp_sk(sk);
6909     struct net *net = sock_net(sk);
6910     struct sock *fastopen_sk = NULL;
6911     struct request_sock *req;
6912     bool want_cookie = false;
6913     struct dst_entry *dst;
6914     struct flowi fl;
6915     u8 syncookies;
6916
6917     syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
6918
6919     /* TW buckets are converted to open requests without
6920      * limitations, they conserve resources and peer is
6921      * evidently real one.
6922      */
6923     if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6924         want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
6925         if (!want_cookie)
6926             goto drop;
6927     }
6928
6929     if (sk_acceptq_is_full(sk)) {
6930         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6931         goto drop;
6932     }
6933
6934     req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
6935     if (!req)
6936         goto drop;
6937
6938     req->syncookie = want_cookie;
6939     tcp_rsk(req)->af_specific = af_ops;
6940     tcp_rsk(req)->ts_off = 0;
6941 #if IS_ENABLED(CONFIG_MPTCP)
6942     tcp_rsk(req)->is_mptcp = 0;
6943 #endif
6944
6945     tcp_clear_options(&tmp_opt);
6946     tmp_opt.mss_clamp = af_ops->mss_clamp;
6947     tmp_opt.user_mss  = tp->rx_opt.user_mss;
6948     tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
6949               want_cookie ? NULL : &foc);
6950
6951     if (want_cookie && !tmp_opt.saw_tstamp)
6952         tcp_clear_options(&tmp_opt);
6953
6954     if (IS_ENABLED(CONFIG_SMC) && want_cookie)
6955         tmp_opt.smc_ok = 0;
6956
6957     tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
6958     tcp_openreq_init(req, &tmp_opt, skb, sk);
6959     inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
6960
6961     /* Note: tcp_v6_init_req() might override ir_iif for link locals */
6962     inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
6963
6964     dst = af_ops->route_req(sk, skb, &fl, req);
6965     if (!dst)
6966         goto drop_and_free;
6967
6968     if (tmp_opt.tstamp_ok)
6969         tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
6970
6971     if (!want_cookie && !isn) {
6972         int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
6973
6974         /* Kill the following clause, if you dislike this way. */
6975         if (!syncookies &&
6976             (max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6977              (max_syn_backlog >> 2)) &&
6978             !tcp_peer_is_proven(req, dst)) {
6979             /* Without syncookies last quarter of
6980              * backlog is filled with destinations,
6981              * proven to be alive.
6982              * It means that we continue to communicate
6983              * to destinations, already remembered
6984              * to the moment of synflood.
6985              */
6986             pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
6987                     rsk_ops->family);
6988             goto drop_and_release;
6989         }
6990
6991         isn = af_ops->init_seq(skb);
6992     }
6993
6994     tcp_ecn_create_request(req, skb, sk, dst);
6995
6996     if (want_cookie) {
6997         isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6998         if (!tmp_opt.tstamp_ok)
6999             inet_rsk(req)->ecn_ok = 0;
7000     }
7001
7002     tcp_rsk(req)->snt_isn = isn;
7003     tcp_rsk(req)->txhash = net_tx_rndhash();
7004     tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
7005     tcp_openreq_init_rwin(req, sk, dst);
7006     sk_rx_queue_set(req_to_sk(req), skb);
7007     if (!want_cookie) {
7008         tcp_reqsk_record_syn(sk, req, skb);
7009         fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
7010     }
7011     if (fastopen_sk) {
7012         af_ops->send_synack(fastopen_sk, dst, &fl, req,
7013                     &foc, TCP_SYNACK_FASTOPEN, skb);
7014         /* Add the child socket directly into the accept queue */
7015         if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
7016             reqsk_fastopen_remove(fastopen_sk, req, false);
7017             bh_unlock_sock(fastopen_sk);
7018             sock_put(fastopen_sk);
7019             goto drop_and_free;
7020         }
7021         sk->sk_data_ready(sk);
7022         bh_unlock_sock(fastopen_sk);
7023         sock_put(fastopen_sk);
7024     } else {
7025         tcp_rsk(req)->tfo_listener = false;
7026         if (!want_cookie) {
7027             req->timeout = tcp_timeout_init((struct sock *)req);
7028             inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
7029         }
7030         af_ops->send_synack(sk, dst, &fl, req, &foc,
7031                     !want_cookie ? TCP_SYNACK_NORMAL :
7032                            TCP_SYNACK_COOKIE,
7033                     skb);
7034         if (want_cookie) {
7035             reqsk_free(req);
7036             return 0;
7037         }
7038     }
7039     reqsk_put(req);
7040     return 0;
7041
7042 drop_and_release:
7043     dst_release(dst);
7044 drop_and_free:
7045     __reqsk_free(req);
7046 drop:
7047     tcp_listendrop(sk);
7048     return 0;
7049 }
7050 EXPORT_SYMBOL(tcp_conn_request);