net/ipv4/tcp_minisocks.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * INET     An implementation of the TCP/IP protocol suite for the LINUX
0004  *      operating system.  INET is implemented using the  BSD Socket
0005  *      interface as the means of communication with the user level.
0006  *
0007  *      Implementation of the Transmission Control Protocol(TCP).
0008  *
0009  * Authors: Ross Biro
0010  *      Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
0011  *      Mark Evans, <evansmp@uhura.aston.ac.uk>
0012  *      Corey Minyard <wf-rch!minyard@relay.EU.net>
0013  *      Florian La Roche, <flla@stud.uni-sb.de>
0014  *      Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
0015  *      Linus Torvalds, <torvalds@cs.helsinki.fi>
0016  *      Alan Cox, <gw4pts@gw4pts.ampr.org>
0017  *      Matthew Dillon, <dillon@apollo.west.oic.com>
0018  *      Arnt Gulbrandsen, <agulbra@nvg.unit.no>
0019  *      Jorge Cwik, <jorge@laser.satlink.net>
0020  */
0021
0022 #include <net/tcp.h>
0023 #include <net/xfrm.h>
0024 #include <net/busy_poll.h>
0025
0026 static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
0027 {
0028     if (seq == s_win)
0029         return true;
0030     if (after(end_seq, s_win) && before(seq, e_win))
0031         return true;
0032     return seq == e_win && seq == end_seq;
0033 }
0034
0035 static enum tcp_tw_status
0036 tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
0037                   const struct sk_buff *skb, int mib_idx)
0038 {
0039     struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
0040
0041     if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
0042                   &tcptw->tw_last_oow_ack_time)) {
0043         /* Send ACK. Note, we do not put the bucket,
0044          * it will be released by caller.
0045          */
0046         return TCP_TW_ACK;
0047     }
0048
0049     /* We are rate-limiting, so just release the tw sock and drop skb. */
0050     inet_twsk_put(tw);
0051     return TCP_TW_SUCCESS;
0052 }
0053
0054 /*
0055  * * Main purpose of TIME-WAIT state is to close connection gracefully,
0056  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
0057  *   (and, probably, tail of data) and one or more our ACKs are lost.
0058  * * What is TIME-WAIT timeout? It is associated with maximal packet
0059  *   lifetime in the internet, which results in wrong conclusion, that
0060  *   it is set to catch "old duplicate segments" wandering out of their path.
0061  *   It is not quite correct. This timeout is calculated so that it exceeds
0062  *   maximal retransmission timeout enough to allow to lose one (or more)
0063  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
0064  * * When TIME-WAIT socket receives RST, it means that another end
0065  *   finally closed and we are allowed to kill TIME-WAIT too.
0066  * * Second purpose of TIME-WAIT is catching old duplicate segments.
0067  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
0068  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
0069  * * If we invented some more clever way to catch duplicates
0070  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
0071  *
0072  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
0073  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
0074  * from the very beginning.
0075  *
0076  * NOTE. With recycling (and later with fin-wait-2) TW bucket
0077  * is _not_ stateless. It means, that strictly speaking we must
0078  * spinlock it. I do not want! Well, probability of misbehaviour
0079  * is ridiculously low and, seems, we could use some mb() tricks
0080  * to avoid misread sequence numbers, states etc.  --ANK
0081  *
0082  * We don't need to initialize tmp_out.sack_ok as we don't use the results
0083  */
0084 enum tcp_tw_status
0085 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
0086                const struct tcphdr *th)
0087 {
0088     struct tcp_options_received tmp_opt;
0089     struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
0090     bool paws_reject = false;
0091
0092     tmp_opt.saw_tstamp = 0;
0093     if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
0094         tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
0095
0096         if (tmp_opt.saw_tstamp) {
0097             if (tmp_opt.rcv_tsecr)
0098                 tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
0099             tmp_opt.ts_recent   = tcptw->tw_ts_recent;
0100             tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
0101             paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
0102         }
0103     }
0104
0105     if (tw->tw_substate == TCP_FIN_WAIT2) {
0106         /* Just repeat all the checks of tcp_rcv_state_process() */
0107
0108         /* Out of window, send ACK */
0109         if (paws_reject ||
0110             !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
0111                    tcptw->tw_rcv_nxt,
0112                    tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
0113             return tcp_timewait_check_oow_rate_limit(
0114                 tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
0115
0116         if (th->rst)
0117             goto kill;
0118
0119         if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
0120             return TCP_TW_RST;
0121
0122         /* Dup ACK? */
0123         if (!th->ack ||
0124             !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
0125             TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
0126             inet_twsk_put(tw);
0127             return TCP_TW_SUCCESS;
0128         }
0129
0130         /* New data or FIN. If new data arrive after half-duplex close,
0131          * reset.
0132          */
0133         if (!th->fin ||
0134             TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1)
0135             return TCP_TW_RST;
0136
0137         /* FIN arrived, enter true time-wait state. */
0138         tw->tw_substate   = TCP_TIME_WAIT;
0139         tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
0140         if (tmp_opt.saw_tstamp) {
0141             tcptw->tw_ts_recent_stamp = ktime_get_seconds();
0142             tcptw->tw_ts_recent   = tmp_opt.rcv_tsval;
0143         }
0144
0145         inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
0146         return TCP_TW_ACK;
0147     }
0148
0149     /*
0150      *  Now real TIME-WAIT state.
0151      *
0152      *  RFC 1122:
0153      *  "When a connection is [...] on TIME-WAIT state [...]
0154      *  [a TCP] MAY accept a new SYN from the remote TCP to
0155      *  reopen the connection directly, if it:
0156      *
0157      *  (1)  assigns its initial sequence number for the new
0158      *  connection to be larger than the largest sequence
0159      *  number it used on the previous connection incarnation,
0160      *  and
0161      *
0162      *  (2)  returns to TIME-WAIT state if the SYN turns out
0163      *  to be an old duplicate".
0164      */
0165
0166     if (!paws_reject &&
0167         (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
0168          (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
0169         /* In window segment, it may be only reset or bare ack. */
0170
0171         if (th->rst) {
0172             /* This is TIME_WAIT assassination, in two flavors.
0173              * Oh well... nobody has a sufficient solution to this
0174              * protocol bug yet.
0175              */
0176             if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) {
0177 kill:
0178                 inet_twsk_deschedule_put(tw);
0179                 return TCP_TW_SUCCESS;
0180             }
0181         } else {
0182             inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
0183         }
0184
0185         if (tmp_opt.saw_tstamp) {
0186             tcptw->tw_ts_recent   = tmp_opt.rcv_tsval;
0187             tcptw->tw_ts_recent_stamp = ktime_get_seconds();
0188         }
0189
0190         inet_twsk_put(tw);
0191         return TCP_TW_SUCCESS;
0192     }
0193
0194     /* Out of window segment.
0195
0196        All the segments are ACKed immediately.
0197
0198        The only exception is new SYN. We accept it, if it is
0199        not old duplicate and we are not in danger to be killed
0200        by delayed old duplicates. RFC check is that it has
0201        newer sequence number works at rates <40Mbit/sec.
0202        However, if paws works, it is reliable AND even more,
0203        we even may relax silly seq space cutoff.
0204
0205        RED-PEN: we violate main RFC requirement, if this SYN will appear
0206        old duplicate (i.e. we receive RST in reply to SYN-ACK),
0207        we must return socket to time-wait state. It is not good,
0208        but not fatal yet.
0209      */
0210
0211     if (th->syn && !th->rst && !th->ack && !paws_reject &&
0212         (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
0213          (tmp_opt.saw_tstamp &&
0214           (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
0215         u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
0216         if (isn == 0)
0217             isn++;
0218         TCP_SKB_CB(skb)->tcp_tw_isn = isn;
0219         return TCP_TW_SYN;
0220     }
0221
0222     if (paws_reject)
0223         __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
0224
0225     if (!th->rst) {
0226         /* In this case we must reset the TIMEWAIT timer.
0227          *
0228          * If it is ACKless SYN it may be both old duplicate
0229          * and new good SYN with random sequence number <rcv_nxt.
0230          * Do not reschedule in the last case.
0231          */
0232         if (paws_reject || th->ack)
0233             inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
0234
0235         return tcp_timewait_check_oow_rate_limit(
0236             tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
0237     }
0238     inet_twsk_put(tw);
0239     return TCP_TW_SUCCESS;
0240 }
0241 EXPORT_SYMBOL(tcp_timewait_state_process);
0242
0243 /*
0244  * Move a socket to time-wait or dead fin-wait-2 state.
0245  */
0246 void tcp_time_wait(struct sock *sk, int state, int timeo)
0247 {
0248     const struct inet_connection_sock *icsk = inet_csk(sk);
0249     const struct tcp_sock *tp = tcp_sk(sk);
0250     struct inet_timewait_sock *tw;
0251     struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
0252
0253     tw = inet_twsk_alloc(sk, tcp_death_row, state);
0254
0255     if (tw) {
0256         struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
0257         const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
0258         struct inet_sock *inet = inet_sk(sk);
0259
0260         tw->tw_transparent  = inet->transparent;
0261         tw->tw_mark     = sk->sk_mark;
0262         tw->tw_priority     = sk->sk_priority;
0263         tw->tw_rcv_wscale   = tp->rx_opt.rcv_wscale;
0264         tcptw->tw_rcv_nxt   = tp->rcv_nxt;
0265         tcptw->tw_snd_nxt   = tp->snd_nxt;
0266         tcptw->tw_rcv_wnd   = tcp_receive_window(tp);
0267         tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
0268         tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
0269         tcptw->tw_ts_offset = tp->tsoffset;
0270         tcptw->tw_last_oow_ack_time = 0;
0271         tcptw->tw_tx_delay  = tp->tcp_tx_delay;
0272 #if IS_ENABLED(CONFIG_IPV6)
0273         if (tw->tw_family == PF_INET6) {
0274             struct ipv6_pinfo *np = inet6_sk(sk);
0275
0276             tw->tw_v6_daddr = sk->sk_v6_daddr;
0277             tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
0278             tw->tw_tclass = np->tclass;
0279             tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
0280             tw->tw_txhash = sk->sk_txhash;
0281             tw->tw_ipv6only = sk->sk_ipv6only;
0282         }
0283 #endif
0284
0285 #ifdef CONFIG_TCP_MD5SIG
0286         /*
0287          * The timewait bucket does not have the key DB from the
0288          * sock structure. We just make a quick copy of the
0289          * md5 key being used (if indeed we are using one)
0290          * so the timewait ack generating code has the key.
0291          */
0292         do {
0293             tcptw->tw_md5_key = NULL;
0294             if (static_branch_unlikely(&tcp_md5_needed)) {
0295                 struct tcp_md5sig_key *key;
0296
0297                 key = tp->af_specific->md5_lookup(sk, sk);
0298                 if (key) {
0299                     tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
0300                     BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool());
0301                 }
0302             }
0303         } while (0);
0304 #endif
0305
0306         /* Get the TIME_WAIT timeout firing. */
0307         if (timeo < rto)
0308             timeo = rto;
0309
0310         if (state == TCP_TIME_WAIT)
0311             timeo = TCP_TIMEWAIT_LEN;
0312
0313         /* tw_timer is pinned, so we need to make sure BH are disabled
0314          * in following section, otherwise timer handler could run before
0315          * we complete the initialization.
0316          */
0317         local_bh_disable();
0318         inet_twsk_schedule(tw, timeo);
0319         /* Linkage updates.
0320          * Note that access to tw after this point is illegal.
0321          */
0322         inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
0323         local_bh_enable();
0324     } else {
0325         /* Sorry, if we're out of memory, just CLOSE this
0326          * socket up.  We've got bigger problems than
0327          * non-graceful socket closings.
0328          */
0329         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
0330     }
0331
0332     tcp_update_metrics(sk);
0333     tcp_done(sk);
0334 }
0335 EXPORT_SYMBOL(tcp_time_wait);
0336
0337 void tcp_twsk_destructor(struct sock *sk)
0338 {
0339 #ifdef CONFIG_TCP_MD5SIG
0340     if (static_branch_unlikely(&tcp_md5_needed)) {
0341         struct tcp_timewait_sock *twsk = tcp_twsk(sk);
0342
0343         if (twsk->tw_md5_key)
0344             kfree_rcu(twsk->tw_md5_key, rcu);
0345     }
0346 #endif
0347 }
0348 EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
0349
0350 /* Warning : This function is called without sk_listener being locked.
0351  * Be sure to read socket fields once, as their value could change under us.
0352  */
0353 void tcp_openreq_init_rwin(struct request_sock *req,
0354                const struct sock *sk_listener,
0355                const struct dst_entry *dst)
0356 {
0357     struct inet_request_sock *ireq = inet_rsk(req);
0358     const struct tcp_sock *tp = tcp_sk(sk_listener);
0359     int full_space = tcp_full_space(sk_listener);
0360     u32 window_clamp;
0361     __u8 rcv_wscale;
0362     u32 rcv_wnd;
0363     int mss;
0364
0365     mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
0366     window_clamp = READ_ONCE(tp->window_clamp);
0367     /* Set this up on the first call only */
0368     req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
0369
0370     /* limit the window selection if the user enforce a smaller rx buffer */
0371     if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
0372         (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
0373         req->rsk_window_clamp = full_space;
0374
0375     rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
0376     if (rcv_wnd == 0)
0377         rcv_wnd = dst_metric(dst, RTAX_INITRWND);
0378     else if (full_space < rcv_wnd * mss)
0379         full_space = rcv_wnd * mss;
0380
0381     /* tcp_full_space because it is guaranteed to be the first packet */
0382     tcp_select_initial_window(sk_listener, full_space,
0383         mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
0384         &req->rsk_rcv_wnd,
0385         &req->rsk_window_clamp,
0386         ireq->wscale_ok,
0387         &rcv_wscale,
0388         rcv_wnd);
0389     ireq->rcv_wscale = rcv_wscale;
0390 }
0391 EXPORT_SYMBOL(tcp_openreq_init_rwin);
0392
0393 static void tcp_ecn_openreq_child(struct tcp_sock *tp,
0394                   const struct request_sock *req)
0395 {
0396     tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
0397 }
0398
0399 void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
0400 {
0401     struct inet_connection_sock *icsk = inet_csk(sk);
0402     u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
0403     bool ca_got_dst = false;
0404
0405     if (ca_key != TCP_CA_UNSPEC) {
0406         const struct tcp_congestion_ops *ca;
0407
0408         rcu_read_lock();
0409         ca = tcp_ca_find_key(ca_key);
0410         if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
0411             icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
0412             icsk->icsk_ca_ops = ca;
0413             ca_got_dst = true;
0414         }
0415         rcu_read_unlock();
0416     }
0417
0418     /* If no valid choice made yet, assign current system default ca. */
0419     if (!ca_got_dst &&
0420         (!icsk->icsk_ca_setsockopt ||
0421          !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
0422         tcp_assign_congestion_control(sk);
0423
0424     tcp_set_ca_state(sk, TCP_CA_Open);
0425 }
0426 EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
0427
0428 static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
0429                     struct request_sock *req,
0430                     struct tcp_sock *newtp)
0431 {
0432 #if IS_ENABLED(CONFIG_SMC)
0433     struct inet_request_sock *ireq;
0434
0435     if (static_branch_unlikely(&tcp_have_smc)) {
0436         ireq = inet_rsk(req);
0437         if (oldtp->syn_smc && !ireq->smc_ok)
0438             newtp->syn_smc = 0;
0439     }
0440 #endif
0441 }
0442
0443 /* This is not only more efficient than what we used to do, it eliminates
0444  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
0445  *
0446  * Actually, we could lots of memory writes here. tp of listening
0447  * socket contains all necessary default parameters.
0448  */
0449 struct sock *tcp_create_openreq_child(const struct sock *sk,
0450                       struct request_sock *req,
0451                       struct sk_buff *skb)
0452 {
0453     struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
0454     const struct inet_request_sock *ireq = inet_rsk(req);
0455     struct tcp_request_sock *treq = tcp_rsk(req);
0456     struct inet_connection_sock *newicsk;
0457     struct tcp_sock *oldtp, *newtp;
0458     u32 seq;
0459
0460     if (!newsk)
0461         return NULL;
0462
0463     newicsk = inet_csk(newsk);
0464     newtp = tcp_sk(newsk);
0465     oldtp = tcp_sk(sk);
0466
0467     smc_check_reset_syn_req(oldtp, req, newtp);
0468
0469     /* Now setup tcp_sock */
0470     newtp->pred_flags = 0;
0471
0472     seq = treq->rcv_isn + 1;
0473     newtp->rcv_wup = seq;
0474     WRITE_ONCE(newtp->copied_seq, seq);
0475     WRITE_ONCE(newtp->rcv_nxt, seq);
0476     newtp->segs_in = 1;
0477
0478     seq = treq->snt_isn + 1;
0479     newtp->snd_sml = newtp->snd_una = seq;
0480     WRITE_ONCE(newtp->snd_nxt, seq);
0481     newtp->snd_up = seq;
0482
0483     INIT_LIST_HEAD(&newtp->tsq_node);
0484     INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
0485
0486     tcp_init_wl(newtp, treq->rcv_isn);
0487
0488     minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
0489     newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
0490
0491     newtp->lsndtime = tcp_jiffies32;
0492     newsk->sk_txhash = treq->txhash;
0493     newtp->total_retrans = req->num_retrans;
0494
0495     tcp_init_xmit_timers(newsk);
0496     WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
0497
0498     if (sock_flag(newsk, SOCK_KEEPOPEN))
0499         inet_csk_reset_keepalive_timer(newsk,
0500                            keepalive_time_when(newtp));
0501
0502     newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
0503     newtp->rx_opt.sack_ok = ireq->sack_ok;
0504     newtp->window_clamp = req->rsk_window_clamp;
0505     newtp->rcv_ssthresh = req->rsk_rcv_wnd;
0506     newtp->rcv_wnd = req->rsk_rcv_wnd;
0507     newtp->rx_opt.wscale_ok = ireq->wscale_ok;
0508     if (newtp->rx_opt.wscale_ok) {
0509         newtp->rx_opt.snd_wscale = ireq->snd_wscale;
0510         newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
0511     } else {
0512         newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
0513         newtp->window_clamp = min(newtp->window_clamp, 65535U);
0514     }
0515     newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
0516     newtp->max_window = newtp->snd_wnd;
0517
0518     if (newtp->rx_opt.tstamp_ok) {
0519         newtp->rx_opt.ts_recent = req->ts_recent;
0520         newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
0521         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
0522     } else {
0523         newtp->rx_opt.ts_recent_stamp = 0;
0524         newtp->tcp_header_len = sizeof(struct tcphdr);
0525     }
0526     if (req->num_timeout) {
0527         newtp->undo_marker = treq->snt_isn;
0528         newtp->retrans_stamp = div_u64(treq->snt_synack,
0529                            USEC_PER_SEC / TCP_TS_HZ);
0530     }
0531     newtp->tsoffset = treq->ts_off;
0532 #ifdef CONFIG_TCP_MD5SIG
0533     newtp->md5sig_info = NULL;  /*XXX*/
0534     if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req)))
0535         newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
0536 #endif
0537     if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
0538         newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
0539     newtp->rx_opt.mss_clamp = req->mss;
0540     tcp_ecn_openreq_child(newtp, req);
0541     newtp->fastopen_req = NULL;
0542     RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
0543
0544     tcp_bpf_clone(sk, newsk);
0545
0546     __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
0547
0548     return newsk;
0549 }
0550 EXPORT_SYMBOL(tcp_create_openreq_child);
0551
0552 /*
0553  * Process an incoming packet for SYN_RECV sockets represented as a
0554  * request_sock. Normally sk is the listener socket but for TFO it
0555  * points to the child socket.
0556  *
0557  * XXX (TFO) - The current impl contains a special check for ack
0558  * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
0559  *
0560  * We don't need to initialize tmp_opt.sack_ok as we don't use the results
0561  */
0562
0563 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
0564                struct request_sock *req,
0565                bool fastopen, bool *req_stolen)
0566 {
0567     struct tcp_options_received tmp_opt;
0568     struct sock *child;
0569     const struct tcphdr *th = tcp_hdr(skb);
0570     __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
0571     bool paws_reject = false;
0572     bool own_req;
0573
0574     tmp_opt.saw_tstamp = 0;
0575     if (th->doff > (sizeof(struct tcphdr)>>2)) {
0576         tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
0577
0578         if (tmp_opt.saw_tstamp) {
0579             tmp_opt.ts_recent = req->ts_recent;
0580             if (tmp_opt.rcv_tsecr)
0581                 tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
0582             /* We do not store true stamp, but it is not required,
0583              * it can be estimated (approximately)
0584              * from another data.
0585              */
0586             tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
0587             paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
0588         }
0589     }
0590
0591     /* Check for pure retransmitted SYN. */
0592     if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
0593         flg == TCP_FLAG_SYN &&
0594         !paws_reject) {
0595         /*
0596          * RFC793 draws (Incorrectly! It was fixed in RFC1122)
0597          * this case on figure 6 and figure 8, but formal
0598          * protocol description says NOTHING.
0599          * To be more exact, it says that we should send ACK,
0600          * because this segment (at least, if it has no data)
0601          * is out of window.
0602          *
0603          *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
0604          *  describe SYN-RECV state. All the description
0605          *  is wrong, we cannot believe to it and should
0606          *  rely only on common sense and implementation
0607          *  experience.
0608          *
0609          * Enforce "SYN-ACK" according to figure 8, figure 6
0610          * of RFC793, fixed by RFC1122.
0611          *
0612          * Note that even if there is new data in the SYN packet
0613          * they will be thrown away too.
0614          *
0615          * Reset timer after retransmitting SYNACK, similar to
0616          * the idea of fast retransmit in recovery.
0617          */
0618         if (!tcp_oow_rate_limited(sock_net(sk), skb,
0619                       LINUX_MIB_TCPACKSKIPPEDSYNRECV,
0620                       &tcp_rsk(req)->last_oow_ack_time) &&
0621
0622             !inet_rtx_syn_ack(sk, req)) {
0623             unsigned long expires = jiffies;
0624
0625             expires += reqsk_timeout(req, TCP_RTO_MAX);
0626             if (!fastopen)
0627                 mod_timer_pending(&req->rsk_timer, expires);
0628             else
0629                 req->rsk_timer.expires = expires;
0630         }
0631         return NULL;
0632     }
0633
0634     /* Further reproduces section "SEGMENT ARRIVES"
0635        for state SYN-RECEIVED of RFC793.
0636        It is broken, however, it does not work only
0637        when SYNs are crossed.
0638
0639        You would think that SYN crossing is impossible here, since
0640        we should have a SYN_SENT socket (from connect()) on our end,
0641        but this is not true if the crossed SYNs were sent to both
0642        ends by a malicious third party.  We must defend against this,
0643        and to do that we first verify the ACK (as per RFC793, page
0644        36) and reset if it is invalid.  Is this a true full defense?
0645        To convince ourselves, let us consider a way in which the ACK
0646        test can still pass in this 'malicious crossed SYNs' case.
0647        Malicious sender sends identical SYNs (and thus identical sequence
0648        numbers) to both A and B:
0649
0650         A: gets SYN, seq=7
0651         B: gets SYN, seq=7
0652
0653        By our good fortune, both A and B select the same initial
0654        send sequence number of seven :-)
0655
0656         A: sends SYN|ACK, seq=7, ack_seq=8
0657         B: sends SYN|ACK, seq=7, ack_seq=8
0658
0659        So we are now A eating this SYN|ACK, ACK test passes.  So
0660        does sequence test, SYN is truncated, and thus we consider
0661        it a bare ACK.
0662
0663        If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
0664        bare ACK.  Otherwise, we create an established connection.  Both
0665        ends (listening sockets) accept the new incoming connection and try
0666        to talk to each other. 8-)
0667
0668        Note: This case is both harmless, and rare.  Possibility is about the
0669        same as us discovering intelligent life on another plant tomorrow.
0670
0671        But generally, we should (RFC lies!) to accept ACK
0672        from SYNACK both here and in tcp_rcv_state_process().
0673        tcp_rcv_state_process() does not, hence, we do not too.
0674
0675        Note that the case is absolutely generic:
0676        we cannot optimize anything here without
0677        violating protocol. All the checks must be made
0678        before attempt to create socket.
0679      */
0680
0681     /* RFC793 page 36: "If the connection is in any non-synchronized state ...
0682      *                  and the incoming segment acknowledges something not yet
0683      *                  sent (the segment carries an unacceptable ACK) ...
0684      *                  a reset is sent."
0685      *
0686      * Invalid ACK: reset will be sent by listening socket.
0687      * Note that the ACK validity check for a Fast Open socket is done
0688      * elsewhere and is checked directly against the child socket rather
0689      * than req because user data may have been sent out.
0690      */
0691     if ((flg & TCP_FLAG_ACK) && !fastopen &&
0692         (TCP_SKB_CB(skb)->ack_seq !=
0693          tcp_rsk(req)->snt_isn + 1))
0694         return sk;
0695
0696     /* Also, it would be not so bad idea to check rcv_tsecr, which
0697      * is essentially ACK extension and too early or too late values
0698      * should cause reset in unsynchronized states.
0699      */
0700
0701     /* RFC793: "first check sequence number". */
0702
0703     if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
0704                       tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
0705         /* Out of window: send ACK and drop. */
0706         if (!(flg & TCP_FLAG_RST) &&
0707             !tcp_oow_rate_limited(sock_net(sk), skb,
0708                       LINUX_MIB_TCPACKSKIPPEDSYNRECV,
0709                       &tcp_rsk(req)->last_oow_ack_time))
0710             req->rsk_ops->send_ack(sk, skb, req);
0711         if (paws_reject)
0712             __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
0713         return NULL;
0714     }
0715
0716     /* In sequence, PAWS is OK. */
0717
0718     if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
0719         req->ts_recent = tmp_opt.rcv_tsval;
0720
0721     if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
0722         /* Truncate SYN, it is out of window starting
0723            at tcp_rsk(req)->rcv_isn + 1. */
0724         flg &= ~TCP_FLAG_SYN;
0725     }
0726
0727     /* RFC793: "second check the RST bit" and
0728      *     "fourth, check the SYN bit"
0729      */
0730     if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
0731         __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
0732         goto embryonic_reset;
0733     }
0734
0735     /* ACK sequence verified above, just make sure ACK is
0736      * set.  If ACK not set, just silently drop the packet.
0737      *
0738      * XXX (TFO) - if we ever allow "data after SYN", the
0739      * following check needs to be removed.
0740      */
0741     if (!(flg & TCP_FLAG_ACK))
0742         return NULL;
0743
0744     /* For Fast Open no more processing is needed (sk is the
0745      * child socket).
0746      */
0747     if (fastopen)
0748         return sk;
0749
0750     /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
0751     if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
0752         TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
0753         inet_rsk(req)->acked = 1;
0754         __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
0755         return NULL;
0756     }
0757
0758     /* OK, ACK is valid, create big socket and
0759      * feed this segment to it. It will repeat all
0760      * the tests. THIS SEGMENT MUST MOVE SOCKET TO
0761      * ESTABLISHED STATE. If it will be dropped after
0762      * socket is created, wait for troubles.
0763      */
0764     child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
0765                              req, &own_req);
0766     if (!child)
0767         goto listen_overflow;
0768
0769     if (own_req && rsk_drop_req(req)) {
0770         reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
0771         inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
0772         return child;
0773     }
0774
0775     sock_rps_save_rxhash(child, skb);
0776     tcp_synack_rtt_meas(child, req);
0777     *req_stolen = !own_req;
0778     return inet_csk_complete_hashdance(sk, child, req, own_req);
0779
0780 listen_overflow:
0781     if (sk != req->rsk_listener)
0782         __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
0783
0784     if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) {
0785         inet_rsk(req)->acked = 1;
0786         return NULL;
0787     }
0788
0789 embryonic_reset:
0790     if (!(flg & TCP_FLAG_RST)) {
0791         /* Received a bad SYN pkt - for TFO We try not to reset
0792          * the local connection unless it's really necessary to
0793          * avoid becoming vulnerable to outside attack aiming at
0794          * resetting legit local connections.
0795          */
0796         req->rsk_ops->send_reset(sk, skb);
0797     } else if (fastopen) { /* received a valid RST pkt */
0798         reqsk_fastopen_remove(sk, req, true);
0799         tcp_reset(sk, skb);
0800     }
0801     if (!fastopen) {
0802         bool unlinked = inet_csk_reqsk_queue_drop(sk, req);
0803
0804         if (unlinked)
0805             __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
0806         *req_stolen = !unlinked;
0807     }
0808     return NULL;
0809 }
0810 EXPORT_SYMBOL(tcp_check_req);
0811
0812 /*
0813  * Queue segment on the new socket if the new socket is active,
0814  * otherwise we just shortcircuit this and continue with
0815  * the new socket.
0816  *
0817  * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
0818  * when entering. But other states are possible due to a race condition
0819  * where after __inet_lookup_established() fails but before the listener
0820  * locked is obtained, other packets cause the same connection to
0821  * be created.
0822  */
0823
0824 int tcp_child_process(struct sock *parent, struct sock *child,
0825               struct sk_buff *skb)
0826     __releases(&((child)->sk_lock.slock))
0827 {
0828     int ret = 0;
0829     int state = child->sk_state;
0830
0831     /* record sk_napi_id and sk_rx_queue_mapping of child. */
0832     sk_mark_napi_id_set(child, skb);
0833
0834     tcp_segs_in(tcp_sk(child), skb);
0835     if (!sock_owned_by_user(child)) {
0836         ret = tcp_rcv_state_process(child, skb);
0837         /* Wakeup parent, send SIGIO */
0838         if (state == TCP_SYN_RECV && child->sk_state != state)
0839             parent->sk_data_ready(parent);
0840     } else {
0841         /* Alas, it is possible again, because we do lookup
0842          * in main socket hash table and lock on listening
0843          * socket does not protect us more.
0844          */
0845         __sk_add_backlog(child, skb);
0846     }
0847
0848     bh_unlock_sock(child);
0849     sock_put(child);
0850     return ret;
0851 }
0852 EXPORT_SYMBOL(tcp_child_process);