0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038 #define pr_fmt(fmt) "TCP: " fmt
0039
0040 #include <net/tcp.h>
0041 #include <net/mptcp.h>
0042
0043 #include <linux/compiler.h>
0044 #include <linux/gfp.h>
0045 #include <linux/module.h>
0046 #include <linux/static_key.h>
0047
0048 #include <trace/events/tcp.h>
0049
0050
0051
0052
0053 void tcp_mstamp_refresh(struct tcp_sock *tp)
0054 {
0055 u64 val = tcp_clock_ns();
0056
0057 tp->tcp_clock_cache = val;
0058 tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
0059 }
0060
0061 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
0062 int push_one, gfp_t gfp);
0063
0064
0065 static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
0066 {
0067 struct inet_connection_sock *icsk = inet_csk(sk);
0068 struct tcp_sock *tp = tcp_sk(sk);
0069 unsigned int prior_packets = tp->packets_out;
0070
0071 WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
0072
0073 __skb_unlink(skb, &sk->sk_write_queue);
0074 tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
0075
0076 if (tp->highest_sack == NULL)
0077 tp->highest_sack = skb;
0078
0079 tp->packets_out += tcp_skb_pcount(skb);
0080 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
0081 tcp_rearm_rto(sk);
0082
0083 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
0084 tcp_skb_pcount(skb));
0085 tcp_check_space(sk);
0086 }
0087
0088
0089
0090
0091
0092
0093
0094
0095 static inline __u32 tcp_acceptable_seq(const struct sock *sk)
0096 {
0097 const struct tcp_sock *tp = tcp_sk(sk);
0098
0099 if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
0100 (tp->rx_opt.wscale_ok &&
0101 ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
0102 return tp->snd_nxt;
0103 else
0104 return tcp_wnd_end(tp);
0105 }
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121 static __u16 tcp_advertise_mss(struct sock *sk)
0122 {
0123 struct tcp_sock *tp = tcp_sk(sk);
0124 const struct dst_entry *dst = __sk_dst_get(sk);
0125 int mss = tp->advmss;
0126
0127 if (dst) {
0128 unsigned int metric = dst_metric_advmss(dst);
0129
0130 if (metric < mss) {
0131 mss = metric;
0132 tp->advmss = mss;
0133 }
0134 }
0135
0136 return (__u16)mss;
0137 }
0138
0139
0140
0141
0142 void tcp_cwnd_restart(struct sock *sk, s32 delta)
0143 {
0144 struct tcp_sock *tp = tcp_sk(sk);
0145 u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
0146 u32 cwnd = tcp_snd_cwnd(tp);
0147
0148 tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
0149
0150 tp->snd_ssthresh = tcp_current_ssthresh(sk);
0151 restart_cwnd = min(restart_cwnd, cwnd);
0152
0153 while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
0154 cwnd >>= 1;
0155 tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd));
0156 tp->snd_cwnd_stamp = tcp_jiffies32;
0157 tp->snd_cwnd_used = 0;
0158 }
0159
0160
0161 static void tcp_event_data_sent(struct tcp_sock *tp,
0162 struct sock *sk)
0163 {
0164 struct inet_connection_sock *icsk = inet_csk(sk);
0165 const u32 now = tcp_jiffies32;
0166
0167 if (tcp_packets_in_flight(tp) == 0)
0168 tcp_ca_event(sk, CA_EVENT_TX_START);
0169
0170 tp->lsndtime = now;
0171
0172
0173
0174
0175 if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
0176 inet_csk_enter_pingpong_mode(sk);
0177 }
0178
0179
0180 static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
0181 u32 rcv_nxt)
0182 {
0183 struct tcp_sock *tp = tcp_sk(sk);
0184
0185 if (unlikely(tp->compressed_ack)) {
0186 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
0187 tp->compressed_ack);
0188 tp->compressed_ack = 0;
0189 if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
0190 __sock_put(sk);
0191 }
0192
0193 if (unlikely(rcv_nxt != tp->rcv_nxt))
0194 return;
0195 tcp_dec_quickack_mode(sk, pkts);
0196 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
0197 }
0198
0199
0200
0201
0202
0203
0204
0205
0206 void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
0207 __u32 *rcv_wnd, __u32 *window_clamp,
0208 int wscale_ok, __u8 *rcv_wscale,
0209 __u32 init_rcv_wnd)
0210 {
0211 unsigned int space = (__space < 0 ? 0 : __space);
0212
0213
0214 if (*window_clamp == 0)
0215 (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
0216 space = min(*window_clamp, space);
0217
0218
0219 if (space > mss)
0220 space = rounddown(space, mss);
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
0231 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
0232 else
0233 (*rcv_wnd) = min_t(u32, space, U16_MAX);
0234
0235 if (init_rcv_wnd)
0236 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
0237
0238 *rcv_wscale = 0;
0239 if (wscale_ok) {
0240
0241 space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
0242 space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
0243 space = min_t(u32, space, *window_clamp);
0244 *rcv_wscale = clamp_t(int, ilog2(space) - 15,
0245 0, TCP_MAX_WSCALE);
0246 }
0247
0248 (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
0249 }
0250 EXPORT_SYMBOL(tcp_select_initial_window);
0251
0252
0253
0254
0255
0256
0257 static u16 tcp_select_window(struct sock *sk)
0258 {
0259 struct tcp_sock *tp = tcp_sk(sk);
0260 u32 old_win = tp->rcv_wnd;
0261 u32 cur_win = tcp_receive_window(tp);
0262 u32 new_win = __tcp_select_window(sk);
0263
0264
0265 if (new_win < cur_win) {
0266
0267
0268
0269
0270
0271
0272
0273 if (new_win == 0)
0274 NET_INC_STATS(sock_net(sk),
0275 LINUX_MIB_TCPWANTZEROWINDOWADV);
0276 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
0277 }
0278 tp->rcv_wnd = new_win;
0279 tp->rcv_wup = tp->rcv_nxt;
0280
0281
0282
0283
0284 if (!tp->rx_opt.rcv_wscale &&
0285 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
0286 new_win = min(new_win, MAX_TCP_WINDOW);
0287 else
0288 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
0289
0290
0291 new_win >>= tp->rx_opt.rcv_wscale;
0292
0293
0294 if (new_win == 0) {
0295 tp->pred_flags = 0;
0296 if (old_win)
0297 NET_INC_STATS(sock_net(sk),
0298 LINUX_MIB_TCPTOZEROWINDOWADV);
0299 } else if (old_win == 0) {
0300 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
0301 }
0302
0303 return new_win;
0304 }
0305
0306
0307 static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
0308 {
0309 const struct tcp_sock *tp = tcp_sk(sk);
0310
0311 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
0312 if (!(tp->ecn_flags & TCP_ECN_OK))
0313 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
0314 else if (tcp_ca_needs_ecn(sk) ||
0315 tcp_bpf_ca_needs_ecn(sk))
0316 INET_ECN_xmit(sk);
0317 }
0318
0319
0320 static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
0321 {
0322 struct tcp_sock *tp = tcp_sk(sk);
0323 bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
0324 bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
0325 tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
0326
0327 if (!use_ecn) {
0328 const struct dst_entry *dst = __sk_dst_get(sk);
0329
0330 if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
0331 use_ecn = true;
0332 }
0333
0334 tp->ecn_flags = 0;
0335
0336 if (use_ecn) {
0337 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
0338 tp->ecn_flags = TCP_ECN_OK;
0339 if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
0340 INET_ECN_xmit(sk);
0341 }
0342 }
0343
0344 static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
0345 {
0346 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
0347
0348
0349
0350 TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
0351 }
0352
0353 static void
0354 tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
0355 {
0356 if (inet_rsk(req)->ecn_ok)
0357 th->ece = 1;
0358 }
0359
0360
0361
0362
0363 static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
0364 struct tcphdr *th, int tcp_header_len)
0365 {
0366 struct tcp_sock *tp = tcp_sk(sk);
0367
0368 if (tp->ecn_flags & TCP_ECN_OK) {
0369
0370 if (skb->len != tcp_header_len &&
0371 !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
0372 INET_ECN_xmit(sk);
0373 if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
0374 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
0375 th->cwr = 1;
0376 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
0377 }
0378 } else if (!tcp_ca_needs_ecn(sk)) {
0379
0380 INET_ECN_dontxmit(sk);
0381 }
0382 if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
0383 th->ece = 1;
0384 }
0385 }
0386
0387
0388
0389
0390 static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
0391 {
0392 skb->ip_summed = CHECKSUM_PARTIAL;
0393
0394 TCP_SKB_CB(skb)->tcp_flags = flags;
0395
0396 tcp_skb_pcount_set(skb, 1);
0397
0398 TCP_SKB_CB(skb)->seq = seq;
0399 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
0400 seq++;
0401 TCP_SKB_CB(skb)->end_seq = seq;
0402 }
0403
0404 static inline bool tcp_urg_mode(const struct tcp_sock *tp)
0405 {
0406 return tp->snd_una != tp->snd_up;
0407 }
0408
0409 #define OPTION_SACK_ADVERTISE BIT(0)
0410 #define OPTION_TS BIT(1)
0411 #define OPTION_MD5 BIT(2)
0412 #define OPTION_WSCALE BIT(3)
0413 #define OPTION_FAST_OPEN_COOKIE BIT(8)
0414 #define OPTION_SMC BIT(9)
0415 #define OPTION_MPTCP BIT(10)
0416
0417 static void smc_options_write(__be32 *ptr, u16 *options)
0418 {
0419 #if IS_ENABLED(CONFIG_SMC)
0420 if (static_branch_unlikely(&tcp_have_smc)) {
0421 if (unlikely(OPTION_SMC & *options)) {
0422 *ptr++ = htonl((TCPOPT_NOP << 24) |
0423 (TCPOPT_NOP << 16) |
0424 (TCPOPT_EXP << 8) |
0425 (TCPOLEN_EXP_SMC_BASE));
0426 *ptr++ = htonl(TCPOPT_SMC_MAGIC);
0427 }
0428 }
0429 #endif
0430 }
0431
0432 struct tcp_out_options {
0433 u16 options;
0434 u16 mss;
0435 u8 ws;
0436 u8 num_sack_blocks;
0437 u8 hash_size;
0438 u8 bpf_opt_len;
0439 __u8 *hash_location;
0440 __u32 tsval, tsecr;
0441 struct tcp_fastopen_cookie *fastopen_cookie;
0442 struct mptcp_out_options mptcp;
0443 };
0444
0445 static void mptcp_options_write(struct tcphdr *th, __be32 *ptr,
0446 struct tcp_sock *tp,
0447 struct tcp_out_options *opts)
0448 {
0449 #if IS_ENABLED(CONFIG_MPTCP)
0450 if (unlikely(OPTION_MPTCP & opts->options))
0451 mptcp_write_options(th, ptr, tp, &opts->mptcp);
0452 #endif
0453 }
0454
0455 #ifdef CONFIG_CGROUP_BPF
0456 static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
0457 enum tcp_synack_type synack_type)
0458 {
0459 if (unlikely(!skb))
0460 return BPF_WRITE_HDR_TCP_CURRENT_MSS;
0461
0462 if (unlikely(synack_type == TCP_SYNACK_COOKIE))
0463 return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
0464
0465 return 0;
0466 }
0467
0468
0469 static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
0470 struct request_sock *req,
0471 struct sk_buff *syn_skb,
0472 enum tcp_synack_type synack_type,
0473 struct tcp_out_options *opts,
0474 unsigned int *remaining)
0475 {
0476 struct bpf_sock_ops_kern sock_ops;
0477 int err;
0478
0479 if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
0480 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
0481 !*remaining)
0482 return;
0483
0484
0485
0486
0487 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
0488
0489 sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
0490
0491 if (req) {
0492
0493
0494
0495
0496
0497
0498
0499
0500
0501
0502
0503
0504
0505 sock_ops.sk = (struct sock *)req;
0506 sock_ops.syn_skb = syn_skb;
0507 } else {
0508 sock_owned_by_me(sk);
0509
0510 sock_ops.is_fullsock = 1;
0511 sock_ops.sk = sk;
0512 }
0513
0514 sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
0515 sock_ops.remaining_opt_len = *remaining;
0516
0517 if (skb)
0518 bpf_skops_init_skb(&sock_ops, skb, 0);
0519
0520 err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
0521
0522 if (err || sock_ops.remaining_opt_len == *remaining)
0523 return;
0524
0525 opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
0526
0527 opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
0528
0529 *remaining -= opts->bpf_opt_len;
0530 }
0531
0532 static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
0533 struct request_sock *req,
0534 struct sk_buff *syn_skb,
0535 enum tcp_synack_type synack_type,
0536 struct tcp_out_options *opts)
0537 {
0538 u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
0539 struct bpf_sock_ops_kern sock_ops;
0540 int err;
0541
0542 if (likely(!max_opt_len))
0543 return;
0544
0545 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
0546
0547 sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
0548
0549 if (req) {
0550 sock_ops.sk = (struct sock *)req;
0551 sock_ops.syn_skb = syn_skb;
0552 } else {
0553 sock_owned_by_me(sk);
0554
0555 sock_ops.is_fullsock = 1;
0556 sock_ops.sk = sk;
0557 }
0558
0559 sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
0560 sock_ops.remaining_opt_len = max_opt_len;
0561 first_opt_off = tcp_hdrlen(skb) - max_opt_len;
0562 bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
0563
0564 err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
0565
0566 if (err)
0567 nr_written = 0;
0568 else
0569 nr_written = max_opt_len - sock_ops.remaining_opt_len;
0570
0571 if (nr_written < max_opt_len)
0572 memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
0573 max_opt_len - nr_written);
0574 }
0575 #else
0576 static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
0577 struct request_sock *req,
0578 struct sk_buff *syn_skb,
0579 enum tcp_synack_type synack_type,
0580 struct tcp_out_options *opts,
0581 unsigned int *remaining)
0582 {
0583 }
0584
0585 static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
0586 struct request_sock *req,
0587 struct sk_buff *syn_skb,
0588 enum tcp_synack_type synack_type,
0589 struct tcp_out_options *opts)
0590 {
0591 }
0592 #endif
0593
0594
0595
0596
0597
0598
0599
0600
0601
0602
0603
0604
0605
0606
0607 static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
0608 struct tcp_out_options *opts)
0609 {
0610 __be32 *ptr = (__be32 *)(th + 1);
0611 u16 options = opts->options;
0612
0613 if (unlikely(OPTION_MD5 & options)) {
0614 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
0615 (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
0616
0617 opts->hash_location = (__u8 *)ptr;
0618 ptr += 4;
0619 }
0620
0621 if (unlikely(opts->mss)) {
0622 *ptr++ = htonl((TCPOPT_MSS << 24) |
0623 (TCPOLEN_MSS << 16) |
0624 opts->mss);
0625 }
0626
0627 if (likely(OPTION_TS & options)) {
0628 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
0629 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
0630 (TCPOLEN_SACK_PERM << 16) |
0631 (TCPOPT_TIMESTAMP << 8) |
0632 TCPOLEN_TIMESTAMP);
0633 options &= ~OPTION_SACK_ADVERTISE;
0634 } else {
0635 *ptr++ = htonl((TCPOPT_NOP << 24) |
0636 (TCPOPT_NOP << 16) |
0637 (TCPOPT_TIMESTAMP << 8) |
0638 TCPOLEN_TIMESTAMP);
0639 }
0640 *ptr++ = htonl(opts->tsval);
0641 *ptr++ = htonl(opts->tsecr);
0642 }
0643
0644 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
0645 *ptr++ = htonl((TCPOPT_NOP << 24) |
0646 (TCPOPT_NOP << 16) |
0647 (TCPOPT_SACK_PERM << 8) |
0648 TCPOLEN_SACK_PERM);
0649 }
0650
0651 if (unlikely(OPTION_WSCALE & options)) {
0652 *ptr++ = htonl((TCPOPT_NOP << 24) |
0653 (TCPOPT_WINDOW << 16) |
0654 (TCPOLEN_WINDOW << 8) |
0655 opts->ws);
0656 }
0657
0658 if (unlikely(opts->num_sack_blocks)) {
0659 struct tcp_sack_block *sp = tp->rx_opt.dsack ?
0660 tp->duplicate_sack : tp->selective_acks;
0661 int this_sack;
0662
0663 *ptr++ = htonl((TCPOPT_NOP << 24) |
0664 (TCPOPT_NOP << 16) |
0665 (TCPOPT_SACK << 8) |
0666 (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
0667 TCPOLEN_SACK_PERBLOCK)));
0668
0669 for (this_sack = 0; this_sack < opts->num_sack_blocks;
0670 ++this_sack) {
0671 *ptr++ = htonl(sp[this_sack].start_seq);
0672 *ptr++ = htonl(sp[this_sack].end_seq);
0673 }
0674
0675 tp->rx_opt.dsack = 0;
0676 }
0677
0678 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
0679 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
0680 u8 *p = (u8 *)ptr;
0681 u32 len;
0682
0683 if (foc->exp) {
0684 len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
0685 *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
0686 TCPOPT_FASTOPEN_MAGIC);
0687 p += TCPOLEN_EXP_FASTOPEN_BASE;
0688 } else {
0689 len = TCPOLEN_FASTOPEN_BASE + foc->len;
0690 *p++ = TCPOPT_FASTOPEN;
0691 *p++ = len;
0692 }
0693
0694 memcpy(p, foc->val, foc->len);
0695 if ((len & 3) == 2) {
0696 p[foc->len] = TCPOPT_NOP;
0697 p[foc->len + 1] = TCPOPT_NOP;
0698 }
0699 ptr += (len + 3) >> 2;
0700 }
0701
0702 smc_options_write(ptr, &options);
0703
0704 mptcp_options_write(th, ptr, tp, opts);
0705 }
0706
0707 static void smc_set_option(const struct tcp_sock *tp,
0708 struct tcp_out_options *opts,
0709 unsigned int *remaining)
0710 {
0711 #if IS_ENABLED(CONFIG_SMC)
0712 if (static_branch_unlikely(&tcp_have_smc)) {
0713 if (tp->syn_smc) {
0714 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
0715 opts->options |= OPTION_SMC;
0716 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
0717 }
0718 }
0719 }
0720 #endif
0721 }
0722
0723 static void smc_set_option_cond(const struct tcp_sock *tp,
0724 const struct inet_request_sock *ireq,
0725 struct tcp_out_options *opts,
0726 unsigned int *remaining)
0727 {
0728 #if IS_ENABLED(CONFIG_SMC)
0729 if (static_branch_unlikely(&tcp_have_smc)) {
0730 if (tp->syn_smc && ireq->smc_ok) {
0731 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
0732 opts->options |= OPTION_SMC;
0733 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
0734 }
0735 }
0736 }
0737 #endif
0738 }
0739
0740 static void mptcp_set_option_cond(const struct request_sock *req,
0741 struct tcp_out_options *opts,
0742 unsigned int *remaining)
0743 {
0744 if (rsk_is_mptcp(req)) {
0745 unsigned int size;
0746
0747 if (mptcp_synack_options(req, &size, &opts->mptcp)) {
0748 if (*remaining >= size) {
0749 opts->options |= OPTION_MPTCP;
0750 *remaining -= size;
0751 }
0752 }
0753 }
0754 }
0755
0756
0757
0758
0759 static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
0760 struct tcp_out_options *opts,
0761 struct tcp_md5sig_key **md5)
0762 {
0763 struct tcp_sock *tp = tcp_sk(sk);
0764 unsigned int remaining = MAX_TCP_OPTION_SPACE;
0765 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
0766
0767 *md5 = NULL;
0768 #ifdef CONFIG_TCP_MD5SIG
0769 if (static_branch_unlikely(&tcp_md5_needed) &&
0770 rcu_access_pointer(tp->md5sig_info)) {
0771 *md5 = tp->af_specific->md5_lookup(sk, sk);
0772 if (*md5) {
0773 opts->options |= OPTION_MD5;
0774 remaining -= TCPOLEN_MD5SIG_ALIGNED;
0775 }
0776 }
0777 #endif
0778
0779
0780
0781
0782
0783
0784
0785
0786
0787
0788 opts->mss = tcp_advertise_mss(sk);
0789 remaining -= TCPOLEN_MSS_ALIGNED;
0790
0791 if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
0792 opts->options |= OPTION_TS;
0793 opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
0794 opts->tsecr = tp->rx_opt.ts_recent;
0795 remaining -= TCPOLEN_TSTAMP_ALIGNED;
0796 }
0797 if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
0798 opts->ws = tp->rx_opt.rcv_wscale;
0799 opts->options |= OPTION_WSCALE;
0800 remaining -= TCPOLEN_WSCALE_ALIGNED;
0801 }
0802 if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
0803 opts->options |= OPTION_SACK_ADVERTISE;
0804 if (unlikely(!(OPTION_TS & opts->options)))
0805 remaining -= TCPOLEN_SACKPERM_ALIGNED;
0806 }
0807
0808 if (fastopen && fastopen->cookie.len >= 0) {
0809 u32 need = fastopen->cookie.len;
0810
0811 need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
0812 TCPOLEN_FASTOPEN_BASE;
0813 need = (need + 3) & ~3U;
0814 if (remaining >= need) {
0815 opts->options |= OPTION_FAST_OPEN_COOKIE;
0816 opts->fastopen_cookie = &fastopen->cookie;
0817 remaining -= need;
0818 tp->syn_fastopen = 1;
0819 tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
0820 }
0821 }
0822
0823 smc_set_option(tp, opts, &remaining);
0824
0825 if (sk_is_mptcp(sk)) {
0826 unsigned int size;
0827
0828 if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
0829 opts->options |= OPTION_MPTCP;
0830 remaining -= size;
0831 }
0832 }
0833
0834 bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
0835
0836 return MAX_TCP_OPTION_SPACE - remaining;
0837 }
0838
0839
0840 static unsigned int tcp_synack_options(const struct sock *sk,
0841 struct request_sock *req,
0842 unsigned int mss, struct sk_buff *skb,
0843 struct tcp_out_options *opts,
0844 const struct tcp_md5sig_key *md5,
0845 struct tcp_fastopen_cookie *foc,
0846 enum tcp_synack_type synack_type,
0847 struct sk_buff *syn_skb)
0848 {
0849 struct inet_request_sock *ireq = inet_rsk(req);
0850 unsigned int remaining = MAX_TCP_OPTION_SPACE;
0851
0852 #ifdef CONFIG_TCP_MD5SIG
0853 if (md5) {
0854 opts->options |= OPTION_MD5;
0855 remaining -= TCPOLEN_MD5SIG_ALIGNED;
0856
0857
0858
0859
0860
0861
0862 if (synack_type != TCP_SYNACK_COOKIE)
0863 ireq->tstamp_ok &= !ireq->sack_ok;
0864 }
0865 #endif
0866
0867
0868 opts->mss = mss;
0869 remaining -= TCPOLEN_MSS_ALIGNED;
0870
0871 if (likely(ireq->wscale_ok)) {
0872 opts->ws = ireq->rcv_wscale;
0873 opts->options |= OPTION_WSCALE;
0874 remaining -= TCPOLEN_WSCALE_ALIGNED;
0875 }
0876 if (likely(ireq->tstamp_ok)) {
0877 opts->options |= OPTION_TS;
0878 opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
0879 opts->tsecr = req->ts_recent;
0880 remaining -= TCPOLEN_TSTAMP_ALIGNED;
0881 }
0882 if (likely(ireq->sack_ok)) {
0883 opts->options |= OPTION_SACK_ADVERTISE;
0884 if (unlikely(!ireq->tstamp_ok))
0885 remaining -= TCPOLEN_SACKPERM_ALIGNED;
0886 }
0887 if (foc != NULL && foc->len >= 0) {
0888 u32 need = foc->len;
0889
0890 need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
0891 TCPOLEN_FASTOPEN_BASE;
0892 need = (need + 3) & ~3U;
0893 if (remaining >= need) {
0894 opts->options |= OPTION_FAST_OPEN_COOKIE;
0895 opts->fastopen_cookie = foc;
0896 remaining -= need;
0897 }
0898 }
0899
0900 mptcp_set_option_cond(req, opts, &remaining);
0901
0902 smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
0903
0904 bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
0905 synack_type, opts, &remaining);
0906
0907 return MAX_TCP_OPTION_SPACE - remaining;
0908 }
0909
0910
0911
0912
0913 static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
0914 struct tcp_out_options *opts,
0915 struct tcp_md5sig_key **md5)
0916 {
0917 struct tcp_sock *tp = tcp_sk(sk);
0918 unsigned int size = 0;
0919 unsigned int eff_sacks;
0920
0921 opts->options = 0;
0922
0923 *md5 = NULL;
0924 #ifdef CONFIG_TCP_MD5SIG
0925 if (static_branch_unlikely(&tcp_md5_needed) &&
0926 rcu_access_pointer(tp->md5sig_info)) {
0927 *md5 = tp->af_specific->md5_lookup(sk, sk);
0928 if (*md5) {
0929 opts->options |= OPTION_MD5;
0930 size += TCPOLEN_MD5SIG_ALIGNED;
0931 }
0932 }
0933 #endif
0934
0935 if (likely(tp->rx_opt.tstamp_ok)) {
0936 opts->options |= OPTION_TS;
0937 opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
0938 opts->tsecr = tp->rx_opt.ts_recent;
0939 size += TCPOLEN_TSTAMP_ALIGNED;
0940 }
0941
0942
0943
0944
0945
0946
0947
0948 if (sk_is_mptcp(sk)) {
0949 unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
0950 unsigned int opt_size = 0;
0951
0952 if (mptcp_established_options(sk, skb, &opt_size, remaining,
0953 &opts->mptcp)) {
0954 opts->options |= OPTION_MPTCP;
0955 size += opt_size;
0956 }
0957 }
0958
0959 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
0960 if (unlikely(eff_sacks)) {
0961 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
0962 if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
0963 TCPOLEN_SACK_PERBLOCK))
0964 return size;
0965
0966 opts->num_sack_blocks =
0967 min_t(unsigned int, eff_sacks,
0968 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
0969 TCPOLEN_SACK_PERBLOCK);
0970
0971 size += TCPOLEN_SACK_BASE_ALIGNED +
0972 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
0973 }
0974
0975 if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
0976 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
0977 unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
0978
0979 bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
0980
0981 size = MAX_TCP_OPTION_SPACE - remaining;
0982 }
0983
0984 return size;
0985 }
0986
0987
0988
0989
0990
0991
0992
0993
0994
0995
0996
0997
0998
0999
1000
1001
1002 struct tsq_tasklet {
1003 struct tasklet_struct tasklet;
1004 struct list_head head;
1005 };
1006 static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
1007
1008 static void tcp_tsq_write(struct sock *sk)
1009 {
1010 if ((1 << sk->sk_state) &
1011 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
1012 TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
1013 struct tcp_sock *tp = tcp_sk(sk);
1014
1015 if (tp->lost_out > tp->retrans_out &&
1016 tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) {
1017 tcp_mstamp_refresh(tp);
1018 tcp_xmit_retransmit_queue(sk);
1019 }
1020
1021 tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
1022 0, GFP_ATOMIC);
1023 }
1024 }
1025
1026 static void tcp_tsq_handler(struct sock *sk)
1027 {
1028 bh_lock_sock(sk);
1029 if (!sock_owned_by_user(sk))
1030 tcp_tsq_write(sk);
1031 else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
1032 sock_hold(sk);
1033 bh_unlock_sock(sk);
1034 }
1035
1036
1037
1038
1039
1040
1041 static void tcp_tasklet_func(struct tasklet_struct *t)
1042 {
1043 struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet);
1044 LIST_HEAD(list);
1045 unsigned long flags;
1046 struct list_head *q, *n;
1047 struct tcp_sock *tp;
1048 struct sock *sk;
1049
1050 local_irq_save(flags);
1051 list_splice_init(&tsq->head, &list);
1052 local_irq_restore(flags);
1053
1054 list_for_each_safe(q, n, &list) {
1055 tp = list_entry(q, struct tcp_sock, tsq_node);
1056 list_del(&tp->tsq_node);
1057
1058 sk = (struct sock *)tp;
1059 smp_mb__before_atomic();
1060 clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
1061
1062 tcp_tsq_handler(sk);
1063 sk_free(sk);
1064 }
1065 }
1066
1067 #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
1068 TCPF_WRITE_TIMER_DEFERRED | \
1069 TCPF_DELACK_TIMER_DEFERRED | \
1070 TCPF_MTU_REDUCED_DEFERRED)
1071
1072
1073
1074
1075
1076
1077
1078 void tcp_release_cb(struct sock *sk)
1079 {
1080 unsigned long flags, nflags;
1081
1082
1083 do {
1084 flags = sk->sk_tsq_flags;
1085 if (!(flags & TCP_DEFERRED_ALL))
1086 return;
1087 nflags = flags & ~TCP_DEFERRED_ALL;
1088 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
1089
1090 if (flags & TCPF_TSQ_DEFERRED) {
1091 tcp_tsq_write(sk);
1092 __sock_put(sk);
1093 }
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103 sock_release_ownership(sk);
1104
1105 if (flags & TCPF_WRITE_TIMER_DEFERRED) {
1106 tcp_write_timer_handler(sk);
1107 __sock_put(sk);
1108 }
1109 if (flags & TCPF_DELACK_TIMER_DEFERRED) {
1110 tcp_delack_timer_handler(sk);
1111 __sock_put(sk);
1112 }
1113 if (flags & TCPF_MTU_REDUCED_DEFERRED) {
1114 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
1115 __sock_put(sk);
1116 }
1117 }
1118 EXPORT_SYMBOL(tcp_release_cb);
1119
1120 void __init tcp_tasklet_init(void)
1121 {
1122 int i;
1123
1124 for_each_possible_cpu(i) {
1125 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
1126
1127 INIT_LIST_HEAD(&tsq->head);
1128 tasklet_setup(&tsq->tasklet, tcp_tasklet_func);
1129 }
1130 }
1131
1132
1133
1134
1135
1136
1137 void tcp_wfree(struct sk_buff *skb)
1138 {
1139 struct sock *sk = skb->sk;
1140 struct tcp_sock *tp = tcp_sk(sk);
1141 unsigned long flags, nval, oval;
1142
1143
1144
1145
1146 WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
1147
1148
1149
1150
1151
1152
1153
1154
1155 if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
1156 goto out;
1157
1158 for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
1159 struct tsq_tasklet *tsq;
1160 bool empty;
1161
1162 if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
1163 goto out;
1164
1165 nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
1166 nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
1167 if (nval != oval)
1168 continue;
1169
1170
1171 local_irq_save(flags);
1172 tsq = this_cpu_ptr(&tsq_tasklet);
1173 empty = list_empty(&tsq->head);
1174 list_add(&tp->tsq_node, &tsq->head);
1175 if (empty)
1176 tasklet_schedule(&tsq->tasklet);
1177 local_irq_restore(flags);
1178 return;
1179 }
1180 out:
1181 sk_free(sk);
1182 }
1183
1184
1185
1186
1187 enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
1188 {
1189 struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
1190 struct sock *sk = (struct sock *)tp;
1191
1192 tcp_tsq_handler(sk);
1193 sock_put(sk);
1194
1195 return HRTIMER_NORESTART;
1196 }
1197
1198 static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
1199 u64 prior_wstamp)
1200 {
1201 struct tcp_sock *tp = tcp_sk(sk);
1202
1203 if (sk->sk_pacing_status != SK_PACING_NONE) {
1204 unsigned long rate = sk->sk_pacing_rate;
1205
1206
1207
1208
1209
1210 if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
1211 u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
1212 u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
1213
1214
1215 len_ns -= min_t(u64, len_ns / 2, credit);
1216 tp->tcp_wstamp_ns += len_ns;
1217 }
1218 }
1219 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1220 }
1221
1222 INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
1223 INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
1224 INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237 static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1238 int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1239 {
1240 const struct inet_connection_sock *icsk = inet_csk(sk);
1241 struct inet_sock *inet;
1242 struct tcp_sock *tp;
1243 struct tcp_skb_cb *tcb;
1244 struct tcp_out_options opts;
1245 unsigned int tcp_options_size, tcp_header_size;
1246 struct sk_buff *oskb = NULL;
1247 struct tcp_md5sig_key *md5;
1248 struct tcphdr *th;
1249 u64 prior_wstamp;
1250 int err;
1251
1252 BUG_ON(!skb || !tcp_skb_pcount(skb));
1253 tp = tcp_sk(sk);
1254 prior_wstamp = tp->tcp_wstamp_ns;
1255 tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
1256 skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
1257 if (clone_it) {
1258 oskb = skb;
1259
1260 tcp_skb_tsorted_save(oskb) {
1261 if (unlikely(skb_cloned(oskb)))
1262 skb = pskb_copy(oskb, gfp_mask);
1263 else
1264 skb = skb_clone(oskb, gfp_mask);
1265 } tcp_skb_tsorted_restore(oskb);
1266
1267 if (unlikely(!skb))
1268 return -ENOBUFS;
1269
1270
1271
1272 skb->dev = NULL;
1273 }
1274
1275 inet = inet_sk(sk);
1276 tcb = TCP_SKB_CB(skb);
1277 memset(&opts, 0, sizeof(opts));
1278
1279 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
1280 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
1281 } else {
1282 tcp_options_size = tcp_established_options(sk, skb, &opts,
1283 &md5);
1284
1285
1286
1287
1288
1289
1290
1291
1292 if (tcp_skb_pcount(skb) > 1)
1293 tcb->tcp_flags |= TCPHDR_PSH;
1294 }
1295 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1296
1297
1298
1299
1300
1301
1302
1303
1304 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
1305
1306
1307
1308
1309
1310
1311 skb->pfmemalloc = 0;
1312
1313 skb_push(skb, tcp_header_size);
1314 skb_reset_transport_header(skb);
1315
1316 skb_orphan(skb);
1317 skb->sk = sk;
1318 skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
1319 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1320
1321 skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
1322
1323
1324 th = (struct tcphdr *)skb->data;
1325 th->source = inet->inet_sport;
1326 th->dest = inet->inet_dport;
1327 th->seq = htonl(tcb->seq);
1328 th->ack_seq = htonl(rcv_nxt);
1329 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
1330 tcb->tcp_flags);
1331
1332 th->check = 0;
1333 th->urg_ptr = 0;
1334
1335
1336 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
1337 if (before(tp->snd_up, tcb->seq + 0x10000)) {
1338 th->urg_ptr = htons(tp->snd_up - tcb->seq);
1339 th->urg = 1;
1340 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
1341 th->urg_ptr = htons(0xFFFF);
1342 th->urg = 1;
1343 }
1344 }
1345
1346 skb_shinfo(skb)->gso_type = sk->sk_gso_type;
1347 if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
1348 th->window = htons(tcp_select_window(sk));
1349 tcp_ecn_send(sk, skb, th, tcp_header_size);
1350 } else {
1351
1352
1353
1354 th->window = htons(min(tp->rcv_wnd, 65535U));
1355 }
1356
1357 tcp_options_write(th, tp, &opts);
1358
1359 #ifdef CONFIG_TCP_MD5SIG
1360
1361 if (md5) {
1362 sk_gso_disable(sk);
1363 tp->af_specific->calc_md5_hash(opts.hash_location,
1364 md5, sk, skb);
1365 }
1366 #endif
1367
1368
1369 bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
1370
1371 INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
1372 tcp_v6_send_check, tcp_v4_send_check,
1373 sk, skb);
1374
1375 if (likely(tcb->tcp_flags & TCPHDR_ACK))
1376 tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
1377
1378 if (skb->len != tcp_header_size) {
1379 tcp_event_data_sent(tp, sk);
1380 tp->data_segs_out += tcp_skb_pcount(skb);
1381 tp->bytes_sent += skb->len - tcp_header_size;
1382 }
1383
1384 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
1385 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
1386 tcp_skb_pcount(skb));
1387
1388 tp->segs_out += tcp_skb_pcount(skb);
1389 skb_set_hash_from_sk(skb, sk);
1390
1391 skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1392 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1393
1394
1395
1396
1397 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
1398 sizeof(struct inet6_skb_parm)));
1399
1400 tcp_add_tx_delay(skb, tp);
1401
1402 err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
1403 inet6_csk_xmit, ip_queue_xmit,
1404 sk, skb, &inet->cork.fl);
1405
1406 if (unlikely(err > 0)) {
1407 tcp_enter_cwr(sk);
1408 err = net_xmit_eval(err);
1409 }
1410 if (!err && oskb) {
1411 tcp_update_skb_after_send(sk, oskb, prior_wstamp);
1412 tcp_rate_skb_sent(sk, oskb);
1413 }
1414 return err;
1415 }
1416
1417 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1418 gfp_t gfp_mask)
1419 {
1420 return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
1421 tcp_sk(sk)->rcv_nxt);
1422 }
1423
1424
1425
1426
1427
1428
1429 static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
1430 {
1431 struct tcp_sock *tp = tcp_sk(sk);
1432
1433
1434 WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
1435 __skb_header_release(skb);
1436 tcp_add_write_queue_tail(sk, skb);
1437 sk_wmem_queued_add(sk, skb->truesize);
1438 sk_mem_charge(sk, skb->truesize);
1439 }
1440
1441
1442 static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1443 {
1444 if (skb->len <= mss_now) {
1445
1446
1447
1448 tcp_skb_pcount_set(skb, 1);
1449 TCP_SKB_CB(skb)->tcp_gso_size = 0;
1450 } else {
1451 tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1452 TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1453 }
1454 }
1455
1456
1457
1458
1459 static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1460 {
1461 struct tcp_sock *tp = tcp_sk(sk);
1462
1463 tp->packets_out -= decr;
1464
1465 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1466 tp->sacked_out -= decr;
1467 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1468 tp->retrans_out -= decr;
1469 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1470 tp->lost_out -= decr;
1471
1472
1473 if (tcp_is_reno(tp) && decr > 0)
1474 tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1475
1476 if (tp->lost_skb_hint &&
1477 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1478 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1479 tp->lost_cnt_hint -= decr;
1480
1481 tcp_verify_left_out(tp);
1482 }
1483
1484 static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
1485 {
1486 return TCP_SKB_CB(skb)->txstamp_ack ||
1487 (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
1488 }
1489
1490 static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
1491 {
1492 struct skb_shared_info *shinfo = skb_shinfo(skb);
1493
1494 if (unlikely(tcp_has_tx_tstamp(skb)) &&
1495 !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
1496 struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
1497 u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
1498
1499 shinfo->tx_flags &= ~tsflags;
1500 shinfo2->tx_flags |= tsflags;
1501 swap(shinfo->tskey, shinfo2->tskey);
1502 TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
1503 TCP_SKB_CB(skb)->txstamp_ack = 0;
1504 }
1505 }
1506
1507 static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
1508 {
1509 TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
1510 TCP_SKB_CB(skb)->eor = 0;
1511 }
1512
1513
1514 static void tcp_insert_write_queue_after(struct sk_buff *skb,
1515 struct sk_buff *buff,
1516 struct sock *sk,
1517 enum tcp_queue tcp_queue)
1518 {
1519 if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1520 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1521 else
1522 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1523 }
1524
1525
1526
1527
1528
1529
1530 int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1531 struct sk_buff *skb, u32 len,
1532 unsigned int mss_now, gfp_t gfp)
1533 {
1534 struct tcp_sock *tp = tcp_sk(sk);
1535 struct sk_buff *buff;
1536 int nsize, old_factor;
1537 long limit;
1538 int nlen;
1539 u8 flags;
1540
1541 if (WARN_ON(len > skb->len))
1542 return -EINVAL;
1543
1544 nsize = skb_headlen(skb) - len;
1545 if (nsize < 0)
1546 nsize = 0;
1547
1548
1549
1550
1551
1552
1553 limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
1554 if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
1555 tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
1556 skb != tcp_rtx_queue_head(sk) &&
1557 skb != tcp_rtx_queue_tail(sk))) {
1558 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
1559 return -ENOMEM;
1560 }
1561
1562 if (skb_unclone_keeptruesize(skb, gfp))
1563 return -ENOMEM;
1564
1565
1566 buff = tcp_stream_alloc_skb(sk, nsize, gfp, true);
1567 if (!buff)
1568 return -ENOMEM;
1569 skb_copy_decrypted(buff, skb);
1570 mptcp_skb_ext_copy(buff, skb);
1571
1572 sk_wmem_queued_add(sk, buff->truesize);
1573 sk_mem_charge(sk, buff->truesize);
1574 nlen = skb->len - len - nsize;
1575 buff->truesize += nlen;
1576 skb->truesize -= nlen;
1577
1578
1579 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1580 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1581 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1582
1583
1584 flags = TCP_SKB_CB(skb)->tcp_flags;
1585 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1586 TCP_SKB_CB(buff)->tcp_flags = flags;
1587 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1588 tcp_skb_fragment_eor(skb, buff);
1589
1590 skb_split(skb, buff, len);
1591
1592 skb_set_delivery_time(buff, skb->tstamp, true);
1593 tcp_fragment_tstamp(skb, buff);
1594
1595 old_factor = tcp_skb_pcount(skb);
1596
1597
1598 tcp_set_skb_tso_segs(skb, mss_now);
1599 tcp_set_skb_tso_segs(buff, mss_now);
1600
1601
1602 TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
1603
1604
1605
1606
1607 if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1608 int diff = old_factor - tcp_skb_pcount(skb) -
1609 tcp_skb_pcount(buff);
1610
1611 if (diff)
1612 tcp_adjust_pcount(sk, skb, diff);
1613 }
1614
1615
1616 __skb_header_release(buff);
1617 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1618 if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
1619 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1620
1621 return 0;
1622 }
1623
1624
1625
1626
1627 static int __pskb_trim_head(struct sk_buff *skb, int len)
1628 {
1629 struct skb_shared_info *shinfo;
1630 int i, k, eat;
1631
1632 eat = min_t(int, len, skb_headlen(skb));
1633 if (eat) {
1634 __skb_pull(skb, eat);
1635 len -= eat;
1636 if (!len)
1637 return 0;
1638 }
1639 eat = len;
1640 k = 0;
1641 shinfo = skb_shinfo(skb);
1642 for (i = 0; i < shinfo->nr_frags; i++) {
1643 int size = skb_frag_size(&shinfo->frags[i]);
1644
1645 if (size <= eat) {
1646 skb_frag_unref(skb, i);
1647 eat -= size;
1648 } else {
1649 shinfo->frags[k] = shinfo->frags[i];
1650 if (eat) {
1651 skb_frag_off_add(&shinfo->frags[k], eat);
1652 skb_frag_size_sub(&shinfo->frags[k], eat);
1653 eat = 0;
1654 }
1655 k++;
1656 }
1657 }
1658 shinfo->nr_frags = k;
1659
1660 skb->data_len -= len;
1661 skb->len = skb->data_len;
1662 return len;
1663 }
1664
1665
1666 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1667 {
1668 u32 delta_truesize;
1669
1670 if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
1671 return -ENOMEM;
1672
1673 delta_truesize = __pskb_trim_head(skb, len);
1674
1675 TCP_SKB_CB(skb)->seq += len;
1676
1677 if (delta_truesize) {
1678 skb->truesize -= delta_truesize;
1679 sk_wmem_queued_add(sk, -delta_truesize);
1680 if (!skb_zcopy_pure(skb))
1681 sk_mem_uncharge(sk, delta_truesize);
1682 }
1683
1684
1685 if (tcp_skb_pcount(skb) > 1)
1686 tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
1687
1688 return 0;
1689 }
1690
1691
1692 static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
1693 {
1694 const struct tcp_sock *tp = tcp_sk(sk);
1695 const struct inet_connection_sock *icsk = inet_csk(sk);
1696 int mss_now;
1697
1698
1699
1700
1701 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1702
1703
1704 if (icsk->icsk_af_ops->net_frag_header_len) {
1705 const struct dst_entry *dst = __sk_dst_get(sk);
1706
1707 if (dst && dst_allfrag(dst))
1708 mss_now -= icsk->icsk_af_ops->net_frag_header_len;
1709 }
1710
1711
1712 if (mss_now > tp->rx_opt.mss_clamp)
1713 mss_now = tp->rx_opt.mss_clamp;
1714
1715
1716 mss_now -= icsk->icsk_ext_hdr_len;
1717
1718
1719 mss_now = max(mss_now,
1720 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
1721 return mss_now;
1722 }
1723
1724
1725 int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1726 {
1727
1728 return __tcp_mtu_to_mss(sk, pmtu) -
1729 (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
1730 }
1731 EXPORT_SYMBOL(tcp_mtu_to_mss);
1732
1733
1734 int tcp_mss_to_mtu(struct sock *sk, int mss)
1735 {
1736 const struct tcp_sock *tp = tcp_sk(sk);
1737 const struct inet_connection_sock *icsk = inet_csk(sk);
1738 int mtu;
1739
1740 mtu = mss +
1741 tp->tcp_header_len +
1742 icsk->icsk_ext_hdr_len +
1743 icsk->icsk_af_ops->net_header_len;
1744
1745
1746 if (icsk->icsk_af_ops->net_frag_header_len) {
1747 const struct dst_entry *dst = __sk_dst_get(sk);
1748
1749 if (dst && dst_allfrag(dst))
1750 mtu += icsk->icsk_af_ops->net_frag_header_len;
1751 }
1752 return mtu;
1753 }
1754 EXPORT_SYMBOL(tcp_mss_to_mtu);
1755
1756
1757 void tcp_mtup_init(struct sock *sk)
1758 {
1759 struct tcp_sock *tp = tcp_sk(sk);
1760 struct inet_connection_sock *icsk = inet_csk(sk);
1761 struct net *net = sock_net(sk);
1762
1763 icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1;
1764 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1765 icsk->icsk_af_ops->net_header_len;
1766 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
1767 icsk->icsk_mtup.probe_size = 0;
1768 if (icsk->icsk_mtup.enabled)
1769 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
1770 }
1771 EXPORT_SYMBOL(tcp_mtup_init);
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1796 {
1797 struct tcp_sock *tp = tcp_sk(sk);
1798 struct inet_connection_sock *icsk = inet_csk(sk);
1799 int mss_now;
1800
1801 if (icsk->icsk_mtup.search_high > pmtu)
1802 icsk->icsk_mtup.search_high = pmtu;
1803
1804 mss_now = tcp_mtu_to_mss(sk, pmtu);
1805 mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1806
1807
1808 icsk->icsk_pmtu_cookie = pmtu;
1809 if (icsk->icsk_mtup.enabled)
1810 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1811 tp->mss_cache = mss_now;
1812
1813 return mss_now;
1814 }
1815 EXPORT_SYMBOL(tcp_sync_mss);
1816
1817
1818
1819
1820 unsigned int tcp_current_mss(struct sock *sk)
1821 {
1822 const struct tcp_sock *tp = tcp_sk(sk);
1823 const struct dst_entry *dst = __sk_dst_get(sk);
1824 u32 mss_now;
1825 unsigned int header_len;
1826 struct tcp_out_options opts;
1827 struct tcp_md5sig_key *md5;
1828
1829 mss_now = tp->mss_cache;
1830
1831 if (dst) {
1832 u32 mtu = dst_mtu(dst);
1833 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1834 mss_now = tcp_sync_mss(sk, mtu);
1835 }
1836
1837 header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1838 sizeof(struct tcphdr);
1839
1840
1841
1842
1843 if (header_len != tp->tcp_header_len) {
1844 int delta = (int) header_len - tp->tcp_header_len;
1845 mss_now -= delta;
1846 }
1847
1848 return mss_now;
1849 }
1850
1851
1852
1853
1854
1855 static void tcp_cwnd_application_limited(struct sock *sk)
1856 {
1857 struct tcp_sock *tp = tcp_sk(sk);
1858
1859 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1860 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1861
1862 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1863 u32 win_used = max(tp->snd_cwnd_used, init_win);
1864 if (win_used < tcp_snd_cwnd(tp)) {
1865 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1866 tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1);
1867 }
1868 tp->snd_cwnd_used = 0;
1869 }
1870 tp->snd_cwnd_stamp = tcp_jiffies32;
1871 }
1872
1873 static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1874 {
1875 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1876 struct tcp_sock *tp = tcp_sk(sk);
1877
1878
1879
1880
1881 if (!before(tp->snd_una, tp->max_packets_seq) ||
1882 tp->packets_out > tp->max_packets_out ||
1883 is_cwnd_limited) {
1884 tp->max_packets_out = tp->packets_out;
1885 tp->max_packets_seq = tp->snd_nxt;
1886 tp->is_cwnd_limited = is_cwnd_limited;
1887 }
1888
1889 if (tcp_is_cwnd_limited(sk)) {
1890
1891 tp->snd_cwnd_used = 0;
1892 tp->snd_cwnd_stamp = tcp_jiffies32;
1893 } else {
1894
1895 if (tp->packets_out > tp->snd_cwnd_used)
1896 tp->snd_cwnd_used = tp->packets_out;
1897
1898 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
1899 (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1900 !ca_ops->cong_control)
1901 tcp_cwnd_application_limited(sk);
1902
1903
1904
1905
1906
1907
1908
1909
1910 if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1911 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1912 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1913 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
1914 }
1915 }
1916
1917
1918 static bool tcp_minshall_check(const struct tcp_sock *tp)
1919 {
1920 return after(tp->snd_sml, tp->snd_una) &&
1921 !after(tp->snd_sml, tp->snd_nxt);
1922 }
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932 static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1933 const struct sk_buff *skb)
1934 {
1935 if (skb->len < tcp_skb_pcount(skb) * mss_now)
1936 tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1937 }
1938
1939
1940
1941
1942
1943
1944
1945
1946 static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1947 int nonagle)
1948 {
1949 return partial &&
1950 ((nonagle & TCP_NAGLE_CORK) ||
1951 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1952 }
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968 static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
1969 int min_tso_segs)
1970 {
1971 unsigned long bytes;
1972 u32 r;
1973
1974 bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift);
1975
1976 r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
1977 if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
1978 bytes += sk->sk_gso_max_size >> r;
1979
1980 bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
1981
1982 return max_t(u32, bytes / mss_now, min_tso_segs);
1983 }
1984
1985
1986
1987
1988 static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
1989 {
1990 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1991 u32 min_tso, tso_segs;
1992
1993 min_tso = ca_ops->min_tso_segs ?
1994 ca_ops->min_tso_segs(sk) :
1995 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
1996
1997 tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
1998 return min_t(u32, tso_segs, sk->sk_gso_max_segs);
1999 }
2000
2001
2002 static unsigned int tcp_mss_split_point(const struct sock *sk,
2003 const struct sk_buff *skb,
2004 unsigned int mss_now,
2005 unsigned int max_segs,
2006 int nonagle)
2007 {
2008 const struct tcp_sock *tp = tcp_sk(sk);
2009 u32 partial, needed, window, max_len;
2010
2011 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2012 max_len = mss_now * max_segs;
2013
2014 if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
2015 return max_len;
2016
2017 needed = min(skb->len, window);
2018
2019 if (max_len <= needed)
2020 return max_len;
2021
2022 partial = needed % mss_now;
2023
2024
2025
2026
2027 if (tcp_nagle_check(partial != 0, tp, nonagle))
2028 return needed - partial;
2029
2030 return needed;
2031 }
2032
2033
2034
2035
2036 static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
2037 const struct sk_buff *skb)
2038 {
2039 u32 in_flight, cwnd, halfcwnd;
2040
2041
2042 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
2043 tcp_skb_pcount(skb) == 1)
2044 return 1;
2045
2046 in_flight = tcp_packets_in_flight(tp);
2047 cwnd = tcp_snd_cwnd(tp);
2048 if (in_flight >= cwnd)
2049 return 0;
2050
2051
2052
2053
2054 halfcwnd = max(cwnd >> 1, 1U);
2055 return min(halfcwnd, cwnd - in_flight);
2056 }
2057
2058
2059
2060
2061
2062 static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
2063 {
2064 int tso_segs = tcp_skb_pcount(skb);
2065
2066 if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
2067 tcp_set_skb_tso_segs(skb, mss_now);
2068 tso_segs = tcp_skb_pcount(skb);
2069 }
2070 return tso_segs;
2071 }
2072
2073
2074
2075
2076
2077 static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
2078 unsigned int cur_mss, int nonagle)
2079 {
2080
2081
2082
2083
2084
2085
2086 if (nonagle & TCP_NAGLE_PUSH)
2087 return true;
2088
2089
2090 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
2091 return true;
2092
2093 if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
2094 return true;
2095
2096 return false;
2097 }
2098
2099
2100 static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
2101 const struct sk_buff *skb,
2102 unsigned int cur_mss)
2103 {
2104 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
2105
2106 if (skb->len > cur_mss)
2107 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
2108
2109 return !after(end_seq, tcp_wnd_end(tp));
2110 }
2111
2112
2113
2114
2115
2116
2117
2118
2119 static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
2120 unsigned int mss_now, gfp_t gfp)
2121 {
2122 int nlen = skb->len - len;
2123 struct sk_buff *buff;
2124 u8 flags;
2125
2126
2127 if (skb->len != skb->data_len)
2128 return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2129 skb, len, mss_now, gfp);
2130
2131 buff = tcp_stream_alloc_skb(sk, 0, gfp, true);
2132 if (unlikely(!buff))
2133 return -ENOMEM;
2134 skb_copy_decrypted(buff, skb);
2135 mptcp_skb_ext_copy(buff, skb);
2136
2137 sk_wmem_queued_add(sk, buff->truesize);
2138 sk_mem_charge(sk, buff->truesize);
2139 buff->truesize += nlen;
2140 skb->truesize -= nlen;
2141
2142
2143 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
2144 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
2145 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
2146
2147
2148 flags = TCP_SKB_CB(skb)->tcp_flags;
2149 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
2150 TCP_SKB_CB(buff)->tcp_flags = flags;
2151
2152 tcp_skb_fragment_eor(skb, buff);
2153
2154 skb_split(skb, buff, len);
2155 tcp_fragment_tstamp(skb, buff);
2156
2157
2158 tcp_set_skb_tso_segs(skb, mss_now);
2159 tcp_set_skb_tso_segs(buff, mss_now);
2160
2161
2162 __skb_header_release(buff);
2163 tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
2164
2165 return 0;
2166 }
2167
2168
2169
2170
2171
2172
2173 static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
2174 bool *is_cwnd_limited,
2175 bool *is_rwnd_limited,
2176 u32 max_segs)
2177 {
2178 const struct inet_connection_sock *icsk = inet_csk(sk);
2179 u32 send_win, cong_win, limit, in_flight;
2180 struct tcp_sock *tp = tcp_sk(sk);
2181 struct sk_buff *head;
2182 int win_divisor;
2183 s64 delta;
2184
2185 if (icsk->icsk_ca_state >= TCP_CA_Recovery)
2186 goto send_now;
2187
2188
2189
2190
2191
2192
2193 delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
2194 if (delta > 0)
2195 goto send_now;
2196
2197 in_flight = tcp_packets_in_flight(tp);
2198
2199 BUG_ON(tcp_skb_pcount(skb) <= 1);
2200 BUG_ON(tcp_snd_cwnd(tp) <= in_flight);
2201
2202 send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2203
2204
2205 cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache;
2206
2207 limit = min(send_win, cong_win);
2208
2209
2210 if (limit >= max_segs * tp->mss_cache)
2211 goto send_now;
2212
2213
2214 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
2215 goto send_now;
2216
2217 win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
2218 if (win_divisor) {
2219 u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache);
2220
2221
2222
2223
2224 chunk /= win_divisor;
2225 if (limit >= chunk)
2226 goto send_now;
2227 } else {
2228
2229
2230
2231
2232
2233 if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
2234 goto send_now;
2235 }
2236
2237
2238 head = tcp_rtx_queue_head(sk);
2239 if (!head)
2240 goto send_now;
2241 delta = tp->tcp_clock_cache - head->tstamp;
2242
2243 if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
2244 goto send_now;
2245
2246
2247
2248
2249
2250
2251
2252 if (cong_win < send_win) {
2253 if (cong_win <= skb->len) {
2254 *is_cwnd_limited = true;
2255 return true;
2256 }
2257 } else {
2258 if (send_win <= skb->len) {
2259 *is_rwnd_limited = true;
2260 return true;
2261 }
2262 }
2263
2264
2265 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
2266 TCP_SKB_CB(skb)->eor)
2267 goto send_now;
2268
2269 return true;
2270
2271 send_now:
2272 return false;
2273 }
2274
2275 static inline void tcp_mtu_check_reprobe(struct sock *sk)
2276 {
2277 struct inet_connection_sock *icsk = inet_csk(sk);
2278 struct tcp_sock *tp = tcp_sk(sk);
2279 struct net *net = sock_net(sk);
2280 u32 interval;
2281 s32 delta;
2282
2283 interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
2284 delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
2285 if (unlikely(delta >= interval * HZ)) {
2286 int mss = tcp_current_mss(sk);
2287
2288
2289 icsk->icsk_mtup.probe_size = 0;
2290 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
2291 sizeof(struct tcphdr) +
2292 icsk->icsk_af_ops->net_header_len;
2293 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
2294
2295
2296 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
2297 }
2298 }
2299
2300 static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
2301 {
2302 struct sk_buff *skb, *next;
2303
2304 skb = tcp_send_head(sk);
2305 tcp_for_write_queue_from_safe(skb, next, sk) {
2306 if (len <= skb->len)
2307 break;
2308
2309 if (unlikely(TCP_SKB_CB(skb)->eor) ||
2310 tcp_has_tx_tstamp(skb) ||
2311 !skb_pure_zcopy_same(skb, next))
2312 return false;
2313
2314 len -= skb->len;
2315 }
2316
2317 return true;
2318 }
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329 static int tcp_mtu_probe(struct sock *sk)
2330 {
2331 struct inet_connection_sock *icsk = inet_csk(sk);
2332 struct tcp_sock *tp = tcp_sk(sk);
2333 struct sk_buff *skb, *nskb, *next;
2334 struct net *net = sock_net(sk);
2335 int probe_size;
2336 int size_needed;
2337 int copy, len;
2338 int mss_now;
2339 int interval;
2340
2341
2342
2343
2344
2345
2346 if (likely(!icsk->icsk_mtup.enabled ||
2347 icsk->icsk_mtup.probe_size ||
2348 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
2349 tcp_snd_cwnd(tp) < 11 ||
2350 tp->rx_opt.num_sacks || tp->rx_opt.dsack))
2351 return -1;
2352
2353
2354
2355
2356
2357 mss_now = tcp_current_mss(sk);
2358 probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
2359 icsk->icsk_mtup.search_low) >> 1);
2360 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
2361 interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
2362
2363
2364
2365
2366 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
2367 interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
2368
2369
2370
2371 tcp_mtu_check_reprobe(sk);
2372 return -1;
2373 }
2374
2375
2376 if (tp->write_seq - tp->snd_nxt < size_needed)
2377 return -1;
2378
2379 if (tp->snd_wnd < size_needed)
2380 return -1;
2381 if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
2382 return 0;
2383
2384
2385 if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) {
2386 if (!tcp_packets_in_flight(tp))
2387 return -1;
2388 else
2389 return 0;
2390 }
2391
2392 if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
2393 return -1;
2394
2395
2396 nskb = tcp_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
2397 if (!nskb)
2398 return -1;
2399 sk_wmem_queued_add(sk, nskb->truesize);
2400 sk_mem_charge(sk, nskb->truesize);
2401
2402 skb = tcp_send_head(sk);
2403 skb_copy_decrypted(nskb, skb);
2404 mptcp_skb_ext_copy(nskb, skb);
2405
2406 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
2407 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
2408 TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
2409
2410 tcp_insert_write_queue_before(nskb, skb, sk);
2411 tcp_highest_sack_replace(sk, skb, nskb);
2412
2413 len = 0;
2414 tcp_for_write_queue_from_safe(skb, next, sk) {
2415 copy = min_t(int, skb->len, probe_size - len);
2416 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
2417
2418 if (skb->len <= copy) {
2419
2420
2421 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2422
2423
2424
2425 TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
2426 tcp_skb_collapse_tstamp(nskb, skb);
2427 tcp_unlink_write_queue(skb, sk);
2428 tcp_wmem_free_skb(sk, skb);
2429 } else {
2430 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
2431 ~(TCPHDR_FIN|TCPHDR_PSH);
2432 if (!skb_shinfo(skb)->nr_frags) {
2433 skb_pull(skb, copy);
2434 } else {
2435 __pskb_trim_head(skb, copy);
2436 tcp_set_skb_tso_segs(skb, mss_now);
2437 }
2438 TCP_SKB_CB(skb)->seq += copy;
2439 }
2440
2441 len += copy;
2442
2443 if (len >= probe_size)
2444 break;
2445 }
2446 tcp_init_tso_segs(nskb, nskb->len);
2447
2448
2449
2450
2451 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
2452
2453
2454 tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
2455 tcp_event_new_data_sent(sk, nskb);
2456
2457 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
2458 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
2459 tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
2460
2461 return 1;
2462 }
2463
2464 return -1;
2465 }
2466
2467 static bool tcp_pacing_check(struct sock *sk)
2468 {
2469 struct tcp_sock *tp = tcp_sk(sk);
2470
2471 if (!tcp_needs_internal_pacing(sk))
2472 return false;
2473
2474 if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
2475 return false;
2476
2477 if (!hrtimer_is_queued(&tp->pacing_timer)) {
2478 hrtimer_start(&tp->pacing_timer,
2479 ns_to_ktime(tp->tcp_wstamp_ns),
2480 HRTIMER_MODE_ABS_PINNED_SOFT);
2481 sock_hold(sk);
2482 }
2483 return true;
2484 }
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497 static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2498 unsigned int factor)
2499 {
2500 unsigned long limit;
2501
2502 limit = max_t(unsigned long,
2503 2 * skb->truesize,
2504 sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
2505 if (sk->sk_pacing_status == SK_PACING_NONE)
2506 limit = min_t(unsigned long, limit,
2507 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
2508 limit <<= factor;
2509
2510 if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
2511 tcp_sk(sk)->tcp_tx_delay) {
2512 u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
2513
2514
2515
2516
2517
2518
2519 extra_bytes >>= (20 - 1);
2520 limit += extra_bytes;
2521 }
2522 if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2523
2524
2525
2526
2527
2528 if (tcp_rtx_queue_empty(sk))
2529 return false;
2530
2531 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
2532
2533
2534
2535
2536 smp_mb__after_atomic();
2537 if (refcount_read(&sk->sk_wmem_alloc) > limit)
2538 return true;
2539 }
2540 return false;
2541 }
2542
2543 static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
2544 {
2545 const u32 now = tcp_jiffies32;
2546 enum tcp_chrono old = tp->chrono_type;
2547
2548 if (old > TCP_CHRONO_UNSPEC)
2549 tp->chrono_stat[old - 1] += now - tp->chrono_start;
2550 tp->chrono_start = now;
2551 tp->chrono_type = new;
2552 }
2553
2554 void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
2555 {
2556 struct tcp_sock *tp = tcp_sk(sk);
2557
2558
2559
2560
2561
2562
2563 if (type > tp->chrono_type)
2564 tcp_chrono_set(tp, type);
2565 }
2566
2567 void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2568 {
2569 struct tcp_sock *tp = tcp_sk(sk);
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579 if (tcp_rtx_and_write_queues_empty(sk))
2580 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2581 else if (type == tp->chrono_type)
2582 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
2583 }
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2600 int push_one, gfp_t gfp)
2601 {
2602 struct tcp_sock *tp = tcp_sk(sk);
2603 struct sk_buff *skb;
2604 unsigned int tso_segs, sent_pkts;
2605 int cwnd_quota;
2606 int result;
2607 bool is_cwnd_limited = false, is_rwnd_limited = false;
2608 u32 max_segs;
2609
2610 sent_pkts = 0;
2611
2612 tcp_mstamp_refresh(tp);
2613 if (!push_one) {
2614
2615 result = tcp_mtu_probe(sk);
2616 if (!result) {
2617 return false;
2618 } else if (result > 0) {
2619 sent_pkts = 1;
2620 }
2621 }
2622
2623 max_segs = tcp_tso_segs(sk, mss_now);
2624 while ((skb = tcp_send_head(sk))) {
2625 unsigned int limit;
2626
2627 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2628
2629 tp->tcp_wstamp_ns = tp->tcp_clock_cache;
2630 skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
2631 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
2632 tcp_init_tso_segs(skb, mss_now);
2633 goto repair;
2634 }
2635
2636 if (tcp_pacing_check(sk))
2637 break;
2638
2639 tso_segs = tcp_init_tso_segs(skb, mss_now);
2640 BUG_ON(!tso_segs);
2641
2642 cwnd_quota = tcp_cwnd_test(tp, skb);
2643 if (!cwnd_quota) {
2644 if (push_one == 2)
2645
2646 cwnd_quota = 1;
2647 else
2648 break;
2649 }
2650
2651 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
2652 is_rwnd_limited = true;
2653 break;
2654 }
2655
2656 if (tso_segs == 1) {
2657 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
2658 (tcp_skb_is_last(sk, skb) ?
2659 nonagle : TCP_NAGLE_PUSH))))
2660 break;
2661 } else {
2662 if (!push_one &&
2663 tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
2664 &is_rwnd_limited, max_segs))
2665 break;
2666 }
2667
2668 limit = mss_now;
2669 if (tso_segs > 1 && !tcp_urg_mode(tp))
2670 limit = tcp_mss_split_point(sk, skb, mss_now,
2671 min_t(unsigned int,
2672 cwnd_quota,
2673 max_segs),
2674 nonagle);
2675
2676 if (skb->len > limit &&
2677 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2678 break;
2679
2680 if (tcp_small_queue_check(sk, skb, 0))
2681 break;
2682
2683
2684
2685
2686
2687
2688 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
2689 break;
2690
2691 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2692 break;
2693
2694 repair:
2695
2696
2697
2698 tcp_event_new_data_sent(sk, skb);
2699
2700 tcp_minshall_update(tp, mss_now, skb);
2701 sent_pkts += tcp_skb_pcount(skb);
2702
2703 if (push_one)
2704 break;
2705 }
2706
2707 if (is_rwnd_limited)
2708 tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
2709 else
2710 tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
2711
2712 is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp));
2713 if (likely(sent_pkts || is_cwnd_limited))
2714 tcp_cwnd_validate(sk, is_cwnd_limited);
2715
2716 if (likely(sent_pkts)) {
2717 if (tcp_in_cwnd_reduction(sk))
2718 tp->prr_out += sent_pkts;
2719
2720
2721 if (push_one != 2)
2722 tcp_schedule_loss_probe(sk, false);
2723 return false;
2724 }
2725 return !tp->packets_out && !tcp_write_queue_empty(sk);
2726 }
2727
2728 bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
2729 {
2730 struct inet_connection_sock *icsk = inet_csk(sk);
2731 struct tcp_sock *tp = tcp_sk(sk);
2732 u32 timeout, rto_delta_us;
2733 int early_retrans;
2734
2735
2736
2737
2738 if (rcu_access_pointer(tp->fastopen_rsk))
2739 return false;
2740
2741 early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
2742
2743
2744
2745 if ((early_retrans != 3 && early_retrans != 4) ||
2746 !tp->packets_out || !tcp_is_sack(tp) ||
2747 (icsk->icsk_ca_state != TCP_CA_Open &&
2748 icsk->icsk_ca_state != TCP_CA_CWR))
2749 return false;
2750
2751
2752
2753
2754
2755 if (tp->srtt_us) {
2756 timeout = usecs_to_jiffies(tp->srtt_us >> 2);
2757 if (tp->packets_out == 1)
2758 timeout += TCP_RTO_MIN;
2759 else
2760 timeout += TCP_TIMEOUT_MIN;
2761 } else {
2762 timeout = TCP_TIMEOUT_INIT;
2763 }
2764
2765
2766 rto_delta_us = advancing_rto ?
2767 jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
2768 tcp_rto_delta_us(sk);
2769 if (rto_delta_us > 0)
2770 timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2771
2772 tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
2773 return true;
2774 }
2775
2776
2777
2778
2779
2780 static bool skb_still_in_host_queue(struct sock *sk,
2781 const struct sk_buff *skb)
2782 {
2783 if (unlikely(skb_fclone_busy(sk, skb))) {
2784 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
2785 smp_mb__after_atomic();
2786 if (skb_fclone_busy(sk, skb)) {
2787 NET_INC_STATS(sock_net(sk),
2788 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2789 return true;
2790 }
2791 }
2792 return false;
2793 }
2794
2795
2796
2797
2798 void tcp_send_loss_probe(struct sock *sk)
2799 {
2800 struct tcp_sock *tp = tcp_sk(sk);
2801 struct sk_buff *skb;
2802 int pcount;
2803 int mss = tcp_current_mss(sk);
2804
2805
2806 if (tp->tlp_high_seq)
2807 goto rearm_timer;
2808
2809 tp->tlp_retrans = 0;
2810 skb = tcp_send_head(sk);
2811 if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
2812 pcount = tp->packets_out;
2813 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2814 if (tp->packets_out > pcount)
2815 goto probe_sent;
2816 goto rearm_timer;
2817 }
2818 skb = skb_rb_last(&sk->tcp_rtx_queue);
2819 if (unlikely(!skb)) {
2820 WARN_ONCE(tp->packets_out,
2821 "invalid inflight: %u state %u cwnd %u mss %d\n",
2822 tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss);
2823 inet_csk(sk)->icsk_pending = 0;
2824 return;
2825 }
2826
2827 if (skb_still_in_host_queue(sk, skb))
2828 goto rearm_timer;
2829
2830 pcount = tcp_skb_pcount(skb);
2831 if (WARN_ON(!pcount))
2832 goto rearm_timer;
2833
2834 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2835 if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2836 (pcount - 1) * mss, mss,
2837 GFP_ATOMIC)))
2838 goto rearm_timer;
2839 skb = skb_rb_next(skb);
2840 }
2841
2842 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2843 goto rearm_timer;
2844
2845 if (__tcp_retransmit_skb(sk, skb, 1))
2846 goto rearm_timer;
2847
2848 tp->tlp_retrans = 1;
2849
2850 probe_sent:
2851
2852 tp->tlp_high_seq = tp->snd_nxt;
2853
2854 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
2855
2856 inet_csk(sk)->icsk_pending = 0;
2857 rearm_timer:
2858 tcp_rearm_rto(sk);
2859 }
2860
2861
2862
2863
2864
2865 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2866 int nonagle)
2867 {
2868
2869
2870
2871
2872 if (unlikely(sk->sk_state == TCP_CLOSE))
2873 return;
2874
2875 if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2876 sk_gfp_mask(sk, GFP_ATOMIC)))
2877 tcp_check_probe_timer(sk);
2878 }
2879
2880
2881
2882
2883 void tcp_push_one(struct sock *sk, unsigned int mss_now)
2884 {
2885 struct sk_buff *skb = tcp_send_head(sk);
2886
2887 BUG_ON(!skb || skb->len < mss_now);
2888
2889 tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
2890 }
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944 u32 __tcp_select_window(struct sock *sk)
2945 {
2946 struct inet_connection_sock *icsk = inet_csk(sk);
2947 struct tcp_sock *tp = tcp_sk(sk);
2948
2949
2950
2951
2952
2953
2954 int mss = icsk->icsk_ack.rcv_mss;
2955 int free_space = tcp_space(sk);
2956 int allowed_space = tcp_full_space(sk);
2957 int full_space, window;
2958
2959 if (sk_is_mptcp(sk))
2960 mptcp_space(sk, &free_space, &allowed_space);
2961
2962 full_space = min_t(int, tp->window_clamp, allowed_space);
2963
2964 if (unlikely(mss > full_space)) {
2965 mss = full_space;
2966 if (mss <= 0)
2967 return 0;
2968 }
2969 if (free_space < (full_space >> 1)) {
2970 icsk->icsk_ack.quick = 0;
2971
2972 if (tcp_under_memory_pressure(sk))
2973 tcp_adjust_rcv_ssthresh(sk);
2974
2975
2976
2977
2978 free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
2979
2980
2981
2982
2983
2984
2985
2986
2987 if (free_space < (allowed_space >> 4) || free_space < mss)
2988 return 0;
2989 }
2990
2991 if (free_space > tp->rcv_ssthresh)
2992 free_space = tp->rcv_ssthresh;
2993
2994
2995
2996
2997 if (tp->rx_opt.rcv_wscale) {
2998 window = free_space;
2999
3000
3001
3002
3003
3004 window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
3005 } else {
3006 window = tp->rcv_wnd;
3007
3008
3009
3010
3011
3012
3013
3014
3015 if (window <= free_space - mss || window > free_space)
3016 window = rounddown(free_space, mss);
3017 else if (mss == full_space &&
3018 free_space > window + (full_space >> 1))
3019 window = free_space;
3020 }
3021
3022 return window;
3023 }
3024
3025 void tcp_skb_collapse_tstamp(struct sk_buff *skb,
3026 const struct sk_buff *next_skb)
3027 {
3028 if (unlikely(tcp_has_tx_tstamp(next_skb))) {
3029 const struct skb_shared_info *next_shinfo =
3030 skb_shinfo(next_skb);
3031 struct skb_shared_info *shinfo = skb_shinfo(skb);
3032
3033 shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
3034 shinfo->tskey = next_shinfo->tskey;
3035 TCP_SKB_CB(skb)->txstamp_ack |=
3036 TCP_SKB_CB(next_skb)->txstamp_ack;
3037 }
3038 }
3039
3040
3041 static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
3042 {
3043 struct tcp_sock *tp = tcp_sk(sk);
3044 struct sk_buff *next_skb = skb_rb_next(skb);
3045 int next_skb_size;
3046
3047 next_skb_size = next_skb->len;
3048
3049 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
3050
3051 if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size))
3052 return false;
3053
3054 tcp_highest_sack_replace(sk, next_skb, skb);
3055
3056
3057 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
3058
3059
3060 TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
3061
3062
3063
3064
3065 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
3066 TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
3067
3068
3069 tcp_clear_retrans_hints_partial(tp);
3070 if (next_skb == tp->retransmit_skb_hint)
3071 tp->retransmit_skb_hint = skb;
3072
3073 tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
3074
3075 tcp_skb_collapse_tstamp(skb, next_skb);
3076
3077 tcp_rtx_queue_unlink_and_free(next_skb, sk);
3078 return true;
3079 }
3080
3081
3082 static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
3083 {
3084 if (tcp_skb_pcount(skb) > 1)
3085 return false;
3086 if (skb_cloned(skb))
3087 return false;
3088
3089 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
3090 return false;
3091
3092 return true;
3093 }
3094
3095
3096
3097
3098 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
3099 int space)
3100 {
3101 struct tcp_sock *tp = tcp_sk(sk);
3102 struct sk_buff *skb = to, *tmp;
3103 bool first = true;
3104
3105 if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
3106 return;
3107 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3108 return;
3109
3110 skb_rbtree_walk_from_safe(skb, tmp) {
3111 if (!tcp_can_collapse(sk, skb))
3112 break;
3113
3114 if (!tcp_skb_can_collapse(to, skb))
3115 break;
3116
3117 space -= skb->len;
3118
3119 if (first) {
3120 first = false;
3121 continue;
3122 }
3123
3124 if (space < 0)
3125 break;
3126
3127 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
3128 break;
3129
3130 if (!tcp_collapse_retrans(sk, to))
3131 break;
3132 }
3133 }
3134
3135
3136
3137
3138
3139 int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
3140 {
3141 struct inet_connection_sock *icsk = inet_csk(sk);
3142 struct tcp_sock *tp = tcp_sk(sk);
3143 unsigned int cur_mss;
3144 int diff, len, err;
3145 int avail_wnd;
3146
3147
3148 if (icsk->icsk_mtup.probe_size)
3149 icsk->icsk_mtup.probe_size = 0;
3150
3151 if (skb_still_in_host_queue(sk, skb))
3152 return -EBUSY;
3153
3154 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
3155 if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
3156 WARN_ON_ONCE(1);
3157 return -EINVAL;
3158 }
3159 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3160 return -ENOMEM;
3161 }
3162
3163 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3164 return -EHOSTUNREACH;
3165
3166 cur_mss = tcp_current_mss(sk);
3167 avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
3168
3169
3170
3171
3172
3173
3174 if (avail_wnd <= 0) {
3175 if (TCP_SKB_CB(skb)->seq != tp->snd_una)
3176 return -EAGAIN;
3177 avail_wnd = cur_mss;
3178 }
3179
3180 len = cur_mss * segs;
3181 if (len > avail_wnd) {
3182 len = rounddown(avail_wnd, cur_mss);
3183 if (!len)
3184 len = avail_wnd;
3185 }
3186 if (skb->len > len) {
3187 if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
3188 cur_mss, GFP_ATOMIC))
3189 return -ENOMEM;
3190 } else {
3191 if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
3192 return -ENOMEM;
3193
3194 diff = tcp_skb_pcount(skb);
3195 tcp_set_skb_tso_segs(skb, cur_mss);
3196 diff -= tcp_skb_pcount(skb);
3197 if (diff)
3198 tcp_adjust_pcount(sk, skb, diff);
3199 avail_wnd = min_t(int, avail_wnd, cur_mss);
3200 if (skb->len < avail_wnd)
3201 tcp_retrans_try_collapse(sk, skb, avail_wnd);
3202 }
3203
3204
3205 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
3206 tcp_ecn_clear_syn(sk, skb);
3207
3208
3209 segs = tcp_skb_pcount(skb);
3210 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
3211 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3212 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3213 tp->total_retrans += segs;
3214 tp->bytes_retrans += skb->len;
3215
3216
3217
3218
3219
3220 if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
3221 skb_headroom(skb) >= 0xFFFF)) {
3222 struct sk_buff *nskb;
3223
3224 tcp_skb_tsorted_save(skb) {
3225 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
3226 if (nskb) {
3227 nskb->dev = NULL;
3228 err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
3229 } else {
3230 err = -ENOBUFS;
3231 }
3232 } tcp_skb_tsorted_restore(skb);
3233
3234 if (!err) {
3235 tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
3236 tcp_rate_skb_sent(sk, skb);
3237 }
3238 } else {
3239 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3240 }
3241
3242
3243
3244
3245 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
3246
3247 if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
3248 tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
3249 TCP_SKB_CB(skb)->seq, segs, err);
3250
3251 if (likely(!err)) {
3252 trace_tcp_retransmit_skb(sk, skb);
3253 } else if (err != -EBUSY) {
3254 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
3255 }
3256 return err;
3257 }
3258
3259 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
3260 {
3261 struct tcp_sock *tp = tcp_sk(sk);
3262 int err = __tcp_retransmit_skb(sk, skb, segs);
3263
3264 if (err == 0) {
3265 #if FASTRETRANS_DEBUG > 0
3266 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
3267 net_dbg_ratelimited("retrans_out leaked\n");
3268 }
3269 #endif
3270 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
3271 tp->retrans_out += tcp_skb_pcount(skb);
3272 }
3273
3274
3275 if (!tp->retrans_stamp)
3276 tp->retrans_stamp = tcp_skb_timestamp(skb);
3277
3278 if (tp->undo_retrans < 0)
3279 tp->undo_retrans = 0;
3280 tp->undo_retrans += tcp_skb_pcount(skb);
3281 return err;
3282 }
3283
3284
3285
3286
3287
3288
3289 void tcp_xmit_retransmit_queue(struct sock *sk)
3290 {
3291 const struct inet_connection_sock *icsk = inet_csk(sk);
3292 struct sk_buff *skb, *rtx_head, *hole = NULL;
3293 struct tcp_sock *tp = tcp_sk(sk);
3294 bool rearm_timer = false;
3295 u32 max_segs;
3296 int mib_idx;
3297
3298 if (!tp->packets_out)
3299 return;
3300
3301 rtx_head = tcp_rtx_queue_head(sk);
3302 skb = tp->retransmit_skb_hint ?: rtx_head;
3303 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
3304 skb_rbtree_walk_from(skb) {
3305 __u8 sacked;
3306 int segs;
3307
3308 if (tcp_pacing_check(sk))
3309 break;
3310
3311
3312 if (!hole)
3313 tp->retransmit_skb_hint = skb;
3314
3315 segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp);
3316 if (segs <= 0)
3317 break;
3318 sacked = TCP_SKB_CB(skb)->sacked;
3319
3320
3321
3322 segs = min_t(int, segs, max_segs);
3323
3324 if (tp->retrans_out >= tp->lost_out) {
3325 break;
3326 } else if (!(sacked & TCPCB_LOST)) {
3327 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
3328 hole = skb;
3329 continue;
3330
3331 } else {
3332 if (icsk->icsk_ca_state != TCP_CA_Loss)
3333 mib_idx = LINUX_MIB_TCPFASTRETRANS;
3334 else
3335 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
3336 }
3337
3338 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
3339 continue;
3340
3341 if (tcp_small_queue_check(sk, skb, 1))
3342 break;
3343
3344 if (tcp_retransmit_skb(sk, skb, segs))
3345 break;
3346
3347 NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
3348
3349 if (tcp_in_cwnd_reduction(sk))
3350 tp->prr_out += tcp_skb_pcount(skb);
3351
3352 if (skb == rtx_head &&
3353 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3354 rearm_timer = true;
3355
3356 }
3357 if (rearm_timer)
3358 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3359 inet_csk(sk)->icsk_rto,
3360 TCP_RTO_MAX);
3361 }
3362
3363
3364
3365
3366
3367
3368
3369
3370 void sk_forced_mem_schedule(struct sock *sk, int size)
3371 {
3372 int delta, amt;
3373
3374 delta = size - sk->sk_forward_alloc;
3375 if (delta <= 0)
3376 return;
3377 amt = sk_mem_pages(delta);
3378 sk->sk_forward_alloc += amt << PAGE_SHIFT;
3379 sk_memory_allocated_add(sk, amt);
3380
3381 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3382 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3383 gfp_memcg_charge() | __GFP_NOFAIL);
3384 }
3385
3386
3387
3388
3389 void tcp_send_fin(struct sock *sk)
3390 {
3391 struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
3392 struct tcp_sock *tp = tcp_sk(sk);
3393
3394
3395
3396
3397
3398
3399 tskb = tail;
3400 if (!tskb && tcp_under_memory_pressure(sk))
3401 tskb = skb_rb_last(&sk->tcp_rtx_queue);
3402
3403 if (tskb) {
3404 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
3405 TCP_SKB_CB(tskb)->end_seq++;
3406 tp->write_seq++;
3407 if (!tail) {
3408
3409
3410
3411
3412
3413
3414 WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
3415 return;
3416 }
3417 } else {
3418 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
3419 if (unlikely(!skb))
3420 return;
3421
3422 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3423 skb_reserve(skb, MAX_TCP_HEADER);
3424 sk_forced_mem_schedule(sk, skb->truesize);
3425
3426 tcp_init_nondata_skb(skb, tp->write_seq,
3427 TCPHDR_ACK | TCPHDR_FIN);
3428 tcp_queue_skb(sk, skb);
3429 }
3430 __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
3431 }
3432
3433
3434
3435
3436
3437
3438 void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3439 {
3440 struct sk_buff *skb;
3441
3442 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3443
3444
3445 skb = alloc_skb(MAX_TCP_HEADER, priority);
3446 if (!skb) {
3447 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3448 return;
3449 }
3450
3451
3452 skb_reserve(skb, MAX_TCP_HEADER);
3453 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
3454 TCPHDR_ACK | TCPHDR_RST);
3455 tcp_mstamp_refresh(tcp_sk(sk));
3456
3457 if (tcp_transmit_skb(sk, skb, 0, priority))
3458 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3459
3460
3461
3462
3463 trace_tcp_send_reset(sk, NULL);
3464 }
3465
3466
3467
3468
3469
3470
3471
3472 int tcp_send_synack(struct sock *sk)
3473 {
3474 struct sk_buff *skb;
3475
3476 skb = tcp_rtx_queue_head(sk);
3477 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3478 pr_err("%s: wrong queue state\n", __func__);
3479 return -EFAULT;
3480 }
3481 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3482 if (skb_cloned(skb)) {
3483 struct sk_buff *nskb;
3484
3485 tcp_skb_tsorted_save(skb) {
3486 nskb = skb_copy(skb, GFP_ATOMIC);
3487 } tcp_skb_tsorted_restore(skb);
3488 if (!nskb)
3489 return -ENOMEM;
3490 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3491 tcp_highest_sack_replace(sk, skb, nskb);
3492 tcp_rtx_queue_unlink_and_free(skb, sk);
3493 __skb_header_release(nskb);
3494 tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3495 sk_wmem_queued_add(sk, nskb->truesize);
3496 sk_mem_charge(sk, nskb->truesize);
3497 skb = nskb;
3498 }
3499
3500 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
3501 tcp_ecn_send_synack(sk, skb);
3502 }
3503 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3504 }
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516 struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3517 struct request_sock *req,
3518 struct tcp_fastopen_cookie *foc,
3519 enum tcp_synack_type synack_type,
3520 struct sk_buff *syn_skb)
3521 {
3522 struct inet_request_sock *ireq = inet_rsk(req);
3523 const struct tcp_sock *tp = tcp_sk(sk);
3524 struct tcp_md5sig_key *md5 = NULL;
3525 struct tcp_out_options opts;
3526 struct sk_buff *skb;
3527 int tcp_header_size;
3528 struct tcphdr *th;
3529 int mss;
3530 u64 now;
3531
3532 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3533 if (unlikely(!skb)) {
3534 dst_release(dst);
3535 return NULL;
3536 }
3537
3538 skb_reserve(skb, MAX_TCP_HEADER);
3539
3540 switch (synack_type) {
3541 case TCP_SYNACK_NORMAL:
3542 skb_set_owner_w(skb, req_to_sk(req));
3543 break;
3544 case TCP_SYNACK_COOKIE:
3545
3546
3547
3548 break;
3549 case TCP_SYNACK_FASTOPEN:
3550
3551
3552
3553
3554 skb_set_owner_w(skb, (struct sock *)sk);
3555 break;
3556 }
3557 skb_dst_set(skb, dst);
3558
3559 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3560
3561 memset(&opts, 0, sizeof(opts));
3562 now = tcp_clock_ns();
3563 #ifdef CONFIG_SYN_COOKIES
3564 if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
3565 skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
3566 true);
3567 else
3568 #endif
3569 {
3570 skb_set_delivery_time(skb, now, true);
3571 if (!tcp_rsk(req)->snt_synack)
3572 tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
3573 }
3574
3575 #ifdef CONFIG_TCP_MD5SIG
3576 rcu_read_lock();
3577 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
3578 #endif
3579 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
3580
3581 TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
3582 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3583 foc, synack_type,
3584 syn_skb) + sizeof(*th);
3585
3586 skb_push(skb, tcp_header_size);
3587 skb_reset_transport_header(skb);
3588
3589 th = (struct tcphdr *)skb->data;
3590 memset(th, 0, sizeof(struct tcphdr));
3591 th->syn = 1;
3592 th->ack = 1;
3593 tcp_ecn_make_synack(req, th);
3594 th->source = htons(ireq->ir_num);
3595 th->dest = ireq->ir_rmt_port;
3596 skb->mark = ireq->ir_mark;
3597 skb->ip_summed = CHECKSUM_PARTIAL;
3598 th->seq = htonl(tcp_rsk(req)->snt_isn);
3599
3600 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
3601
3602
3603 th->window = htons(min(req->rsk_rcv_wnd, 65535U));
3604 tcp_options_write(th, NULL, &opts);
3605 th->doff = (tcp_header_size >> 2);
3606 __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
3607
3608 #ifdef CONFIG_TCP_MD5SIG
3609
3610 if (md5)
3611 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
3612 md5, req_to_sk(req), skb);
3613 rcu_read_unlock();
3614 #endif
3615
3616 bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
3617 synack_type, &opts);
3618
3619 skb_set_delivery_time(skb, now, true);
3620 tcp_add_tx_delay(skb, tp);
3621
3622 return skb;
3623 }
3624 EXPORT_SYMBOL(tcp_make_synack);
3625
3626 static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
3627 {
3628 struct inet_connection_sock *icsk = inet_csk(sk);
3629 const struct tcp_congestion_ops *ca;
3630 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
3631
3632 if (ca_key == TCP_CA_UNSPEC)
3633 return;
3634
3635 rcu_read_lock();
3636 ca = tcp_ca_find_key(ca_key);
3637 if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
3638 bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
3639 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
3640 icsk->icsk_ca_ops = ca;
3641 }
3642 rcu_read_unlock();
3643 }
3644
3645
3646 static void tcp_connect_init(struct sock *sk)
3647 {
3648 const struct dst_entry *dst = __sk_dst_get(sk);
3649 struct tcp_sock *tp = tcp_sk(sk);
3650 __u8 rcv_wscale;
3651 u32 rcv_wnd;
3652
3653
3654
3655
3656 tp->tcp_header_len = sizeof(struct tcphdr);
3657 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
3658 tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
3659
3660 #ifdef CONFIG_TCP_MD5SIG
3661 if (tp->af_specific->md5_lookup(sk, sk))
3662 tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
3663 #endif
3664
3665
3666 if (tp->rx_opt.user_mss)
3667 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
3668 tp->max_window = 0;
3669 tcp_mtup_init(sk);
3670 tcp_sync_mss(sk, dst_mtu(dst));
3671
3672 tcp_ca_dst_init(sk, dst);
3673
3674 if (!tp->window_clamp)
3675 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3676 tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3677
3678 tcp_initialize_rcv_mss(sk);
3679
3680
3681 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
3682 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
3683 tp->window_clamp = tcp_full_space(sk);
3684
3685 rcv_wnd = tcp_rwnd_init_bpf(sk);
3686 if (rcv_wnd == 0)
3687 rcv_wnd = dst_metric(dst, RTAX_INITRWND);
3688
3689 tcp_select_initial_window(sk, tcp_full_space(sk),
3690 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
3691 &tp->rcv_wnd,
3692 &tp->window_clamp,
3693 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
3694 &rcv_wscale,
3695 rcv_wnd);
3696
3697 tp->rx_opt.rcv_wscale = rcv_wscale;
3698 tp->rcv_ssthresh = tp->rcv_wnd;
3699
3700 sk->sk_err = 0;
3701 sock_reset_flag(sk, SOCK_DONE);
3702 tp->snd_wnd = 0;
3703 tcp_init_wl(tp, 0);
3704 tcp_write_queue_purge(sk);
3705 tp->snd_una = tp->write_seq;
3706 tp->snd_sml = tp->write_seq;
3707 tp->snd_up = tp->write_seq;
3708 WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3709
3710 if (likely(!tp->repair))
3711 tp->rcv_nxt = 0;
3712 else
3713 tp->rcv_tstamp = tcp_jiffies32;
3714 tp->rcv_wup = tp->rcv_nxt;
3715 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
3716
3717 inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
3718 inet_csk(sk)->icsk_retransmits = 0;
3719 tcp_clear_retrans(tp);
3720 }
3721
3722 static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3723 {
3724 struct tcp_sock *tp = tcp_sk(sk);
3725 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
3726
3727 tcb->end_seq += skb->len;
3728 __skb_header_release(skb);
3729 sk_wmem_queued_add(sk, skb->truesize);
3730 sk_mem_charge(sk, skb->truesize);
3731 WRITE_ONCE(tp->write_seq, tcb->end_seq);
3732 tp->packets_out += tcp_skb_pcount(skb);
3733 }
3734
3735
3736
3737
3738
3739
3740
3741
3742 static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3743 {
3744 struct inet_connection_sock *icsk = inet_csk(sk);
3745 struct tcp_sock *tp = tcp_sk(sk);
3746 struct tcp_fastopen_request *fo = tp->fastopen_req;
3747 int space, err = 0;
3748 struct sk_buff *syn_data;
3749
3750 tp->rx_opt.mss_clamp = tp->advmss;
3751 if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
3752 goto fallback;
3753
3754
3755
3756
3757
3758 tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
3759
3760 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
3761
3762 space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
3763 MAX_TCP_OPTION_SPACE;
3764
3765 space = min_t(size_t, space, fo->size);
3766
3767
3768 space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
3769
3770 syn_data = tcp_stream_alloc_skb(sk, space, sk->sk_allocation, false);
3771 if (!syn_data)
3772 goto fallback;
3773 memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3774 if (space) {
3775 int copied = copy_from_iter(skb_put(syn_data, space), space,
3776 &fo->data->msg_iter);
3777 if (unlikely(!copied)) {
3778 tcp_skb_tsorted_anchor_cleanup(syn_data);
3779 kfree_skb(syn_data);
3780 goto fallback;
3781 }
3782 if (copied != space) {
3783 skb_trim(syn_data, copied);
3784 space = copied;
3785 }
3786 skb_zcopy_set(syn_data, fo->uarg, NULL);
3787 }
3788
3789 if (space == fo->size)
3790 fo->data = NULL;
3791 fo->copied = space;
3792
3793 tcp_connect_queue_skb(sk, syn_data);
3794 if (syn_data->len)
3795 tcp_chrono_start(sk, TCP_CHRONO_BUSY);
3796
3797 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3798
3799 skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true);
3800
3801
3802
3803
3804
3805
3806 TCP_SKB_CB(syn_data)->seq++;
3807 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3808 if (!err) {
3809 tp->syn_data = (fo->copied > 0);
3810 tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
3811 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3812 goto done;
3813 }
3814
3815
3816 __skb_queue_tail(&sk->sk_write_queue, syn_data);
3817 tp->packets_out -= tcp_skb_pcount(syn_data);
3818
3819 fallback:
3820
3821 if (fo->cookie.len > 0)
3822 fo->cookie.len = 0;
3823 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
3824 if (err)
3825 tp->syn_fastopen = 0;
3826 done:
3827 fo->cookie.len = -1;
3828 return err;
3829 }
3830
3831
3832 int tcp_connect(struct sock *sk)
3833 {
3834 struct tcp_sock *tp = tcp_sk(sk);
3835 struct sk_buff *buff;
3836 int err;
3837
3838 tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
3839
3840 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3841 return -EHOSTUNREACH;
3842
3843 tcp_connect_init(sk);
3844
3845 if (unlikely(tp->repair)) {
3846 tcp_finish_connect(sk, NULL);
3847 return 0;
3848 }
3849
3850 buff = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
3851 if (unlikely(!buff))
3852 return -ENOBUFS;
3853
3854 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
3855 tcp_mstamp_refresh(tp);
3856 tp->retrans_stamp = tcp_time_stamp(tp);
3857 tcp_connect_queue_skb(sk, buff);
3858 tcp_ecn_send_syn(sk, buff);
3859 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
3860
3861
3862 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
3863 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
3864 if (err == -ECONNREFUSED)
3865 return err;
3866
3867
3868
3869
3870 WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3871 tp->pushed_seq = tp->write_seq;
3872 buff = tcp_send_head(sk);
3873 if (unlikely(buff)) {
3874 WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
3875 tp->pushed_seq = TCP_SKB_CB(buff)->seq;
3876 }
3877 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
3878
3879
3880 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3881 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
3882 return 0;
3883 }
3884 EXPORT_SYMBOL(tcp_connect);
3885
3886
3887
3888
3889
3890 void tcp_send_delayed_ack(struct sock *sk)
3891 {
3892 struct inet_connection_sock *icsk = inet_csk(sk);
3893 int ato = icsk->icsk_ack.ato;
3894 unsigned long timeout;
3895
3896 if (ato > TCP_DELACK_MIN) {
3897 const struct tcp_sock *tp = tcp_sk(sk);
3898 int max_ato = HZ / 2;
3899
3900 if (inet_csk_in_pingpong_mode(sk) ||
3901 (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
3902 max_ato = TCP_DELACK_MAX;
3903
3904
3905
3906
3907
3908
3909
3910 if (tp->srtt_us) {
3911 int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
3912 TCP_DELACK_MIN);
3913
3914 if (rtt < max_ato)
3915 max_ato = rtt;
3916 }
3917
3918 ato = min(ato, max_ato);
3919 }
3920
3921 ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
3922
3923
3924 timeout = jiffies + ato;
3925
3926
3927 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3928
3929 if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3930 tcp_send_ack(sk);
3931 return;
3932 }
3933
3934 if (!time_before(timeout, icsk->icsk_ack.timeout))
3935 timeout = icsk->icsk_ack.timeout;
3936 }
3937 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3938 icsk->icsk_ack.timeout = timeout;
3939 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
3940 }
3941
3942
3943 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
3944 {
3945 struct sk_buff *buff;
3946
3947
3948 if (sk->sk_state == TCP_CLOSE)
3949 return;
3950
3951
3952
3953
3954
3955 buff = alloc_skb(MAX_TCP_HEADER,
3956 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
3957 if (unlikely(!buff)) {
3958 struct inet_connection_sock *icsk = inet_csk(sk);
3959 unsigned long delay;
3960
3961 delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
3962 if (delay < TCP_RTO_MAX)
3963 icsk->icsk_ack.retry++;
3964 inet_csk_schedule_ack(sk);
3965 icsk->icsk_ack.ato = TCP_ATO_MIN;
3966 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
3967 return;
3968 }
3969
3970
3971 skb_reserve(buff, MAX_TCP_HEADER);
3972 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3973
3974
3975
3976
3977
3978 skb_set_tcp_pure_ack(buff);
3979
3980
3981 __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
3982 }
3983 EXPORT_SYMBOL_GPL(__tcp_send_ack);
3984
3985 void tcp_send_ack(struct sock *sk)
3986 {
3987 __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
3988 }
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001 static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
4002 {
4003 struct tcp_sock *tp = tcp_sk(sk);
4004 struct sk_buff *skb;
4005
4006
4007 skb = alloc_skb(MAX_TCP_HEADER,
4008 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
4009 if (!skb)
4010 return -1;
4011
4012
4013 skb_reserve(skb, MAX_TCP_HEADER);
4014
4015
4016
4017
4018 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
4019 NET_INC_STATS(sock_net(sk), mib);
4020 return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
4021 }
4022
4023
4024 void tcp_send_window_probe(struct sock *sk)
4025 {
4026 if (sk->sk_state == TCP_ESTABLISHED) {
4027 tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
4028 tcp_mstamp_refresh(tcp_sk(sk));
4029 tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
4030 }
4031 }
4032
4033
4034 int tcp_write_wakeup(struct sock *sk, int mib)
4035 {
4036 struct tcp_sock *tp = tcp_sk(sk);
4037 struct sk_buff *skb;
4038
4039 if (sk->sk_state == TCP_CLOSE)
4040 return -1;
4041
4042 skb = tcp_send_head(sk);
4043 if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
4044 int err;
4045 unsigned int mss = tcp_current_mss(sk);
4046 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
4047
4048 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
4049 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
4050
4051
4052
4053
4054
4055 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
4056 skb->len > mss) {
4057 seg_size = min(seg_size, mss);
4058 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
4059 if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
4060 skb, seg_size, mss, GFP_ATOMIC))
4061 return -1;
4062 } else if (!tcp_skb_pcount(skb))
4063 tcp_set_skb_tso_segs(skb, mss);
4064
4065 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
4066 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
4067 if (!err)
4068 tcp_event_new_data_sent(sk, skb);
4069 return err;
4070 } else {
4071 if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
4072 tcp_xmit_probe_skb(sk, 1, mib);
4073 return tcp_xmit_probe_skb(sk, 0, mib);
4074 }
4075 }
4076
4077
4078
4079
4080 void tcp_send_probe0(struct sock *sk)
4081 {
4082 struct inet_connection_sock *icsk = inet_csk(sk);
4083 struct tcp_sock *tp = tcp_sk(sk);
4084 struct net *net = sock_net(sk);
4085 unsigned long timeout;
4086 int err;
4087
4088 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
4089
4090 if (tp->packets_out || tcp_write_queue_empty(sk)) {
4091
4092 icsk->icsk_probes_out = 0;
4093 icsk->icsk_backoff = 0;
4094 icsk->icsk_probes_tstamp = 0;
4095 return;
4096 }
4097
4098 icsk->icsk_probes_out++;
4099 if (err <= 0) {
4100 if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
4101 icsk->icsk_backoff++;
4102 timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
4103 } else {
4104
4105
4106
4107 timeout = TCP_RESOURCE_PROBE_INTERVAL;
4108 }
4109
4110 timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
4111 tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
4112 }
4113
4114 int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
4115 {
4116 const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
4117 struct flowi fl;
4118 int res;
4119
4120
4121 if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
4122 tcp_rsk(req)->txhash = net_tx_rndhash();
4123 res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
4124 NULL);
4125 if (!res) {
4126 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
4127 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
4128 if (unlikely(tcp_passive_fastopen(sk)))
4129 tcp_sk(sk)->total_retrans++;
4130 trace_tcp_retransmit_synack(sk, req);
4131 }
4132 return res;
4133 }
4134 EXPORT_SYMBOL(tcp_rtx_synack);