0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200
0201
0202
0203
0204
0205
0206
0207
0208
0209
0210
0211
0212
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244 #define pr_fmt(fmt) "TCP: " fmt
0245
0246 #include <crypto/hash.h>
0247 #include <linux/kernel.h>
0248 #include <linux/module.h>
0249 #include <linux/types.h>
0250 #include <linux/fcntl.h>
0251 #include <linux/poll.h>
0252 #include <linux/inet_diag.h>
0253 #include <linux/init.h>
0254 #include <linux/fs.h>
0255 #include <linux/skbuff.h>
0256 #include <linux/scatterlist.h>
0257 #include <linux/splice.h>
0258 #include <linux/net.h>
0259 #include <linux/socket.h>
0260 #include <linux/random.h>
0261 #include <linux/memblock.h>
0262 #include <linux/highmem.h>
0263 #include <linux/cache.h>
0264 #include <linux/err.h>
0265 #include <linux/time.h>
0266 #include <linux/slab.h>
0267 #include <linux/errqueue.h>
0268 #include <linux/static_key.h>
0269 #include <linux/btf.h>
0270
0271 #include <net/icmp.h>
0272 #include <net/inet_common.h>
0273 #include <net/tcp.h>
0274 #include <net/mptcp.h>
0275 #include <net/xfrm.h>
0276 #include <net/ip.h>
0277 #include <net/sock.h>
0278
0279 #include <linux/uaccess.h>
0280 #include <asm/ioctls.h>
0281 #include <net/busy_poll.h>
0282
0283
0284 enum {
0285 TCP_CMSG_INQ = 1,
0286 TCP_CMSG_TS = 2
0287 };
0288
0289 DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
0290 EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
0291
0292 long sysctl_tcp_mem[3] __read_mostly;
0293 EXPORT_SYMBOL(sysctl_tcp_mem);
0294
0295 atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;
0296 EXPORT_SYMBOL(tcp_memory_allocated);
0297 DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
0298 EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
0299
0300 #if IS_ENABLED(CONFIG_SMC)
0301 DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
0302 EXPORT_SYMBOL(tcp_have_smc);
0303 #endif
0304
0305
0306
0307
0308 struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
0309 EXPORT_SYMBOL(tcp_sockets_allocated);
0310
0311
0312
0313
0314 struct tcp_splice_state {
0315 struct pipe_inode_info *pipe;
0316 size_t len;
0317 unsigned int flags;
0318 };
0319
0320
0321
0322
0323
0324
0325
0326 unsigned long tcp_memory_pressure __read_mostly;
0327 EXPORT_SYMBOL_GPL(tcp_memory_pressure);
0328
0329 void tcp_enter_memory_pressure(struct sock *sk)
0330 {
0331 unsigned long val;
0332
0333 if (READ_ONCE(tcp_memory_pressure))
0334 return;
0335 val = jiffies;
0336
0337 if (!val)
0338 val--;
0339 if (!cmpxchg(&tcp_memory_pressure, 0, val))
0340 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
0341 }
0342 EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
0343
0344 void tcp_leave_memory_pressure(struct sock *sk)
0345 {
0346 unsigned long val;
0347
0348 if (!READ_ONCE(tcp_memory_pressure))
0349 return;
0350 val = xchg(&tcp_memory_pressure, 0);
0351 if (val)
0352 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
0353 jiffies_to_msecs(jiffies - val));
0354 }
0355 EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
0356
0357
0358 static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
0359 {
0360 u8 res = 0;
0361
0362 if (seconds > 0) {
0363 int period = timeout;
0364
0365 res = 1;
0366 while (seconds > period && res < 255) {
0367 res++;
0368 timeout <<= 1;
0369 if (timeout > rto_max)
0370 timeout = rto_max;
0371 period += timeout;
0372 }
0373 }
0374 return res;
0375 }
0376
0377
0378 static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
0379 {
0380 int period = 0;
0381
0382 if (retrans > 0) {
0383 period = timeout;
0384 while (--retrans) {
0385 timeout <<= 1;
0386 if (timeout > rto_max)
0387 timeout = rto_max;
0388 period += timeout;
0389 }
0390 }
0391 return period;
0392 }
0393
0394 static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
0395 {
0396 u32 rate = READ_ONCE(tp->rate_delivered);
0397 u32 intv = READ_ONCE(tp->rate_interval_us);
0398 u64 rate64 = 0;
0399
0400 if (rate && intv) {
0401 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
0402 do_div(rate64, intv);
0403 }
0404 return rate64;
0405 }
0406
0407
0408
0409
0410
0411
0412 void tcp_init_sock(struct sock *sk)
0413 {
0414 struct inet_connection_sock *icsk = inet_csk(sk);
0415 struct tcp_sock *tp = tcp_sk(sk);
0416
0417 tp->out_of_order_queue = RB_ROOT;
0418 sk->tcp_rtx_queue = RB_ROOT;
0419 tcp_init_xmit_timers(sk);
0420 INIT_LIST_HEAD(&tp->tsq_node);
0421 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
0422
0423 icsk->icsk_rto = TCP_TIMEOUT_INIT;
0424 icsk->icsk_rto_min = TCP_RTO_MIN;
0425 icsk->icsk_delack_max = TCP_DELACK_MAX;
0426 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
0427 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
0428
0429
0430
0431
0432
0433
0434 tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
0435
0436
0437 tp->app_limited = ~0U;
0438
0439
0440
0441
0442 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
0443 tp->snd_cwnd_clamp = ~0;
0444 tp->mss_cache = TCP_MSS_DEFAULT;
0445
0446 tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
0447 tcp_assign_congestion_control(sk);
0448
0449 tp->tsoffset = 0;
0450 tp->rack.reo_wnd_steps = 1;
0451
0452 sk->sk_write_space = sk_stream_write_space;
0453 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
0454
0455 icsk->icsk_sync_mss = tcp_sync_mss;
0456
0457 WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
0458 WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
0459
0460 sk_sockets_allocated_inc(sk);
0461 }
0462 EXPORT_SYMBOL(tcp_init_sock);
0463
0464 static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
0465 {
0466 struct sk_buff *skb = tcp_write_queue_tail(sk);
0467
0468 if (tsflags && skb) {
0469 struct skb_shared_info *shinfo = skb_shinfo(skb);
0470 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
0471
0472 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
0473 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
0474 tcb->txstamp_ack = 1;
0475 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
0476 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
0477 }
0478 }
0479
0480 static bool tcp_stream_is_readable(struct sock *sk, int target)
0481 {
0482 if (tcp_epollin_ready(sk, target))
0483 return true;
0484 return sk_is_readable(sk);
0485 }
0486
0487
0488
0489
0490
0491
0492
0493
0494 __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
0495 {
0496 __poll_t mask;
0497 struct sock *sk = sock->sk;
0498 const struct tcp_sock *tp = tcp_sk(sk);
0499 int state;
0500
0501 sock_poll_wait(file, sock, wait);
0502
0503 state = inet_sk_state_load(sk);
0504 if (state == TCP_LISTEN)
0505 return inet_csk_listen_poll(sk);
0506
0507
0508
0509
0510
0511
0512 mask = 0;
0513
0514
0515
0516
0517
0518
0519
0520
0521
0522
0523
0524
0525
0526
0527
0528
0529
0530
0531
0532
0533
0534
0535
0536
0537
0538
0539
0540
0541 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
0542 mask |= EPOLLHUP;
0543 if (sk->sk_shutdown & RCV_SHUTDOWN)
0544 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
0545
0546
0547 if (state != TCP_SYN_SENT &&
0548 (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
0549 int target = sock_rcvlowat(sk, 0, INT_MAX);
0550 u16 urg_data = READ_ONCE(tp->urg_data);
0551
0552 if (unlikely(urg_data) &&
0553 READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
0554 !sock_flag(sk, SOCK_URGINLINE))
0555 target++;
0556
0557 if (tcp_stream_is_readable(sk, target))
0558 mask |= EPOLLIN | EPOLLRDNORM;
0559
0560 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
0561 if (__sk_stream_is_writeable(sk, 1)) {
0562 mask |= EPOLLOUT | EPOLLWRNORM;
0563 } else {
0564 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
0565 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
0566
0567
0568
0569
0570
0571
0572 smp_mb__after_atomic();
0573 if (__sk_stream_is_writeable(sk, 1))
0574 mask |= EPOLLOUT | EPOLLWRNORM;
0575 }
0576 } else
0577 mask |= EPOLLOUT | EPOLLWRNORM;
0578
0579 if (urg_data & TCP_URG_VALID)
0580 mask |= EPOLLPRI;
0581 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
0582
0583
0584
0585
0586 mask |= EPOLLOUT | EPOLLWRNORM;
0587 }
0588
0589 smp_rmb();
0590 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
0591 mask |= EPOLLERR;
0592
0593 return mask;
0594 }
0595 EXPORT_SYMBOL(tcp_poll);
0596
0597 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
0598 {
0599 struct tcp_sock *tp = tcp_sk(sk);
0600 int answ;
0601 bool slow;
0602
0603 switch (cmd) {
0604 case SIOCINQ:
0605 if (sk->sk_state == TCP_LISTEN)
0606 return -EINVAL;
0607
0608 slow = lock_sock_fast(sk);
0609 answ = tcp_inq(sk);
0610 unlock_sock_fast(sk, slow);
0611 break;
0612 case SIOCATMARK:
0613 answ = READ_ONCE(tp->urg_data) &&
0614 READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
0615 break;
0616 case SIOCOUTQ:
0617 if (sk->sk_state == TCP_LISTEN)
0618 return -EINVAL;
0619
0620 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
0621 answ = 0;
0622 else
0623 answ = READ_ONCE(tp->write_seq) - tp->snd_una;
0624 break;
0625 case SIOCOUTQNSD:
0626 if (sk->sk_state == TCP_LISTEN)
0627 return -EINVAL;
0628
0629 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
0630 answ = 0;
0631 else
0632 answ = READ_ONCE(tp->write_seq) -
0633 READ_ONCE(tp->snd_nxt);
0634 break;
0635 default:
0636 return -ENOIOCTLCMD;
0637 }
0638
0639 return put_user(answ, (int __user *)arg);
0640 }
0641 EXPORT_SYMBOL(tcp_ioctl);
0642
0643 void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
0644 {
0645 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
0646 tp->pushed_seq = tp->write_seq;
0647 }
0648
0649 static inline bool forced_push(const struct tcp_sock *tp)
0650 {
0651 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
0652 }
0653
0654 void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
0655 {
0656 struct tcp_sock *tp = tcp_sk(sk);
0657 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
0658
0659 tcb->seq = tcb->end_seq = tp->write_seq;
0660 tcb->tcp_flags = TCPHDR_ACK;
0661 __skb_header_release(skb);
0662 tcp_add_write_queue_tail(sk, skb);
0663 sk_wmem_queued_add(sk, skb->truesize);
0664 sk_mem_charge(sk, skb->truesize);
0665 if (tp->nonagle & TCP_NAGLE_PUSH)
0666 tp->nonagle &= ~TCP_NAGLE_PUSH;
0667
0668 tcp_slow_start_after_idle_check(sk);
0669 }
0670
0671 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
0672 {
0673 if (flags & MSG_OOB)
0674 tp->snd_up = tp->write_seq;
0675 }
0676
0677
0678
0679
0680
0681
0682
0683
0684
0685
0686
0687 static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
0688 int size_goal)
0689 {
0690 return skb->len < size_goal &&
0691 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
0692 !tcp_rtx_queue_empty(sk) &&
0693 refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
0694 tcp_skb_can_collapse_to(skb);
0695 }
0696
0697 void tcp_push(struct sock *sk, int flags, int mss_now,
0698 int nonagle, int size_goal)
0699 {
0700 struct tcp_sock *tp = tcp_sk(sk);
0701 struct sk_buff *skb;
0702
0703 skb = tcp_write_queue_tail(sk);
0704 if (!skb)
0705 return;
0706 if (!(flags & MSG_MORE) || forced_push(tp))
0707 tcp_mark_push(tp, skb);
0708
0709 tcp_mark_urg(tp, flags);
0710
0711 if (tcp_should_autocork(sk, skb, size_goal)) {
0712
0713
0714 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
0715 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
0716 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
0717 }
0718
0719
0720
0721 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
0722 return;
0723 }
0724
0725 if (flags & MSG_MORE)
0726 nonagle = TCP_NAGLE_CORK;
0727
0728 __tcp_push_pending_frames(sk, mss_now, nonagle);
0729 }
0730
0731 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
0732 unsigned int offset, size_t len)
0733 {
0734 struct tcp_splice_state *tss = rd_desc->arg.data;
0735 int ret;
0736
0737 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
0738 min(rd_desc->count, len), tss->flags);
0739 if (ret > 0)
0740 rd_desc->count -= ret;
0741 return ret;
0742 }
0743
0744 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
0745 {
0746
0747 read_descriptor_t rd_desc = {
0748 .arg.data = tss,
0749 .count = tss->len,
0750 };
0751
0752 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
0753 }
0754
0755
0756
0757
0758
0759
0760
0761
0762
0763
0764
0765
0766
0767 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
0768 struct pipe_inode_info *pipe, size_t len,
0769 unsigned int flags)
0770 {
0771 struct sock *sk = sock->sk;
0772 struct tcp_splice_state tss = {
0773 .pipe = pipe,
0774 .len = len,
0775 .flags = flags,
0776 };
0777 long timeo;
0778 ssize_t spliced;
0779 int ret;
0780
0781 sock_rps_record_flow(sk);
0782
0783
0784
0785 if (unlikely(*ppos))
0786 return -ESPIPE;
0787
0788 ret = spliced = 0;
0789
0790 lock_sock(sk);
0791
0792 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
0793 while (tss.len) {
0794 ret = __tcp_splice_read(sk, &tss);
0795 if (ret < 0)
0796 break;
0797 else if (!ret) {
0798 if (spliced)
0799 break;
0800 if (sock_flag(sk, SOCK_DONE))
0801 break;
0802 if (sk->sk_err) {
0803 ret = sock_error(sk);
0804 break;
0805 }
0806 if (sk->sk_shutdown & RCV_SHUTDOWN)
0807 break;
0808 if (sk->sk_state == TCP_CLOSE) {
0809
0810
0811
0812
0813 ret = -ENOTCONN;
0814 break;
0815 }
0816 if (!timeo) {
0817 ret = -EAGAIN;
0818 break;
0819 }
0820
0821
0822
0823
0824 if (!skb_queue_empty(&sk->sk_receive_queue))
0825 break;
0826 sk_wait_data(sk, &timeo, NULL);
0827 if (signal_pending(current)) {
0828 ret = sock_intr_errno(timeo);
0829 break;
0830 }
0831 continue;
0832 }
0833 tss.len -= ret;
0834 spliced += ret;
0835
0836 if (!timeo)
0837 break;
0838 release_sock(sk);
0839 lock_sock(sk);
0840
0841 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
0842 (sk->sk_shutdown & RCV_SHUTDOWN) ||
0843 signal_pending(current))
0844 break;
0845 }
0846
0847 release_sock(sk);
0848
0849 if (spliced)
0850 return spliced;
0851
0852 return ret;
0853 }
0854 EXPORT_SYMBOL(tcp_splice_read);
0855
0856 struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
0857 bool force_schedule)
0858 {
0859 struct sk_buff *skb;
0860
0861 skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
0862 if (likely(skb)) {
0863 bool mem_scheduled;
0864
0865 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
0866 if (force_schedule) {
0867 mem_scheduled = true;
0868 sk_forced_mem_schedule(sk, skb->truesize);
0869 } else {
0870 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
0871 }
0872 if (likely(mem_scheduled)) {
0873 skb_reserve(skb, MAX_TCP_HEADER);
0874 skb->ip_summed = CHECKSUM_PARTIAL;
0875 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
0876 return skb;
0877 }
0878 __kfree_skb(skb);
0879 } else {
0880 sk->sk_prot->enter_memory_pressure(sk);
0881 sk_stream_moderate_sndbuf(sk);
0882 }
0883 return NULL;
0884 }
0885
0886 static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
0887 int large_allowed)
0888 {
0889 struct tcp_sock *tp = tcp_sk(sk);
0890 u32 new_size_goal, size_goal;
0891
0892 if (!large_allowed)
0893 return mss_now;
0894
0895
0896 new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
0897
0898
0899 size_goal = tp->gso_segs * mss_now;
0900 if (unlikely(new_size_goal < size_goal ||
0901 new_size_goal >= size_goal + mss_now)) {
0902 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
0903 sk->sk_gso_max_segs);
0904 size_goal = tp->gso_segs * mss_now;
0905 }
0906
0907 return max(size_goal, mss_now);
0908 }
0909
0910 int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
0911 {
0912 int mss_now;
0913
0914 mss_now = tcp_current_mss(sk);
0915 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
0916
0917 return mss_now;
0918 }
0919
0920
0921
0922
0923
0924
0925
0926 void tcp_remove_empty_skb(struct sock *sk)
0927 {
0928 struct sk_buff *skb = tcp_write_queue_tail(sk);
0929
0930 if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
0931 tcp_unlink_write_queue(skb, sk);
0932 if (tcp_write_queue_empty(sk))
0933 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
0934 tcp_wmem_free_skb(sk, skb);
0935 }
0936 }
0937
0938
0939 static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
0940 {
0941 if (unlikely(skb_zcopy_pure(skb))) {
0942 u32 extra = skb->truesize -
0943 SKB_TRUESIZE(skb_end_offset(skb));
0944
0945 if (!sk_wmem_schedule(sk, extra))
0946 return -ENOMEM;
0947
0948 sk_mem_charge(sk, extra);
0949 skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
0950 }
0951 return 0;
0952 }
0953
0954
0955 static int tcp_wmem_schedule(struct sock *sk, int copy)
0956 {
0957 int left;
0958
0959 if (likely(sk_wmem_schedule(sk, copy)))
0960 return copy;
0961
0962
0963
0964
0965
0966 left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued;
0967 if (left > 0)
0968 sk_forced_mem_schedule(sk, min(left, copy));
0969 return min(copy, sk->sk_forward_alloc);
0970 }
0971
0972 static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
0973 struct page *page, int offset, size_t *size)
0974 {
0975 struct sk_buff *skb = tcp_write_queue_tail(sk);
0976 struct tcp_sock *tp = tcp_sk(sk);
0977 bool can_coalesce;
0978 int copy, i;
0979
0980 if (!skb || (copy = size_goal - skb->len) <= 0 ||
0981 !tcp_skb_can_collapse_to(skb)) {
0982 new_segment:
0983 if (!sk_stream_memory_free(sk))
0984 return NULL;
0985
0986 skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
0987 tcp_rtx_and_write_queues_empty(sk));
0988 if (!skb)
0989 return NULL;
0990
0991 #ifdef CONFIG_TLS_DEVICE
0992 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
0993 #endif
0994 tcp_skb_entail(sk, skb);
0995 copy = size_goal;
0996 }
0997
0998 if (copy > *size)
0999 copy = *size;
1000
1001 i = skb_shinfo(skb)->nr_frags;
1002 can_coalesce = skb_can_coalesce(skb, i, page, offset);
1003 if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
1004 tcp_mark_push(tp, skb);
1005 goto new_segment;
1006 }
1007 if (tcp_downgrade_zcopy_pure(sk, skb))
1008 return NULL;
1009
1010 copy = tcp_wmem_schedule(sk, copy);
1011 if (!copy)
1012 return NULL;
1013
1014 if (can_coalesce) {
1015 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1016 } else {
1017 get_page(page);
1018 skb_fill_page_desc_noacc(skb, i, page, offset, copy);
1019 }
1020
1021 if (!(flags & MSG_NO_SHARED_FRAGS))
1022 skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
1023
1024 skb->len += copy;
1025 skb->data_len += copy;
1026 skb->truesize += copy;
1027 sk_wmem_queued_add(sk, copy);
1028 sk_mem_charge(sk, copy);
1029 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1030 TCP_SKB_CB(skb)->end_seq += copy;
1031 tcp_skb_pcount_set(skb, 0);
1032
1033 *size = copy;
1034 return skb;
1035 }
1036
1037 ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
1038 size_t size, int flags)
1039 {
1040 struct tcp_sock *tp = tcp_sk(sk);
1041 int mss_now, size_goal;
1042 int err;
1043 ssize_t copied;
1044 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1045
1046 if (IS_ENABLED(CONFIG_DEBUG_VM) &&
1047 WARN_ONCE(!sendpage_ok(page),
1048 "page must not be a Slab one and have page_count > 0"))
1049 return -EINVAL;
1050
1051
1052
1053
1054
1055 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1056 !tcp_passive_fastopen(sk)) {
1057 err = sk_stream_wait_connect(sk, &timeo);
1058 if (err != 0)
1059 goto out_err;
1060 }
1061
1062 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1063
1064 mss_now = tcp_send_mss(sk, &size_goal, flags);
1065 copied = 0;
1066
1067 err = -EPIPE;
1068 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1069 goto out_err;
1070
1071 while (size > 0) {
1072 struct sk_buff *skb;
1073 size_t copy = size;
1074
1075 skb = tcp_build_frag(sk, size_goal, flags, page, offset, ©);
1076 if (!skb)
1077 goto wait_for_space;
1078
1079 if (!copied)
1080 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1081
1082 copied += copy;
1083 offset += copy;
1084 size -= copy;
1085 if (!size)
1086 goto out;
1087
1088 if (skb->len < size_goal || (flags & MSG_OOB))
1089 continue;
1090
1091 if (forced_push(tp)) {
1092 tcp_mark_push(tp, skb);
1093 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1094 } else if (skb == tcp_send_head(sk))
1095 tcp_push_one(sk, mss_now);
1096 continue;
1097
1098 wait_for_space:
1099 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1100 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1101 TCP_NAGLE_PUSH, size_goal);
1102
1103 err = sk_stream_wait_memory(sk, &timeo);
1104 if (err != 0)
1105 goto do_error;
1106
1107 mss_now = tcp_send_mss(sk, &size_goal, flags);
1108 }
1109
1110 out:
1111 if (copied) {
1112 tcp_tx_timestamp(sk, sk->sk_tsflags);
1113 if (!(flags & MSG_SENDPAGE_NOTLAST))
1114 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1115 }
1116 return copied;
1117
1118 do_error:
1119 tcp_remove_empty_skb(sk);
1120 if (copied)
1121 goto out;
1122 out_err:
1123
1124 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1125 sk->sk_write_space(sk);
1126 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1127 }
1128 return sk_stream_error(sk, flags, err);
1129 }
1130 EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1131
1132 int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1133 size_t size, int flags)
1134 {
1135 if (!(sk->sk_route_caps & NETIF_F_SG))
1136 return sock_no_sendpage_locked(sk, page, offset, size, flags);
1137
1138 tcp_rate_check_app_limited(sk);
1139
1140 return do_tcp_sendpages(sk, page, offset, size, flags);
1141 }
1142 EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1143
1144 int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1145 size_t size, int flags)
1146 {
1147 int ret;
1148
1149 lock_sock(sk);
1150 ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1151 release_sock(sk);
1152
1153 return ret;
1154 }
1155 EXPORT_SYMBOL(tcp_sendpage);
1156
1157 void tcp_free_fastopen_req(struct tcp_sock *tp)
1158 {
1159 if (tp->fastopen_req) {
1160 kfree(tp->fastopen_req);
1161 tp->fastopen_req = NULL;
1162 }
1163 }
1164
1165 static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1166 int *copied, size_t size,
1167 struct ubuf_info *uarg)
1168 {
1169 struct tcp_sock *tp = tcp_sk(sk);
1170 struct inet_sock *inet = inet_sk(sk);
1171 struct sockaddr *uaddr = msg->msg_name;
1172 int err, flags;
1173
1174 if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
1175 TFO_CLIENT_ENABLE) ||
1176 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1177 uaddr->sa_family == AF_UNSPEC))
1178 return -EOPNOTSUPP;
1179 if (tp->fastopen_req)
1180 return -EALREADY;
1181
1182 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1183 sk->sk_allocation);
1184 if (unlikely(!tp->fastopen_req))
1185 return -ENOBUFS;
1186 tp->fastopen_req->data = msg;
1187 tp->fastopen_req->size = size;
1188 tp->fastopen_req->uarg = uarg;
1189
1190 if (inet->defer_connect) {
1191 err = tcp_connect(sk);
1192
1193 if (err) {
1194 tcp_set_state(sk, TCP_CLOSE);
1195 inet->inet_dport = 0;
1196 sk->sk_route_caps = 0;
1197 }
1198 }
1199 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1200 err = __inet_stream_connect(sk->sk_socket, uaddr,
1201 msg->msg_namelen, flags, 1);
1202
1203
1204
1205 if (tp->fastopen_req) {
1206 *copied = tp->fastopen_req->copied;
1207 tcp_free_fastopen_req(tp);
1208 inet->defer_connect = 0;
1209 }
1210 return err;
1211 }
1212
1213 int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1214 {
1215 struct tcp_sock *tp = tcp_sk(sk);
1216 struct ubuf_info *uarg = NULL;
1217 struct sk_buff *skb;
1218 struct sockcm_cookie sockc;
1219 int flags, err, copied = 0;
1220 int mss_now = 0, size_goal, copied_syn = 0;
1221 int process_backlog = 0;
1222 bool zc = false;
1223 long timeo;
1224
1225 flags = msg->msg_flags;
1226
1227 if ((flags & MSG_ZEROCOPY) && size) {
1228 skb = tcp_write_queue_tail(sk);
1229
1230 if (msg->msg_ubuf) {
1231 uarg = msg->msg_ubuf;
1232 net_zcopy_get(uarg);
1233 zc = sk->sk_route_caps & NETIF_F_SG;
1234 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1235 uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
1236 if (!uarg) {
1237 err = -ENOBUFS;
1238 goto out_err;
1239 }
1240 zc = sk->sk_route_caps & NETIF_F_SG;
1241 if (!zc)
1242 uarg->zerocopy = 0;
1243 }
1244 }
1245
1246 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1247 !tp->repair) {
1248 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
1249 if (err == -EINPROGRESS && copied_syn > 0)
1250 goto out;
1251 else if (err)
1252 goto out_err;
1253 }
1254
1255 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1256
1257 tcp_rate_check_app_limited(sk);
1258
1259
1260
1261
1262
1263 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1264 !tcp_passive_fastopen(sk)) {
1265 err = sk_stream_wait_connect(sk, &timeo);
1266 if (err != 0)
1267 goto do_error;
1268 }
1269
1270 if (unlikely(tp->repair)) {
1271 if (tp->repair_queue == TCP_RECV_QUEUE) {
1272 copied = tcp_send_rcvq(sk, msg, size);
1273 goto out_nopush;
1274 }
1275
1276 err = -EINVAL;
1277 if (tp->repair_queue == TCP_NO_QUEUE)
1278 goto out_err;
1279
1280
1281 }
1282
1283 sockcm_init(&sockc, sk);
1284 if (msg->msg_controllen) {
1285 err = sock_cmsg_send(sk, msg, &sockc);
1286 if (unlikely(err)) {
1287 err = -EINVAL;
1288 goto out_err;
1289 }
1290 }
1291
1292
1293 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1294
1295
1296 copied = 0;
1297
1298 restart:
1299 mss_now = tcp_send_mss(sk, &size_goal, flags);
1300
1301 err = -EPIPE;
1302 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1303 goto do_error;
1304
1305 while (msg_data_left(msg)) {
1306 int copy = 0;
1307
1308 skb = tcp_write_queue_tail(sk);
1309 if (skb)
1310 copy = size_goal - skb->len;
1311
1312 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1313 bool first_skb;
1314
1315 new_segment:
1316 if (!sk_stream_memory_free(sk))
1317 goto wait_for_space;
1318
1319 if (unlikely(process_backlog >= 16)) {
1320 process_backlog = 0;
1321 if (sk_flush_backlog(sk))
1322 goto restart;
1323 }
1324 first_skb = tcp_rtx_and_write_queues_empty(sk);
1325 skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
1326 first_skb);
1327 if (!skb)
1328 goto wait_for_space;
1329
1330 process_backlog++;
1331
1332 tcp_skb_entail(sk, skb);
1333 copy = size_goal;
1334
1335
1336
1337
1338
1339 if (tp->repair)
1340 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1341 }
1342
1343
1344 if (copy > msg_data_left(msg))
1345 copy = msg_data_left(msg);
1346
1347 if (!zc) {
1348 bool merge = true;
1349 int i = skb_shinfo(skb)->nr_frags;
1350 struct page_frag *pfrag = sk_page_frag(sk);
1351
1352 if (!sk_page_frag_refill(sk, pfrag))
1353 goto wait_for_space;
1354
1355 if (!skb_can_coalesce(skb, i, pfrag->page,
1356 pfrag->offset)) {
1357 if (i >= READ_ONCE(sysctl_max_skb_frags)) {
1358 tcp_mark_push(tp, skb);
1359 goto new_segment;
1360 }
1361 merge = false;
1362 }
1363
1364 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1365
1366 if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) {
1367 if (tcp_downgrade_zcopy_pure(sk, skb))
1368 goto wait_for_space;
1369 skb_zcopy_downgrade_managed(skb);
1370 }
1371
1372 copy = tcp_wmem_schedule(sk, copy);
1373 if (!copy)
1374 goto wait_for_space;
1375
1376 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1377 pfrag->page,
1378 pfrag->offset,
1379 copy);
1380 if (err)
1381 goto do_error;
1382
1383
1384 if (merge) {
1385 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1386 } else {
1387 skb_fill_page_desc(skb, i, pfrag->page,
1388 pfrag->offset, copy);
1389 page_ref_inc(pfrag->page);
1390 }
1391 pfrag->offset += copy;
1392 } else {
1393
1394
1395
1396 if (!skb->len)
1397 skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
1398
1399 if (!skb_zcopy_pure(skb)) {
1400 copy = tcp_wmem_schedule(sk, copy);
1401 if (!copy)
1402 goto wait_for_space;
1403 }
1404
1405 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1406 if (err == -EMSGSIZE || err == -EEXIST) {
1407 tcp_mark_push(tp, skb);
1408 goto new_segment;
1409 }
1410 if (err < 0)
1411 goto do_error;
1412 copy = err;
1413 }
1414
1415 if (!copied)
1416 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1417
1418 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1419 TCP_SKB_CB(skb)->end_seq += copy;
1420 tcp_skb_pcount_set(skb, 0);
1421
1422 copied += copy;
1423 if (!msg_data_left(msg)) {
1424 if (unlikely(flags & MSG_EOR))
1425 TCP_SKB_CB(skb)->eor = 1;
1426 goto out;
1427 }
1428
1429 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1430 continue;
1431
1432 if (forced_push(tp)) {
1433 tcp_mark_push(tp, skb);
1434 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1435 } else if (skb == tcp_send_head(sk))
1436 tcp_push_one(sk, mss_now);
1437 continue;
1438
1439 wait_for_space:
1440 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1441 if (copied)
1442 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1443 TCP_NAGLE_PUSH, size_goal);
1444
1445 err = sk_stream_wait_memory(sk, &timeo);
1446 if (err != 0)
1447 goto do_error;
1448
1449 mss_now = tcp_send_mss(sk, &size_goal, flags);
1450 }
1451
1452 out:
1453 if (copied) {
1454 tcp_tx_timestamp(sk, sockc.tsflags);
1455 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1456 }
1457 out_nopush:
1458 net_zcopy_put(uarg);
1459 return copied + copied_syn;
1460
1461 do_error:
1462 tcp_remove_empty_skb(sk);
1463
1464 if (copied + copied_syn)
1465 goto out;
1466 out_err:
1467 net_zcopy_put_abort(uarg, true);
1468 err = sk_stream_error(sk, flags, err);
1469
1470 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1471 sk->sk_write_space(sk);
1472 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1473 }
1474 return err;
1475 }
1476 EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1477
1478 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1479 {
1480 int ret;
1481
1482 lock_sock(sk);
1483 ret = tcp_sendmsg_locked(sk, msg, size);
1484 release_sock(sk);
1485
1486 return ret;
1487 }
1488 EXPORT_SYMBOL(tcp_sendmsg);
1489
1490
1491
1492
1493
1494
1495 static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1496 {
1497 struct tcp_sock *tp = tcp_sk(sk);
1498
1499
1500 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1501 tp->urg_data == TCP_URG_READ)
1502 return -EINVAL;
1503
1504 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1505 return -ENOTCONN;
1506
1507 if (tp->urg_data & TCP_URG_VALID) {
1508 int err = 0;
1509 char c = tp->urg_data;
1510
1511 if (!(flags & MSG_PEEK))
1512 WRITE_ONCE(tp->urg_data, TCP_URG_READ);
1513
1514
1515 msg->msg_flags |= MSG_OOB;
1516
1517 if (len > 0) {
1518 if (!(flags & MSG_TRUNC))
1519 err = memcpy_to_msg(msg, &c, 1);
1520 len = 1;
1521 } else
1522 msg->msg_flags |= MSG_TRUNC;
1523
1524 return err ? -EFAULT : len;
1525 }
1526
1527 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1528 return 0;
1529
1530
1531
1532
1533
1534
1535
1536 return -EAGAIN;
1537 }
1538
1539 static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1540 {
1541 struct sk_buff *skb;
1542 int copied = 0, err = 0;
1543
1544
1545
1546 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1547 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1548 if (err)
1549 return err;
1550 copied += skb->len;
1551 }
1552
1553 skb_queue_walk(&sk->sk_write_queue, skb) {
1554 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1555 if (err)
1556 break;
1557
1558 copied += skb->len;
1559 }
1560
1561 return err ?: copied;
1562 }
1563
1564
1565
1566
1567
1568
1569
1570 static void __tcp_cleanup_rbuf(struct sock *sk, int copied)
1571 {
1572 struct tcp_sock *tp = tcp_sk(sk);
1573 bool time_to_ack = false;
1574
1575 if (inet_csk_ack_scheduled(sk)) {
1576 const struct inet_connection_sock *icsk = inet_csk(sk);
1577
1578 if (
1579 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1580
1581
1582
1583
1584
1585
1586 (copied > 0 &&
1587 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1588 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1589 !inet_csk_in_pingpong_mode(sk))) &&
1590 !atomic_read(&sk->sk_rmem_alloc)))
1591 time_to_ack = true;
1592 }
1593
1594
1595
1596
1597
1598
1599
1600 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1601 __u32 rcv_window_now = tcp_receive_window(tp);
1602
1603
1604 if (2*rcv_window_now <= tp->window_clamp) {
1605 __u32 new_window = __tcp_select_window(sk);
1606
1607
1608
1609
1610
1611
1612 if (new_window && new_window >= 2 * rcv_window_now)
1613 time_to_ack = true;
1614 }
1615 }
1616 if (time_to_ack)
1617 tcp_send_ack(sk);
1618 }
1619
1620 void tcp_cleanup_rbuf(struct sock *sk, int copied)
1621 {
1622 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1623 struct tcp_sock *tp = tcp_sk(sk);
1624
1625 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1626 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1627 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1628 __tcp_cleanup_rbuf(sk, copied);
1629 }
1630
1631 static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
1632 {
1633 __skb_unlink(skb, &sk->sk_receive_queue);
1634 if (likely(skb->destructor == sock_rfree)) {
1635 sock_rfree(skb);
1636 skb->destructor = NULL;
1637 skb->sk = NULL;
1638 return skb_attempt_defer_free(skb);
1639 }
1640 __kfree_skb(skb);
1641 }
1642
1643 struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1644 {
1645 struct sk_buff *skb;
1646 u32 offset;
1647
1648 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1649 offset = seq - TCP_SKB_CB(skb)->seq;
1650 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1651 pr_err_once("%s: found a SYN, please report !\n", __func__);
1652 offset--;
1653 }
1654 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1655 *off = offset;
1656 return skb;
1657 }
1658
1659
1660
1661
1662 tcp_eat_recv_skb(sk, skb);
1663 }
1664 return NULL;
1665 }
1666 EXPORT_SYMBOL(tcp_recv_skb);
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1680 sk_read_actor_t recv_actor)
1681 {
1682 struct sk_buff *skb;
1683 struct tcp_sock *tp = tcp_sk(sk);
1684 u32 seq = tp->copied_seq;
1685 u32 offset;
1686 int copied = 0;
1687
1688 if (sk->sk_state == TCP_LISTEN)
1689 return -ENOTCONN;
1690 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1691 if (offset < skb->len) {
1692 int used;
1693 size_t len;
1694
1695 len = skb->len - offset;
1696
1697 if (unlikely(tp->urg_data)) {
1698 u32 urg_offset = tp->urg_seq - seq;
1699 if (urg_offset < len)
1700 len = urg_offset;
1701 if (!len)
1702 break;
1703 }
1704 used = recv_actor(desc, skb, offset, len);
1705 if (used <= 0) {
1706 if (!copied)
1707 copied = used;
1708 break;
1709 }
1710 if (WARN_ON_ONCE(used > len))
1711 used = len;
1712 seq += used;
1713 copied += used;
1714 offset += used;
1715
1716
1717
1718
1719
1720
1721 skb = tcp_recv_skb(sk, seq - 1, &offset);
1722 if (!skb)
1723 break;
1724
1725
1726
1727 if (offset + 1 != skb->len)
1728 continue;
1729 }
1730 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1731 tcp_eat_recv_skb(sk, skb);
1732 ++seq;
1733 break;
1734 }
1735 tcp_eat_recv_skb(sk, skb);
1736 if (!desc->count)
1737 break;
1738 WRITE_ONCE(tp->copied_seq, seq);
1739 }
1740 WRITE_ONCE(tp->copied_seq, seq);
1741
1742 tcp_rcv_space_adjust(sk);
1743
1744
1745 if (copied > 0) {
1746 tcp_recv_skb(sk, seq, &offset);
1747 tcp_cleanup_rbuf(sk, copied);
1748 }
1749 return copied;
1750 }
1751 EXPORT_SYMBOL(tcp_read_sock);
1752
1753 int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
1754 {
1755 struct tcp_sock *tp = tcp_sk(sk);
1756 u32 seq = tp->copied_seq;
1757 struct sk_buff *skb;
1758 int copied = 0;
1759 u32 offset;
1760
1761 if (sk->sk_state == TCP_LISTEN)
1762 return -ENOTCONN;
1763
1764 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1765 u8 tcp_flags;
1766 int used;
1767
1768 __skb_unlink(skb, &sk->sk_receive_queue);
1769 WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
1770 tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
1771 used = recv_actor(sk, skb);
1772 consume_skb(skb);
1773 if (used < 0) {
1774 if (!copied)
1775 copied = used;
1776 break;
1777 }
1778 seq += used;
1779 copied += used;
1780
1781 if (tcp_flags & TCPHDR_FIN) {
1782 ++seq;
1783 break;
1784 }
1785 }
1786 WRITE_ONCE(tp->copied_seq, seq);
1787
1788 tcp_rcv_space_adjust(sk);
1789
1790
1791 if (copied > 0)
1792 __tcp_cleanup_rbuf(sk, copied);
1793
1794 return copied;
1795 }
1796 EXPORT_SYMBOL(tcp_read_skb);
1797
1798 void tcp_read_done(struct sock *sk, size_t len)
1799 {
1800 struct tcp_sock *tp = tcp_sk(sk);
1801 u32 seq = tp->copied_seq;
1802 struct sk_buff *skb;
1803 size_t left;
1804 u32 offset;
1805
1806 if (sk->sk_state == TCP_LISTEN)
1807 return;
1808
1809 left = len;
1810 while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1811 int used;
1812
1813 used = min_t(size_t, skb->len - offset, left);
1814 seq += used;
1815 left -= used;
1816
1817 if (skb->len > offset + used)
1818 break;
1819
1820 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1821 tcp_eat_recv_skb(sk, skb);
1822 ++seq;
1823 break;
1824 }
1825 tcp_eat_recv_skb(sk, skb);
1826 }
1827 WRITE_ONCE(tp->copied_seq, seq);
1828
1829 tcp_rcv_space_adjust(sk);
1830
1831
1832 if (left != len)
1833 tcp_cleanup_rbuf(sk, len - left);
1834 }
1835 EXPORT_SYMBOL(tcp_read_done);
1836
1837 int tcp_peek_len(struct socket *sock)
1838 {
1839 return tcp_inq(sock->sk);
1840 }
1841 EXPORT_SYMBOL(tcp_peek_len);
1842
1843
1844 int tcp_set_rcvlowat(struct sock *sk, int val)
1845 {
1846 int cap;
1847
1848 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1849 cap = sk->sk_rcvbuf >> 1;
1850 else
1851 cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
1852 val = min(val, cap);
1853 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1854
1855
1856 tcp_data_ready(sk);
1857
1858 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1859 return 0;
1860
1861 val <<= 1;
1862 if (val > sk->sk_rcvbuf) {
1863 WRITE_ONCE(sk->sk_rcvbuf, val);
1864 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1865 }
1866 return 0;
1867 }
1868 EXPORT_SYMBOL(tcp_set_rcvlowat);
1869
1870 void tcp_update_recv_tstamps(struct sk_buff *skb,
1871 struct scm_timestamping_internal *tss)
1872 {
1873 if (skb->tstamp)
1874 tss->ts[0] = ktime_to_timespec64(skb->tstamp);
1875 else
1876 tss->ts[0] = (struct timespec64) {0};
1877
1878 if (skb_hwtstamps(skb)->hwtstamp)
1879 tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1880 else
1881 tss->ts[2] = (struct timespec64) {0};
1882 }
1883
1884 #ifdef CONFIG_MMU
1885 static const struct vm_operations_struct tcp_vm_ops = {
1886 };
1887
1888 int tcp_mmap(struct file *file, struct socket *sock,
1889 struct vm_area_struct *vma)
1890 {
1891 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1892 return -EPERM;
1893 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1894
1895
1896 vma->vm_flags |= VM_MIXEDMAP;
1897
1898 vma->vm_ops = &tcp_vm_ops;
1899 return 0;
1900 }
1901 EXPORT_SYMBOL(tcp_mmap);
1902
1903 static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
1904 u32 *offset_frag)
1905 {
1906 skb_frag_t *frag;
1907
1908 if (unlikely(offset_skb >= skb->len))
1909 return NULL;
1910
1911 offset_skb -= skb_headlen(skb);
1912 if ((int)offset_skb < 0 || skb_has_frag_list(skb))
1913 return NULL;
1914
1915 frag = skb_shinfo(skb)->frags;
1916 while (offset_skb) {
1917 if (skb_frag_size(frag) > offset_skb) {
1918 *offset_frag = offset_skb;
1919 return frag;
1920 }
1921 offset_skb -= skb_frag_size(frag);
1922 ++frag;
1923 }
1924 *offset_frag = 0;
1925 return frag;
1926 }
1927
1928 static bool can_map_frag(const skb_frag_t *frag)
1929 {
1930 return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
1931 }
1932
1933 static int find_next_mappable_frag(const skb_frag_t *frag,
1934 int remaining_in_skb)
1935 {
1936 int offset = 0;
1937
1938 if (likely(can_map_frag(frag)))
1939 return 0;
1940
1941 while (offset < remaining_in_skb && !can_map_frag(frag)) {
1942 offset += skb_frag_size(frag);
1943 ++frag;
1944 }
1945 return offset;
1946 }
1947
1948 static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
1949 struct tcp_zerocopy_receive *zc,
1950 struct sk_buff *skb, u32 offset)
1951 {
1952 u32 frag_offset, partial_frag_remainder = 0;
1953 int mappable_offset;
1954 skb_frag_t *frag;
1955
1956
1957 zc->recv_skip_hint = skb->len - offset;
1958
1959
1960 frag = skb_advance_to_frag(skb, offset, &frag_offset);
1961 if (!frag)
1962 return;
1963
1964 if (frag_offset) {
1965 struct skb_shared_info *info = skb_shinfo(skb);
1966
1967
1968 if (frag == &info->frags[info->nr_frags - 1])
1969 return;
1970
1971
1972 partial_frag_remainder = skb_frag_size(frag) - frag_offset;
1973 zc->recv_skip_hint -= partial_frag_remainder;
1974 ++frag;
1975 }
1976
1977
1978
1979
1980
1981 mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
1982 zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
1983 }
1984
1985 static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
1986 int flags, struct scm_timestamping_internal *tss,
1987 int *cmsg_flags);
1988 static int receive_fallback_to_copy(struct sock *sk,
1989 struct tcp_zerocopy_receive *zc, int inq,
1990 struct scm_timestamping_internal *tss)
1991 {
1992 unsigned long copy_address = (unsigned long)zc->copybuf_address;
1993 struct msghdr msg = {};
1994 struct iovec iov;
1995 int err;
1996
1997 zc->length = 0;
1998 zc->recv_skip_hint = 0;
1999
2000 if (copy_address != zc->copybuf_address)
2001 return -EINVAL;
2002
2003 err = import_single_range(READ, (void __user *)copy_address,
2004 inq, &iov, &msg.msg_iter);
2005 if (err)
2006 return err;
2007
2008 err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT,
2009 tss, &zc->msg_flags);
2010 if (err < 0)
2011 return err;
2012
2013 zc->copybuf_len = err;
2014 if (likely(zc->copybuf_len)) {
2015 struct sk_buff *skb;
2016 u32 offset;
2017
2018 skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
2019 if (skb)
2020 tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
2021 }
2022 return 0;
2023 }
2024
2025 static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
2026 struct sk_buff *skb, u32 copylen,
2027 u32 *offset, u32 *seq)
2028 {
2029 unsigned long copy_address = (unsigned long)zc->copybuf_address;
2030 struct msghdr msg = {};
2031 struct iovec iov;
2032 int err;
2033
2034 if (copy_address != zc->copybuf_address)
2035 return -EINVAL;
2036
2037 err = import_single_range(READ, (void __user *)copy_address,
2038 copylen, &iov, &msg.msg_iter);
2039 if (err)
2040 return err;
2041 err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
2042 if (err)
2043 return err;
2044 zc->recv_skip_hint -= copylen;
2045 *offset += copylen;
2046 *seq += copylen;
2047 return (__s32)copylen;
2048 }
2049
2050 static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
2051 struct sock *sk,
2052 struct sk_buff *skb,
2053 u32 *seq,
2054 s32 copybuf_len,
2055 struct scm_timestamping_internal *tss)
2056 {
2057 u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
2058
2059 if (!copylen)
2060 return 0;
2061
2062 if (skb) {
2063 offset = *seq - TCP_SKB_CB(skb)->seq;
2064 } else {
2065 skb = tcp_recv_skb(sk, *seq, &offset);
2066 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2067 tcp_update_recv_tstamps(skb, tss);
2068 zc->msg_flags |= TCP_CMSG_TS;
2069 }
2070 }
2071
2072 zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
2073 seq);
2074 return zc->copybuf_len < 0 ? 0 : copylen;
2075 }
2076
2077 static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
2078 struct page **pending_pages,
2079 unsigned long pages_remaining,
2080 unsigned long *address,
2081 u32 *length,
2082 u32 *seq,
2083 struct tcp_zerocopy_receive *zc,
2084 u32 total_bytes_to_map,
2085 int err)
2086 {
2087
2088 if (err == -EBUSY &&
2089 zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
2090 u32 maybe_zap_len;
2091
2092 maybe_zap_len = total_bytes_to_map -
2093 *length +
2094 (pages_remaining * PAGE_SIZE);
2095 zap_page_range(vma, *address, maybe_zap_len);
2096 err = 0;
2097 }
2098
2099 if (!err) {
2100 unsigned long leftover_pages = pages_remaining;
2101 int bytes_mapped;
2102
2103
2104 err = vm_insert_pages(vma, *address,
2105 pending_pages,
2106 &pages_remaining);
2107 bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
2108 *seq += bytes_mapped;
2109 *address += bytes_mapped;
2110 }
2111 if (err) {
2112
2113
2114
2115
2116
2117 const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
2118
2119 *length -= bytes_not_mapped;
2120 zc->recv_skip_hint += bytes_not_mapped;
2121 }
2122 return err;
2123 }
2124
2125 static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
2126 struct page **pages,
2127 unsigned int pages_to_map,
2128 unsigned long *address,
2129 u32 *length,
2130 u32 *seq,
2131 struct tcp_zerocopy_receive *zc,
2132 u32 total_bytes_to_map)
2133 {
2134 unsigned long pages_remaining = pages_to_map;
2135 unsigned int pages_mapped;
2136 unsigned int bytes_mapped;
2137 int err;
2138
2139 err = vm_insert_pages(vma, *address, pages, &pages_remaining);
2140 pages_mapped = pages_to_map - (unsigned int)pages_remaining;
2141 bytes_mapped = PAGE_SIZE * pages_mapped;
2142
2143
2144
2145 *seq += bytes_mapped;
2146 *address += bytes_mapped;
2147
2148 if (likely(!err))
2149 return 0;
2150
2151
2152 return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
2153 pages_remaining, address, length, seq, zc, total_bytes_to_map,
2154 err);
2155 }
2156
2157 #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS)
2158 static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
2159 struct tcp_zerocopy_receive *zc,
2160 struct scm_timestamping_internal *tss)
2161 {
2162 unsigned long msg_control_addr;
2163 struct msghdr cmsg_dummy;
2164
2165 msg_control_addr = (unsigned long)zc->msg_control;
2166 cmsg_dummy.msg_control = (void *)msg_control_addr;
2167 cmsg_dummy.msg_controllen =
2168 (__kernel_size_t)zc->msg_controllen;
2169 cmsg_dummy.msg_flags = in_compat_syscall()
2170 ? MSG_CMSG_COMPAT : 0;
2171 cmsg_dummy.msg_control_is_user = true;
2172 zc->msg_flags = 0;
2173 if (zc->msg_control == msg_control_addr &&
2174 zc->msg_controllen == cmsg_dummy.msg_controllen) {
2175 tcp_recv_timestamp(&cmsg_dummy, sk, tss);
2176 zc->msg_control = (__u64)
2177 ((uintptr_t)cmsg_dummy.msg_control);
2178 zc->msg_controllen =
2179 (__u64)cmsg_dummy.msg_controllen;
2180 zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
2181 }
2182 }
2183
2184 #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
2185 static int tcp_zerocopy_receive(struct sock *sk,
2186 struct tcp_zerocopy_receive *zc,
2187 struct scm_timestamping_internal *tss)
2188 {
2189 u32 length = 0, offset, vma_len, avail_len, copylen = 0;
2190 unsigned long address = (unsigned long)zc->address;
2191 struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
2192 s32 copybuf_len = zc->copybuf_len;
2193 struct tcp_sock *tp = tcp_sk(sk);
2194 const skb_frag_t *frags = NULL;
2195 unsigned int pages_to_map = 0;
2196 struct vm_area_struct *vma;
2197 struct sk_buff *skb = NULL;
2198 u32 seq = tp->copied_seq;
2199 u32 total_bytes_to_map;
2200 int inq = tcp_inq(sk);
2201 int ret;
2202
2203 zc->copybuf_len = 0;
2204 zc->msg_flags = 0;
2205
2206 if (address & (PAGE_SIZE - 1) || address != zc->address)
2207 return -EINVAL;
2208
2209 if (sk->sk_state == TCP_LISTEN)
2210 return -ENOTCONN;
2211
2212 sock_rps_record_flow(sk);
2213
2214 if (inq && inq <= copybuf_len)
2215 return receive_fallback_to_copy(sk, zc, inq, tss);
2216
2217 if (inq < PAGE_SIZE) {
2218 zc->length = 0;
2219 zc->recv_skip_hint = inq;
2220 if (!inq && sock_flag(sk, SOCK_DONE))
2221 return -EIO;
2222 return 0;
2223 }
2224
2225 mmap_read_lock(current->mm);
2226
2227 vma = vma_lookup(current->mm, address);
2228 if (!vma || vma->vm_ops != &tcp_vm_ops) {
2229 mmap_read_unlock(current->mm);
2230 return -EINVAL;
2231 }
2232 vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
2233 avail_len = min_t(u32, vma_len, inq);
2234 total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
2235 if (total_bytes_to_map) {
2236 if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
2237 zap_page_range(vma, address, total_bytes_to_map);
2238 zc->length = total_bytes_to_map;
2239 zc->recv_skip_hint = 0;
2240 } else {
2241 zc->length = avail_len;
2242 zc->recv_skip_hint = avail_len;
2243 }
2244 ret = 0;
2245 while (length + PAGE_SIZE <= zc->length) {
2246 int mappable_offset;
2247 struct page *page;
2248
2249 if (zc->recv_skip_hint < PAGE_SIZE) {
2250 u32 offset_frag;
2251
2252 if (skb) {
2253 if (zc->recv_skip_hint > 0)
2254 break;
2255 skb = skb->next;
2256 offset = seq - TCP_SKB_CB(skb)->seq;
2257 } else {
2258 skb = tcp_recv_skb(sk, seq, &offset);
2259 }
2260
2261 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2262 tcp_update_recv_tstamps(skb, tss);
2263 zc->msg_flags |= TCP_CMSG_TS;
2264 }
2265 zc->recv_skip_hint = skb->len - offset;
2266 frags = skb_advance_to_frag(skb, offset, &offset_frag);
2267 if (!frags || offset_frag)
2268 break;
2269 }
2270
2271 mappable_offset = find_next_mappable_frag(frags,
2272 zc->recv_skip_hint);
2273 if (mappable_offset) {
2274 zc->recv_skip_hint = mappable_offset;
2275 break;
2276 }
2277 page = skb_frag_page(frags);
2278 prefetchw(page);
2279 pages[pages_to_map++] = page;
2280 length += PAGE_SIZE;
2281 zc->recv_skip_hint -= PAGE_SIZE;
2282 frags++;
2283 if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
2284 zc->recv_skip_hint < PAGE_SIZE) {
2285
2286
2287
2288 ret = tcp_zerocopy_vm_insert_batch(vma, pages,
2289 pages_to_map,
2290 &address, &length,
2291 &seq, zc,
2292 total_bytes_to_map);
2293 if (ret)
2294 goto out;
2295 pages_to_map = 0;
2296 }
2297 }
2298 if (pages_to_map) {
2299 ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
2300 &address, &length, &seq,
2301 zc, total_bytes_to_map);
2302 }
2303 out:
2304 mmap_read_unlock(current->mm);
2305
2306 if (!ret)
2307 copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
2308
2309 if (length + copylen) {
2310 WRITE_ONCE(tp->copied_seq, seq);
2311 tcp_rcv_space_adjust(sk);
2312
2313
2314 tcp_recv_skb(sk, seq, &offset);
2315 tcp_cleanup_rbuf(sk, length + copylen);
2316 ret = 0;
2317 if (length == zc->length)
2318 zc->recv_skip_hint = 0;
2319 } else {
2320 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
2321 ret = -EIO;
2322 }
2323 zc->length = length;
2324 return ret;
2325 }
2326 #endif
2327
2328
2329 void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
2330 struct scm_timestamping_internal *tss)
2331 {
2332 int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
2333 bool has_timestamping = false;
2334
2335 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
2336 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
2337 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
2338 if (new_tstamp) {
2339 struct __kernel_timespec kts = {
2340 .tv_sec = tss->ts[0].tv_sec,
2341 .tv_nsec = tss->ts[0].tv_nsec,
2342 };
2343 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2344 sizeof(kts), &kts);
2345 } else {
2346 struct __kernel_old_timespec ts_old = {
2347 .tv_sec = tss->ts[0].tv_sec,
2348 .tv_nsec = tss->ts[0].tv_nsec,
2349 };
2350 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2351 sizeof(ts_old), &ts_old);
2352 }
2353 } else {
2354 if (new_tstamp) {
2355 struct __kernel_sock_timeval stv = {
2356 .tv_sec = tss->ts[0].tv_sec,
2357 .tv_usec = tss->ts[0].tv_nsec / 1000,
2358 };
2359 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2360 sizeof(stv), &stv);
2361 } else {
2362 struct __kernel_old_timeval tv = {
2363 .tv_sec = tss->ts[0].tv_sec,
2364 .tv_usec = tss->ts[0].tv_nsec / 1000,
2365 };
2366 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2367 sizeof(tv), &tv);
2368 }
2369 }
2370 }
2371
2372 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
2373 has_timestamping = true;
2374 else
2375 tss->ts[0] = (struct timespec64) {0};
2376 }
2377
2378 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
2379 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
2380 has_timestamping = true;
2381 else
2382 tss->ts[2] = (struct timespec64) {0};
2383 }
2384
2385 if (has_timestamping) {
2386 tss->ts[1] = (struct timespec64) {0};
2387 if (sock_flag(sk, SOCK_TSTAMP_NEW))
2388 put_cmsg_scm_timestamping64(msg, tss);
2389 else
2390 put_cmsg_scm_timestamping(msg, tss);
2391 }
2392 }
2393
2394 static int tcp_inq_hint(struct sock *sk)
2395 {
2396 const struct tcp_sock *tp = tcp_sk(sk);
2397 u32 copied_seq = READ_ONCE(tp->copied_seq);
2398 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
2399 int inq;
2400
2401 inq = rcv_nxt - copied_seq;
2402 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
2403 lock_sock(sk);
2404 inq = tp->rcv_nxt - tp->copied_seq;
2405 release_sock(sk);
2406 }
2407
2408
2409
2410 if (inq == 0 && sock_flag(sk, SOCK_DONE))
2411 inq = 1;
2412 return inq;
2413 }
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423 static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
2424 int flags, struct scm_timestamping_internal *tss,
2425 int *cmsg_flags)
2426 {
2427 struct tcp_sock *tp = tcp_sk(sk);
2428 int copied = 0;
2429 u32 peek_seq;
2430 u32 *seq;
2431 unsigned long used;
2432 int err;
2433 int target;
2434 long timeo;
2435 struct sk_buff *skb, *last;
2436 u32 urg_hole = 0;
2437
2438 err = -ENOTCONN;
2439 if (sk->sk_state == TCP_LISTEN)
2440 goto out;
2441
2442 if (tp->recvmsg_inq) {
2443 *cmsg_flags = TCP_CMSG_INQ;
2444 msg->msg_get_inq = 1;
2445 }
2446 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2447
2448
2449 if (flags & MSG_OOB)
2450 goto recv_urg;
2451
2452 if (unlikely(tp->repair)) {
2453 err = -EPERM;
2454 if (!(flags & MSG_PEEK))
2455 goto out;
2456
2457 if (tp->repair_queue == TCP_SEND_QUEUE)
2458 goto recv_sndq;
2459
2460 err = -EINVAL;
2461 if (tp->repair_queue == TCP_NO_QUEUE)
2462 goto out;
2463
2464
2465 }
2466
2467 seq = &tp->copied_seq;
2468 if (flags & MSG_PEEK) {
2469 peek_seq = tp->copied_seq;
2470 seq = &peek_seq;
2471 }
2472
2473 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
2474
2475 do {
2476 u32 offset;
2477
2478
2479 if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
2480 if (copied)
2481 break;
2482 if (signal_pending(current)) {
2483 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2484 break;
2485 }
2486 }
2487
2488
2489
2490 last = skb_peek_tail(&sk->sk_receive_queue);
2491 skb_queue_walk(&sk->sk_receive_queue, skb) {
2492 last = skb;
2493
2494
2495
2496 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2497 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2498 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2499 flags))
2500 break;
2501
2502 offset = *seq - TCP_SKB_CB(skb)->seq;
2503 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2504 pr_err_once("%s: found a SYN, please report !\n", __func__);
2505 offset--;
2506 }
2507 if (offset < skb->len)
2508 goto found_ok_skb;
2509 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2510 goto found_fin_ok;
2511 WARN(!(flags & MSG_PEEK),
2512 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2513 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2514 }
2515
2516
2517
2518 if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
2519 break;
2520
2521 if (copied) {
2522 if (!timeo ||
2523 sk->sk_err ||
2524 sk->sk_state == TCP_CLOSE ||
2525 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2526 signal_pending(current))
2527 break;
2528 } else {
2529 if (sock_flag(sk, SOCK_DONE))
2530 break;
2531
2532 if (sk->sk_err) {
2533 copied = sock_error(sk);
2534 break;
2535 }
2536
2537 if (sk->sk_shutdown & RCV_SHUTDOWN)
2538 break;
2539
2540 if (sk->sk_state == TCP_CLOSE) {
2541
2542
2543
2544 copied = -ENOTCONN;
2545 break;
2546 }
2547
2548 if (!timeo) {
2549 copied = -EAGAIN;
2550 break;
2551 }
2552
2553 if (signal_pending(current)) {
2554 copied = sock_intr_errno(timeo);
2555 break;
2556 }
2557 }
2558
2559 if (copied >= target) {
2560
2561 __sk_flush_backlog(sk);
2562 } else {
2563 tcp_cleanup_rbuf(sk, copied);
2564 sk_wait_data(sk, &timeo, last);
2565 }
2566
2567 if ((flags & MSG_PEEK) &&
2568 (peek_seq - copied - urg_hole != tp->copied_seq)) {
2569 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2570 current->comm,
2571 task_pid_nr(current));
2572 peek_seq = tp->copied_seq;
2573 }
2574 continue;
2575
2576 found_ok_skb:
2577
2578 used = skb->len - offset;
2579 if (len < used)
2580 used = len;
2581
2582
2583 if (unlikely(tp->urg_data)) {
2584 u32 urg_offset = tp->urg_seq - *seq;
2585 if (urg_offset < used) {
2586 if (!urg_offset) {
2587 if (!sock_flag(sk, SOCK_URGINLINE)) {
2588 WRITE_ONCE(*seq, *seq + 1);
2589 urg_hole++;
2590 offset++;
2591 used--;
2592 if (!used)
2593 goto skip_copy;
2594 }
2595 } else
2596 used = urg_offset;
2597 }
2598 }
2599
2600 if (!(flags & MSG_TRUNC)) {
2601 err = skb_copy_datagram_msg(skb, offset, msg, used);
2602 if (err) {
2603
2604 if (!copied)
2605 copied = -EFAULT;
2606 break;
2607 }
2608 }
2609
2610 WRITE_ONCE(*seq, *seq + used);
2611 copied += used;
2612 len -= used;
2613
2614 tcp_rcv_space_adjust(sk);
2615
2616 skip_copy:
2617 if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
2618 WRITE_ONCE(tp->urg_data, 0);
2619 tcp_fast_path_check(sk);
2620 }
2621
2622 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2623 tcp_update_recv_tstamps(skb, tss);
2624 *cmsg_flags |= TCP_CMSG_TS;
2625 }
2626
2627 if (used + offset < skb->len)
2628 continue;
2629
2630 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2631 goto found_fin_ok;
2632 if (!(flags & MSG_PEEK))
2633 tcp_eat_recv_skb(sk, skb);
2634 continue;
2635
2636 found_fin_ok:
2637
2638 WRITE_ONCE(*seq, *seq + 1);
2639 if (!(flags & MSG_PEEK))
2640 tcp_eat_recv_skb(sk, skb);
2641 break;
2642 } while (len > 0);
2643
2644
2645
2646
2647
2648
2649 tcp_cleanup_rbuf(sk, copied);
2650 return copied;
2651
2652 out:
2653 return err;
2654
2655 recv_urg:
2656 err = tcp_recv_urg(sk, msg, len, flags);
2657 goto out;
2658
2659 recv_sndq:
2660 err = tcp_peek_sndq(sk, msg, len);
2661 goto out;
2662 }
2663
2664 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
2665 int *addr_len)
2666 {
2667 int cmsg_flags = 0, ret;
2668 struct scm_timestamping_internal tss;
2669
2670 if (unlikely(flags & MSG_ERRQUEUE))
2671 return inet_recv_error(sk, msg, len, addr_len);
2672
2673 if (sk_can_busy_loop(sk) &&
2674 skb_queue_empty_lockless(&sk->sk_receive_queue) &&
2675 sk->sk_state == TCP_ESTABLISHED)
2676 sk_busy_loop(sk, flags & MSG_DONTWAIT);
2677
2678 lock_sock(sk);
2679 ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
2680 release_sock(sk);
2681
2682 if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
2683 if (cmsg_flags & TCP_CMSG_TS)
2684 tcp_recv_timestamp(msg, sk, &tss);
2685 if (msg->msg_get_inq) {
2686 msg->msg_inq = tcp_inq_hint(sk);
2687 if (cmsg_flags & TCP_CMSG_INQ)
2688 put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
2689 sizeof(msg->msg_inq), &msg->msg_inq);
2690 }
2691 }
2692 return ret;
2693 }
2694 EXPORT_SYMBOL(tcp_recvmsg);
2695
2696 void tcp_set_state(struct sock *sk, int state)
2697 {
2698 int oldstate = sk->sk_state;
2699
2700
2701
2702
2703
2704
2705
2706
2707 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2708 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2709 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2710 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2711 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2712 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2713 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2714 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2715 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2716 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2717 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2718 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2719 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730 BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
2731
2732 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2733 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2734
2735 switch (state) {
2736 case TCP_ESTABLISHED:
2737 if (oldstate != TCP_ESTABLISHED)
2738 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2739 break;
2740
2741 case TCP_CLOSE:
2742 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2743 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2744
2745 sk->sk_prot->unhash(sk);
2746 if (inet_csk(sk)->icsk_bind_hash &&
2747 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2748 inet_put_port(sk);
2749 fallthrough;
2750 default:
2751 if (oldstate == TCP_ESTABLISHED)
2752 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2753 }
2754
2755
2756
2757
2758 inet_sk_state_store(sk, state);
2759 }
2760 EXPORT_SYMBOL_GPL(tcp_set_state);
2761
2762
2763
2764
2765
2766
2767
2768
2769 static const unsigned char new_state[16] = {
2770
2771 [0 ] = TCP_CLOSE,
2772 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2773 [TCP_SYN_SENT] = TCP_CLOSE,
2774 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2775 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2776 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2777 [TCP_TIME_WAIT] = TCP_CLOSE,
2778 [TCP_CLOSE] = TCP_CLOSE,
2779 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2780 [TCP_LAST_ACK] = TCP_LAST_ACK,
2781 [TCP_LISTEN] = TCP_CLOSE,
2782 [TCP_CLOSING] = TCP_CLOSING,
2783 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2784 };
2785
2786 static int tcp_close_state(struct sock *sk)
2787 {
2788 int next = (int)new_state[sk->sk_state];
2789 int ns = next & TCP_STATE_MASK;
2790
2791 tcp_set_state(sk, ns);
2792
2793 return next & TCP_ACTION_FIN;
2794 }
2795
2796
2797
2798
2799
2800
2801 void tcp_shutdown(struct sock *sk, int how)
2802 {
2803
2804
2805
2806
2807 if (!(how & SEND_SHUTDOWN))
2808 return;
2809
2810
2811 if ((1 << sk->sk_state) &
2812 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2813 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2814
2815 if (tcp_close_state(sk))
2816 tcp_send_fin(sk);
2817 }
2818 }
2819 EXPORT_SYMBOL(tcp_shutdown);
2820
2821 int tcp_orphan_count_sum(void)
2822 {
2823 int i, total = 0;
2824
2825 for_each_possible_cpu(i)
2826 total += per_cpu(tcp_orphan_count, i);
2827
2828 return max(total, 0);
2829 }
2830
2831 static int tcp_orphan_cache;
2832 static struct timer_list tcp_orphan_timer;
2833 #define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
2834
2835 static void tcp_orphan_update(struct timer_list *unused)
2836 {
2837 WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
2838 mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
2839 }
2840
2841 static bool tcp_too_many_orphans(int shift)
2842 {
2843 return READ_ONCE(tcp_orphan_cache) << shift >
2844 READ_ONCE(sysctl_tcp_max_orphans);
2845 }
2846
2847 bool tcp_check_oom(struct sock *sk, int shift)
2848 {
2849 bool too_many_orphans, out_of_socket_memory;
2850
2851 too_many_orphans = tcp_too_many_orphans(shift);
2852 out_of_socket_memory = tcp_out_of_memory(sk);
2853
2854 if (too_many_orphans)
2855 net_info_ratelimited("too many orphaned sockets\n");
2856 if (out_of_socket_memory)
2857 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2858 return too_many_orphans || out_of_socket_memory;
2859 }
2860
2861 void __tcp_close(struct sock *sk, long timeout)
2862 {
2863 struct sk_buff *skb;
2864 int data_was_unread = 0;
2865 int state;
2866
2867 sk->sk_shutdown = SHUTDOWN_MASK;
2868
2869 if (sk->sk_state == TCP_LISTEN) {
2870 tcp_set_state(sk, TCP_CLOSE);
2871
2872
2873 inet_csk_listen_stop(sk);
2874
2875 goto adjudge_to_death;
2876 }
2877
2878
2879
2880
2881
2882 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2883 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2884
2885 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2886 len--;
2887 data_was_unread += len;
2888 __kfree_skb(skb);
2889 }
2890
2891
2892 if (sk->sk_state == TCP_CLOSE)
2893 goto adjudge_to_death;
2894
2895
2896
2897
2898
2899
2900
2901
2902 if (unlikely(tcp_sk(sk)->repair)) {
2903 sk->sk_prot->disconnect(sk, 0);
2904 } else if (data_was_unread) {
2905
2906 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2907 tcp_set_state(sk, TCP_CLOSE);
2908 tcp_send_active_reset(sk, sk->sk_allocation);
2909 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2910
2911 sk->sk_prot->disconnect(sk, 0);
2912 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2913 } else if (tcp_close_state(sk)) {
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943 tcp_send_fin(sk);
2944 }
2945
2946 sk_stream_wait_close(sk, timeout);
2947
2948 adjudge_to_death:
2949 state = sk->sk_state;
2950 sock_hold(sk);
2951 sock_orphan(sk);
2952
2953 local_bh_disable();
2954 bh_lock_sock(sk);
2955
2956 __release_sock(sk);
2957
2958 this_cpu_inc(tcp_orphan_count);
2959
2960
2961 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2962 goto out;
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978 if (sk->sk_state == TCP_FIN_WAIT2) {
2979 struct tcp_sock *tp = tcp_sk(sk);
2980 if (tp->linger2 < 0) {
2981 tcp_set_state(sk, TCP_CLOSE);
2982 tcp_send_active_reset(sk, GFP_ATOMIC);
2983 __NET_INC_STATS(sock_net(sk),
2984 LINUX_MIB_TCPABORTONLINGER);
2985 } else {
2986 const int tmo = tcp_fin_time(sk);
2987
2988 if (tmo > TCP_TIMEWAIT_LEN) {
2989 inet_csk_reset_keepalive_timer(sk,
2990 tmo - TCP_TIMEWAIT_LEN);
2991 } else {
2992 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2993 goto out;
2994 }
2995 }
2996 }
2997 if (sk->sk_state != TCP_CLOSE) {
2998 if (tcp_check_oom(sk, 0)) {
2999 tcp_set_state(sk, TCP_CLOSE);
3000 tcp_send_active_reset(sk, GFP_ATOMIC);
3001 __NET_INC_STATS(sock_net(sk),
3002 LINUX_MIB_TCPABORTONMEMORY);
3003 } else if (!check_net(sock_net(sk))) {
3004
3005 tcp_set_state(sk, TCP_CLOSE);
3006 }
3007 }
3008
3009 if (sk->sk_state == TCP_CLOSE) {
3010 struct request_sock *req;
3011
3012 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
3013 lockdep_sock_is_held(sk));
3014
3015
3016
3017
3018 if (req)
3019 reqsk_fastopen_remove(sk, req, false);
3020 inet_csk_destroy_sock(sk);
3021 }
3022
3023
3024 out:
3025 bh_unlock_sock(sk);
3026 local_bh_enable();
3027 }
3028
3029 void tcp_close(struct sock *sk, long timeout)
3030 {
3031 lock_sock(sk);
3032 __tcp_close(sk, timeout);
3033 release_sock(sk);
3034 sock_put(sk);
3035 }
3036 EXPORT_SYMBOL(tcp_close);
3037
3038
3039
3040 static inline bool tcp_need_reset(int state)
3041 {
3042 return (1 << state) &
3043 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
3044 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
3045 }
3046
3047 static void tcp_rtx_queue_purge(struct sock *sk)
3048 {
3049 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
3050
3051 tcp_sk(sk)->highest_sack = NULL;
3052 while (p) {
3053 struct sk_buff *skb = rb_to_skb(p);
3054
3055 p = rb_next(p);
3056
3057
3058
3059 tcp_rtx_queue_unlink(skb, sk);
3060 tcp_wmem_free_skb(sk, skb);
3061 }
3062 }
3063
3064 void tcp_write_queue_purge(struct sock *sk)
3065 {
3066 struct sk_buff *skb;
3067
3068 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
3069 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
3070 tcp_skb_tsorted_anchor_cleanup(skb);
3071 tcp_wmem_free_skb(sk, skb);
3072 }
3073 tcp_rtx_queue_purge(sk);
3074 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
3075 tcp_clear_all_retrans_hints(tcp_sk(sk));
3076 tcp_sk(sk)->packets_out = 0;
3077 inet_csk(sk)->icsk_backoff = 0;
3078 }
3079
3080 int tcp_disconnect(struct sock *sk, int flags)
3081 {
3082 struct inet_sock *inet = inet_sk(sk);
3083 struct inet_connection_sock *icsk = inet_csk(sk);
3084 struct tcp_sock *tp = tcp_sk(sk);
3085 int old_state = sk->sk_state;
3086 u32 seq;
3087
3088 if (old_state != TCP_CLOSE)
3089 tcp_set_state(sk, TCP_CLOSE);
3090
3091
3092 if (old_state == TCP_LISTEN) {
3093 inet_csk_listen_stop(sk);
3094 } else if (unlikely(tp->repair)) {
3095 sk->sk_err = ECONNABORTED;
3096 } else if (tcp_need_reset(old_state) ||
3097 (tp->snd_nxt != tp->write_seq &&
3098 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
3099
3100
3101
3102 tcp_send_active_reset(sk, gfp_any());
3103 sk->sk_err = ECONNRESET;
3104 } else if (old_state == TCP_SYN_SENT)
3105 sk->sk_err = ECONNRESET;
3106
3107 tcp_clear_xmit_timers(sk);
3108 __skb_queue_purge(&sk->sk_receive_queue);
3109 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
3110 WRITE_ONCE(tp->urg_data, 0);
3111 tcp_write_queue_purge(sk);
3112 tcp_fastopen_active_disable_ofo_check(sk);
3113 skb_rbtree_purge(&tp->out_of_order_queue);
3114
3115 inet->inet_dport = 0;
3116
3117 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
3118 inet_reset_saddr(sk);
3119
3120 sk->sk_shutdown = 0;
3121 sock_reset_flag(sk, SOCK_DONE);
3122 tp->srtt_us = 0;
3123 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
3124 tp->rcv_rtt_last_tsecr = 0;
3125
3126 seq = tp->write_seq + tp->max_window + 2;
3127 if (!seq)
3128 seq = 1;
3129 WRITE_ONCE(tp->write_seq, seq);
3130
3131 icsk->icsk_backoff = 0;
3132 icsk->icsk_probes_out = 0;
3133 icsk->icsk_probes_tstamp = 0;
3134 icsk->icsk_rto = TCP_TIMEOUT_INIT;
3135 icsk->icsk_rto_min = TCP_RTO_MIN;
3136 icsk->icsk_delack_max = TCP_DELACK_MAX;
3137 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
3138 tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
3139 tp->snd_cwnd_cnt = 0;
3140 tp->window_clamp = 0;
3141 tp->delivered = 0;
3142 tp->delivered_ce = 0;
3143 if (icsk->icsk_ca_ops->release)
3144 icsk->icsk_ca_ops->release(sk);
3145 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
3146 icsk->icsk_ca_initialized = 0;
3147 tcp_set_ca_state(sk, TCP_CA_Open);
3148 tp->is_sack_reneg = 0;
3149 tcp_clear_retrans(tp);
3150 tp->total_retrans = 0;
3151 inet_csk_delack_init(sk);
3152
3153
3154
3155 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
3156 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
3157 __sk_dst_reset(sk);
3158 dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
3159 tcp_saved_syn_free(tp);
3160 tp->compressed_ack = 0;
3161 tp->segs_in = 0;
3162 tp->segs_out = 0;
3163 tp->bytes_sent = 0;
3164 tp->bytes_acked = 0;
3165 tp->bytes_received = 0;
3166 tp->bytes_retrans = 0;
3167 tp->data_segs_in = 0;
3168 tp->data_segs_out = 0;
3169 tp->duplicate_sack[0].start_seq = 0;
3170 tp->duplicate_sack[0].end_seq = 0;
3171 tp->dsack_dups = 0;
3172 tp->reord_seen = 0;
3173 tp->retrans_out = 0;
3174 tp->sacked_out = 0;
3175 tp->tlp_high_seq = 0;
3176 tp->last_oow_ack_time = 0;
3177
3178 tp->app_limited = ~0U;
3179 tp->rack.mstamp = 0;
3180 tp->rack.advanced = 0;
3181 tp->rack.reo_wnd_steps = 1;
3182 tp->rack.last_delivered = 0;
3183 tp->rack.reo_wnd_persist = 0;
3184 tp->rack.dsack_seen = 0;
3185 tp->syn_data_acked = 0;
3186 tp->rx_opt.saw_tstamp = 0;
3187 tp->rx_opt.dsack = 0;
3188 tp->rx_opt.num_sacks = 0;
3189 tp->rcv_ooopack = 0;
3190
3191
3192
3193 tcp_free_fastopen_req(tp);
3194 inet->defer_connect = 0;
3195 tp->fastopen_client_fail = 0;
3196
3197 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
3198
3199 if (sk->sk_frag.page) {
3200 put_page(sk->sk_frag.page);
3201 sk->sk_frag.page = NULL;
3202 sk->sk_frag.offset = 0;
3203 }
3204 sk_error_report(sk);
3205 return 0;
3206 }
3207 EXPORT_SYMBOL(tcp_disconnect);
3208
3209 static inline bool tcp_can_repair_sock(const struct sock *sk)
3210 {
3211 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
3212 (sk->sk_state != TCP_LISTEN);
3213 }
3214
3215 static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
3216 {
3217 struct tcp_repair_window opt;
3218
3219 if (!tp->repair)
3220 return -EPERM;
3221
3222 if (len != sizeof(opt))
3223 return -EINVAL;
3224
3225 if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
3226 return -EFAULT;
3227
3228 if (opt.max_window < opt.snd_wnd)
3229 return -EINVAL;
3230
3231 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
3232 return -EINVAL;
3233
3234 if (after(opt.rcv_wup, tp->rcv_nxt))
3235 return -EINVAL;
3236
3237 tp->snd_wl1 = opt.snd_wl1;
3238 tp->snd_wnd = opt.snd_wnd;
3239 tp->max_window = opt.max_window;
3240
3241 tp->rcv_wnd = opt.rcv_wnd;
3242 tp->rcv_wup = opt.rcv_wup;
3243
3244 return 0;
3245 }
3246
3247 static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
3248 unsigned int len)
3249 {
3250 struct tcp_sock *tp = tcp_sk(sk);
3251 struct tcp_repair_opt opt;
3252 size_t offset = 0;
3253
3254 while (len >= sizeof(opt)) {
3255 if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
3256 return -EFAULT;
3257
3258 offset += sizeof(opt);
3259 len -= sizeof(opt);
3260
3261 switch (opt.opt_code) {
3262 case TCPOPT_MSS:
3263 tp->rx_opt.mss_clamp = opt.opt_val;
3264 tcp_mtup_init(sk);
3265 break;
3266 case TCPOPT_WINDOW:
3267 {
3268 u16 snd_wscale = opt.opt_val & 0xFFFF;
3269 u16 rcv_wscale = opt.opt_val >> 16;
3270
3271 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
3272 return -EFBIG;
3273
3274 tp->rx_opt.snd_wscale = snd_wscale;
3275 tp->rx_opt.rcv_wscale = rcv_wscale;
3276 tp->rx_opt.wscale_ok = 1;
3277 }
3278 break;
3279 case TCPOPT_SACK_PERM:
3280 if (opt.opt_val != 0)
3281 return -EINVAL;
3282
3283 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
3284 break;
3285 case TCPOPT_TIMESTAMP:
3286 if (opt.opt_val != 0)
3287 return -EINVAL;
3288
3289 tp->rx_opt.tstamp_ok = 1;
3290 break;
3291 }
3292 }
3293
3294 return 0;
3295 }
3296
3297 DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
3298 EXPORT_SYMBOL(tcp_tx_delay_enabled);
3299
3300 static void tcp_enable_tx_delay(void)
3301 {
3302 if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
3303 static int __tcp_tx_delay_enabled = 0;
3304
3305 if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
3306 static_branch_enable(&tcp_tx_delay_enabled);
3307 pr_info("TCP_TX_DELAY enabled\n");
3308 }
3309 }
3310 }
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321 void __tcp_sock_set_cork(struct sock *sk, bool on)
3322 {
3323 struct tcp_sock *tp = tcp_sk(sk);
3324
3325 if (on) {
3326 tp->nonagle |= TCP_NAGLE_CORK;
3327 } else {
3328 tp->nonagle &= ~TCP_NAGLE_CORK;
3329 if (tp->nonagle & TCP_NAGLE_OFF)
3330 tp->nonagle |= TCP_NAGLE_PUSH;
3331 tcp_push_pending_frames(sk);
3332 }
3333 }
3334
3335 void tcp_sock_set_cork(struct sock *sk, bool on)
3336 {
3337 lock_sock(sk);
3338 __tcp_sock_set_cork(sk, on);
3339 release_sock(sk);
3340 }
3341 EXPORT_SYMBOL(tcp_sock_set_cork);
3342
3343
3344
3345
3346
3347
3348
3349 void __tcp_sock_set_nodelay(struct sock *sk, bool on)
3350 {
3351 if (on) {
3352 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
3353 tcp_push_pending_frames(sk);
3354 } else {
3355 tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
3356 }
3357 }
3358
3359 void tcp_sock_set_nodelay(struct sock *sk)
3360 {
3361 lock_sock(sk);
3362 __tcp_sock_set_nodelay(sk, true);
3363 release_sock(sk);
3364 }
3365 EXPORT_SYMBOL(tcp_sock_set_nodelay);
3366
3367 static void __tcp_sock_set_quickack(struct sock *sk, int val)
3368 {
3369 if (!val) {
3370 inet_csk_enter_pingpong_mode(sk);
3371 return;
3372 }
3373
3374 inet_csk_exit_pingpong_mode(sk);
3375 if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3376 inet_csk_ack_scheduled(sk)) {
3377 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
3378 tcp_cleanup_rbuf(sk, 1);
3379 if (!(val & 1))
3380 inet_csk_enter_pingpong_mode(sk);
3381 }
3382 }
3383
3384 void tcp_sock_set_quickack(struct sock *sk, int val)
3385 {
3386 lock_sock(sk);
3387 __tcp_sock_set_quickack(sk, val);
3388 release_sock(sk);
3389 }
3390 EXPORT_SYMBOL(tcp_sock_set_quickack);
3391
3392 int tcp_sock_set_syncnt(struct sock *sk, int val)
3393 {
3394 if (val < 1 || val > MAX_TCP_SYNCNT)
3395 return -EINVAL;
3396
3397 lock_sock(sk);
3398 inet_csk(sk)->icsk_syn_retries = val;
3399 release_sock(sk);
3400 return 0;
3401 }
3402 EXPORT_SYMBOL(tcp_sock_set_syncnt);
3403
3404 void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
3405 {
3406 lock_sock(sk);
3407 inet_csk(sk)->icsk_user_timeout = val;
3408 release_sock(sk);
3409 }
3410 EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3411
3412 int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
3413 {
3414 struct tcp_sock *tp = tcp_sk(sk);
3415
3416 if (val < 1 || val > MAX_TCP_KEEPIDLE)
3417 return -EINVAL;
3418
3419 tp->keepalive_time = val * HZ;
3420 if (sock_flag(sk, SOCK_KEEPOPEN) &&
3421 !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
3422 u32 elapsed = keepalive_time_elapsed(tp);
3423
3424 if (tp->keepalive_time > elapsed)
3425 elapsed = tp->keepalive_time - elapsed;
3426 else
3427 elapsed = 0;
3428 inet_csk_reset_keepalive_timer(sk, elapsed);
3429 }
3430
3431 return 0;
3432 }
3433
3434 int tcp_sock_set_keepidle(struct sock *sk, int val)
3435 {
3436 int err;
3437
3438 lock_sock(sk);
3439 err = tcp_sock_set_keepidle_locked(sk, val);
3440 release_sock(sk);
3441 return err;
3442 }
3443 EXPORT_SYMBOL(tcp_sock_set_keepidle);
3444
3445 int tcp_sock_set_keepintvl(struct sock *sk, int val)
3446 {
3447 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3448 return -EINVAL;
3449
3450 lock_sock(sk);
3451 tcp_sk(sk)->keepalive_intvl = val * HZ;
3452 release_sock(sk);
3453 return 0;
3454 }
3455 EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3456
3457 int tcp_sock_set_keepcnt(struct sock *sk, int val)
3458 {
3459 if (val < 1 || val > MAX_TCP_KEEPCNT)
3460 return -EINVAL;
3461
3462 lock_sock(sk);
3463 tcp_sk(sk)->keepalive_probes = val;
3464 release_sock(sk);
3465 return 0;
3466 }
3467 EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3468
3469 int tcp_set_window_clamp(struct sock *sk, int val)
3470 {
3471 struct tcp_sock *tp = tcp_sk(sk);
3472
3473 if (!val) {
3474 if (sk->sk_state != TCP_CLOSE)
3475 return -EINVAL;
3476 tp->window_clamp = 0;
3477 } else {
3478 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
3479 SOCK_MIN_RCVBUF / 2 : val;
3480 tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
3481 }
3482 return 0;
3483 }
3484
3485
3486
3487
3488 static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
3489 sockptr_t optval, unsigned int optlen)
3490 {
3491 struct tcp_sock *tp = tcp_sk(sk);
3492 struct inet_connection_sock *icsk = inet_csk(sk);
3493 struct net *net = sock_net(sk);
3494 int val;
3495 int err = 0;
3496
3497
3498 switch (optname) {
3499 case TCP_CONGESTION: {
3500 char name[TCP_CA_NAME_MAX];
3501
3502 if (optlen < 1)
3503 return -EINVAL;
3504
3505 val = strncpy_from_sockptr(name, optval,
3506 min_t(long, TCP_CA_NAME_MAX-1, optlen));
3507 if (val < 0)
3508 return -EFAULT;
3509 name[val] = 0;
3510
3511 lock_sock(sk);
3512 err = tcp_set_congestion_control(sk, name, true,
3513 ns_capable(sock_net(sk)->user_ns,
3514 CAP_NET_ADMIN));
3515 release_sock(sk);
3516 return err;
3517 }
3518 case TCP_ULP: {
3519 char name[TCP_ULP_NAME_MAX];
3520
3521 if (optlen < 1)
3522 return -EINVAL;
3523
3524 val = strncpy_from_sockptr(name, optval,
3525 min_t(long, TCP_ULP_NAME_MAX - 1,
3526 optlen));
3527 if (val < 0)
3528 return -EFAULT;
3529 name[val] = 0;
3530
3531 lock_sock(sk);
3532 err = tcp_set_ulp(sk, name);
3533 release_sock(sk);
3534 return err;
3535 }
3536 case TCP_FASTOPEN_KEY: {
3537 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3538 __u8 *backup_key = NULL;
3539
3540
3541
3542
3543 if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3544 optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
3545 return -EINVAL;
3546
3547 if (copy_from_sockptr(key, optval, optlen))
3548 return -EFAULT;
3549
3550 if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3551 backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3552
3553 return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
3554 }
3555 default:
3556
3557 break;
3558 }
3559
3560 if (optlen < sizeof(int))
3561 return -EINVAL;
3562
3563 if (copy_from_sockptr(&val, optval, sizeof(val)))
3564 return -EFAULT;
3565
3566 lock_sock(sk);
3567
3568 switch (optname) {
3569 case TCP_MAXSEG:
3570
3571
3572
3573
3574 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
3575 err = -EINVAL;
3576 break;
3577 }
3578 tp->rx_opt.user_mss = val;
3579 break;
3580
3581 case TCP_NODELAY:
3582 __tcp_sock_set_nodelay(sk, val);
3583 break;
3584
3585 case TCP_THIN_LINEAR_TIMEOUTS:
3586 if (val < 0 || val > 1)
3587 err = -EINVAL;
3588 else
3589 tp->thin_lto = val;
3590 break;
3591
3592 case TCP_THIN_DUPACK:
3593 if (val < 0 || val > 1)
3594 err = -EINVAL;
3595 break;
3596
3597 case TCP_REPAIR:
3598 if (!tcp_can_repair_sock(sk))
3599 err = -EPERM;
3600 else if (val == TCP_REPAIR_ON) {
3601 tp->repair = 1;
3602 sk->sk_reuse = SK_FORCE_REUSE;
3603 tp->repair_queue = TCP_NO_QUEUE;
3604 } else if (val == TCP_REPAIR_OFF) {
3605 tp->repair = 0;
3606 sk->sk_reuse = SK_NO_REUSE;
3607 tcp_send_window_probe(sk);
3608 } else if (val == TCP_REPAIR_OFF_NO_WP) {
3609 tp->repair = 0;
3610 sk->sk_reuse = SK_NO_REUSE;
3611 } else
3612 err = -EINVAL;
3613
3614 break;
3615
3616 case TCP_REPAIR_QUEUE:
3617 if (!tp->repair)
3618 err = -EPERM;
3619 else if ((unsigned int)val < TCP_QUEUES_NR)
3620 tp->repair_queue = val;
3621 else
3622 err = -EINVAL;
3623 break;
3624
3625 case TCP_QUEUE_SEQ:
3626 if (sk->sk_state != TCP_CLOSE) {
3627 err = -EPERM;
3628 } else if (tp->repair_queue == TCP_SEND_QUEUE) {
3629 if (!tcp_rtx_queue_empty(sk))
3630 err = -EPERM;
3631 else
3632 WRITE_ONCE(tp->write_seq, val);
3633 } else if (tp->repair_queue == TCP_RECV_QUEUE) {
3634 if (tp->rcv_nxt != tp->copied_seq) {
3635 err = -EPERM;
3636 } else {
3637 WRITE_ONCE(tp->rcv_nxt, val);
3638 WRITE_ONCE(tp->copied_seq, val);
3639 }
3640 } else {
3641 err = -EINVAL;
3642 }
3643 break;
3644
3645 case TCP_REPAIR_OPTIONS:
3646 if (!tp->repair)
3647 err = -EINVAL;
3648 else if (sk->sk_state == TCP_ESTABLISHED)
3649 err = tcp_repair_options_est(sk, optval, optlen);
3650 else
3651 err = -EPERM;
3652 break;
3653
3654 case TCP_CORK:
3655 __tcp_sock_set_cork(sk, val);
3656 break;
3657
3658 case TCP_KEEPIDLE:
3659 err = tcp_sock_set_keepidle_locked(sk, val);
3660 break;
3661 case TCP_KEEPINTVL:
3662 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3663 err = -EINVAL;
3664 else
3665 tp->keepalive_intvl = val * HZ;
3666 break;
3667 case TCP_KEEPCNT:
3668 if (val < 1 || val > MAX_TCP_KEEPCNT)
3669 err = -EINVAL;
3670 else
3671 tp->keepalive_probes = val;
3672 break;
3673 case TCP_SYNCNT:
3674 if (val < 1 || val > MAX_TCP_SYNCNT)
3675 err = -EINVAL;
3676 else
3677 icsk->icsk_syn_retries = val;
3678 break;
3679
3680 case TCP_SAVE_SYN:
3681
3682 if (val < 0 || val > 2)
3683 err = -EINVAL;
3684 else
3685 tp->save_syn = val;
3686 break;
3687
3688 case TCP_LINGER2:
3689 if (val < 0)
3690 tp->linger2 = -1;
3691 else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3692 tp->linger2 = TCP_FIN_TIMEOUT_MAX;
3693 else
3694 tp->linger2 = val * HZ;
3695 break;
3696
3697 case TCP_DEFER_ACCEPT:
3698
3699 icsk->icsk_accept_queue.rskq_defer_accept =
3700 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3701 TCP_RTO_MAX / HZ);
3702 break;
3703
3704 case TCP_WINDOW_CLAMP:
3705 err = tcp_set_window_clamp(sk, val);
3706 break;
3707
3708 case TCP_QUICKACK:
3709 __tcp_sock_set_quickack(sk, val);
3710 break;
3711
3712 #ifdef CONFIG_TCP_MD5SIG
3713 case TCP_MD5SIG:
3714 case TCP_MD5SIG_EXT:
3715 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3716 break;
3717 #endif
3718 case TCP_USER_TIMEOUT:
3719
3720
3721
3722 if (val < 0)
3723 err = -EINVAL;
3724 else
3725 icsk->icsk_user_timeout = val;
3726 break;
3727
3728 case TCP_FASTOPEN:
3729 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3730 TCPF_LISTEN))) {
3731 tcp_fastopen_init_key_once(net);
3732
3733 fastopen_queue_tune(sk, val);
3734 } else {
3735 err = -EINVAL;
3736 }
3737 break;
3738 case TCP_FASTOPEN_CONNECT:
3739 if (val > 1 || val < 0) {
3740 err = -EINVAL;
3741 } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
3742 TFO_CLIENT_ENABLE) {
3743 if (sk->sk_state == TCP_CLOSE)
3744 tp->fastopen_connect = val;
3745 else
3746 err = -EINVAL;
3747 } else {
3748 err = -EOPNOTSUPP;
3749 }
3750 break;
3751 case TCP_FASTOPEN_NO_COOKIE:
3752 if (val > 1 || val < 0)
3753 err = -EINVAL;
3754 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3755 err = -EINVAL;
3756 else
3757 tp->fastopen_no_cookie = val;
3758 break;
3759 case TCP_TIMESTAMP:
3760 if (!tp->repair)
3761 err = -EPERM;
3762 else
3763 tp->tsoffset = val - tcp_time_stamp_raw();
3764 break;
3765 case TCP_REPAIR_WINDOW:
3766 err = tcp_repair_set_window(tp, optval, optlen);
3767 break;
3768 case TCP_NOTSENT_LOWAT:
3769 tp->notsent_lowat = val;
3770 sk->sk_write_space(sk);
3771 break;
3772 case TCP_INQ:
3773 if (val > 1 || val < 0)
3774 err = -EINVAL;
3775 else
3776 tp->recvmsg_inq = val;
3777 break;
3778 case TCP_TX_DELAY:
3779 if (val)
3780 tcp_enable_tx_delay();
3781 tp->tcp_tx_delay = val;
3782 break;
3783 default:
3784 err = -ENOPROTOOPT;
3785 break;
3786 }
3787
3788 release_sock(sk);
3789 return err;
3790 }
3791
3792 int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
3793 unsigned int optlen)
3794 {
3795 const struct inet_connection_sock *icsk = inet_csk(sk);
3796
3797 if (level != SOL_TCP)
3798 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3799 optval, optlen);
3800 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3801 }
3802 EXPORT_SYMBOL(tcp_setsockopt);
3803
3804 static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3805 struct tcp_info *info)
3806 {
3807 u64 stats[__TCP_CHRONO_MAX], total = 0;
3808 enum tcp_chrono i;
3809
3810 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3811 stats[i] = tp->chrono_stat[i - 1];
3812 if (i == tp->chrono_type)
3813 stats[i] += tcp_jiffies32 - tp->chrono_start;
3814 stats[i] *= USEC_PER_SEC / HZ;
3815 total += stats[i];
3816 }
3817
3818 info->tcpi_busy_time = total;
3819 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3820 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3821 }
3822
3823
3824 void tcp_get_info(struct sock *sk, struct tcp_info *info)
3825 {
3826 const struct tcp_sock *tp = tcp_sk(sk);
3827 const struct inet_connection_sock *icsk = inet_csk(sk);
3828 unsigned long rate;
3829 u32 now;
3830 u64 rate64;
3831 bool slow;
3832
3833 memset(info, 0, sizeof(*info));
3834 if (sk->sk_type != SOCK_STREAM)
3835 return;
3836
3837 info->tcpi_state = inet_sk_state_load(sk);
3838
3839
3840 rate = READ_ONCE(sk->sk_pacing_rate);
3841 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3842 info->tcpi_pacing_rate = rate64;
3843
3844 rate = READ_ONCE(sk->sk_max_pacing_rate);
3845 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3846 info->tcpi_max_pacing_rate = rate64;
3847
3848 info->tcpi_reordering = tp->reordering;
3849 info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);
3850
3851 if (info->tcpi_state == TCP_LISTEN) {
3852
3853
3854
3855
3856 info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3857 info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
3858 return;
3859 }
3860
3861 slow = lock_sock_fast(sk);
3862
3863 info->tcpi_ca_state = icsk->icsk_ca_state;
3864 info->tcpi_retransmits = icsk->icsk_retransmits;
3865 info->tcpi_probes = icsk->icsk_probes_out;
3866 info->tcpi_backoff = icsk->icsk_backoff;
3867
3868 if (tp->rx_opt.tstamp_ok)
3869 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3870 if (tcp_is_sack(tp))
3871 info->tcpi_options |= TCPI_OPT_SACK;
3872 if (tp->rx_opt.wscale_ok) {
3873 info->tcpi_options |= TCPI_OPT_WSCALE;
3874 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3875 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3876 }
3877
3878 if (tp->ecn_flags & TCP_ECN_OK)
3879 info->tcpi_options |= TCPI_OPT_ECN;
3880 if (tp->ecn_flags & TCP_ECN_SEEN)
3881 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3882 if (tp->syn_data_acked)
3883 info->tcpi_options |= TCPI_OPT_SYN_DATA;
3884
3885 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3886 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3887 info->tcpi_snd_mss = tp->mss_cache;
3888 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3889
3890 info->tcpi_unacked = tp->packets_out;
3891 info->tcpi_sacked = tp->sacked_out;
3892
3893 info->tcpi_lost = tp->lost_out;
3894 info->tcpi_retrans = tp->retrans_out;
3895
3896 now = tcp_jiffies32;
3897 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3898 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3899 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3900
3901 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3902 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3903 info->tcpi_rtt = tp->srtt_us >> 3;
3904 info->tcpi_rttvar = tp->mdev_us >> 2;
3905 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3906 info->tcpi_advmss = tp->advmss;
3907
3908 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3909 info->tcpi_rcv_space = tp->rcvq_space.space;
3910
3911 info->tcpi_total_retrans = tp->total_retrans;
3912
3913 info->tcpi_bytes_acked = tp->bytes_acked;
3914 info->tcpi_bytes_received = tp->bytes_received;
3915 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3916 tcp_get_info_chrono_stats(tp, info);
3917
3918 info->tcpi_segs_out = tp->segs_out;
3919
3920
3921 info->tcpi_segs_in = READ_ONCE(tp->segs_in);
3922 info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
3923
3924 info->tcpi_min_rtt = tcp_min_rtt(tp);
3925 info->tcpi_data_segs_out = tp->data_segs_out;
3926
3927 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3928 rate64 = tcp_compute_delivery_rate(tp);
3929 if (rate64)
3930 info->tcpi_delivery_rate = rate64;
3931 info->tcpi_delivered = tp->delivered;
3932 info->tcpi_delivered_ce = tp->delivered_ce;
3933 info->tcpi_bytes_sent = tp->bytes_sent;
3934 info->tcpi_bytes_retrans = tp->bytes_retrans;
3935 info->tcpi_dsack_dups = tp->dsack_dups;
3936 info->tcpi_reord_seen = tp->reord_seen;
3937 info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3938 info->tcpi_snd_wnd = tp->snd_wnd;
3939 info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
3940 unlock_sock_fast(sk, slow);
3941 }
3942 EXPORT_SYMBOL_GPL(tcp_get_info);
3943
3944 static size_t tcp_opt_stats_get_size(void)
3945 {
3946 return
3947 nla_total_size_64bit(sizeof(u64)) +
3948 nla_total_size_64bit(sizeof(u64)) +
3949 nla_total_size_64bit(sizeof(u64)) +
3950 nla_total_size_64bit(sizeof(u64)) +
3951 nla_total_size_64bit(sizeof(u64)) +
3952 nla_total_size_64bit(sizeof(u64)) +
3953 nla_total_size_64bit(sizeof(u64)) +
3954 nla_total_size(sizeof(u32)) +
3955 nla_total_size(sizeof(u32)) +
3956 nla_total_size(sizeof(u32)) +
3957 nla_total_size(sizeof(u8)) +
3958 nla_total_size(sizeof(u8)) +
3959 nla_total_size(sizeof(u32)) +
3960 nla_total_size(sizeof(u8)) +
3961 nla_total_size(sizeof(u32)) +
3962 nla_total_size(sizeof(u32)) +
3963 nla_total_size(sizeof(u32)) +
3964 nla_total_size_64bit(sizeof(u64)) +
3965 nla_total_size_64bit(sizeof(u64)) +
3966 nla_total_size(sizeof(u32)) +
3967 nla_total_size(sizeof(u32)) +
3968 nla_total_size(sizeof(u32)) +
3969 nla_total_size(sizeof(u16)) +
3970 nla_total_size(sizeof(u32)) +
3971 nla_total_size_64bit(sizeof(u64)) +
3972 nla_total_size(sizeof(u8)) +
3973 0;
3974 }
3975
3976
3977 static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
3978 {
3979 if (skb->protocol == htons(ETH_P_IP))
3980 return ip_hdr(skb)->ttl;
3981 else if (skb->protocol == htons(ETH_P_IPV6))
3982 return ipv6_hdr(skb)->hop_limit;
3983 else
3984 return 0;
3985 }
3986
3987 struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
3988 const struct sk_buff *orig_skb,
3989 const struct sk_buff *ack_skb)
3990 {
3991 const struct tcp_sock *tp = tcp_sk(sk);
3992 struct sk_buff *stats;
3993 struct tcp_info info;
3994 unsigned long rate;
3995 u64 rate64;
3996
3997 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3998 if (!stats)
3999 return NULL;
4000
4001 tcp_get_info_chrono_stats(tp, &info);
4002 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
4003 info.tcpi_busy_time, TCP_NLA_PAD);
4004 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
4005 info.tcpi_rwnd_limited, TCP_NLA_PAD);
4006 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
4007 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
4008 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
4009 tp->data_segs_out, TCP_NLA_PAD);
4010 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
4011 tp->total_retrans, TCP_NLA_PAD);
4012
4013 rate = READ_ONCE(sk->sk_pacing_rate);
4014 rate64 = (rate != ~0UL) ? rate : ~0ULL;
4015 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
4016
4017 rate64 = tcp_compute_delivery_rate(tp);
4018 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
4019
4020 nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
4021 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
4022 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
4023
4024 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
4025 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
4026 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
4027 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
4028 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
4029
4030 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
4031 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
4032
4033 nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
4034 TCP_NLA_PAD);
4035 nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
4036 TCP_NLA_PAD);
4037 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
4038 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
4039 nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
4040 nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
4041 nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
4042 max_t(int, 0, tp->write_seq - tp->snd_nxt));
4043 nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
4044 TCP_NLA_PAD);
4045 if (ack_skb)
4046 nla_put_u8(stats, TCP_NLA_TTL,
4047 tcp_skb_ttl_or_hop_limit(ack_skb));
4048
4049 return stats;
4050 }
4051
4052 static int do_tcp_getsockopt(struct sock *sk, int level,
4053 int optname, char __user *optval, int __user *optlen)
4054 {
4055 struct inet_connection_sock *icsk = inet_csk(sk);
4056 struct tcp_sock *tp = tcp_sk(sk);
4057 struct net *net = sock_net(sk);
4058 int val, len;
4059
4060 if (get_user(len, optlen))
4061 return -EFAULT;
4062
4063 len = min_t(unsigned int, len, sizeof(int));
4064
4065 if (len < 0)
4066 return -EINVAL;
4067
4068 switch (optname) {
4069 case TCP_MAXSEG:
4070 val = tp->mss_cache;
4071 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
4072 val = tp->rx_opt.user_mss;
4073 if (tp->repair)
4074 val = tp->rx_opt.mss_clamp;
4075 break;
4076 case TCP_NODELAY:
4077 val = !!(tp->nonagle&TCP_NAGLE_OFF);
4078 break;
4079 case TCP_CORK:
4080 val = !!(tp->nonagle&TCP_NAGLE_CORK);
4081 break;
4082 case TCP_KEEPIDLE:
4083 val = keepalive_time_when(tp) / HZ;
4084 break;
4085 case TCP_KEEPINTVL:
4086 val = keepalive_intvl_when(tp) / HZ;
4087 break;
4088 case TCP_KEEPCNT:
4089 val = keepalive_probes(tp);
4090 break;
4091 case TCP_SYNCNT:
4092 val = icsk->icsk_syn_retries ? :
4093 READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
4094 break;
4095 case TCP_LINGER2:
4096 val = tp->linger2;
4097 if (val >= 0)
4098 val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
4099 break;
4100 case TCP_DEFER_ACCEPT:
4101 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
4102 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
4103 break;
4104 case TCP_WINDOW_CLAMP:
4105 val = tp->window_clamp;
4106 break;
4107 case TCP_INFO: {
4108 struct tcp_info info;
4109
4110 if (get_user(len, optlen))
4111 return -EFAULT;
4112
4113 tcp_get_info(sk, &info);
4114
4115 len = min_t(unsigned int, len, sizeof(info));
4116 if (put_user(len, optlen))
4117 return -EFAULT;
4118 if (copy_to_user(optval, &info, len))
4119 return -EFAULT;
4120 return 0;
4121 }
4122 case TCP_CC_INFO: {
4123 const struct tcp_congestion_ops *ca_ops;
4124 union tcp_cc_info info;
4125 size_t sz = 0;
4126 int attr;
4127
4128 if (get_user(len, optlen))
4129 return -EFAULT;
4130
4131 ca_ops = icsk->icsk_ca_ops;
4132 if (ca_ops && ca_ops->get_info)
4133 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
4134
4135 len = min_t(unsigned int, len, sz);
4136 if (put_user(len, optlen))
4137 return -EFAULT;
4138 if (copy_to_user(optval, &info, len))
4139 return -EFAULT;
4140 return 0;
4141 }
4142 case TCP_QUICKACK:
4143 val = !inet_csk_in_pingpong_mode(sk);
4144 break;
4145
4146 case TCP_CONGESTION:
4147 if (get_user(len, optlen))
4148 return -EFAULT;
4149 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
4150 if (put_user(len, optlen))
4151 return -EFAULT;
4152 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
4153 return -EFAULT;
4154 return 0;
4155
4156 case TCP_ULP:
4157 if (get_user(len, optlen))
4158 return -EFAULT;
4159 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
4160 if (!icsk->icsk_ulp_ops) {
4161 if (put_user(0, optlen))
4162 return -EFAULT;
4163 return 0;
4164 }
4165 if (put_user(len, optlen))
4166 return -EFAULT;
4167 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
4168 return -EFAULT;
4169 return 0;
4170
4171 case TCP_FASTOPEN_KEY: {
4172 u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
4173 unsigned int key_len;
4174
4175 if (get_user(len, optlen))
4176 return -EFAULT;
4177
4178 key_len = tcp_fastopen_get_cipher(net, icsk, key) *
4179 TCP_FASTOPEN_KEY_LENGTH;
4180 len = min_t(unsigned int, len, key_len);
4181 if (put_user(len, optlen))
4182 return -EFAULT;
4183 if (copy_to_user(optval, key, len))
4184 return -EFAULT;
4185 return 0;
4186 }
4187 case TCP_THIN_LINEAR_TIMEOUTS:
4188 val = tp->thin_lto;
4189 break;
4190
4191 case TCP_THIN_DUPACK:
4192 val = 0;
4193 break;
4194
4195 case TCP_REPAIR:
4196 val = tp->repair;
4197 break;
4198
4199 case TCP_REPAIR_QUEUE:
4200 if (tp->repair)
4201 val = tp->repair_queue;
4202 else
4203 return -EINVAL;
4204 break;
4205
4206 case TCP_REPAIR_WINDOW: {
4207 struct tcp_repair_window opt;
4208
4209 if (get_user(len, optlen))
4210 return -EFAULT;
4211
4212 if (len != sizeof(opt))
4213 return -EINVAL;
4214
4215 if (!tp->repair)
4216 return -EPERM;
4217
4218 opt.snd_wl1 = tp->snd_wl1;
4219 opt.snd_wnd = tp->snd_wnd;
4220 opt.max_window = tp->max_window;
4221 opt.rcv_wnd = tp->rcv_wnd;
4222 opt.rcv_wup = tp->rcv_wup;
4223
4224 if (copy_to_user(optval, &opt, len))
4225 return -EFAULT;
4226 return 0;
4227 }
4228 case TCP_QUEUE_SEQ:
4229 if (tp->repair_queue == TCP_SEND_QUEUE)
4230 val = tp->write_seq;
4231 else if (tp->repair_queue == TCP_RECV_QUEUE)
4232 val = tp->rcv_nxt;
4233 else
4234 return -EINVAL;
4235 break;
4236
4237 case TCP_USER_TIMEOUT:
4238 val = icsk->icsk_user_timeout;
4239 break;
4240
4241 case TCP_FASTOPEN:
4242 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
4243 break;
4244
4245 case TCP_FASTOPEN_CONNECT:
4246 val = tp->fastopen_connect;
4247 break;
4248
4249 case TCP_FASTOPEN_NO_COOKIE:
4250 val = tp->fastopen_no_cookie;
4251 break;
4252
4253 case TCP_TX_DELAY:
4254 val = tp->tcp_tx_delay;
4255 break;
4256
4257 case TCP_TIMESTAMP:
4258 val = tcp_time_stamp_raw() + tp->tsoffset;
4259 break;
4260 case TCP_NOTSENT_LOWAT:
4261 val = tp->notsent_lowat;
4262 break;
4263 case TCP_INQ:
4264 val = tp->recvmsg_inq;
4265 break;
4266 case TCP_SAVE_SYN:
4267 val = tp->save_syn;
4268 break;
4269 case TCP_SAVED_SYN: {
4270 if (get_user(len, optlen))
4271 return -EFAULT;
4272
4273 lock_sock(sk);
4274 if (tp->saved_syn) {
4275 if (len < tcp_saved_syn_len(tp->saved_syn)) {
4276 if (put_user(tcp_saved_syn_len(tp->saved_syn),
4277 optlen)) {
4278 release_sock(sk);
4279 return -EFAULT;
4280 }
4281 release_sock(sk);
4282 return -EINVAL;
4283 }
4284 len = tcp_saved_syn_len(tp->saved_syn);
4285 if (put_user(len, optlen)) {
4286 release_sock(sk);
4287 return -EFAULT;
4288 }
4289 if (copy_to_user(optval, tp->saved_syn->data, len)) {
4290 release_sock(sk);
4291 return -EFAULT;
4292 }
4293 tcp_saved_syn_free(tp);
4294 release_sock(sk);
4295 } else {
4296 release_sock(sk);
4297 len = 0;
4298 if (put_user(len, optlen))
4299 return -EFAULT;
4300 }
4301 return 0;
4302 }
4303 #ifdef CONFIG_MMU
4304 case TCP_ZEROCOPY_RECEIVE: {
4305 struct scm_timestamping_internal tss;
4306 struct tcp_zerocopy_receive zc = {};
4307 int err;
4308
4309 if (get_user(len, optlen))
4310 return -EFAULT;
4311 if (len < 0 ||
4312 len < offsetofend(struct tcp_zerocopy_receive, length))
4313 return -EINVAL;
4314 if (unlikely(len > sizeof(zc))) {
4315 err = check_zeroed_user(optval + sizeof(zc),
4316 len - sizeof(zc));
4317 if (err < 1)
4318 return err == 0 ? -EINVAL : err;
4319 len = sizeof(zc);
4320 if (put_user(len, optlen))
4321 return -EFAULT;
4322 }
4323 if (copy_from_user(&zc, optval, len))
4324 return -EFAULT;
4325 if (zc.reserved)
4326 return -EINVAL;
4327 if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
4328 return -EINVAL;
4329 lock_sock(sk);
4330 err = tcp_zerocopy_receive(sk, &zc, &tss);
4331 err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
4332 &zc, &len, err);
4333 release_sock(sk);
4334 if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
4335 goto zerocopy_rcv_cmsg;
4336 switch (len) {
4337 case offsetofend(struct tcp_zerocopy_receive, msg_flags):
4338 goto zerocopy_rcv_cmsg;
4339 case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
4340 case offsetofend(struct tcp_zerocopy_receive, msg_control):
4341 case offsetofend(struct tcp_zerocopy_receive, flags):
4342 case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
4343 case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
4344 case offsetofend(struct tcp_zerocopy_receive, err):
4345 goto zerocopy_rcv_sk_err;
4346 case offsetofend(struct tcp_zerocopy_receive, inq):
4347 goto zerocopy_rcv_inq;
4348 case offsetofend(struct tcp_zerocopy_receive, length):
4349 default:
4350 goto zerocopy_rcv_out;
4351 }
4352 zerocopy_rcv_cmsg:
4353 if (zc.msg_flags & TCP_CMSG_TS)
4354 tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
4355 else
4356 zc.msg_flags = 0;
4357 zerocopy_rcv_sk_err:
4358 if (!err)
4359 zc.err = sock_error(sk);
4360 zerocopy_rcv_inq:
4361 zc.inq = tcp_inq_hint(sk);
4362 zerocopy_rcv_out:
4363 if (!err && copy_to_user(optval, &zc, len))
4364 err = -EFAULT;
4365 return err;
4366 }
4367 #endif
4368 default:
4369 return -ENOPROTOOPT;
4370 }
4371
4372 if (put_user(len, optlen))
4373 return -EFAULT;
4374 if (copy_to_user(optval, &val, len))
4375 return -EFAULT;
4376 return 0;
4377 }
4378
4379 bool tcp_bpf_bypass_getsockopt(int level, int optname)
4380 {
4381
4382
4383
4384 if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
4385 return true;
4386
4387 return false;
4388 }
4389 EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
4390
4391 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
4392 int __user *optlen)
4393 {
4394 struct inet_connection_sock *icsk = inet_csk(sk);
4395
4396 if (level != SOL_TCP)
4397 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
4398 optval, optlen);
4399 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
4400 }
4401 EXPORT_SYMBOL(tcp_getsockopt);
4402
4403 #ifdef CONFIG_TCP_MD5SIG
4404 static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
4405 static DEFINE_MUTEX(tcp_md5sig_mutex);
4406 static bool tcp_md5sig_pool_populated = false;
4407
4408 static void __tcp_alloc_md5sig_pool(void)
4409 {
4410 struct crypto_ahash *hash;
4411 int cpu;
4412
4413 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
4414 if (IS_ERR(hash))
4415 return;
4416
4417 for_each_possible_cpu(cpu) {
4418 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
4419 struct ahash_request *req;
4420
4421 if (!scratch) {
4422 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
4423 sizeof(struct tcphdr),
4424 GFP_KERNEL,
4425 cpu_to_node(cpu));
4426 if (!scratch)
4427 return;
4428 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
4429 }
4430 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
4431 continue;
4432
4433 req = ahash_request_alloc(hash, GFP_KERNEL);
4434 if (!req)
4435 return;
4436
4437 ahash_request_set_callback(req, 0, NULL, NULL);
4438
4439 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
4440 }
4441
4442
4443
4444 smp_wmb();
4445 tcp_md5sig_pool_populated = true;
4446 }
4447
4448 bool tcp_alloc_md5sig_pool(void)
4449 {
4450 if (unlikely(!tcp_md5sig_pool_populated)) {
4451 mutex_lock(&tcp_md5sig_mutex);
4452
4453 if (!tcp_md5sig_pool_populated) {
4454 __tcp_alloc_md5sig_pool();
4455 if (tcp_md5sig_pool_populated)
4456 static_branch_inc(&tcp_md5_needed);
4457 }
4458
4459 mutex_unlock(&tcp_md5sig_mutex);
4460 }
4461 return tcp_md5sig_pool_populated;
4462 }
4463 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473 struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
4474 {
4475 local_bh_disable();
4476
4477 if (tcp_md5sig_pool_populated) {
4478
4479 smp_rmb();
4480 return this_cpu_ptr(&tcp_md5sig_pool);
4481 }
4482 local_bh_enable();
4483 return NULL;
4484 }
4485 EXPORT_SYMBOL(tcp_get_md5sig_pool);
4486
4487 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
4488 const struct sk_buff *skb, unsigned int header_len)
4489 {
4490 struct scatterlist sg;
4491 const struct tcphdr *tp = tcp_hdr(skb);
4492 struct ahash_request *req = hp->md5_req;
4493 unsigned int i;
4494 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
4495 skb_headlen(skb) - header_len : 0;
4496 const struct skb_shared_info *shi = skb_shinfo(skb);
4497 struct sk_buff *frag_iter;
4498
4499 sg_init_table(&sg, 1);
4500
4501 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
4502 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
4503 if (crypto_ahash_update(req))
4504 return 1;
4505
4506 for (i = 0; i < shi->nr_frags; ++i) {
4507 const skb_frag_t *f = &shi->frags[i];
4508 unsigned int offset = skb_frag_off(f);
4509 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
4510
4511 sg_set_page(&sg, page, skb_frag_size(f),
4512 offset_in_page(offset));
4513 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
4514 if (crypto_ahash_update(req))
4515 return 1;
4516 }
4517
4518 skb_walk_frags(skb, frag_iter)
4519 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
4520 return 1;
4521
4522 return 0;
4523 }
4524 EXPORT_SYMBOL(tcp_md5_hash_skb_data);
4525
4526 int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
4527 {
4528 u8 keylen = READ_ONCE(key->keylen);
4529 struct scatterlist sg;
4530
4531 sg_init_one(&sg, key->key, keylen);
4532 ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
4533
4534
4535 return data_race(crypto_ahash_update(hp->md5_req));
4536 }
4537 EXPORT_SYMBOL(tcp_md5_hash_key);
4538
4539
4540 enum skb_drop_reason
4541 tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
4542 const void *saddr, const void *daddr,
4543 int family, int dif, int sdif)
4544 {
4545
4546
4547
4548
4549
4550
4551
4552
4553 const __u8 *hash_location = NULL;
4554 struct tcp_md5sig_key *hash_expected;
4555 const struct tcphdr *th = tcp_hdr(skb);
4556 struct tcp_sock *tp = tcp_sk(sk);
4557 int genhash, l3index;
4558 u8 newhash[16];
4559
4560
4561
4562
4563 l3index = sdif ? dif : 0;
4564
4565 hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family);
4566 hash_location = tcp_parse_md5sig_option(th);
4567
4568
4569 if (!hash_expected && !hash_location)
4570 return SKB_NOT_DROPPED_YET;
4571
4572 if (hash_expected && !hash_location) {
4573 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
4574 return SKB_DROP_REASON_TCP_MD5NOTFOUND;
4575 }
4576
4577 if (!hash_expected && hash_location) {
4578 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
4579 return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
4580 }
4581
4582
4583
4584
4585
4586 if (family == AF_INET)
4587 genhash = tcp_v4_md5_hash_skb(newhash,
4588 hash_expected,
4589 NULL, skb);
4590 else
4591 genhash = tp->af_specific->calc_md5_hash(newhash,
4592 hash_expected,
4593 NULL, skb);
4594
4595 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
4596 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
4597 if (family == AF_INET) {
4598 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
4599 saddr, ntohs(th->source),
4600 daddr, ntohs(th->dest),
4601 genhash ? " tcp_v4_calc_md5_hash failed"
4602 : "", l3index);
4603 } else {
4604 net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n",
4605 genhash ? "failed" : "mismatch",
4606 saddr, ntohs(th->source),
4607 daddr, ntohs(th->dest), l3index);
4608 }
4609 return SKB_DROP_REASON_TCP_MD5FAILURE;
4610 }
4611 return SKB_NOT_DROPPED_YET;
4612 }
4613 EXPORT_SYMBOL(tcp_inbound_md5_hash);
4614
4615 #endif
4616
4617 void tcp_done(struct sock *sk)
4618 {
4619 struct request_sock *req;
4620
4621
4622
4623
4624
4625 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
4626
4627 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
4628 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
4629
4630 tcp_set_state(sk, TCP_CLOSE);
4631 tcp_clear_xmit_timers(sk);
4632 if (req)
4633 reqsk_fastopen_remove(sk, req, false);
4634
4635 sk->sk_shutdown = SHUTDOWN_MASK;
4636
4637 if (!sock_flag(sk, SOCK_DEAD))
4638 sk->sk_state_change(sk);
4639 else
4640 inet_csk_destroy_sock(sk);
4641 }
4642 EXPORT_SYMBOL_GPL(tcp_done);
4643
4644 int tcp_abort(struct sock *sk, int err)
4645 {
4646 int state = inet_sk_state_load(sk);
4647
4648 if (state == TCP_NEW_SYN_RECV) {
4649 struct request_sock *req = inet_reqsk(sk);
4650
4651 local_bh_disable();
4652 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4653 local_bh_enable();
4654 return 0;
4655 }
4656 if (state == TCP_TIME_WAIT) {
4657 struct inet_timewait_sock *tw = inet_twsk(sk);
4658
4659 refcount_inc(&tw->tw_refcnt);
4660 local_bh_disable();
4661 inet_twsk_deschedule_put(tw);
4662 local_bh_enable();
4663 return 0;
4664 }
4665
4666
4667 lock_sock(sk);
4668
4669 if (sk->sk_state == TCP_LISTEN) {
4670 tcp_set_state(sk, TCP_CLOSE);
4671 inet_csk_listen_stop(sk);
4672 }
4673
4674
4675 local_bh_disable();
4676 bh_lock_sock(sk);
4677
4678 if (!sock_flag(sk, SOCK_DEAD)) {
4679 sk->sk_err = err;
4680
4681 smp_wmb();
4682 sk_error_report(sk);
4683 if (tcp_need_reset(sk->sk_state))
4684 tcp_send_active_reset(sk, GFP_ATOMIC);
4685 tcp_done(sk);
4686 }
4687
4688 bh_unlock_sock(sk);
4689 local_bh_enable();
4690 tcp_write_queue_purge(sk);
4691 release_sock(sk);
4692 return 0;
4693 }
4694 EXPORT_SYMBOL_GPL(tcp_abort);
4695
4696 extern struct tcp_congestion_ops tcp_reno;
4697
4698 static __initdata unsigned long thash_entries;
4699 static int __init set_thash_entries(char *str)
4700 {
4701 ssize_t ret;
4702
4703 if (!str)
4704 return 0;
4705
4706 ret = kstrtoul(str, 0, &thash_entries);
4707 if (ret)
4708 return 0;
4709
4710 return 1;
4711 }
4712 __setup("thash_entries=", set_thash_entries);
4713
4714 static void __init tcp_init_mem(void)
4715 {
4716 unsigned long limit = nr_free_buffer_pages() / 16;
4717
4718 limit = max(limit, 128UL);
4719 sysctl_tcp_mem[0] = limit / 4 * 3;
4720 sysctl_tcp_mem[1] = limit;
4721 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
4722 }
4723
4724 void __init tcp_init(void)
4725 {
4726 int max_rshare, max_wshare, cnt;
4727 unsigned long limit;
4728 unsigned int i;
4729
4730 BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
4731 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
4732 sizeof_field(struct sk_buff, cb));
4733
4734 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
4735
4736 timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
4737 mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
4738
4739 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
4740 thash_entries, 21,
4741 0, 64 * 1024);
4742 tcp_hashinfo.bind_bucket_cachep =
4743 kmem_cache_create("tcp_bind_bucket",
4744 sizeof(struct inet_bind_bucket), 0,
4745 SLAB_HWCACHE_ALIGN | SLAB_PANIC |
4746 SLAB_ACCOUNT,
4747 NULL);
4748
4749
4750
4751
4752
4753
4754 tcp_hashinfo.ehash =
4755 alloc_large_system_hash("TCP established",
4756 sizeof(struct inet_ehash_bucket),
4757 thash_entries,
4758 17,
4759 0,
4760 NULL,
4761 &tcp_hashinfo.ehash_mask,
4762 0,
4763 thash_entries ? 0 : 512 * 1024);
4764 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
4765 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
4766
4767 if (inet_ehash_locks_alloc(&tcp_hashinfo))
4768 panic("TCP: failed to alloc ehash_locks");
4769 tcp_hashinfo.bhash =
4770 alloc_large_system_hash("TCP bind",
4771 sizeof(struct inet_bind_hashbucket),
4772 tcp_hashinfo.ehash_mask + 1,
4773 17,
4774 0,
4775 &tcp_hashinfo.bhash_size,
4776 NULL,
4777 0,
4778 64 * 1024);
4779 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
4780 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
4781 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
4782 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
4783 }
4784
4785
4786 cnt = tcp_hashinfo.ehash_mask + 1;
4787 sysctl_tcp_max_orphans = cnt / 2;
4788
4789 tcp_init_mem();
4790
4791 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
4792 max_wshare = min(4UL*1024*1024, limit);
4793 max_rshare = min(6UL*1024*1024, limit);
4794
4795 init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
4796 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
4797 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
4798
4799 init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
4800 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
4801 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
4802
4803 pr_info("Hash tables configured (established %u bind %u)\n",
4804 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
4805
4806 tcp_v4_init();
4807 tcp_metrics_init();
4808 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
4809 tcp_tasklet_init();
4810 mptcp_init();
4811 }