0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048 #define pr_fmt(fmt) "TCP: " fmt
0049
0050 #include <linux/bottom_half.h>
0051 #include <linux/types.h>
0052 #include <linux/fcntl.h>
0053 #include <linux/module.h>
0054 #include <linux/random.h>
0055 #include <linux/cache.h>
0056 #include <linux/jhash.h>
0057 #include <linux/init.h>
0058 #include <linux/times.h>
0059 #include <linux/slab.h>
0060
0061 #include <net/net_namespace.h>
0062 #include <net/icmp.h>
0063 #include <net/inet_hashtables.h>
0064 #include <net/tcp.h>
0065 #include <net/transp_v6.h>
0066 #include <net/ipv6.h>
0067 #include <net/inet_common.h>
0068 #include <net/timewait_sock.h>
0069 #include <net/xfrm.h>
0070 #include <net/secure_seq.h>
0071 #include <net/busy_poll.h>
0072
0073 #include <linux/inet.h>
0074 #include <linux/ipv6.h>
0075 #include <linux/stddef.h>
0076 #include <linux/proc_fs.h>
0077 #include <linux/seq_file.h>
0078 #include <linux/inetdevice.h>
0079 #include <linux/btf_ids.h>
0080
0081 #include <crypto/hash.h>
0082 #include <linux/scatterlist.h>
0083
0084 #include <trace/events/tcp.h>
0085
0086 #ifdef CONFIG_TCP_MD5SIG
0087 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
0088 __be32 daddr, __be32 saddr, const struct tcphdr *th);
0089 #endif
0090
0091 struct inet_hashinfo tcp_hashinfo;
0092 EXPORT_SYMBOL(tcp_hashinfo);
0093
0094 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
0095
0096 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
0097 {
0098 return secure_tcp_seq(ip_hdr(skb)->daddr,
0099 ip_hdr(skb)->saddr,
0100 tcp_hdr(skb)->dest,
0101 tcp_hdr(skb)->source);
0102 }
0103
0104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
0105 {
0106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
0107 }
0108
0109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
0110 {
0111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
0112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
0113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
0114 struct tcp_sock *tp = tcp_sk(sk);
0115
0116 if (reuse == 2) {
0117
0118
0119
0120
0121 bool loopback = false;
0122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
0123 loopback = true;
0124 #if IS_ENABLED(CONFIG_IPV6)
0125 if (tw->tw_family == AF_INET6) {
0126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
0127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
0128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
0129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
0130 loopback = true;
0131 } else
0132 #endif
0133 {
0134 if (ipv4_is_loopback(tw->tw_daddr) ||
0135 ipv4_is_loopback(tw->tw_rcv_saddr))
0136 loopback = true;
0137 }
0138 if (!loopback)
0139 reuse = 0;
0140 }
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153 if (tcptw->tw_ts_recent_stamp &&
0154 (!twp || (reuse && time_after32(ktime_get_seconds(),
0155 tcptw->tw_ts_recent_stamp)))) {
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167 if (likely(!tp->repair)) {
0168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
0169
0170 if (!seq)
0171 seq = 1;
0172 WRITE_ONCE(tp->write_seq, seq);
0173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
0174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
0175 }
0176 sock_hold(sktw);
0177 return 1;
0178 }
0179
0180 return 0;
0181 }
0182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
0183
0184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
0185 int addr_len)
0186 {
0187
0188
0189
0190
0191 if (addr_len < sizeof(struct sockaddr_in))
0192 return -EINVAL;
0193
0194 sock_owned_by_me(sk);
0195
0196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
0197 }
0198
0199
0200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
0201 {
0202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
0203 struct inet_sock *inet = inet_sk(sk);
0204 struct tcp_sock *tp = tcp_sk(sk);
0205 __be16 orig_sport, orig_dport;
0206 __be32 daddr, nexthop;
0207 struct flowi4 *fl4;
0208 struct rtable *rt;
0209 int err;
0210 struct ip_options_rcu *inet_opt;
0211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
0212
0213 if (addr_len < sizeof(struct sockaddr_in))
0214 return -EINVAL;
0215
0216 if (usin->sin_family != AF_INET)
0217 return -EAFNOSUPPORT;
0218
0219 nexthop = daddr = usin->sin_addr.s_addr;
0220 inet_opt = rcu_dereference_protected(inet->inet_opt,
0221 lockdep_sock_is_held(sk));
0222 if (inet_opt && inet_opt->opt.srr) {
0223 if (!daddr)
0224 return -EINVAL;
0225 nexthop = inet_opt->opt.faddr;
0226 }
0227
0228 orig_sport = inet->inet_sport;
0229 orig_dport = usin->sin_port;
0230 fl4 = &inet->cork.fl.u.ip4;
0231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
0232 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
0233 orig_dport, sk);
0234 if (IS_ERR(rt)) {
0235 err = PTR_ERR(rt);
0236 if (err == -ENETUNREACH)
0237 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
0238 return err;
0239 }
0240
0241 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
0242 ip_rt_put(rt);
0243 return -ENETUNREACH;
0244 }
0245
0246 if (!inet_opt || !inet_opt->opt.srr)
0247 daddr = fl4->daddr;
0248
0249 if (!inet->inet_saddr)
0250 inet->inet_saddr = fl4->saddr;
0251 sk_rcv_saddr_set(sk, inet->inet_saddr);
0252
0253 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
0254
0255 tp->rx_opt.ts_recent = 0;
0256 tp->rx_opt.ts_recent_stamp = 0;
0257 if (likely(!tp->repair))
0258 WRITE_ONCE(tp->write_seq, 0);
0259 }
0260
0261 inet->inet_dport = usin->sin_port;
0262 sk_daddr_set(sk, daddr);
0263
0264 inet_csk(sk)->icsk_ext_hdr_len = 0;
0265 if (inet_opt)
0266 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
0267
0268 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
0269
0270
0271
0272
0273
0274
0275 tcp_set_state(sk, TCP_SYN_SENT);
0276 err = inet_hash_connect(tcp_death_row, sk);
0277 if (err)
0278 goto failure;
0279
0280 sk_set_txhash(sk);
0281
0282 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
0283 inet->inet_sport, inet->inet_dport, sk);
0284 if (IS_ERR(rt)) {
0285 err = PTR_ERR(rt);
0286 rt = NULL;
0287 goto failure;
0288 }
0289
0290 sk->sk_gso_type = SKB_GSO_TCPV4;
0291 sk_setup_caps(sk, &rt->dst);
0292 rt = NULL;
0293
0294 if (likely(!tp->repair)) {
0295 if (!tp->write_seq)
0296 WRITE_ONCE(tp->write_seq,
0297 secure_tcp_seq(inet->inet_saddr,
0298 inet->inet_daddr,
0299 inet->inet_sport,
0300 usin->sin_port));
0301 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
0302 inet->inet_saddr,
0303 inet->inet_daddr);
0304 }
0305
0306 inet->inet_id = prandom_u32();
0307
0308 if (tcp_fastopen_defer_connect(sk, &err))
0309 return err;
0310 if (err)
0311 goto failure;
0312
0313 err = tcp_connect(sk);
0314
0315 if (err)
0316 goto failure;
0317
0318 return 0;
0319
0320 failure:
0321
0322
0323
0324
0325 tcp_set_state(sk, TCP_CLOSE);
0326 ip_rt_put(rt);
0327 sk->sk_route_caps = 0;
0328 inet->inet_dport = 0;
0329 return err;
0330 }
0331 EXPORT_SYMBOL(tcp_v4_connect);
0332
0333
0334
0335
0336
0337
0338 void tcp_v4_mtu_reduced(struct sock *sk)
0339 {
0340 struct inet_sock *inet = inet_sk(sk);
0341 struct dst_entry *dst;
0342 u32 mtu;
0343
0344 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
0345 return;
0346 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
0347 dst = inet_csk_update_pmtu(sk, mtu);
0348 if (!dst)
0349 return;
0350
0351
0352
0353
0354 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
0355 sk->sk_err_soft = EMSGSIZE;
0356
0357 mtu = dst_mtu(dst);
0358
0359 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
0360 ip_sk_accept_pmtu(sk) &&
0361 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
0362 tcp_sync_mss(sk, mtu);
0363
0364
0365
0366
0367
0368
0369 tcp_simple_retransmit(sk);
0370 }
0371 }
0372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
0373
0374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
0375 {
0376 struct dst_entry *dst = __sk_dst_check(sk, 0);
0377
0378 if (dst)
0379 dst->ops->redirect(dst, sk, skb);
0380 }
0381
0382
0383
0384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
0385 {
0386 struct request_sock *req = inet_reqsk(sk);
0387 struct net *net = sock_net(sk);
0388
0389
0390
0391
0392 if (seq != tcp_rsk(req)->snt_isn) {
0393 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
0394 } else if (abort) {
0395
0396
0397
0398
0399
0400
0401 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
0402 tcp_listendrop(req->rsk_listener);
0403 }
0404 reqsk_put(req);
0405 }
0406 EXPORT_SYMBOL(tcp_req_err);
0407
0408
0409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
0410 {
0411 struct inet_connection_sock *icsk = inet_csk(sk);
0412 struct tcp_sock *tp = tcp_sk(sk);
0413 struct sk_buff *skb;
0414 s32 remaining;
0415 u32 delta_us;
0416
0417 if (sock_owned_by_user(sk))
0418 return;
0419
0420 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0421 !icsk->icsk_backoff)
0422 return;
0423
0424 skb = tcp_rtx_queue_head(sk);
0425 if (WARN_ON_ONCE(!skb))
0426 return;
0427
0428 icsk->icsk_backoff--;
0429 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
0430 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
0431
0432 tcp_mstamp_refresh(tp);
0433 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
0434 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
0435
0436 if (remaining > 0) {
0437 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
0438 remaining, TCP_RTO_MAX);
0439 } else {
0440
0441
0442
0443 tcp_retransmit_timer(sk);
0444 }
0445 }
0446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
0447
0448
0449
0450
0451
0452
0453
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464 int tcp_v4_err(struct sk_buff *skb, u32 info)
0465 {
0466 const struct iphdr *iph = (const struct iphdr *)skb->data;
0467 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
0468 struct tcp_sock *tp;
0469 struct inet_sock *inet;
0470 const int type = icmp_hdr(skb)->type;
0471 const int code = icmp_hdr(skb)->code;
0472 struct sock *sk;
0473 struct request_sock *fastopen;
0474 u32 seq, snd_una;
0475 int err;
0476 struct net *net = dev_net(skb->dev);
0477
0478 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
0479 th->dest, iph->saddr, ntohs(th->source),
0480 inet_iif(skb), 0);
0481 if (!sk) {
0482 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
0483 return -ENOENT;
0484 }
0485 if (sk->sk_state == TCP_TIME_WAIT) {
0486 inet_twsk_put(inet_twsk(sk));
0487 return 0;
0488 }
0489 seq = ntohl(th->seq);
0490 if (sk->sk_state == TCP_NEW_SYN_RECV) {
0491 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
0492 type == ICMP_TIME_EXCEEDED ||
0493 (type == ICMP_DEST_UNREACH &&
0494 (code == ICMP_NET_UNREACH ||
0495 code == ICMP_HOST_UNREACH)));
0496 return 0;
0497 }
0498
0499 bh_lock_sock(sk);
0500
0501
0502
0503
0504
0505 if (sock_owned_by_user(sk)) {
0506 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
0507 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
0508 }
0509 if (sk->sk_state == TCP_CLOSE)
0510 goto out;
0511
0512 if (static_branch_unlikely(&ip4_min_ttl)) {
0513
0514 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
0515 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
0516 goto out;
0517 }
0518 }
0519
0520 tp = tcp_sk(sk);
0521
0522 fastopen = rcu_dereference(tp->fastopen_rsk);
0523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
0524 if (sk->sk_state != TCP_LISTEN &&
0525 !between(seq, snd_una, tp->snd_nxt)) {
0526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
0527 goto out;
0528 }
0529
0530 switch (type) {
0531 case ICMP_REDIRECT:
0532 if (!sock_owned_by_user(sk))
0533 do_redirect(skb, sk);
0534 goto out;
0535 case ICMP_SOURCE_QUENCH:
0536
0537 goto out;
0538 case ICMP_PARAMETERPROB:
0539 err = EPROTO;
0540 break;
0541 case ICMP_DEST_UNREACH:
0542 if (code > NR_ICMP_UNREACH)
0543 goto out;
0544
0545 if (code == ICMP_FRAG_NEEDED) {
0546
0547
0548
0549
0550 if (sk->sk_state == TCP_LISTEN)
0551 goto out;
0552
0553 WRITE_ONCE(tp->mtu_info, info);
0554 if (!sock_owned_by_user(sk)) {
0555 tcp_v4_mtu_reduced(sk);
0556 } else {
0557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
0558 sock_hold(sk);
0559 }
0560 goto out;
0561 }
0562
0563 err = icmp_err_convert[code].errno;
0564
0565
0566
0567 if (!fastopen &&
0568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
0569 tcp_ld_RTO_revert(sk, seq);
0570 break;
0571 case ICMP_TIME_EXCEEDED:
0572 err = EHOSTUNREACH;
0573 break;
0574 default:
0575 goto out;
0576 }
0577
0578 switch (sk->sk_state) {
0579 case TCP_SYN_SENT:
0580 case TCP_SYN_RECV:
0581
0582
0583
0584 if (fastopen && !fastopen->sk)
0585 break;
0586
0587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
0588
0589 if (!sock_owned_by_user(sk)) {
0590 sk->sk_err = err;
0591
0592 sk_error_report(sk);
0593
0594 tcp_done(sk);
0595 } else {
0596 sk->sk_err_soft = err;
0597 }
0598 goto out;
0599 }
0600
0601
0602
0603
0604
0605
0606
0607
0608
0609
0610
0611
0612
0613
0614
0615
0616
0617 inet = inet_sk(sk);
0618 if (!sock_owned_by_user(sk) && inet->recverr) {
0619 sk->sk_err = err;
0620 sk_error_report(sk);
0621 } else {
0622 sk->sk_err_soft = err;
0623 }
0624
0625 out:
0626 bh_unlock_sock(sk);
0627 sock_put(sk);
0628 return 0;
0629 }
0630
0631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
0632 {
0633 struct tcphdr *th = tcp_hdr(skb);
0634
0635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
0636 skb->csum_start = skb_transport_header(skb) - skb->head;
0637 skb->csum_offset = offsetof(struct tcphdr, check);
0638 }
0639
0640
0641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
0642 {
0643 const struct inet_sock *inet = inet_sk(sk);
0644
0645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
0646 }
0647 EXPORT_SYMBOL(tcp_v4_send_check);
0648
0649
0650
0651
0652
0653
0654
0655
0656
0657
0658
0659
0660
0661
0662 #ifdef CONFIG_TCP_MD5SIG
0663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
0664 #else
0665 #define OPTION_BYTES sizeof(__be32)
0666 #endif
0667
0668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
0669 {
0670 const struct tcphdr *th = tcp_hdr(skb);
0671 struct {
0672 struct tcphdr th;
0673 __be32 opt[OPTION_BYTES / sizeof(__be32)];
0674 } rep;
0675 struct ip_reply_arg arg;
0676 #ifdef CONFIG_TCP_MD5SIG
0677 struct tcp_md5sig_key *key = NULL;
0678 const __u8 *hash_location = NULL;
0679 unsigned char newhash[16];
0680 int genhash;
0681 struct sock *sk1 = NULL;
0682 #endif
0683 u64 transmit_time = 0;
0684 struct sock *ctl_sk;
0685 struct net *net;
0686
0687
0688 if (th->rst)
0689 return;
0690
0691
0692
0693
0694 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
0695 return;
0696
0697
0698 memset(&rep, 0, sizeof(rep));
0699 rep.th.dest = th->source;
0700 rep.th.source = th->dest;
0701 rep.th.doff = sizeof(struct tcphdr) / 4;
0702 rep.th.rst = 1;
0703
0704 if (th->ack) {
0705 rep.th.seq = th->ack_seq;
0706 } else {
0707 rep.th.ack = 1;
0708 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
0709 skb->len - (th->doff << 2));
0710 }
0711
0712 memset(&arg, 0, sizeof(arg));
0713 arg.iov[0].iov_base = (unsigned char *)&rep;
0714 arg.iov[0].iov_len = sizeof(rep.th);
0715
0716 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
0717 #ifdef CONFIG_TCP_MD5SIG
0718 rcu_read_lock();
0719 hash_location = tcp_parse_md5sig_option(th);
0720 if (sk && sk_fullsock(sk)) {
0721 const union tcp_md5_addr *addr;
0722 int l3index;
0723
0724
0725
0726
0727 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
0728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
0729 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
0730 } else if (hash_location) {
0731 const union tcp_md5_addr *addr;
0732 int sdif = tcp_v4_sdif(skb);
0733 int dif = inet_iif(skb);
0734 int l3index;
0735
0736
0737
0738
0739
0740
0741
0742
0743 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
0744 ip_hdr(skb)->saddr,
0745 th->source, ip_hdr(skb)->daddr,
0746 ntohs(th->source), dif, sdif);
0747
0748 if (!sk1)
0749 goto out;
0750
0751
0752
0753
0754 l3index = sdif ? dif : 0;
0755 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
0756 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
0757 if (!key)
0758 goto out;
0759
0760
0761 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
0762 if (genhash || memcmp(hash_location, newhash, 16) != 0)
0763 goto out;
0764
0765 }
0766
0767 if (key) {
0768 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
0769 (TCPOPT_NOP << 16) |
0770 (TCPOPT_MD5SIG << 8) |
0771 TCPOLEN_MD5SIG);
0772
0773 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
0774 rep.th.doff = arg.iov[0].iov_len / 4;
0775
0776 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
0777 key, ip_hdr(skb)->saddr,
0778 ip_hdr(skb)->daddr, &rep.th);
0779 }
0780 #endif
0781
0782 if (rep.opt[0] == 0) {
0783 __be32 mrst = mptcp_reset_option(skb);
0784
0785 if (mrst) {
0786 rep.opt[0] = mrst;
0787 arg.iov[0].iov_len += sizeof(mrst);
0788 rep.th.doff = arg.iov[0].iov_len / 4;
0789 }
0790 }
0791
0792 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
0793 ip_hdr(skb)->saddr,
0794 arg.iov[0].iov_len, IPPROTO_TCP, 0);
0795 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
0796 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
0797
0798
0799
0800
0801
0802 if (sk) {
0803 arg.bound_dev_if = sk->sk_bound_dev_if;
0804 if (sk_fullsock(sk))
0805 trace_tcp_send_reset(sk, skb);
0806 }
0807
0808 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
0809 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
0810
0811 arg.tos = ip_hdr(skb)->tos;
0812 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
0813 local_bh_disable();
0814 ctl_sk = this_cpu_read(ipv4_tcp_sk);
0815 sock_net_set(ctl_sk, net);
0816 if (sk) {
0817 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
0818 inet_twsk(sk)->tw_mark : sk->sk_mark;
0819 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
0820 inet_twsk(sk)->tw_priority : sk->sk_priority;
0821 transmit_time = tcp_transmit_time(sk);
0822 xfrm_sk_clone_policy(ctl_sk, sk);
0823 }
0824 ip_send_unicast_reply(ctl_sk,
0825 skb, &TCP_SKB_CB(skb)->header.h4.opt,
0826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
0827 &arg, arg.iov[0].iov_len,
0828 transmit_time);
0829
0830 ctl_sk->sk_mark = 0;
0831 xfrm_sk_free_policy(ctl_sk);
0832 sock_net_set(ctl_sk, &init_net);
0833 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
0834 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
0835 local_bh_enable();
0836
0837 #ifdef CONFIG_TCP_MD5SIG
0838 out:
0839 rcu_read_unlock();
0840 #endif
0841 }
0842
0843
0844
0845
0846
0847 static void tcp_v4_send_ack(const struct sock *sk,
0848 struct sk_buff *skb, u32 seq, u32 ack,
0849 u32 win, u32 tsval, u32 tsecr, int oif,
0850 struct tcp_md5sig_key *key,
0851 int reply_flags, u8 tos)
0852 {
0853 const struct tcphdr *th = tcp_hdr(skb);
0854 struct {
0855 struct tcphdr th;
0856 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
0857 #ifdef CONFIG_TCP_MD5SIG
0858 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
0859 #endif
0860 ];
0861 } rep;
0862 struct net *net = sock_net(sk);
0863 struct ip_reply_arg arg;
0864 struct sock *ctl_sk;
0865 u64 transmit_time;
0866
0867 memset(&rep.th, 0, sizeof(struct tcphdr));
0868 memset(&arg, 0, sizeof(arg));
0869
0870 arg.iov[0].iov_base = (unsigned char *)&rep;
0871 arg.iov[0].iov_len = sizeof(rep.th);
0872 if (tsecr) {
0873 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
0874 (TCPOPT_TIMESTAMP << 8) |
0875 TCPOLEN_TIMESTAMP);
0876 rep.opt[1] = htonl(tsval);
0877 rep.opt[2] = htonl(tsecr);
0878 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
0879 }
0880
0881
0882 rep.th.dest = th->source;
0883 rep.th.source = th->dest;
0884 rep.th.doff = arg.iov[0].iov_len / 4;
0885 rep.th.seq = htonl(seq);
0886 rep.th.ack_seq = htonl(ack);
0887 rep.th.ack = 1;
0888 rep.th.window = htons(win);
0889
0890 #ifdef CONFIG_TCP_MD5SIG
0891 if (key) {
0892 int offset = (tsecr) ? 3 : 0;
0893
0894 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
0895 (TCPOPT_NOP << 16) |
0896 (TCPOPT_MD5SIG << 8) |
0897 TCPOLEN_MD5SIG);
0898 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
0899 rep.th.doff = arg.iov[0].iov_len/4;
0900
0901 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
0902 key, ip_hdr(skb)->saddr,
0903 ip_hdr(skb)->daddr, &rep.th);
0904 }
0905 #endif
0906 arg.flags = reply_flags;
0907 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
0908 ip_hdr(skb)->saddr,
0909 arg.iov[0].iov_len, IPPROTO_TCP, 0);
0910 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
0911 if (oif)
0912 arg.bound_dev_if = oif;
0913 arg.tos = tos;
0914 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
0915 local_bh_disable();
0916 ctl_sk = this_cpu_read(ipv4_tcp_sk);
0917 sock_net_set(ctl_sk, net);
0918 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
0919 inet_twsk(sk)->tw_mark : sk->sk_mark;
0920 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
0921 inet_twsk(sk)->tw_priority : sk->sk_priority;
0922 transmit_time = tcp_transmit_time(sk);
0923 ip_send_unicast_reply(ctl_sk,
0924 skb, &TCP_SKB_CB(skb)->header.h4.opt,
0925 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
0926 &arg, arg.iov[0].iov_len,
0927 transmit_time);
0928
0929 ctl_sk->sk_mark = 0;
0930 sock_net_set(ctl_sk, &init_net);
0931 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
0932 local_bh_enable();
0933 }
0934
0935 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
0936 {
0937 struct inet_timewait_sock *tw = inet_twsk(sk);
0938 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
0939
0940 tcp_v4_send_ack(sk, skb,
0941 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
0942 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
0943 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
0944 tcptw->tw_ts_recent,
0945 tw->tw_bound_dev_if,
0946 tcp_twsk_md5_key(tcptw),
0947 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
0948 tw->tw_tos
0949 );
0950
0951 inet_twsk_put(tw);
0952 }
0953
0954 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
0955 struct request_sock *req)
0956 {
0957 const union tcp_md5_addr *addr;
0958 int l3index;
0959
0960
0961
0962
0963 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
0964 tcp_sk(sk)->snd_nxt;
0965
0966
0967
0968
0969
0970
0971 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
0972 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
0973 tcp_v4_send_ack(sk, skb, seq,
0974 tcp_rsk(req)->rcv_nxt,
0975 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
0976 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
0977 req->ts_recent,
0978 0,
0979 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
0980 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
0981 ip_hdr(skb)->tos);
0982 }
0983
0984
0985
0986
0987
0988
0989 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
0990 struct flowi *fl,
0991 struct request_sock *req,
0992 struct tcp_fastopen_cookie *foc,
0993 enum tcp_synack_type synack_type,
0994 struct sk_buff *syn_skb)
0995 {
0996 const struct inet_request_sock *ireq = inet_rsk(req);
0997 struct flowi4 fl4;
0998 int err = -1;
0999 struct sk_buff *skb;
1000 u8 tos;
1001
1002
1003 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1004 return -1;
1005
1006 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1007
1008 if (skb) {
1009 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1010
1011 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1012 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1013 (inet_sk(sk)->tos & INET_ECN_MASK) :
1014 inet_sk(sk)->tos;
1015
1016 if (!INET_ECN_is_capable(tos) &&
1017 tcp_bpf_ca_needs_ecn((struct sock *)req))
1018 tos |= INET_ECN_ECT_0;
1019
1020 rcu_read_lock();
1021 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1022 ireq->ir_rmt_addr,
1023 rcu_dereference(ireq->ireq_opt),
1024 tos);
1025 rcu_read_unlock();
1026 err = net_xmit_eval(err);
1027 }
1028
1029 return err;
1030 }
1031
1032
1033
1034
1035 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1036 {
1037 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1038 }
1039
1040 #ifdef CONFIG_TCP_MD5SIG
1041
1042
1043
1044
1045
1046
1047 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1048 EXPORT_SYMBOL(tcp_md5_needed);
1049
1050 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1051 {
1052 if (!old)
1053 return true;
1054
1055
1056 if (old->l3index && new->l3index == 0)
1057 return false;
1058 if (old->l3index == 0 && new->l3index)
1059 return true;
1060
1061 return old->prefixlen < new->prefixlen;
1062 }
1063
1064
1065 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1066 const union tcp_md5_addr *addr,
1067 int family)
1068 {
1069 const struct tcp_sock *tp = tcp_sk(sk);
1070 struct tcp_md5sig_key *key;
1071 const struct tcp_md5sig_info *md5sig;
1072 __be32 mask;
1073 struct tcp_md5sig_key *best_match = NULL;
1074 bool match;
1075
1076
1077 md5sig = rcu_dereference_check(tp->md5sig_info,
1078 lockdep_sock_is_held(sk));
1079 if (!md5sig)
1080 return NULL;
1081
1082 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1083 lockdep_sock_is_held(sk)) {
1084 if (key->family != family)
1085 continue;
1086 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1087 continue;
1088 if (family == AF_INET) {
1089 mask = inet_make_mask(key->prefixlen);
1090 match = (key->addr.a4.s_addr & mask) ==
1091 (addr->a4.s_addr & mask);
1092 #if IS_ENABLED(CONFIG_IPV6)
1093 } else if (family == AF_INET6) {
1094 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1095 key->prefixlen);
1096 #endif
1097 } else {
1098 match = false;
1099 }
1100
1101 if (match && better_md5_match(best_match, key))
1102 best_match = key;
1103 }
1104 return best_match;
1105 }
1106 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1107
1108 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1109 const union tcp_md5_addr *addr,
1110 int family, u8 prefixlen,
1111 int l3index, u8 flags)
1112 {
1113 const struct tcp_sock *tp = tcp_sk(sk);
1114 struct tcp_md5sig_key *key;
1115 unsigned int size = sizeof(struct in_addr);
1116 const struct tcp_md5sig_info *md5sig;
1117
1118
1119 md5sig = rcu_dereference_check(tp->md5sig_info,
1120 lockdep_sock_is_held(sk));
1121 if (!md5sig)
1122 return NULL;
1123 #if IS_ENABLED(CONFIG_IPV6)
1124 if (family == AF_INET6)
1125 size = sizeof(struct in6_addr);
1126 #endif
1127 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1128 lockdep_sock_is_held(sk)) {
1129 if (key->family != family)
1130 continue;
1131 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1132 continue;
1133 if (key->l3index != l3index)
1134 continue;
1135 if (!memcmp(&key->addr, addr, size) &&
1136 key->prefixlen == prefixlen)
1137 return key;
1138 }
1139 return NULL;
1140 }
1141
1142 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1143 const struct sock *addr_sk)
1144 {
1145 const union tcp_md5_addr *addr;
1146 int l3index;
1147
1148 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1149 addr_sk->sk_bound_dev_if);
1150 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1151 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1152 }
1153 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1154
1155
1156 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1157 int family, u8 prefixlen, int l3index, u8 flags,
1158 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1159 {
1160
1161 struct tcp_md5sig_key *key;
1162 struct tcp_sock *tp = tcp_sk(sk);
1163 struct tcp_md5sig_info *md5sig;
1164
1165 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1166 if (key) {
1167
1168
1169
1170
1171
1172
1173 data_race(memcpy(key->key, newkey, newkeylen));
1174
1175
1176
1177
1178
1179
1180 WRITE_ONCE(key->keylen, newkeylen);
1181
1182 return 0;
1183 }
1184
1185 md5sig = rcu_dereference_protected(tp->md5sig_info,
1186 lockdep_sock_is_held(sk));
1187 if (!md5sig) {
1188 md5sig = kmalloc(sizeof(*md5sig), gfp);
1189 if (!md5sig)
1190 return -ENOMEM;
1191
1192 sk_gso_disable(sk);
1193 INIT_HLIST_HEAD(&md5sig->head);
1194 rcu_assign_pointer(tp->md5sig_info, md5sig);
1195 }
1196
1197 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1198 if (!key)
1199 return -ENOMEM;
1200 if (!tcp_alloc_md5sig_pool()) {
1201 sock_kfree_s(sk, key, sizeof(*key));
1202 return -ENOMEM;
1203 }
1204
1205 memcpy(key->key, newkey, newkeylen);
1206 key->keylen = newkeylen;
1207 key->family = family;
1208 key->prefixlen = prefixlen;
1209 key->l3index = l3index;
1210 key->flags = flags;
1211 memcpy(&key->addr, addr,
1212 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1213 sizeof(struct in_addr));
1214 hlist_add_head_rcu(&key->node, &md5sig->head);
1215 return 0;
1216 }
1217 EXPORT_SYMBOL(tcp_md5_do_add);
1218
1219 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1220 u8 prefixlen, int l3index, u8 flags)
1221 {
1222 struct tcp_md5sig_key *key;
1223
1224 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1225 if (!key)
1226 return -ENOENT;
1227 hlist_del_rcu(&key->node);
1228 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1229 kfree_rcu(key, rcu);
1230 return 0;
1231 }
1232 EXPORT_SYMBOL(tcp_md5_do_del);
1233
1234 static void tcp_clear_md5_list(struct sock *sk)
1235 {
1236 struct tcp_sock *tp = tcp_sk(sk);
1237 struct tcp_md5sig_key *key;
1238 struct hlist_node *n;
1239 struct tcp_md5sig_info *md5sig;
1240
1241 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1242
1243 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1244 hlist_del_rcu(&key->node);
1245 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1246 kfree_rcu(key, rcu);
1247 }
1248 }
1249
1250 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1251 sockptr_t optval, int optlen)
1252 {
1253 struct tcp_md5sig cmd;
1254 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1255 const union tcp_md5_addr *addr;
1256 u8 prefixlen = 32;
1257 int l3index = 0;
1258 u8 flags;
1259
1260 if (optlen < sizeof(cmd))
1261 return -EINVAL;
1262
1263 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1264 return -EFAULT;
1265
1266 if (sin->sin_family != AF_INET)
1267 return -EINVAL;
1268
1269 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1270
1271 if (optname == TCP_MD5SIG_EXT &&
1272 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1273 prefixlen = cmd.tcpm_prefixlen;
1274 if (prefixlen > 32)
1275 return -EINVAL;
1276 }
1277
1278 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1279 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1280 struct net_device *dev;
1281
1282 rcu_read_lock();
1283 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1284 if (dev && netif_is_l3_master(dev))
1285 l3index = dev->ifindex;
1286
1287 rcu_read_unlock();
1288
1289
1290
1291
1292 if (!dev || !l3index)
1293 return -EINVAL;
1294 }
1295
1296 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1297
1298 if (!cmd.tcpm_keylen)
1299 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1300
1301 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1302 return -EINVAL;
1303
1304 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1305 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1306 }
1307
1308 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1309 __be32 daddr, __be32 saddr,
1310 const struct tcphdr *th, int nbytes)
1311 {
1312 struct tcp4_pseudohdr *bp;
1313 struct scatterlist sg;
1314 struct tcphdr *_th;
1315
1316 bp = hp->scratch;
1317 bp->saddr = saddr;
1318 bp->daddr = daddr;
1319 bp->pad = 0;
1320 bp->protocol = IPPROTO_TCP;
1321 bp->len = cpu_to_be16(nbytes);
1322
1323 _th = (struct tcphdr *)(bp + 1);
1324 memcpy(_th, th, sizeof(*th));
1325 _th->check = 0;
1326
1327 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1328 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1329 sizeof(*bp) + sizeof(*th));
1330 return crypto_ahash_update(hp->md5_req);
1331 }
1332
1333 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1334 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1335 {
1336 struct tcp_md5sig_pool *hp;
1337 struct ahash_request *req;
1338
1339 hp = tcp_get_md5sig_pool();
1340 if (!hp)
1341 goto clear_hash_noput;
1342 req = hp->md5_req;
1343
1344 if (crypto_ahash_init(req))
1345 goto clear_hash;
1346 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1347 goto clear_hash;
1348 if (tcp_md5_hash_key(hp, key))
1349 goto clear_hash;
1350 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1351 if (crypto_ahash_final(req))
1352 goto clear_hash;
1353
1354 tcp_put_md5sig_pool();
1355 return 0;
1356
1357 clear_hash:
1358 tcp_put_md5sig_pool();
1359 clear_hash_noput:
1360 memset(md5_hash, 0, 16);
1361 return 1;
1362 }
1363
1364 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1365 const struct sock *sk,
1366 const struct sk_buff *skb)
1367 {
1368 struct tcp_md5sig_pool *hp;
1369 struct ahash_request *req;
1370 const struct tcphdr *th = tcp_hdr(skb);
1371 __be32 saddr, daddr;
1372
1373 if (sk) {
1374 saddr = sk->sk_rcv_saddr;
1375 daddr = sk->sk_daddr;
1376 } else {
1377 const struct iphdr *iph = ip_hdr(skb);
1378 saddr = iph->saddr;
1379 daddr = iph->daddr;
1380 }
1381
1382 hp = tcp_get_md5sig_pool();
1383 if (!hp)
1384 goto clear_hash_noput;
1385 req = hp->md5_req;
1386
1387 if (crypto_ahash_init(req))
1388 goto clear_hash;
1389
1390 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1391 goto clear_hash;
1392 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1393 goto clear_hash;
1394 if (tcp_md5_hash_key(hp, key))
1395 goto clear_hash;
1396 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1397 if (crypto_ahash_final(req))
1398 goto clear_hash;
1399
1400 tcp_put_md5sig_pool();
1401 return 0;
1402
1403 clear_hash:
1404 tcp_put_md5sig_pool();
1405 clear_hash_noput:
1406 memset(md5_hash, 0, 16);
1407 return 1;
1408 }
1409 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1410
1411 #endif
1412
1413 static void tcp_v4_init_req(struct request_sock *req,
1414 const struct sock *sk_listener,
1415 struct sk_buff *skb)
1416 {
1417 struct inet_request_sock *ireq = inet_rsk(req);
1418 struct net *net = sock_net(sk_listener);
1419
1420 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1421 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1422 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1423 }
1424
1425 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1426 struct sk_buff *skb,
1427 struct flowi *fl,
1428 struct request_sock *req)
1429 {
1430 tcp_v4_init_req(req, sk, skb);
1431
1432 if (security_inet_conn_request(sk, skb, req))
1433 return NULL;
1434
1435 return inet_csk_route_req(sk, &fl->u.ip4, req);
1436 }
1437
1438 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1439 .family = PF_INET,
1440 .obj_size = sizeof(struct tcp_request_sock),
1441 .rtx_syn_ack = tcp_rtx_synack,
1442 .send_ack = tcp_v4_reqsk_send_ack,
1443 .destructor = tcp_v4_reqsk_destructor,
1444 .send_reset = tcp_v4_send_reset,
1445 .syn_ack_timeout = tcp_syn_ack_timeout,
1446 };
1447
1448 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1449 .mss_clamp = TCP_MSS_DEFAULT,
1450 #ifdef CONFIG_TCP_MD5SIG
1451 .req_md5_lookup = tcp_v4_md5_lookup,
1452 .calc_md5_hash = tcp_v4_md5_hash_skb,
1453 #endif
1454 #ifdef CONFIG_SYN_COOKIES
1455 .cookie_init_seq = cookie_v4_init_sequence,
1456 #endif
1457 .route_req = tcp_v4_route_req,
1458 .init_seq = tcp_v4_init_seq,
1459 .init_ts_off = tcp_v4_init_ts_off,
1460 .send_synack = tcp_v4_send_synack,
1461 };
1462
1463 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1464 {
1465
1466 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1467 goto drop;
1468
1469 return tcp_conn_request(&tcp_request_sock_ops,
1470 &tcp_request_sock_ipv4_ops, sk, skb);
1471
1472 drop:
1473 tcp_listendrop(sk);
1474 return 0;
1475 }
1476 EXPORT_SYMBOL(tcp_v4_conn_request);
1477
1478
1479
1480
1481
1482
1483 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1484 struct request_sock *req,
1485 struct dst_entry *dst,
1486 struct request_sock *req_unhash,
1487 bool *own_req)
1488 {
1489 struct inet_request_sock *ireq;
1490 bool found_dup_sk = false;
1491 struct inet_sock *newinet;
1492 struct tcp_sock *newtp;
1493 struct sock *newsk;
1494 #ifdef CONFIG_TCP_MD5SIG
1495 const union tcp_md5_addr *addr;
1496 struct tcp_md5sig_key *key;
1497 int l3index;
1498 #endif
1499 struct ip_options_rcu *inet_opt;
1500
1501 if (sk_acceptq_is_full(sk))
1502 goto exit_overflow;
1503
1504 newsk = tcp_create_openreq_child(sk, req, skb);
1505 if (!newsk)
1506 goto exit_nonewsk;
1507
1508 newsk->sk_gso_type = SKB_GSO_TCPV4;
1509 inet_sk_rx_dst_set(newsk, skb);
1510
1511 newtp = tcp_sk(newsk);
1512 newinet = inet_sk(newsk);
1513 ireq = inet_rsk(req);
1514 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1515 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1516 newsk->sk_bound_dev_if = ireq->ir_iif;
1517 newinet->inet_saddr = ireq->ir_loc_addr;
1518 inet_opt = rcu_dereference(ireq->ireq_opt);
1519 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1520 newinet->mc_index = inet_iif(skb);
1521 newinet->mc_ttl = ip_hdr(skb)->ttl;
1522 newinet->rcv_tos = ip_hdr(skb)->tos;
1523 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1524 if (inet_opt)
1525 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1526 newinet->inet_id = prandom_u32();
1527
1528
1529
1530
1531 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1532 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1533
1534 if (!dst) {
1535 dst = inet_csk_route_child_sock(sk, newsk, req);
1536 if (!dst)
1537 goto put_and_exit;
1538 } else {
1539
1540 }
1541 sk_setup_caps(newsk, dst);
1542
1543 tcp_ca_openreq_child(newsk, dst);
1544
1545 tcp_sync_mss(newsk, dst_mtu(dst));
1546 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1547
1548 tcp_initialize_rcv_mss(newsk);
1549
1550 #ifdef CONFIG_TCP_MD5SIG
1551 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1552
1553 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1554 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1555 if (key) {
1556
1557
1558
1559
1560
1561
1562 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1563 key->key, key->keylen, GFP_ATOMIC);
1564 sk_gso_disable(newsk);
1565 }
1566 #endif
1567
1568 if (__inet_inherit_port(sk, newsk) < 0)
1569 goto put_and_exit;
1570 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1571 &found_dup_sk);
1572 if (likely(*own_req)) {
1573 tcp_move_syn(newtp, req);
1574 ireq->ireq_opt = NULL;
1575 } else {
1576 newinet->inet_opt = NULL;
1577
1578 if (!req_unhash && found_dup_sk) {
1579
1580
1581
1582 bh_unlock_sock(newsk);
1583 sock_put(newsk);
1584 newsk = NULL;
1585 }
1586 }
1587 return newsk;
1588
1589 exit_overflow:
1590 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1591 exit_nonewsk:
1592 dst_release(dst);
1593 exit:
1594 tcp_listendrop(sk);
1595 return NULL;
1596 put_and_exit:
1597 newinet->inet_opt = NULL;
1598 inet_csk_prepare_forced_close(newsk);
1599 tcp_done(newsk);
1600 goto exit;
1601 }
1602 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1603
1604 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1605 {
1606 #ifdef CONFIG_SYN_COOKIES
1607 const struct tcphdr *th = tcp_hdr(skb);
1608
1609 if (!th->syn)
1610 sk = cookie_v4_check(sk, skb);
1611 #endif
1612 return sk;
1613 }
1614
1615 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1616 struct tcphdr *th, u32 *cookie)
1617 {
1618 u16 mss = 0;
1619 #ifdef CONFIG_SYN_COOKIES
1620 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1621 &tcp_request_sock_ipv4_ops, sk, th);
1622 if (mss) {
1623 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1624 tcp_synq_overflow(sk);
1625 }
1626 #endif
1627 return mss;
1628 }
1629
1630 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1631 u32));
1632
1633
1634
1635
1636
1637
1638
1639
1640 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1641 {
1642 enum skb_drop_reason reason;
1643 struct sock *rsk;
1644
1645 if (sk->sk_state == TCP_ESTABLISHED) {
1646 struct dst_entry *dst;
1647
1648 dst = rcu_dereference_protected(sk->sk_rx_dst,
1649 lockdep_sock_is_held(sk));
1650
1651 sock_rps_save_rxhash(sk, skb);
1652 sk_mark_napi_id(sk, skb);
1653 if (dst) {
1654 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1655 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1656 dst, 0)) {
1657 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1658 dst_release(dst);
1659 }
1660 }
1661 tcp_rcv_established(sk, skb);
1662 return 0;
1663 }
1664
1665 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1666 if (tcp_checksum_complete(skb))
1667 goto csum_err;
1668
1669 if (sk->sk_state == TCP_LISTEN) {
1670 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1671
1672 if (!nsk)
1673 goto discard;
1674 if (nsk != sk) {
1675 if (tcp_child_process(sk, nsk, skb)) {
1676 rsk = nsk;
1677 goto reset;
1678 }
1679 return 0;
1680 }
1681 } else
1682 sock_rps_save_rxhash(sk, skb);
1683
1684 if (tcp_rcv_state_process(sk, skb)) {
1685 rsk = sk;
1686 goto reset;
1687 }
1688 return 0;
1689
1690 reset:
1691 tcp_v4_send_reset(rsk, skb);
1692 discard:
1693 kfree_skb_reason(skb, reason);
1694
1695
1696
1697
1698
1699 return 0;
1700
1701 csum_err:
1702 reason = SKB_DROP_REASON_TCP_CSUM;
1703 trace_tcp_bad_csum(skb);
1704 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1705 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1706 goto discard;
1707 }
1708 EXPORT_SYMBOL(tcp_v4_do_rcv);
1709
1710 int tcp_v4_early_demux(struct sk_buff *skb)
1711 {
1712 const struct iphdr *iph;
1713 const struct tcphdr *th;
1714 struct sock *sk;
1715
1716 if (skb->pkt_type != PACKET_HOST)
1717 return 0;
1718
1719 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1720 return 0;
1721
1722 iph = ip_hdr(skb);
1723 th = tcp_hdr(skb);
1724
1725 if (th->doff < sizeof(struct tcphdr) / 4)
1726 return 0;
1727
1728 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1729 iph->saddr, th->source,
1730 iph->daddr, ntohs(th->dest),
1731 skb->skb_iif, inet_sdif(skb));
1732 if (sk) {
1733 skb->sk = sk;
1734 skb->destructor = sock_edemux;
1735 if (sk_fullsock(sk)) {
1736 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1737
1738 if (dst)
1739 dst = dst_check(dst, 0);
1740 if (dst &&
1741 sk->sk_rx_dst_ifindex == skb->skb_iif)
1742 skb_dst_set_noref(skb, dst);
1743 }
1744 }
1745 return 0;
1746 }
1747
1748 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1749 enum skb_drop_reason *reason)
1750 {
1751 u32 limit, tail_gso_size, tail_gso_segs;
1752 struct skb_shared_info *shinfo;
1753 const struct tcphdr *th;
1754 struct tcphdr *thtail;
1755 struct sk_buff *tail;
1756 unsigned int hdrlen;
1757 bool fragstolen;
1758 u32 gso_segs;
1759 u32 gso_size;
1760 int delta;
1761
1762
1763
1764
1765
1766
1767
1768 skb_condense(skb);
1769
1770 skb_dst_drop(skb);
1771
1772 if (unlikely(tcp_checksum_complete(skb))) {
1773 bh_unlock_sock(sk);
1774 trace_tcp_bad_csum(skb);
1775 *reason = SKB_DROP_REASON_TCP_CSUM;
1776 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1778 return true;
1779 }
1780
1781
1782
1783
1784
1785 th = (const struct tcphdr *)skb->data;
1786 hdrlen = th->doff * 4;
1787
1788 tail = sk->sk_backlog.tail;
1789 if (!tail)
1790 goto no_coalesce;
1791 thtail = (struct tcphdr *)tail->data;
1792
1793 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1794 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1795 ((TCP_SKB_CB(tail)->tcp_flags |
1796 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1797 !((TCP_SKB_CB(tail)->tcp_flags &
1798 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1799 ((TCP_SKB_CB(tail)->tcp_flags ^
1800 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1801 #ifdef CONFIG_TLS_DEVICE
1802 tail->decrypted != skb->decrypted ||
1803 #endif
1804 thtail->doff != th->doff ||
1805 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1806 goto no_coalesce;
1807
1808 __skb_pull(skb, hdrlen);
1809
1810 shinfo = skb_shinfo(skb);
1811 gso_size = shinfo->gso_size ?: skb->len;
1812 gso_segs = shinfo->gso_segs ?: 1;
1813
1814 shinfo = skb_shinfo(tail);
1815 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1816 tail_gso_segs = shinfo->gso_segs ?: 1;
1817
1818 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1819 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1820
1821 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1822 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1823 thtail->window = th->window;
1824 }
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834 thtail->fin |= th->fin;
1835 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1836
1837 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1838 TCP_SKB_CB(tail)->has_rxtstamp = true;
1839 tail->tstamp = skb->tstamp;
1840 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1841 }
1842
1843
1844 shinfo->gso_size = max(gso_size, tail_gso_size);
1845 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1846
1847 sk->sk_backlog.len += delta;
1848 __NET_INC_STATS(sock_net(sk),
1849 LINUX_MIB_TCPBACKLOGCOALESCE);
1850 kfree_skb_partial(skb, fragstolen);
1851 return false;
1852 }
1853 __skb_push(skb, hdrlen);
1854
1855 no_coalesce:
1856
1857
1858
1859
1860 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1861
1862 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1863 bh_unlock_sock(sk);
1864 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1865 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1866 return true;
1867 }
1868 return false;
1869 }
1870 EXPORT_SYMBOL(tcp_add_backlog);
1871
1872 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1873 {
1874 struct tcphdr *th = (struct tcphdr *)skb->data;
1875
1876 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1877 }
1878 EXPORT_SYMBOL(tcp_filter);
1879
1880 static void tcp_v4_restore_cb(struct sk_buff *skb)
1881 {
1882 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1883 sizeof(struct inet_skb_parm));
1884 }
1885
1886 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1887 const struct tcphdr *th)
1888 {
1889
1890
1891
1892 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1893 sizeof(struct inet_skb_parm));
1894 barrier();
1895
1896 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1897 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1898 skb->len - th->doff * 4);
1899 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1900 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1901 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1902 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1903 TCP_SKB_CB(skb)->sacked = 0;
1904 TCP_SKB_CB(skb)->has_rxtstamp =
1905 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1906 }
1907
1908
1909
1910
1911
1912 int tcp_v4_rcv(struct sk_buff *skb)
1913 {
1914 struct net *net = dev_net(skb->dev);
1915 enum skb_drop_reason drop_reason;
1916 int sdif = inet_sdif(skb);
1917 int dif = inet_iif(skb);
1918 const struct iphdr *iph;
1919 const struct tcphdr *th;
1920 bool refcounted;
1921 struct sock *sk;
1922 int ret;
1923
1924 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1925 if (skb->pkt_type != PACKET_HOST)
1926 goto discard_it;
1927
1928
1929 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1930
1931 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1932 goto discard_it;
1933
1934 th = (const struct tcphdr *)skb->data;
1935
1936 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1937 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1938 goto bad_packet;
1939 }
1940 if (!pskb_may_pull(skb, th->doff * 4))
1941 goto discard_it;
1942
1943
1944
1945
1946
1947
1948 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1949 goto csum_error;
1950
1951 th = (const struct tcphdr *)skb->data;
1952 iph = ip_hdr(skb);
1953 lookup:
1954 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1955 th->dest, sdif, &refcounted);
1956 if (!sk)
1957 goto no_tcp_socket;
1958
1959 process:
1960 if (sk->sk_state == TCP_TIME_WAIT)
1961 goto do_time_wait;
1962
1963 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1964 struct request_sock *req = inet_reqsk(sk);
1965 bool req_stolen = false;
1966 struct sock *nsk;
1967
1968 sk = req->rsk_listener;
1969 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1970 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1971 else
1972 drop_reason = tcp_inbound_md5_hash(sk, skb,
1973 &iph->saddr, &iph->daddr,
1974 AF_INET, dif, sdif);
1975 if (unlikely(drop_reason)) {
1976 sk_drops_add(sk, skb);
1977 reqsk_put(req);
1978 goto discard_it;
1979 }
1980 if (tcp_checksum_complete(skb)) {
1981 reqsk_put(req);
1982 goto csum_error;
1983 }
1984 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1985 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1986 if (!nsk) {
1987 inet_csk_reqsk_queue_drop_and_put(sk, req);
1988 goto lookup;
1989 }
1990 sk = nsk;
1991
1992
1993
1994 } else {
1995
1996
1997
1998 sock_hold(sk);
1999 }
2000 refcounted = true;
2001 nsk = NULL;
2002 if (!tcp_filter(sk, skb)) {
2003 th = (const struct tcphdr *)skb->data;
2004 iph = ip_hdr(skb);
2005 tcp_v4_fill_cb(skb, iph, th);
2006 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2007 } else {
2008 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2009 }
2010 if (!nsk) {
2011 reqsk_put(req);
2012 if (req_stolen) {
2013
2014
2015
2016
2017
2018 tcp_v4_restore_cb(skb);
2019 sock_put(sk);
2020 goto lookup;
2021 }
2022 goto discard_and_relse;
2023 }
2024 nf_reset_ct(skb);
2025 if (nsk == sk) {
2026 reqsk_put(req);
2027 tcp_v4_restore_cb(skb);
2028 } else if (tcp_child_process(sk, nsk, skb)) {
2029 tcp_v4_send_reset(nsk, skb);
2030 goto discard_and_relse;
2031 } else {
2032 sock_put(sk);
2033 return 0;
2034 }
2035 }
2036
2037 if (static_branch_unlikely(&ip4_min_ttl)) {
2038
2039 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2040 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2041 goto discard_and_relse;
2042 }
2043 }
2044
2045 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2046 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2047 goto discard_and_relse;
2048 }
2049
2050 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2051 &iph->daddr, AF_INET, dif, sdif);
2052 if (drop_reason)
2053 goto discard_and_relse;
2054
2055 nf_reset_ct(skb);
2056
2057 if (tcp_filter(sk, skb)) {
2058 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2059 goto discard_and_relse;
2060 }
2061 th = (const struct tcphdr *)skb->data;
2062 iph = ip_hdr(skb);
2063 tcp_v4_fill_cb(skb, iph, th);
2064
2065 skb->dev = NULL;
2066
2067 if (sk->sk_state == TCP_LISTEN) {
2068 ret = tcp_v4_do_rcv(sk, skb);
2069 goto put_and_return;
2070 }
2071
2072 sk_incoming_cpu_update(sk);
2073
2074 bh_lock_sock_nested(sk);
2075 tcp_segs_in(tcp_sk(sk), skb);
2076 ret = 0;
2077 if (!sock_owned_by_user(sk)) {
2078 ret = tcp_v4_do_rcv(sk, skb);
2079 } else {
2080 if (tcp_add_backlog(sk, skb, &drop_reason))
2081 goto discard_and_relse;
2082 }
2083 bh_unlock_sock(sk);
2084
2085 put_and_return:
2086 if (refcounted)
2087 sock_put(sk);
2088
2089 return ret;
2090
2091 no_tcp_socket:
2092 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2093 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2094 goto discard_it;
2095
2096 tcp_v4_fill_cb(skb, iph, th);
2097
2098 if (tcp_checksum_complete(skb)) {
2099 csum_error:
2100 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2101 trace_tcp_bad_csum(skb);
2102 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2103 bad_packet:
2104 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2105 } else {
2106 tcp_v4_send_reset(NULL, skb);
2107 }
2108
2109 discard_it:
2110 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2111
2112 kfree_skb_reason(skb, drop_reason);
2113 return 0;
2114
2115 discard_and_relse:
2116 sk_drops_add(sk, skb);
2117 if (refcounted)
2118 sock_put(sk);
2119 goto discard_it;
2120
2121 do_time_wait:
2122 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2123 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2124 inet_twsk_put(inet_twsk(sk));
2125 goto discard_it;
2126 }
2127
2128 tcp_v4_fill_cb(skb, iph, th);
2129
2130 if (tcp_checksum_complete(skb)) {
2131 inet_twsk_put(inet_twsk(sk));
2132 goto csum_error;
2133 }
2134 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2135 case TCP_TW_SYN: {
2136 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2137 &tcp_hashinfo, skb,
2138 __tcp_hdrlen(th),
2139 iph->saddr, th->source,
2140 iph->daddr, th->dest,
2141 inet_iif(skb),
2142 sdif);
2143 if (sk2) {
2144 inet_twsk_deschedule_put(inet_twsk(sk));
2145 sk = sk2;
2146 tcp_v4_restore_cb(skb);
2147 refcounted = false;
2148 goto process;
2149 }
2150 }
2151
2152 fallthrough;
2153 case TCP_TW_ACK:
2154 tcp_v4_timewait_ack(sk, skb);
2155 break;
2156 case TCP_TW_RST:
2157 tcp_v4_send_reset(sk, skb);
2158 inet_twsk_deschedule_put(inet_twsk(sk));
2159 goto discard_it;
2160 case TCP_TW_SUCCESS:;
2161 }
2162 goto discard_it;
2163 }
2164
2165 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2166 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2167 .twsk_unique = tcp_twsk_unique,
2168 .twsk_destructor= tcp_twsk_destructor,
2169 };
2170
2171 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2172 {
2173 struct dst_entry *dst = skb_dst(skb);
2174
2175 if (dst && dst_hold_safe(dst)) {
2176 rcu_assign_pointer(sk->sk_rx_dst, dst);
2177 sk->sk_rx_dst_ifindex = skb->skb_iif;
2178 }
2179 }
2180 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2181
2182 const struct inet_connection_sock_af_ops ipv4_specific = {
2183 .queue_xmit = ip_queue_xmit,
2184 .send_check = tcp_v4_send_check,
2185 .rebuild_header = inet_sk_rebuild_header,
2186 .sk_rx_dst_set = inet_sk_rx_dst_set,
2187 .conn_request = tcp_v4_conn_request,
2188 .syn_recv_sock = tcp_v4_syn_recv_sock,
2189 .net_header_len = sizeof(struct iphdr),
2190 .setsockopt = ip_setsockopt,
2191 .getsockopt = ip_getsockopt,
2192 .addr2sockaddr = inet_csk_addr2sockaddr,
2193 .sockaddr_len = sizeof(struct sockaddr_in),
2194 .mtu_reduced = tcp_v4_mtu_reduced,
2195 };
2196 EXPORT_SYMBOL(ipv4_specific);
2197
2198 #ifdef CONFIG_TCP_MD5SIG
2199 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2200 .md5_lookup = tcp_v4_md5_lookup,
2201 .calc_md5_hash = tcp_v4_md5_hash_skb,
2202 .md5_parse = tcp_v4_parse_md5_keys,
2203 };
2204 #endif
2205
2206
2207
2208
2209 static int tcp_v4_init_sock(struct sock *sk)
2210 {
2211 struct inet_connection_sock *icsk = inet_csk(sk);
2212
2213 tcp_init_sock(sk);
2214
2215 icsk->icsk_af_ops = &ipv4_specific;
2216
2217 #ifdef CONFIG_TCP_MD5SIG
2218 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2219 #endif
2220
2221 return 0;
2222 }
2223
2224 void tcp_v4_destroy_sock(struct sock *sk)
2225 {
2226 struct tcp_sock *tp = tcp_sk(sk);
2227
2228 trace_tcp_destroy_sock(sk);
2229
2230 tcp_clear_xmit_timers(sk);
2231
2232 tcp_cleanup_congestion_control(sk);
2233
2234 tcp_cleanup_ulp(sk);
2235
2236
2237 tcp_write_queue_purge(sk);
2238
2239
2240 tcp_fastopen_active_disable_ofo_check(sk);
2241
2242
2243 skb_rbtree_purge(&tp->out_of_order_queue);
2244
2245 #ifdef CONFIG_TCP_MD5SIG
2246
2247 if (tp->md5sig_info) {
2248 tcp_clear_md5_list(sk);
2249 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2250 tp->md5sig_info = NULL;
2251 }
2252 #endif
2253
2254
2255 if (inet_csk(sk)->icsk_bind_hash)
2256 inet_put_port(sk);
2257
2258 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2259
2260
2261 tcp_free_fastopen_req(tp);
2262 tcp_fastopen_destroy_cipher(sk);
2263 tcp_saved_syn_free(tp);
2264
2265 sk_sockets_allocated_dec(sk);
2266 }
2267 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2268
2269 #ifdef CONFIG_PROC_FS
2270
2271
2272 static unsigned short seq_file_family(const struct seq_file *seq);
2273
2274 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2275 {
2276 unsigned short family = seq_file_family(seq);
2277
2278
2279 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2280 net_eq(sock_net(sk), seq_file_net(seq)));
2281 }
2282
2283
2284
2285
2286 static void *listening_get_first(struct seq_file *seq)
2287 {
2288 struct tcp_iter_state *st = seq->private;
2289
2290 st->offset = 0;
2291 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2292 struct inet_listen_hashbucket *ilb2;
2293 struct hlist_nulls_node *node;
2294 struct sock *sk;
2295
2296 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2297 if (hlist_nulls_empty(&ilb2->nulls_head))
2298 continue;
2299
2300 spin_lock(&ilb2->lock);
2301 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2302 if (seq_sk_match(seq, sk))
2303 return sk;
2304 }
2305 spin_unlock(&ilb2->lock);
2306 }
2307
2308 return NULL;
2309 }
2310
2311
2312
2313
2314
2315
2316 static void *listening_get_next(struct seq_file *seq, void *cur)
2317 {
2318 struct tcp_iter_state *st = seq->private;
2319 struct inet_listen_hashbucket *ilb2;
2320 struct hlist_nulls_node *node;
2321 struct sock *sk = cur;
2322
2323 ++st->num;
2324 ++st->offset;
2325
2326 sk = sk_nulls_next(sk);
2327 sk_nulls_for_each_from(sk, node) {
2328 if (seq_sk_match(seq, sk))
2329 return sk;
2330 }
2331
2332 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2333 spin_unlock(&ilb2->lock);
2334 ++st->bucket;
2335 return listening_get_first(seq);
2336 }
2337
2338 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2339 {
2340 struct tcp_iter_state *st = seq->private;
2341 void *rc;
2342
2343 st->bucket = 0;
2344 st->offset = 0;
2345 rc = listening_get_first(seq);
2346
2347 while (rc && *pos) {
2348 rc = listening_get_next(seq, rc);
2349 --*pos;
2350 }
2351 return rc;
2352 }
2353
2354 static inline bool empty_bucket(const struct tcp_iter_state *st)
2355 {
2356 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2357 }
2358
2359
2360
2361
2362
2363 static void *established_get_first(struct seq_file *seq)
2364 {
2365 struct tcp_iter_state *st = seq->private;
2366
2367 st->offset = 0;
2368 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2369 struct sock *sk;
2370 struct hlist_nulls_node *node;
2371 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2372
2373
2374 if (empty_bucket(st))
2375 continue;
2376
2377 spin_lock_bh(lock);
2378 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2379 if (seq_sk_match(seq, sk))
2380 return sk;
2381 }
2382 spin_unlock_bh(lock);
2383 }
2384
2385 return NULL;
2386 }
2387
2388 static void *established_get_next(struct seq_file *seq, void *cur)
2389 {
2390 struct sock *sk = cur;
2391 struct hlist_nulls_node *node;
2392 struct tcp_iter_state *st = seq->private;
2393
2394 ++st->num;
2395 ++st->offset;
2396
2397 sk = sk_nulls_next(sk);
2398
2399 sk_nulls_for_each_from(sk, node) {
2400 if (seq_sk_match(seq, sk))
2401 return sk;
2402 }
2403
2404 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2405 ++st->bucket;
2406 return established_get_first(seq);
2407 }
2408
2409 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2410 {
2411 struct tcp_iter_state *st = seq->private;
2412 void *rc;
2413
2414 st->bucket = 0;
2415 rc = established_get_first(seq);
2416
2417 while (rc && pos) {
2418 rc = established_get_next(seq, rc);
2419 --pos;
2420 }
2421 return rc;
2422 }
2423
2424 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2425 {
2426 void *rc;
2427 struct tcp_iter_state *st = seq->private;
2428
2429 st->state = TCP_SEQ_STATE_LISTENING;
2430 rc = listening_get_idx(seq, &pos);
2431
2432 if (!rc) {
2433 st->state = TCP_SEQ_STATE_ESTABLISHED;
2434 rc = established_get_idx(seq, pos);
2435 }
2436
2437 return rc;
2438 }
2439
2440 static void *tcp_seek_last_pos(struct seq_file *seq)
2441 {
2442 struct tcp_iter_state *st = seq->private;
2443 int bucket = st->bucket;
2444 int offset = st->offset;
2445 int orig_num = st->num;
2446 void *rc = NULL;
2447
2448 switch (st->state) {
2449 case TCP_SEQ_STATE_LISTENING:
2450 if (st->bucket > tcp_hashinfo.lhash2_mask)
2451 break;
2452 st->state = TCP_SEQ_STATE_LISTENING;
2453 rc = listening_get_first(seq);
2454 while (offset-- && rc && bucket == st->bucket)
2455 rc = listening_get_next(seq, rc);
2456 if (rc)
2457 break;
2458 st->bucket = 0;
2459 st->state = TCP_SEQ_STATE_ESTABLISHED;
2460 fallthrough;
2461 case TCP_SEQ_STATE_ESTABLISHED:
2462 if (st->bucket > tcp_hashinfo.ehash_mask)
2463 break;
2464 rc = established_get_first(seq);
2465 while (offset-- && rc && bucket == st->bucket)
2466 rc = established_get_next(seq, rc);
2467 }
2468
2469 st->num = orig_num;
2470
2471 return rc;
2472 }
2473
2474 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2475 {
2476 struct tcp_iter_state *st = seq->private;
2477 void *rc;
2478
2479 if (*pos && *pos == st->last_pos) {
2480 rc = tcp_seek_last_pos(seq);
2481 if (rc)
2482 goto out;
2483 }
2484
2485 st->state = TCP_SEQ_STATE_LISTENING;
2486 st->num = 0;
2487 st->bucket = 0;
2488 st->offset = 0;
2489 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2490
2491 out:
2492 st->last_pos = *pos;
2493 return rc;
2494 }
2495 EXPORT_SYMBOL(tcp_seq_start);
2496
2497 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2498 {
2499 struct tcp_iter_state *st = seq->private;
2500 void *rc = NULL;
2501
2502 if (v == SEQ_START_TOKEN) {
2503 rc = tcp_get_idx(seq, 0);
2504 goto out;
2505 }
2506
2507 switch (st->state) {
2508 case TCP_SEQ_STATE_LISTENING:
2509 rc = listening_get_next(seq, v);
2510 if (!rc) {
2511 st->state = TCP_SEQ_STATE_ESTABLISHED;
2512 st->bucket = 0;
2513 st->offset = 0;
2514 rc = established_get_first(seq);
2515 }
2516 break;
2517 case TCP_SEQ_STATE_ESTABLISHED:
2518 rc = established_get_next(seq, v);
2519 break;
2520 }
2521 out:
2522 ++*pos;
2523 st->last_pos = *pos;
2524 return rc;
2525 }
2526 EXPORT_SYMBOL(tcp_seq_next);
2527
2528 void tcp_seq_stop(struct seq_file *seq, void *v)
2529 {
2530 struct tcp_iter_state *st = seq->private;
2531
2532 switch (st->state) {
2533 case TCP_SEQ_STATE_LISTENING:
2534 if (v != SEQ_START_TOKEN)
2535 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2536 break;
2537 case TCP_SEQ_STATE_ESTABLISHED:
2538 if (v)
2539 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2540 break;
2541 }
2542 }
2543 EXPORT_SYMBOL(tcp_seq_stop);
2544
2545 static void get_openreq4(const struct request_sock *req,
2546 struct seq_file *f, int i)
2547 {
2548 const struct inet_request_sock *ireq = inet_rsk(req);
2549 long delta = req->rsk_timer.expires - jiffies;
2550
2551 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2552 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2553 i,
2554 ireq->ir_loc_addr,
2555 ireq->ir_num,
2556 ireq->ir_rmt_addr,
2557 ntohs(ireq->ir_rmt_port),
2558 TCP_SYN_RECV,
2559 0, 0,
2560 1,
2561 jiffies_delta_to_clock_t(delta),
2562 req->num_timeout,
2563 from_kuid_munged(seq_user_ns(f),
2564 sock_i_uid(req->rsk_listener)),
2565 0,
2566 0,
2567 0,
2568 req);
2569 }
2570
2571 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2572 {
2573 int timer_active;
2574 unsigned long timer_expires;
2575 const struct tcp_sock *tp = tcp_sk(sk);
2576 const struct inet_connection_sock *icsk = inet_csk(sk);
2577 const struct inet_sock *inet = inet_sk(sk);
2578 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2579 __be32 dest = inet->inet_daddr;
2580 __be32 src = inet->inet_rcv_saddr;
2581 __u16 destp = ntohs(inet->inet_dport);
2582 __u16 srcp = ntohs(inet->inet_sport);
2583 int rx_queue;
2584 int state;
2585
2586 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2587 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2588 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2589 timer_active = 1;
2590 timer_expires = icsk->icsk_timeout;
2591 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2592 timer_active = 4;
2593 timer_expires = icsk->icsk_timeout;
2594 } else if (timer_pending(&sk->sk_timer)) {
2595 timer_active = 2;
2596 timer_expires = sk->sk_timer.expires;
2597 } else {
2598 timer_active = 0;
2599 timer_expires = jiffies;
2600 }
2601
2602 state = inet_sk_state_load(sk);
2603 if (state == TCP_LISTEN)
2604 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2605 else
2606
2607
2608
2609 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2610 READ_ONCE(tp->copied_seq), 0);
2611
2612 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2613 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2614 i, src, srcp, dest, destp, state,
2615 READ_ONCE(tp->write_seq) - tp->snd_una,
2616 rx_queue,
2617 timer_active,
2618 jiffies_delta_to_clock_t(timer_expires - jiffies),
2619 icsk->icsk_retransmits,
2620 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2621 icsk->icsk_probes_out,
2622 sock_i_ino(sk),
2623 refcount_read(&sk->sk_refcnt), sk,
2624 jiffies_to_clock_t(icsk->icsk_rto),
2625 jiffies_to_clock_t(icsk->icsk_ack.ato),
2626 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2627 tcp_snd_cwnd(tp),
2628 state == TCP_LISTEN ?
2629 fastopenq->max_qlen :
2630 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2631 }
2632
2633 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2634 struct seq_file *f, int i)
2635 {
2636 long delta = tw->tw_timer.expires - jiffies;
2637 __be32 dest, src;
2638 __u16 destp, srcp;
2639
2640 dest = tw->tw_daddr;
2641 src = tw->tw_rcv_saddr;
2642 destp = ntohs(tw->tw_dport);
2643 srcp = ntohs(tw->tw_sport);
2644
2645 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2646 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2647 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2648 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2649 refcount_read(&tw->tw_refcnt), tw);
2650 }
2651
2652 #define TMPSZ 150
2653
2654 static int tcp4_seq_show(struct seq_file *seq, void *v)
2655 {
2656 struct tcp_iter_state *st;
2657 struct sock *sk = v;
2658
2659 seq_setwidth(seq, TMPSZ - 1);
2660 if (v == SEQ_START_TOKEN) {
2661 seq_puts(seq, " sl local_address rem_address st tx_queue "
2662 "rx_queue tr tm->when retrnsmt uid timeout "
2663 "inode");
2664 goto out;
2665 }
2666 st = seq->private;
2667
2668 if (sk->sk_state == TCP_TIME_WAIT)
2669 get_timewait4_sock(v, seq, st->num);
2670 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2671 get_openreq4(v, seq, st->num);
2672 else
2673 get_tcp4_sock(v, seq, st->num);
2674 out:
2675 seq_pad(seq, '\n');
2676 return 0;
2677 }
2678
2679 #ifdef CONFIG_BPF_SYSCALL
2680 struct bpf_tcp_iter_state {
2681 struct tcp_iter_state state;
2682 unsigned int cur_sk;
2683 unsigned int end_sk;
2684 unsigned int max_sk;
2685 struct sock **batch;
2686 bool st_bucket_done;
2687 };
2688
2689 struct bpf_iter__tcp {
2690 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2691 __bpf_md_ptr(struct sock_common *, sk_common);
2692 uid_t uid __aligned(8);
2693 };
2694
2695 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2696 struct sock_common *sk_common, uid_t uid)
2697 {
2698 struct bpf_iter__tcp ctx;
2699
2700 meta->seq_num--;
2701 ctx.meta = meta;
2702 ctx.sk_common = sk_common;
2703 ctx.uid = uid;
2704 return bpf_iter_run_prog(prog, &ctx);
2705 }
2706
2707 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2708 {
2709 while (iter->cur_sk < iter->end_sk)
2710 sock_put(iter->batch[iter->cur_sk++]);
2711 }
2712
2713 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2714 unsigned int new_batch_sz)
2715 {
2716 struct sock **new_batch;
2717
2718 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2719 GFP_USER | __GFP_NOWARN);
2720 if (!new_batch)
2721 return -ENOMEM;
2722
2723 bpf_iter_tcp_put_batch(iter);
2724 kvfree(iter->batch);
2725 iter->batch = new_batch;
2726 iter->max_sk = new_batch_sz;
2727
2728 return 0;
2729 }
2730
2731 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2732 struct sock *start_sk)
2733 {
2734 struct bpf_tcp_iter_state *iter = seq->private;
2735 struct tcp_iter_state *st = &iter->state;
2736 struct hlist_nulls_node *node;
2737 unsigned int expected = 1;
2738 struct sock *sk;
2739
2740 sock_hold(start_sk);
2741 iter->batch[iter->end_sk++] = start_sk;
2742
2743 sk = sk_nulls_next(start_sk);
2744 sk_nulls_for_each_from(sk, node) {
2745 if (seq_sk_match(seq, sk)) {
2746 if (iter->end_sk < iter->max_sk) {
2747 sock_hold(sk);
2748 iter->batch[iter->end_sk++] = sk;
2749 }
2750 expected++;
2751 }
2752 }
2753 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2754
2755 return expected;
2756 }
2757
2758 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2759 struct sock *start_sk)
2760 {
2761 struct bpf_tcp_iter_state *iter = seq->private;
2762 struct tcp_iter_state *st = &iter->state;
2763 struct hlist_nulls_node *node;
2764 unsigned int expected = 1;
2765 struct sock *sk;
2766
2767 sock_hold(start_sk);
2768 iter->batch[iter->end_sk++] = start_sk;
2769
2770 sk = sk_nulls_next(start_sk);
2771 sk_nulls_for_each_from(sk, node) {
2772 if (seq_sk_match(seq, sk)) {
2773 if (iter->end_sk < iter->max_sk) {
2774 sock_hold(sk);
2775 iter->batch[iter->end_sk++] = sk;
2776 }
2777 expected++;
2778 }
2779 }
2780 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2781
2782 return expected;
2783 }
2784
2785 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2786 {
2787 struct bpf_tcp_iter_state *iter = seq->private;
2788 struct tcp_iter_state *st = &iter->state;
2789 unsigned int expected;
2790 bool resized = false;
2791 struct sock *sk;
2792
2793
2794
2795
2796
2797
2798 if (iter->st_bucket_done) {
2799 st->offset = 0;
2800 st->bucket++;
2801 if (st->state == TCP_SEQ_STATE_LISTENING &&
2802 st->bucket > tcp_hashinfo.lhash2_mask) {
2803 st->state = TCP_SEQ_STATE_ESTABLISHED;
2804 st->bucket = 0;
2805 }
2806 }
2807
2808 again:
2809
2810 iter->cur_sk = 0;
2811 iter->end_sk = 0;
2812 iter->st_bucket_done = false;
2813
2814 sk = tcp_seek_last_pos(seq);
2815 if (!sk)
2816 return NULL;
2817
2818 if (st->state == TCP_SEQ_STATE_LISTENING)
2819 expected = bpf_iter_tcp_listening_batch(seq, sk);
2820 else
2821 expected = bpf_iter_tcp_established_batch(seq, sk);
2822
2823 if (iter->end_sk == expected) {
2824 iter->st_bucket_done = true;
2825 return sk;
2826 }
2827
2828 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2829 resized = true;
2830 goto again;
2831 }
2832
2833 return sk;
2834 }
2835
2836 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2837 {
2838
2839
2840
2841 if (*pos)
2842 return bpf_iter_tcp_batch(seq);
2843
2844 return SEQ_START_TOKEN;
2845 }
2846
2847 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2848 {
2849 struct bpf_tcp_iter_state *iter = seq->private;
2850 struct tcp_iter_state *st = &iter->state;
2851 struct sock *sk;
2852
2853
2854
2855
2856
2857 if (iter->cur_sk < iter->end_sk) {
2858
2859
2860
2861
2862 st->num++;
2863
2864
2865
2866
2867 st->offset++;
2868 sock_put(iter->batch[iter->cur_sk++]);
2869 }
2870
2871 if (iter->cur_sk < iter->end_sk)
2872 sk = iter->batch[iter->cur_sk];
2873 else
2874 sk = bpf_iter_tcp_batch(seq);
2875
2876 ++*pos;
2877
2878
2879
2880 st->last_pos = *pos;
2881 return sk;
2882 }
2883
2884 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2885 {
2886 struct bpf_iter_meta meta;
2887 struct bpf_prog *prog;
2888 struct sock *sk = v;
2889 bool slow;
2890 uid_t uid;
2891 int ret;
2892
2893 if (v == SEQ_START_TOKEN)
2894 return 0;
2895
2896 if (sk_fullsock(sk))
2897 slow = lock_sock_fast(sk);
2898
2899 if (unlikely(sk_unhashed(sk))) {
2900 ret = SEQ_SKIP;
2901 goto unlock;
2902 }
2903
2904 if (sk->sk_state == TCP_TIME_WAIT) {
2905 uid = 0;
2906 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2907 const struct request_sock *req = v;
2908
2909 uid = from_kuid_munged(seq_user_ns(seq),
2910 sock_i_uid(req->rsk_listener));
2911 } else {
2912 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2913 }
2914
2915 meta.seq = seq;
2916 prog = bpf_iter_get_info(&meta, false);
2917 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2918
2919 unlock:
2920 if (sk_fullsock(sk))
2921 unlock_sock_fast(sk, slow);
2922 return ret;
2923
2924 }
2925
2926 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2927 {
2928 struct bpf_tcp_iter_state *iter = seq->private;
2929 struct bpf_iter_meta meta;
2930 struct bpf_prog *prog;
2931
2932 if (!v) {
2933 meta.seq = seq;
2934 prog = bpf_iter_get_info(&meta, true);
2935 if (prog)
2936 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2937 }
2938
2939 if (iter->cur_sk < iter->end_sk) {
2940 bpf_iter_tcp_put_batch(iter);
2941 iter->st_bucket_done = false;
2942 }
2943 }
2944
2945 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2946 .show = bpf_iter_tcp_seq_show,
2947 .start = bpf_iter_tcp_seq_start,
2948 .next = bpf_iter_tcp_seq_next,
2949 .stop = bpf_iter_tcp_seq_stop,
2950 };
2951 #endif
2952 static unsigned short seq_file_family(const struct seq_file *seq)
2953 {
2954 const struct tcp_seq_afinfo *afinfo;
2955
2956 #ifdef CONFIG_BPF_SYSCALL
2957
2958 if (seq->op == &bpf_iter_tcp_seq_ops)
2959 return AF_UNSPEC;
2960 #endif
2961
2962
2963 afinfo = pde_data(file_inode(seq->file));
2964 return afinfo->family;
2965 }
2966
2967 static const struct seq_operations tcp4_seq_ops = {
2968 .show = tcp4_seq_show,
2969 .start = tcp_seq_start,
2970 .next = tcp_seq_next,
2971 .stop = tcp_seq_stop,
2972 };
2973
2974 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2975 .family = AF_INET,
2976 };
2977
2978 static int __net_init tcp4_proc_init_net(struct net *net)
2979 {
2980 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2981 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2982 return -ENOMEM;
2983 return 0;
2984 }
2985
2986 static void __net_exit tcp4_proc_exit_net(struct net *net)
2987 {
2988 remove_proc_entry("tcp", net->proc_net);
2989 }
2990
2991 static struct pernet_operations tcp4_net_ops = {
2992 .init = tcp4_proc_init_net,
2993 .exit = tcp4_proc_exit_net,
2994 };
2995
2996 int __init tcp4_proc_init(void)
2997 {
2998 return register_pernet_subsys(&tcp4_net_ops);
2999 }
3000
3001 void tcp4_proc_exit(void)
3002 {
3003 unregister_pernet_subsys(&tcp4_net_ops);
3004 }
3005 #endif
3006
3007
3008
3009
3010
3011 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3012 {
3013 const struct tcp_sock *tp = tcp_sk(sk);
3014 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3015 READ_ONCE(tp->snd_nxt);
3016
3017 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3018 }
3019 EXPORT_SYMBOL(tcp_stream_memory_free);
3020
3021 struct proto tcp_prot = {
3022 .name = "TCP",
3023 .owner = THIS_MODULE,
3024 .close = tcp_close,
3025 .pre_connect = tcp_v4_pre_connect,
3026 .connect = tcp_v4_connect,
3027 .disconnect = tcp_disconnect,
3028 .accept = inet_csk_accept,
3029 .ioctl = tcp_ioctl,
3030 .init = tcp_v4_init_sock,
3031 .destroy = tcp_v4_destroy_sock,
3032 .shutdown = tcp_shutdown,
3033 .setsockopt = tcp_setsockopt,
3034 .getsockopt = tcp_getsockopt,
3035 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3036 .keepalive = tcp_set_keepalive,
3037 .recvmsg = tcp_recvmsg,
3038 .sendmsg = tcp_sendmsg,
3039 .sendpage = tcp_sendpage,
3040 .backlog_rcv = tcp_v4_do_rcv,
3041 .release_cb = tcp_release_cb,
3042 .hash = inet_hash,
3043 .unhash = inet_unhash,
3044 .get_port = inet_csk_get_port,
3045 .put_port = inet_put_port,
3046 #ifdef CONFIG_BPF_SYSCALL
3047 .psock_update_sk_prot = tcp_bpf_update_proto,
3048 #endif
3049 .enter_memory_pressure = tcp_enter_memory_pressure,
3050 .leave_memory_pressure = tcp_leave_memory_pressure,
3051 .stream_memory_free = tcp_stream_memory_free,
3052 .sockets_allocated = &tcp_sockets_allocated,
3053 .orphan_count = &tcp_orphan_count,
3054
3055 .memory_allocated = &tcp_memory_allocated,
3056 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3057
3058 .memory_pressure = &tcp_memory_pressure,
3059 .sysctl_mem = sysctl_tcp_mem,
3060 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3061 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3062 .max_header = MAX_TCP_HEADER,
3063 .obj_size = sizeof(struct tcp_sock),
3064 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3065 .twsk_prot = &tcp_timewait_sock_ops,
3066 .rsk_prot = &tcp_request_sock_ops,
3067 .h.hashinfo = &tcp_hashinfo,
3068 .no_autobind = true,
3069 .diag_destroy = tcp_abort,
3070 };
3071 EXPORT_SYMBOL(tcp_prot);
3072
3073 static void __net_exit tcp_sk_exit(struct net *net)
3074 {
3075 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3076
3077 if (net->ipv4.tcp_congestion_control)
3078 bpf_module_put(net->ipv4.tcp_congestion_control,
3079 net->ipv4.tcp_congestion_control->owner);
3080 if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3081 kfree(tcp_death_row);
3082 }
3083
3084 static int __net_init tcp_sk_init(struct net *net)
3085 {
3086 int cnt;
3087
3088 net->ipv4.sysctl_tcp_ecn = 2;
3089 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3090
3091 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3092 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3093 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3094 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3095 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3096
3097 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3098 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3099 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3100
3101 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3102 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3103 net->ipv4.sysctl_tcp_syncookies = 1;
3104 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3105 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3106 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3107 net->ipv4.sysctl_tcp_orphan_retries = 0;
3108 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3109 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3110 net->ipv4.sysctl_tcp_tw_reuse = 2;
3111 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3112
3113 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3114 if (!net->ipv4.tcp_death_row)
3115 return -ENOMEM;
3116 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3117 cnt = tcp_hashinfo.ehash_mask + 1;
3118 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3119 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3120
3121 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3122 net->ipv4.sysctl_tcp_sack = 1;
3123 net->ipv4.sysctl_tcp_window_scaling = 1;
3124 net->ipv4.sysctl_tcp_timestamps = 1;
3125 net->ipv4.sysctl_tcp_early_retrans = 3;
3126 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3127 net->ipv4.sysctl_tcp_slow_start_after_idle = 1;
3128 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3129 net->ipv4.sysctl_tcp_max_reordering = 300;
3130 net->ipv4.sysctl_tcp_dsack = 1;
3131 net->ipv4.sysctl_tcp_app_win = 31;
3132 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3133 net->ipv4.sysctl_tcp_frto = 2;
3134 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3135
3136
3137
3138
3139 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3140
3141 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3142
3143
3144 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3145
3146 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3147 net->ipv4.sysctl_tcp_tso_rtt_log = 9;
3148 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3149 net->ipv4.sysctl_tcp_autocorking = 1;
3150 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3151 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3152 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3153 if (net != &init_net) {
3154 memcpy(net->ipv4.sysctl_tcp_rmem,
3155 init_net.ipv4.sysctl_tcp_rmem,
3156 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3157 memcpy(net->ipv4.sysctl_tcp_wmem,
3158 init_net.ipv4.sysctl_tcp_wmem,
3159 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3160 }
3161 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3162 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3163 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3164 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3165 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3166 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3167
3168
3169 if (!net_eq(net, &init_net) &&
3170 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3171 init_net.ipv4.tcp_congestion_control->owner))
3172 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3173 else
3174 net->ipv4.tcp_congestion_control = &tcp_reno;
3175
3176 return 0;
3177 }
3178
3179 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3180 {
3181 struct net *net;
3182
3183 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3184
3185 list_for_each_entry(net, net_exit_list, exit_list)
3186 tcp_fastopen_ctx_destroy(net);
3187 }
3188
3189 static struct pernet_operations __net_initdata tcp_sk_ops = {
3190 .init = tcp_sk_init,
3191 .exit = tcp_sk_exit,
3192 .exit_batch = tcp_sk_exit_batch,
3193 };
3194
3195 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3196 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3197 struct sock_common *sk_common, uid_t uid)
3198
3199 #define INIT_BATCH_SZ 16
3200
3201 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3202 {
3203 struct bpf_tcp_iter_state *iter = priv_data;
3204 int err;
3205
3206 err = bpf_iter_init_seq_net(priv_data, aux);
3207 if (err)
3208 return err;
3209
3210 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3211 if (err) {
3212 bpf_iter_fini_seq_net(priv_data);
3213 return err;
3214 }
3215
3216 return 0;
3217 }
3218
3219 static void bpf_iter_fini_tcp(void *priv_data)
3220 {
3221 struct bpf_tcp_iter_state *iter = priv_data;
3222
3223 bpf_iter_fini_seq_net(priv_data);
3224 kvfree(iter->batch);
3225 }
3226
3227 static const struct bpf_iter_seq_info tcp_seq_info = {
3228 .seq_ops = &bpf_iter_tcp_seq_ops,
3229 .init_seq_private = bpf_iter_init_tcp,
3230 .fini_seq_private = bpf_iter_fini_tcp,
3231 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3232 };
3233
3234 static const struct bpf_func_proto *
3235 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3236 const struct bpf_prog *prog)
3237 {
3238 switch (func_id) {
3239 case BPF_FUNC_setsockopt:
3240 return &bpf_sk_setsockopt_proto;
3241 case BPF_FUNC_getsockopt:
3242 return &bpf_sk_getsockopt_proto;
3243 default:
3244 return NULL;
3245 }
3246 }
3247
3248 static struct bpf_iter_reg tcp_reg_info = {
3249 .target = "tcp",
3250 .ctx_arg_info_size = 1,
3251 .ctx_arg_info = {
3252 { offsetof(struct bpf_iter__tcp, sk_common),
3253 PTR_TO_BTF_ID_OR_NULL },
3254 },
3255 .get_func_proto = bpf_iter_tcp_get_func_proto,
3256 .seq_info = &tcp_seq_info,
3257 };
3258
3259 static void __init bpf_iter_register(void)
3260 {
3261 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3262 if (bpf_iter_reg_target(&tcp_reg_info))
3263 pr_warn("Warning: could not register bpf iterator tcp\n");
3264 }
3265
3266 #endif
3267
3268 void __init tcp_v4_init(void)
3269 {
3270 int cpu, res;
3271
3272 for_each_possible_cpu(cpu) {
3273 struct sock *sk;
3274
3275 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3276 IPPROTO_TCP, &init_net);
3277 if (res)
3278 panic("Failed to create the TCP control socket.\n");
3279 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3280
3281
3282
3283
3284 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3285
3286 per_cpu(ipv4_tcp_sk, cpu) = sk;
3287 }
3288 if (register_pernet_subsys(&tcp_sk_ops))
3289 panic("Failed to create the TCP control socket.\n");
3290
3291 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3292 bpf_iter_register();
3293 #endif
3294 }