0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046 #include <linux/uaccess.h>
0047 #include <linux/module.h>
0048 #include <linux/types.h>
0049 #include <linux/kernel.h>
0050 #include <linux/mm.h>
0051 #include <linux/string.h>
0052 #include <linux/errno.h>
0053 #include <linux/highmem.h>
0054 #include <linux/slab.h>
0055
0056 #include <linux/socket.h>
0057 #include <linux/sockios.h>
0058 #include <linux/in.h>
0059 #include <linux/inet.h>
0060 #include <linux/netdevice.h>
0061 #include <linux/etherdevice.h>
0062 #include <linux/proc_fs.h>
0063 #include <linux/stat.h>
0064 #include <linux/init.h>
0065
0066 #include <net/snmp.h>
0067 #include <net/ip.h>
0068 #include <net/protocol.h>
0069 #include <net/route.h>
0070 #include <net/xfrm.h>
0071 #include <linux/skbuff.h>
0072 #include <net/sock.h>
0073 #include <net/arp.h>
0074 #include <net/icmp.h>
0075 #include <net/checksum.h>
0076 #include <net/inetpeer.h>
0077 #include <net/inet_ecn.h>
0078 #include <net/lwtunnel.h>
0079 #include <linux/bpf-cgroup.h>
0080 #include <linux/igmp.h>
0081 #include <linux/netfilter_ipv4.h>
0082 #include <linux/netfilter_bridge.h>
0083 #include <linux/netlink.h>
0084 #include <linux/tcp.h>
0085
0086 static int
0087 ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
0088 unsigned int mtu,
0089 int (*output)(struct net *, struct sock *, struct sk_buff *));
0090
0091
0092 void ip_send_check(struct iphdr *iph)
0093 {
0094 iph->check = 0;
0095 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
0096 }
0097 EXPORT_SYMBOL(ip_send_check);
0098
0099 int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
0100 {
0101 struct iphdr *iph = ip_hdr(skb);
0102
0103 iph->tot_len = htons(skb->len);
0104 ip_send_check(iph);
0105
0106
0107
0108
0109 skb = l3mdev_ip_out(sk, skb);
0110 if (unlikely(!skb))
0111 return 0;
0112
0113 skb->protocol = htons(ETH_P_IP);
0114
0115 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
0116 net, sk, skb, NULL, skb_dst(skb)->dev,
0117 dst_output);
0118 }
0119
0120 int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
0121 {
0122 int err;
0123
0124 err = __ip_local_out(net, sk, skb);
0125 if (likely(err == 1))
0126 err = dst_output(net, sk, skb);
0127
0128 return err;
0129 }
0130 EXPORT_SYMBOL_GPL(ip_local_out);
0131
0132 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
0133 {
0134 int ttl = inet->uc_ttl;
0135
0136 if (ttl < 0)
0137 ttl = ip4_dst_hoplimit(dst);
0138 return ttl;
0139 }
0140
0141
0142
0143
0144
0145 int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
0146 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
0147 u8 tos)
0148 {
0149 struct inet_sock *inet = inet_sk(sk);
0150 struct rtable *rt = skb_rtable(skb);
0151 struct net *net = sock_net(sk);
0152 struct iphdr *iph;
0153
0154
0155 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
0156 skb_reset_network_header(skb);
0157 iph = ip_hdr(skb);
0158 iph->version = 4;
0159 iph->ihl = 5;
0160 iph->tos = tos;
0161 iph->ttl = ip_select_ttl(inet, &rt->dst);
0162 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
0163 iph->saddr = saddr;
0164 iph->protocol = sk->sk_protocol;
0165
0166 if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) {
0167 iph->frag_off = htons(IP_DF);
0168 iph->id = 0;
0169 } else {
0170 iph->frag_off = 0;
0171
0172
0173
0174 if (sk->sk_protocol == IPPROTO_TCP)
0175 iph->id = (__force __be16)prandom_u32();
0176 else
0177 __ip_select_ident(net, iph, 1);
0178 }
0179
0180 if (opt && opt->opt.optlen) {
0181 iph->ihl += opt->opt.optlen>>2;
0182 ip_options_build(skb, &opt->opt, daddr, rt);
0183 }
0184
0185 skb->priority = sk->sk_priority;
0186 if (!skb->mark)
0187 skb->mark = sk->sk_mark;
0188
0189
0190 return ip_local_out(net, skb->sk, skb);
0191 }
0192 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
0193
0194 static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
0195 {
0196 struct dst_entry *dst = skb_dst(skb);
0197 struct rtable *rt = (struct rtable *)dst;
0198 struct net_device *dev = dst->dev;
0199 unsigned int hh_len = LL_RESERVED_SPACE(dev);
0200 struct neighbour *neigh;
0201 bool is_v6gw = false;
0202
0203 if (rt->rt_type == RTN_MULTICAST) {
0204 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
0205 } else if (rt->rt_type == RTN_BROADCAST)
0206 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
0207
0208 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
0209 skb = skb_expand_head(skb, hh_len);
0210 if (!skb)
0211 return -ENOMEM;
0212 }
0213
0214 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
0215 int res = lwtunnel_xmit(skb);
0216
0217 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
0218 return res;
0219 }
0220
0221 rcu_read_lock_bh();
0222 neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
0223 if (!IS_ERR(neigh)) {
0224 int res;
0225
0226 sock_confirm_neigh(skb, neigh);
0227
0228 res = neigh_output(neigh, skb, is_v6gw);
0229 rcu_read_unlock_bh();
0230 return res;
0231 }
0232 rcu_read_unlock_bh();
0233
0234 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
0235 __func__);
0236 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
0237 return -EINVAL;
0238 }
0239
0240 static int ip_finish_output_gso(struct net *net, struct sock *sk,
0241 struct sk_buff *skb, unsigned int mtu)
0242 {
0243 struct sk_buff *segs, *nskb;
0244 netdev_features_t features;
0245 int ret = 0;
0246
0247
0248
0249 if (skb_gso_validate_network_len(skb, mtu))
0250 return ip_finish_output2(net, sk, skb);
0251
0252
0253
0254
0255
0256
0257
0258
0259
0260
0261
0262
0263
0264
0265 features = netif_skb_features(skb);
0266 BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
0267 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
0268 if (IS_ERR_OR_NULL(segs)) {
0269 kfree_skb(skb);
0270 return -ENOMEM;
0271 }
0272
0273 consume_skb(skb);
0274
0275 skb_list_walk_safe(segs, segs, nskb) {
0276 int err;
0277
0278 skb_mark_not_on_list(segs);
0279 err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
0280
0281 if (err && ret == 0)
0282 ret = err;
0283 }
0284
0285 return ret;
0286 }
0287
0288 static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
0289 {
0290 unsigned int mtu;
0291
0292 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
0293
0294 if (skb_dst(skb)->xfrm) {
0295 IPCB(skb)->flags |= IPSKB_REROUTED;
0296 return dst_output(net, sk, skb);
0297 }
0298 #endif
0299 mtu = ip_skb_dst_mtu(sk, skb);
0300 if (skb_is_gso(skb))
0301 return ip_finish_output_gso(net, sk, skb, mtu);
0302
0303 if (skb->len > mtu || IPCB(skb)->frag_max_size)
0304 return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
0305
0306 return ip_finish_output2(net, sk, skb);
0307 }
0308
0309 static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
0310 {
0311 int ret;
0312
0313 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
0314 switch (ret) {
0315 case NET_XMIT_SUCCESS:
0316 return __ip_finish_output(net, sk, skb);
0317 case NET_XMIT_CN:
0318 return __ip_finish_output(net, sk, skb) ? : ret;
0319 default:
0320 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
0321 return ret;
0322 }
0323 }
0324
0325 static int ip_mc_finish_output(struct net *net, struct sock *sk,
0326 struct sk_buff *skb)
0327 {
0328 struct rtable *new_rt;
0329 bool do_cn = false;
0330 int ret, err;
0331
0332 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
0333 switch (ret) {
0334 case NET_XMIT_CN:
0335 do_cn = true;
0336 fallthrough;
0337 case NET_XMIT_SUCCESS:
0338 break;
0339 default:
0340 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
0341 return ret;
0342 }
0343
0344
0345
0346
0347
0348 new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
0349 if (new_rt) {
0350 new_rt->rt_iif = 0;
0351 skb_dst_drop(skb);
0352 skb_dst_set(skb, &new_rt->dst);
0353 }
0354
0355 err = dev_loopback_xmit(net, sk, skb);
0356 return (do_cn && err) ? ret : err;
0357 }
0358
0359 int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
0360 {
0361 struct rtable *rt = skb_rtable(skb);
0362 struct net_device *dev = rt->dst.dev;
0363
0364
0365
0366
0367 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
0368
0369 skb->dev = dev;
0370 skb->protocol = htons(ETH_P_IP);
0371
0372
0373
0374
0375
0376 if (rt->rt_flags&RTCF_MULTICAST) {
0377 if (sk_mc_loop(sk)
0378 #ifdef CONFIG_IP_MROUTE
0379
0380
0381
0382
0383
0384
0385
0386
0387 &&
0388 ((rt->rt_flags & RTCF_LOCAL) ||
0389 !(IPCB(skb)->flags & IPSKB_FORWARDED))
0390 #endif
0391 ) {
0392 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
0393 if (newskb)
0394 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
0395 net, sk, newskb, NULL, newskb->dev,
0396 ip_mc_finish_output);
0397 }
0398
0399
0400
0401 if (ip_hdr(skb)->ttl == 0) {
0402 kfree_skb(skb);
0403 return 0;
0404 }
0405 }
0406
0407 if (rt->rt_flags&RTCF_BROADCAST) {
0408 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
0409 if (newskb)
0410 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
0411 net, sk, newskb, NULL, newskb->dev,
0412 ip_mc_finish_output);
0413 }
0414
0415 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
0416 net, sk, skb, NULL, skb->dev,
0417 ip_finish_output,
0418 !(IPCB(skb)->flags & IPSKB_REROUTED));
0419 }
0420
0421 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
0422 {
0423 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
0424
0425 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
0426
0427 skb->dev = dev;
0428 skb->protocol = htons(ETH_P_IP);
0429
0430 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
0431 net, sk, skb, indev, dev,
0432 ip_finish_output,
0433 !(IPCB(skb)->flags & IPSKB_REROUTED));
0434 }
0435 EXPORT_SYMBOL(ip_output);
0436
0437
0438
0439
0440
0441
0442
0443 static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
0444 {
0445 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
0446 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
0447
0448 iph->saddr = fl4->saddr;
0449 iph->daddr = fl4->daddr;
0450 }
0451
0452
0453 int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
0454 __u8 tos)
0455 {
0456 struct inet_sock *inet = inet_sk(sk);
0457 struct net *net = sock_net(sk);
0458 struct ip_options_rcu *inet_opt;
0459 struct flowi4 *fl4;
0460 struct rtable *rt;
0461 struct iphdr *iph;
0462 int res;
0463
0464
0465
0466
0467 rcu_read_lock();
0468 inet_opt = rcu_dereference(inet->inet_opt);
0469 fl4 = &fl->u.ip4;
0470 rt = skb_rtable(skb);
0471 if (rt)
0472 goto packet_routed;
0473
0474
0475 rt = (struct rtable *)__sk_dst_check(sk, 0);
0476 if (!rt) {
0477 __be32 daddr;
0478
0479
0480 daddr = inet->inet_daddr;
0481 if (inet_opt && inet_opt->opt.srr)
0482 daddr = inet_opt->opt.faddr;
0483
0484
0485
0486
0487
0488 rt = ip_route_output_ports(net, fl4, sk,
0489 daddr, inet->inet_saddr,
0490 inet->inet_dport,
0491 inet->inet_sport,
0492 sk->sk_protocol,
0493 RT_CONN_FLAGS_TOS(sk, tos),
0494 sk->sk_bound_dev_if);
0495 if (IS_ERR(rt))
0496 goto no_route;
0497 sk_setup_caps(sk, &rt->dst);
0498 }
0499 skb_dst_set_noref(skb, &rt->dst);
0500
0501 packet_routed:
0502 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
0503 goto no_route;
0504
0505
0506 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
0507 skb_reset_network_header(skb);
0508 iph = ip_hdr(skb);
0509 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
0510 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
0511 iph->frag_off = htons(IP_DF);
0512 else
0513 iph->frag_off = 0;
0514 iph->ttl = ip_select_ttl(inet, &rt->dst);
0515 iph->protocol = sk->sk_protocol;
0516 ip_copy_addrs(iph, fl4);
0517
0518
0519
0520 if (inet_opt && inet_opt->opt.optlen) {
0521 iph->ihl += inet_opt->opt.optlen >> 2;
0522 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
0523 }
0524
0525 ip_select_ident_segs(net, skb, sk,
0526 skb_shinfo(skb)->gso_segs ?: 1);
0527
0528
0529 skb->priority = sk->sk_priority;
0530 skb->mark = sk->sk_mark;
0531
0532 res = ip_local_out(net, sk, skb);
0533 rcu_read_unlock();
0534 return res;
0535
0536 no_route:
0537 rcu_read_unlock();
0538 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
0539 kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
0540 return -EHOSTUNREACH;
0541 }
0542 EXPORT_SYMBOL(__ip_queue_xmit);
0543
0544 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
0545 {
0546 return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
0547 }
0548 EXPORT_SYMBOL(ip_queue_xmit);
0549
0550 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
0551 {
0552 to->pkt_type = from->pkt_type;
0553 to->priority = from->priority;
0554 to->protocol = from->protocol;
0555 to->skb_iif = from->skb_iif;
0556 skb_dst_drop(to);
0557 skb_dst_copy(to, from);
0558 to->dev = from->dev;
0559 to->mark = from->mark;
0560
0561 skb_copy_hash(to, from);
0562
0563 #ifdef CONFIG_NET_SCHED
0564 to->tc_index = from->tc_index;
0565 #endif
0566 nf_copy(to, from);
0567 skb_ext_copy(to, from);
0568 #if IS_ENABLED(CONFIG_IP_VS)
0569 to->ipvs_property = from->ipvs_property;
0570 #endif
0571 skb_copy_secmark(to, from);
0572 }
0573
0574 static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
0575 unsigned int mtu,
0576 int (*output)(struct net *, struct sock *, struct sk_buff *))
0577 {
0578 struct iphdr *iph = ip_hdr(skb);
0579
0580 if ((iph->frag_off & htons(IP_DF)) == 0)
0581 return ip_do_fragment(net, sk, skb, output);
0582
0583 if (unlikely(!skb->ignore_df ||
0584 (IPCB(skb)->frag_max_size &&
0585 IPCB(skb)->frag_max_size > mtu))) {
0586 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
0587 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
0588 htonl(mtu));
0589 kfree_skb(skb);
0590 return -EMSGSIZE;
0591 }
0592
0593 return ip_do_fragment(net, sk, skb, output);
0594 }
0595
0596 void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
0597 unsigned int hlen, struct ip_fraglist_iter *iter)
0598 {
0599 unsigned int first_len = skb_pagelen(skb);
0600
0601 iter->frag = skb_shinfo(skb)->frag_list;
0602 skb_frag_list_init(skb);
0603
0604 iter->offset = 0;
0605 iter->iph = iph;
0606 iter->hlen = hlen;
0607
0608 skb->data_len = first_len - skb_headlen(skb);
0609 skb->len = first_len;
0610 iph->tot_len = htons(first_len);
0611 iph->frag_off = htons(IP_MF);
0612 ip_send_check(iph);
0613 }
0614 EXPORT_SYMBOL(ip_fraglist_init);
0615
0616 void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
0617 {
0618 unsigned int hlen = iter->hlen;
0619 struct iphdr *iph = iter->iph;
0620 struct sk_buff *frag;
0621
0622 frag = iter->frag;
0623 frag->ip_summed = CHECKSUM_NONE;
0624 skb_reset_transport_header(frag);
0625 __skb_push(frag, hlen);
0626 skb_reset_network_header(frag);
0627 memcpy(skb_network_header(frag), iph, hlen);
0628 iter->iph = ip_hdr(frag);
0629 iph = iter->iph;
0630 iph->tot_len = htons(frag->len);
0631 ip_copy_metadata(frag, skb);
0632 iter->offset += skb->len - hlen;
0633 iph->frag_off = htons(iter->offset >> 3);
0634 if (frag->next)
0635 iph->frag_off |= htons(IP_MF);
0636
0637 ip_send_check(iph);
0638 }
0639 EXPORT_SYMBOL(ip_fraglist_prepare);
0640
0641 void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
0642 unsigned int ll_rs, unsigned int mtu, bool DF,
0643 struct ip_frag_state *state)
0644 {
0645 struct iphdr *iph = ip_hdr(skb);
0646
0647 state->DF = DF;
0648 state->hlen = hlen;
0649 state->ll_rs = ll_rs;
0650 state->mtu = mtu;
0651
0652 state->left = skb->len - hlen;
0653 state->ptr = hlen;
0654
0655 state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
0656 state->not_last_frag = iph->frag_off & htons(IP_MF);
0657 }
0658 EXPORT_SYMBOL(ip_frag_init);
0659
0660 static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
0661 bool first_frag)
0662 {
0663
0664 IPCB(to)->flags = IPCB(from)->flags;
0665
0666
0667
0668
0669
0670
0671
0672 if (first_frag)
0673 ip_options_fragment(from);
0674 }
0675
0676 struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
0677 {
0678 unsigned int len = state->left;
0679 struct sk_buff *skb2;
0680 struct iphdr *iph;
0681
0682
0683 if (len > state->mtu)
0684 len = state->mtu;
0685
0686
0687 if (len < state->left) {
0688 len &= ~7;
0689 }
0690
0691
0692 skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
0693 if (!skb2)
0694 return ERR_PTR(-ENOMEM);
0695
0696
0697
0698
0699
0700 ip_copy_metadata(skb2, skb);
0701 skb_reserve(skb2, state->ll_rs);
0702 skb_put(skb2, len + state->hlen);
0703 skb_reset_network_header(skb2);
0704 skb2->transport_header = skb2->network_header + state->hlen;
0705
0706
0707
0708
0709
0710
0711 if (skb->sk)
0712 skb_set_owner_w(skb2, skb->sk);
0713
0714
0715
0716
0717
0718 skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
0719
0720
0721
0722
0723 if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
0724 BUG();
0725 state->left -= len;
0726
0727
0728
0729
0730 iph = ip_hdr(skb2);
0731 iph->frag_off = htons((state->offset >> 3));
0732 if (state->DF)
0733 iph->frag_off |= htons(IP_DF);
0734
0735
0736
0737
0738
0739 if (state->left > 0 || state->not_last_frag)
0740 iph->frag_off |= htons(IP_MF);
0741 state->ptr += len;
0742 state->offset += len;
0743
0744 iph->tot_len = htons(len + state->hlen);
0745
0746 ip_send_check(iph);
0747
0748 return skb2;
0749 }
0750 EXPORT_SYMBOL(ip_frag_next);
0751
0752
0753
0754
0755
0756
0757
0758
0759 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
0760 int (*output)(struct net *, struct sock *, struct sk_buff *))
0761 {
0762 struct iphdr *iph;
0763 struct sk_buff *skb2;
0764 bool mono_delivery_time = skb->mono_delivery_time;
0765 struct rtable *rt = skb_rtable(skb);
0766 unsigned int mtu, hlen, ll_rs;
0767 struct ip_fraglist_iter iter;
0768 ktime_t tstamp = skb->tstamp;
0769 struct ip_frag_state state;
0770 int err = 0;
0771
0772
0773 if (skb->ip_summed == CHECKSUM_PARTIAL &&
0774 (err = skb_checksum_help(skb)))
0775 goto fail;
0776
0777
0778
0779
0780
0781 iph = ip_hdr(skb);
0782
0783 mtu = ip_skb_dst_mtu(sk, skb);
0784 if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
0785 mtu = IPCB(skb)->frag_max_size;
0786
0787
0788
0789
0790
0791 hlen = iph->ihl * 4;
0792 mtu = mtu - hlen;
0793 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
0794 ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
0795
0796
0797
0798
0799
0800
0801
0802
0803 if (skb_has_frag_list(skb)) {
0804 struct sk_buff *frag, *frag2;
0805 unsigned int first_len = skb_pagelen(skb);
0806
0807 if (first_len - hlen > mtu ||
0808 ((first_len - hlen) & 7) ||
0809 ip_is_fragment(iph) ||
0810 skb_cloned(skb) ||
0811 skb_headroom(skb) < ll_rs)
0812 goto slow_path;
0813
0814 skb_walk_frags(skb, frag) {
0815
0816 if (frag->len > mtu ||
0817 ((frag->len & 7) && frag->next) ||
0818 skb_headroom(frag) < hlen + ll_rs)
0819 goto slow_path_clean;
0820
0821
0822 if (skb_shared(frag))
0823 goto slow_path_clean;
0824
0825 BUG_ON(frag->sk);
0826 if (skb->sk) {
0827 frag->sk = skb->sk;
0828 frag->destructor = sock_wfree;
0829 }
0830 skb->truesize -= frag->truesize;
0831 }
0832
0833
0834 ip_fraglist_init(skb, iph, hlen, &iter);
0835
0836 for (;;) {
0837
0838
0839 if (iter.frag) {
0840 bool first_frag = (iter.offset == 0);
0841
0842 IPCB(iter.frag)->flags = IPCB(skb)->flags;
0843 ip_fraglist_prepare(skb, &iter);
0844 if (first_frag && IPCB(skb)->opt.optlen) {
0845
0846
0847
0848
0849 IPCB(iter.frag)->opt.optlen =
0850 IPCB(skb)->opt.optlen;
0851 ip_options_fragment(iter.frag);
0852 ip_send_check(iter.iph);
0853 }
0854 }
0855
0856 skb_set_delivery_time(skb, tstamp, mono_delivery_time);
0857 err = output(net, sk, skb);
0858
0859 if (!err)
0860 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
0861 if (err || !iter.frag)
0862 break;
0863
0864 skb = ip_fraglist_next(&iter);
0865 }
0866
0867 if (err == 0) {
0868 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
0869 return 0;
0870 }
0871
0872 kfree_skb_list(iter.frag);
0873
0874 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
0875 return err;
0876
0877 slow_path_clean:
0878 skb_walk_frags(skb, frag2) {
0879 if (frag2 == frag)
0880 break;
0881 frag2->sk = NULL;
0882 frag2->destructor = NULL;
0883 skb->truesize += frag2->truesize;
0884 }
0885 }
0886
0887 slow_path:
0888
0889
0890
0891
0892 ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
0893 &state);
0894
0895
0896
0897
0898
0899 while (state.left > 0) {
0900 bool first_frag = (state.offset == 0);
0901
0902 skb2 = ip_frag_next(skb, &state);
0903 if (IS_ERR(skb2)) {
0904 err = PTR_ERR(skb2);
0905 goto fail;
0906 }
0907 ip_frag_ipcb(skb, skb2, first_frag);
0908
0909
0910
0911
0912 skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
0913 err = output(net, sk, skb2);
0914 if (err)
0915 goto fail;
0916
0917 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
0918 }
0919 consume_skb(skb);
0920 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
0921 return err;
0922
0923 fail:
0924 kfree_skb(skb);
0925 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
0926 return err;
0927 }
0928 EXPORT_SYMBOL(ip_do_fragment);
0929
0930 int
0931 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
0932 {
0933 struct msghdr *msg = from;
0934
0935 if (skb->ip_summed == CHECKSUM_PARTIAL) {
0936 if (!copy_from_iter_full(to, len, &msg->msg_iter))
0937 return -EFAULT;
0938 } else {
0939 __wsum csum = 0;
0940 if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
0941 return -EFAULT;
0942 skb->csum = csum_block_add(skb->csum, csum, odd);
0943 }
0944 return 0;
0945 }
0946 EXPORT_SYMBOL(ip_generic_getfrag);
0947
0948 static inline __wsum
0949 csum_page(struct page *page, int offset, int copy)
0950 {
0951 char *kaddr;
0952 __wsum csum;
0953 kaddr = kmap(page);
0954 csum = csum_partial(kaddr + offset, copy, 0);
0955 kunmap(page);
0956 return csum;
0957 }
0958
0959 static int __ip_append_data(struct sock *sk,
0960 struct flowi4 *fl4,
0961 struct sk_buff_head *queue,
0962 struct inet_cork *cork,
0963 struct page_frag *pfrag,
0964 int getfrag(void *from, char *to, int offset,
0965 int len, int odd, struct sk_buff *skb),
0966 void *from, int length, int transhdrlen,
0967 unsigned int flags)
0968 {
0969 struct inet_sock *inet = inet_sk(sk);
0970 struct ubuf_info *uarg = NULL;
0971 struct sk_buff *skb;
0972 struct ip_options *opt = cork->opt;
0973 int hh_len;
0974 int exthdrlen;
0975 int mtu;
0976 int copy;
0977 int err;
0978 int offset = 0;
0979 bool zc = false;
0980 unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
0981 int csummode = CHECKSUM_NONE;
0982 struct rtable *rt = (struct rtable *)cork->dst;
0983 unsigned int wmem_alloc_delta = 0;
0984 bool paged, extra_uref = false;
0985 u32 tskey = 0;
0986
0987 skb = skb_peek_tail(queue);
0988
0989 exthdrlen = !skb ? rt->dst.header_len : 0;
0990 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
0991 paged = !!cork->gso_size;
0992
0993 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
0994 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
0995 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
0996
0997 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
0998
0999 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1000 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1001 maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;
1002
1003 if (cork->length + length > maxnonfragsize - fragheaderlen) {
1004 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
1005 mtu - (opt ? opt->optlen : 0));
1006 return -EMSGSIZE;
1007 }
1008
1009
1010
1011
1012
1013 if (transhdrlen &&
1014 length + fragheaderlen <= mtu &&
1015 rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
1016 (!(flags & MSG_MORE) || cork->gso_size) &&
1017 (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
1018 csummode = CHECKSUM_PARTIAL;
1019
1020 if ((flags & MSG_ZEROCOPY) && length) {
1021 struct msghdr *msg = from;
1022
1023 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1024 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1025 return -EINVAL;
1026
1027
1028
1029
1030 if ((rt->dst.dev->features & NETIF_F_SG) &&
1031 csummode == CHECKSUM_PARTIAL) {
1032 paged = true;
1033 zc = true;
1034 uarg = msg->msg_ubuf;
1035 }
1036 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1037 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1038 if (!uarg)
1039 return -ENOBUFS;
1040 extra_uref = !skb_zcopy(skb);
1041 if (rt->dst.dev->features & NETIF_F_SG &&
1042 csummode == CHECKSUM_PARTIAL) {
1043 paged = true;
1044 zc = true;
1045 } else {
1046 uarg->zerocopy = 0;
1047 skb_zcopy_set(skb, uarg, &extra_uref);
1048 }
1049 }
1050 }
1051
1052 cork->length += length;
1053
1054
1055
1056
1057
1058
1059
1060
1061 if (!skb)
1062 goto alloc_new_skb;
1063
1064 while (length > 0) {
1065
1066 copy = mtu - skb->len;
1067 if (copy < length)
1068 copy = maxfraglen - skb->len;
1069 if (copy <= 0) {
1070 char *data;
1071 unsigned int datalen;
1072 unsigned int fraglen;
1073 unsigned int fraggap;
1074 unsigned int alloclen, alloc_extra;
1075 unsigned int pagedlen;
1076 struct sk_buff *skb_prev;
1077 alloc_new_skb:
1078 skb_prev = skb;
1079 if (skb_prev)
1080 fraggap = skb_prev->len - maxfraglen;
1081 else
1082 fraggap = 0;
1083
1084
1085
1086
1087
1088 datalen = length + fraggap;
1089 if (datalen > mtu - fragheaderlen)
1090 datalen = maxfraglen - fragheaderlen;
1091 fraglen = datalen + fragheaderlen;
1092 pagedlen = 0;
1093
1094 alloc_extra = hh_len + 15;
1095 alloc_extra += exthdrlen;
1096
1097
1098
1099
1100
1101
1102 if (datalen == length + fraggap)
1103 alloc_extra += rt->dst.trailer_len;
1104
1105 if ((flags & MSG_MORE) &&
1106 !(rt->dst.dev->features&NETIF_F_SG))
1107 alloclen = mtu;
1108 else if (!paged &&
1109 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1110 !(rt->dst.dev->features & NETIF_F_SG)))
1111 alloclen = fraglen;
1112 else if (!zc) {
1113 alloclen = min_t(int, fraglen, MAX_HEADER);
1114 pagedlen = fraglen - alloclen;
1115 } else {
1116 alloclen = fragheaderlen + transhdrlen;
1117 pagedlen = datalen - transhdrlen;
1118 }
1119
1120 alloclen += alloc_extra;
1121
1122 if (transhdrlen) {
1123 skb = sock_alloc_send_skb(sk, alloclen,
1124 (flags & MSG_DONTWAIT), &err);
1125 } else {
1126 skb = NULL;
1127 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1128 2 * sk->sk_sndbuf)
1129 skb = alloc_skb(alloclen,
1130 sk->sk_allocation);
1131 if (unlikely(!skb))
1132 err = -ENOBUFS;
1133 }
1134 if (!skb)
1135 goto error;
1136
1137
1138
1139
1140 skb->ip_summed = csummode;
1141 skb->csum = 0;
1142 skb_reserve(skb, hh_len);
1143
1144
1145
1146
1147 data = skb_put(skb, fraglen + exthdrlen - pagedlen);
1148 skb_set_network_header(skb, exthdrlen);
1149 skb->transport_header = (skb->network_header +
1150 fragheaderlen);
1151 data += fragheaderlen + exthdrlen;
1152
1153 if (fraggap) {
1154 skb->csum = skb_copy_and_csum_bits(
1155 skb_prev, maxfraglen,
1156 data + transhdrlen, fraggap);
1157 skb_prev->csum = csum_sub(skb_prev->csum,
1158 skb->csum);
1159 data += fraggap;
1160 pskb_trim_unique(skb_prev, maxfraglen);
1161 }
1162
1163 copy = datalen - transhdrlen - fraggap - pagedlen;
1164 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1165 err = -EFAULT;
1166 kfree_skb(skb);
1167 goto error;
1168 }
1169
1170 offset += copy;
1171 length -= copy + transhdrlen;
1172 transhdrlen = 0;
1173 exthdrlen = 0;
1174 csummode = CHECKSUM_NONE;
1175
1176
1177 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1178 cork->tx_flags = 0;
1179 skb_shinfo(skb)->tskey = tskey;
1180 tskey = 0;
1181 skb_zcopy_set(skb, uarg, &extra_uref);
1182
1183 if ((flags & MSG_CONFIRM) && !skb_prev)
1184 skb_set_dst_pending_confirm(skb, 1);
1185
1186
1187
1188
1189 if (!skb->destructor) {
1190 skb->destructor = sock_wfree;
1191 skb->sk = sk;
1192 wmem_alloc_delta += skb->truesize;
1193 }
1194 __skb_queue_tail(queue, skb);
1195 continue;
1196 }
1197
1198 if (copy > length)
1199 copy = length;
1200
1201 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1202 skb_tailroom(skb) >= copy) {
1203 unsigned int off;
1204
1205 off = skb->len;
1206 if (getfrag(from, skb_put(skb, copy),
1207 offset, copy, off, skb) < 0) {
1208 __skb_trim(skb, off);
1209 err = -EFAULT;
1210 goto error;
1211 }
1212 } else if (!zc) {
1213 int i = skb_shinfo(skb)->nr_frags;
1214
1215 err = -ENOMEM;
1216 if (!sk_page_frag_refill(sk, pfrag))
1217 goto error;
1218
1219 skb_zcopy_downgrade_managed(skb);
1220 if (!skb_can_coalesce(skb, i, pfrag->page,
1221 pfrag->offset)) {
1222 err = -EMSGSIZE;
1223 if (i == MAX_SKB_FRAGS)
1224 goto error;
1225
1226 __skb_fill_page_desc(skb, i, pfrag->page,
1227 pfrag->offset, 0);
1228 skb_shinfo(skb)->nr_frags = ++i;
1229 get_page(pfrag->page);
1230 }
1231 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1232 if (getfrag(from,
1233 page_address(pfrag->page) + pfrag->offset,
1234 offset, copy, skb->len, skb) < 0)
1235 goto error_efault;
1236
1237 pfrag->offset += copy;
1238 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1239 skb_len_add(skb, copy);
1240 wmem_alloc_delta += copy;
1241 } else {
1242 err = skb_zerocopy_iter_dgram(skb, from, copy);
1243 if (err < 0)
1244 goto error;
1245 }
1246 offset += copy;
1247 length -= copy;
1248 }
1249
1250 if (wmem_alloc_delta)
1251 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1252 return 0;
1253
1254 error_efault:
1255 err = -EFAULT;
1256 error:
1257 net_zcopy_put_abort(uarg, extra_uref);
1258 cork->length -= length;
1259 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1260 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1261 return err;
1262 }
1263
1264 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1265 struct ipcm_cookie *ipc, struct rtable **rtp)
1266 {
1267 struct ip_options_rcu *opt;
1268 struct rtable *rt;
1269
1270 rt = *rtp;
1271 if (unlikely(!rt))
1272 return -EFAULT;
1273
1274
1275
1276
1277 opt = ipc->opt;
1278 if (opt) {
1279 if (!cork->opt) {
1280 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1281 sk->sk_allocation);
1282 if (unlikely(!cork->opt))
1283 return -ENOBUFS;
1284 }
1285 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1286 cork->flags |= IPCORK_OPT;
1287 cork->addr = ipc->addr;
1288 }
1289
1290 cork->fragsize = ip_sk_use_pmtu(sk) ?
1291 dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
1292
1293 if (!inetdev_valid_mtu(cork->fragsize))
1294 return -ENETUNREACH;
1295
1296 cork->gso_size = ipc->gso_size;
1297
1298 cork->dst = &rt->dst;
1299
1300 *rtp = NULL;
1301
1302 cork->length = 0;
1303 cork->ttl = ipc->ttl;
1304 cork->tos = ipc->tos;
1305 cork->mark = ipc->sockc.mark;
1306 cork->priority = ipc->priority;
1307 cork->transmit_time = ipc->sockc.transmit_time;
1308 cork->tx_flags = 0;
1309 sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
1310
1311 return 0;
1312 }
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1326 int getfrag(void *from, char *to, int offset, int len,
1327 int odd, struct sk_buff *skb),
1328 void *from, int length, int transhdrlen,
1329 struct ipcm_cookie *ipc, struct rtable **rtp,
1330 unsigned int flags)
1331 {
1332 struct inet_sock *inet = inet_sk(sk);
1333 int err;
1334
1335 if (flags&MSG_PROBE)
1336 return 0;
1337
1338 if (skb_queue_empty(&sk->sk_write_queue)) {
1339 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1340 if (err)
1341 return err;
1342 } else {
1343 transhdrlen = 0;
1344 }
1345
1346 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1347 sk_page_frag(sk), getfrag,
1348 from, length, transhdrlen, flags);
1349 }
1350
1351 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1352 int offset, size_t size, int flags)
1353 {
1354 struct inet_sock *inet = inet_sk(sk);
1355 struct sk_buff *skb;
1356 struct rtable *rt;
1357 struct ip_options *opt = NULL;
1358 struct inet_cork *cork;
1359 int hh_len;
1360 int mtu;
1361 int len;
1362 int err;
1363 unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
1364
1365 if (inet->hdrincl)
1366 return -EPERM;
1367
1368 if (flags&MSG_PROBE)
1369 return 0;
1370
1371 if (skb_queue_empty(&sk->sk_write_queue))
1372 return -EINVAL;
1373
1374 cork = &inet->cork.base;
1375 rt = (struct rtable *)cork->dst;
1376 if (cork->flags & IPCORK_OPT)
1377 opt = cork->opt;
1378
1379 if (!(rt->dst.dev->features & NETIF_F_SG))
1380 return -EOPNOTSUPP;
1381
1382 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1383 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
1384
1385 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1386 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1387 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
1388
1389 if (cork->length + size > maxnonfragsize - fragheaderlen) {
1390 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
1391 mtu - (opt ? opt->optlen : 0));
1392 return -EMSGSIZE;
1393 }
1394
1395 skb = skb_peek_tail(&sk->sk_write_queue);
1396 if (!skb)
1397 return -EINVAL;
1398
1399 cork->length += size;
1400
1401 while (size > 0) {
1402
1403 len = mtu - skb->len;
1404 if (len < size)
1405 len = maxfraglen - skb->len;
1406
1407 if (len <= 0) {
1408 struct sk_buff *skb_prev;
1409 int alloclen;
1410
1411 skb_prev = skb;
1412 fraggap = skb_prev->len - maxfraglen;
1413
1414 alloclen = fragheaderlen + hh_len + fraggap + 15;
1415 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1416 if (unlikely(!skb)) {
1417 err = -ENOBUFS;
1418 goto error;
1419 }
1420
1421
1422
1423
1424 skb->ip_summed = CHECKSUM_NONE;
1425 skb->csum = 0;
1426 skb_reserve(skb, hh_len);
1427
1428
1429
1430
1431 skb_put(skb, fragheaderlen + fraggap);
1432 skb_reset_network_header(skb);
1433 skb->transport_header = (skb->network_header +
1434 fragheaderlen);
1435 if (fraggap) {
1436 skb->csum = skb_copy_and_csum_bits(skb_prev,
1437 maxfraglen,
1438 skb_transport_header(skb),
1439 fraggap);
1440 skb_prev->csum = csum_sub(skb_prev->csum,
1441 skb->csum);
1442 pskb_trim_unique(skb_prev, maxfraglen);
1443 }
1444
1445
1446
1447
1448 __skb_queue_tail(&sk->sk_write_queue, skb);
1449 continue;
1450 }
1451
1452 if (len > size)
1453 len = size;
1454
1455 if (skb_append_pagefrags(skb, page, offset, len)) {
1456 err = -EMSGSIZE;
1457 goto error;
1458 }
1459
1460 if (skb->ip_summed == CHECKSUM_NONE) {
1461 __wsum csum;
1462 csum = csum_page(page, offset, len);
1463 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1464 }
1465
1466 skb_len_add(skb, len);
1467 refcount_add(len, &sk->sk_wmem_alloc);
1468 offset += len;
1469 size -= len;
1470 }
1471 return 0;
1472
1473 error:
1474 cork->length -= size;
1475 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1476 return err;
1477 }
1478
1479 static void ip_cork_release(struct inet_cork *cork)
1480 {
1481 cork->flags &= ~IPCORK_OPT;
1482 kfree(cork->opt);
1483 cork->opt = NULL;
1484 dst_release(cork->dst);
1485 cork->dst = NULL;
1486 }
1487
1488
1489
1490
1491
1492 struct sk_buff *__ip_make_skb(struct sock *sk,
1493 struct flowi4 *fl4,
1494 struct sk_buff_head *queue,
1495 struct inet_cork *cork)
1496 {
1497 struct sk_buff *skb, *tmp_skb;
1498 struct sk_buff **tail_skb;
1499 struct inet_sock *inet = inet_sk(sk);
1500 struct net *net = sock_net(sk);
1501 struct ip_options *opt = NULL;
1502 struct rtable *rt = (struct rtable *)cork->dst;
1503 struct iphdr *iph;
1504 __be16 df = 0;
1505 __u8 ttl;
1506
1507 skb = __skb_dequeue(queue);
1508 if (!skb)
1509 goto out;
1510 tail_skb = &(skb_shinfo(skb)->frag_list);
1511
1512
1513 if (skb->data < skb_network_header(skb))
1514 __skb_pull(skb, skb_network_offset(skb));
1515 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1516 __skb_pull(tmp_skb, skb_network_header_len(skb));
1517 *tail_skb = tmp_skb;
1518 tail_skb = &(tmp_skb->next);
1519 skb->len += tmp_skb->len;
1520 skb->data_len += tmp_skb->len;
1521 skb->truesize += tmp_skb->truesize;
1522 tmp_skb->destructor = NULL;
1523 tmp_skb->sk = NULL;
1524 }
1525
1526
1527
1528
1529
1530 skb->ignore_df = ip_sk_ignore_df(sk);
1531
1532
1533
1534
1535 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1536 inet->pmtudisc == IP_PMTUDISC_PROBE ||
1537 (skb->len <= dst_mtu(&rt->dst) &&
1538 ip_dont_fragment(sk, &rt->dst)))
1539 df = htons(IP_DF);
1540
1541 if (cork->flags & IPCORK_OPT)
1542 opt = cork->opt;
1543
1544 if (cork->ttl != 0)
1545 ttl = cork->ttl;
1546 else if (rt->rt_type == RTN_MULTICAST)
1547 ttl = inet->mc_ttl;
1548 else
1549 ttl = ip_select_ttl(inet, &rt->dst);
1550
1551 iph = ip_hdr(skb);
1552 iph->version = 4;
1553 iph->ihl = 5;
1554 iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
1555 iph->frag_off = df;
1556 iph->ttl = ttl;
1557 iph->protocol = sk->sk_protocol;
1558 ip_copy_addrs(iph, fl4);
1559 ip_select_ident(net, skb, sk);
1560
1561 if (opt) {
1562 iph->ihl += opt->optlen >> 2;
1563 ip_options_build(skb, opt, cork->addr, rt);
1564 }
1565
1566 skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
1567 skb->mark = cork->mark;
1568 skb->tstamp = cork->transmit_time;
1569
1570
1571
1572
1573 cork->dst = NULL;
1574 skb_dst_set(skb, &rt->dst);
1575
1576 if (iph->protocol == IPPROTO_ICMP)
1577 icmp_out_count(net, ((struct icmphdr *)
1578 skb_transport_header(skb))->type);
1579
1580 ip_cork_release(cork);
1581 out:
1582 return skb;
1583 }
1584
1585 int ip_send_skb(struct net *net, struct sk_buff *skb)
1586 {
1587 int err;
1588
1589 err = ip_local_out(net, skb->sk, skb);
1590 if (err) {
1591 if (err > 0)
1592 err = net_xmit_errno(err);
1593 if (err)
1594 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1595 }
1596
1597 return err;
1598 }
1599
1600 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1601 {
1602 struct sk_buff *skb;
1603
1604 skb = ip_finish_skb(sk, fl4);
1605 if (!skb)
1606 return 0;
1607
1608
1609 return ip_send_skb(sock_net(sk), skb);
1610 }
1611
1612
1613
1614
1615 static void __ip_flush_pending_frames(struct sock *sk,
1616 struct sk_buff_head *queue,
1617 struct inet_cork *cork)
1618 {
1619 struct sk_buff *skb;
1620
1621 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1622 kfree_skb(skb);
1623
1624 ip_cork_release(cork);
1625 }
1626
1627 void ip_flush_pending_frames(struct sock *sk)
1628 {
1629 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1630 }
1631
1632 struct sk_buff *ip_make_skb(struct sock *sk,
1633 struct flowi4 *fl4,
1634 int getfrag(void *from, char *to, int offset,
1635 int len, int odd, struct sk_buff *skb),
1636 void *from, int length, int transhdrlen,
1637 struct ipcm_cookie *ipc, struct rtable **rtp,
1638 struct inet_cork *cork, unsigned int flags)
1639 {
1640 struct sk_buff_head queue;
1641 int err;
1642
1643 if (flags & MSG_PROBE)
1644 return NULL;
1645
1646 __skb_queue_head_init(&queue);
1647
1648 cork->flags = 0;
1649 cork->addr = 0;
1650 cork->opt = NULL;
1651 err = ip_setup_cork(sk, cork, ipc, rtp);
1652 if (err)
1653 return ERR_PTR(err);
1654
1655 err = __ip_append_data(sk, fl4, &queue, cork,
1656 ¤t->task_frag, getfrag,
1657 from, length, transhdrlen, flags);
1658 if (err) {
1659 __ip_flush_pending_frames(sk, &queue, cork);
1660 return ERR_PTR(err);
1661 }
1662
1663 return __ip_make_skb(sk, fl4, &queue, cork);
1664 }
1665
1666
1667
1668
1669 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1670 int len, int odd, struct sk_buff *skb)
1671 {
1672 __wsum csum;
1673
1674 csum = csum_partial_copy_nocheck(dptr+offset, to, len);
1675 skb->csum = csum_block_add(skb->csum, csum, odd);
1676 return 0;
1677 }
1678
1679
1680
1681
1682
1683 void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1684 const struct ip_options *sopt,
1685 __be32 daddr, __be32 saddr,
1686 const struct ip_reply_arg *arg,
1687 unsigned int len, u64 transmit_time)
1688 {
1689 struct ip_options_data replyopts;
1690 struct ipcm_cookie ipc;
1691 struct flowi4 fl4;
1692 struct rtable *rt = skb_rtable(skb);
1693 struct net *net = sock_net(sk);
1694 struct sk_buff *nskb;
1695 int err;
1696 int oif;
1697
1698 if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
1699 return;
1700
1701 ipcm_init(&ipc);
1702 ipc.addr = daddr;
1703 ipc.sockc.transmit_time = transmit_time;
1704
1705 if (replyopts.opt.opt.optlen) {
1706 ipc.opt = &replyopts.opt;
1707
1708 if (replyopts.opt.opt.srr)
1709 daddr = replyopts.opt.opt.faddr;
1710 }
1711
1712 oif = arg->bound_dev_if;
1713 if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
1714 oif = skb->skb_iif;
1715
1716 flowi4_init_output(&fl4, oif,
1717 IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
1718 RT_TOS(arg->tos),
1719 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1720 ip_reply_arg_flowi_flags(arg),
1721 daddr, saddr,
1722 tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
1723 arg->uid);
1724 security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
1725 rt = ip_route_output_flow(net, &fl4, sk);
1726 if (IS_ERR(rt))
1727 return;
1728
1729 inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
1730
1731 sk->sk_protocol = ip_hdr(skb)->protocol;
1732 sk->sk_bound_dev_if = arg->bound_dev_if;
1733 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
1734 ipc.sockc.mark = fl4.flowi4_mark;
1735 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
1736 len, 0, &ipc, &rt, MSG_DONTWAIT);
1737 if (unlikely(err)) {
1738 ip_flush_pending_frames(sk);
1739 goto out;
1740 }
1741
1742 nskb = skb_peek(&sk->sk_write_queue);
1743 if (nskb) {
1744 if (arg->csumoffset >= 0)
1745 *((__sum16 *)skb_transport_header(nskb) +
1746 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1747 arg->csum));
1748 nskb->ip_summed = CHECKSUM_NONE;
1749 nskb->mono_delivery_time = !!transmit_time;
1750 ip_push_pending_frames(sk, &fl4);
1751 }
1752 out:
1753 ip_rt_put(rt);
1754 }
1755
1756 void __init ip_init(void)
1757 {
1758 ip_rt_init();
1759 inet_initpeers();
1760
1761 #if defined(CONFIG_IP_MULTICAST)
1762 igmp_mc_init();
1763 #endif
1764 }