0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024 #define pr_fmt(fmt) "IPv4: " fmt
0025
0026 #include <linux/compiler.h>
0027 #include <linux/module.h>
0028 #include <linux/types.h>
0029 #include <linux/mm.h>
0030 #include <linux/jiffies.h>
0031 #include <linux/skbuff.h>
0032 #include <linux/list.h>
0033 #include <linux/ip.h>
0034 #include <linux/icmp.h>
0035 #include <linux/netdevice.h>
0036 #include <linux/jhash.h>
0037 #include <linux/random.h>
0038 #include <linux/slab.h>
0039 #include <net/route.h>
0040 #include <net/dst.h>
0041 #include <net/sock.h>
0042 #include <net/ip.h>
0043 #include <net/icmp.h>
0044 #include <net/checksum.h>
0045 #include <net/inetpeer.h>
0046 #include <net/inet_frag.h>
0047 #include <linux/tcp.h>
0048 #include <linux/udp.h>
0049 #include <linux/inet.h>
0050 #include <linux/netfilter_ipv4.h>
0051 #include <net/inet_ecn.h>
0052 #include <net/l3mdev.h>
0053
0054
0055
0056
0057
0058 static const char ip_frag_cache_name[] = "ip4-frags";
0059
0060
0061 struct ipq {
0062 struct inet_frag_queue q;
0063
0064 u8 ecn;
0065 u16 max_df_size;
0066 int iif;
0067 unsigned int rid;
0068 struct inet_peer *peer;
0069 };
0070
0071 static u8 ip4_frag_ecn(u8 tos)
0072 {
0073 return 1 << (tos & INET_ECN_MASK);
0074 }
0075
0076 static struct inet_frags ip4_frags;
0077
0078 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
0079 struct sk_buff *prev_tail, struct net_device *dev);
0080
0081
0082 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
0083 {
0084 struct ipq *qp = container_of(q, struct ipq, q);
0085 struct net *net = q->fqdir->net;
0086
0087 const struct frag_v4_compare_key *key = a;
0088
0089 q->key.v4 = *key;
0090 qp->ecn = 0;
0091 qp->peer = q->fqdir->max_dist ?
0092 inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
0093 NULL;
0094 }
0095
0096 static void ip4_frag_free(struct inet_frag_queue *q)
0097 {
0098 struct ipq *qp;
0099
0100 qp = container_of(q, struct ipq, q);
0101 if (qp->peer)
0102 inet_putpeer(qp->peer);
0103 }
0104
0105
0106
0107
0108 static void ipq_put(struct ipq *ipq)
0109 {
0110 inet_frag_put(&ipq->q);
0111 }
0112
0113
0114
0115
0116 static void ipq_kill(struct ipq *ipq)
0117 {
0118 inet_frag_kill(&ipq->q);
0119 }
0120
0121 static bool frag_expire_skip_icmp(u32 user)
0122 {
0123 return user == IP_DEFRAG_AF_PACKET ||
0124 ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
0125 __IP_DEFRAG_CONNTRACK_IN_END) ||
0126 ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
0127 __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
0128 }
0129
0130
0131
0132
0133 static void ip_expire(struct timer_list *t)
0134 {
0135 struct inet_frag_queue *frag = from_timer(frag, t, timer);
0136 const struct iphdr *iph;
0137 struct sk_buff *head = NULL;
0138 struct net *net;
0139 struct ipq *qp;
0140 int err;
0141
0142 qp = container_of(frag, struct ipq, q);
0143 net = qp->q.fqdir->net;
0144
0145 rcu_read_lock();
0146
0147
0148 if (READ_ONCE(qp->q.fqdir->dead))
0149 goto out_rcu_unlock;
0150
0151 spin_lock(&qp->q.lock);
0152
0153 if (qp->q.flags & INET_FRAG_COMPLETE)
0154 goto out;
0155
0156 ipq_kill(qp);
0157 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
0158 __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
0159
0160 if (!(qp->q.flags & INET_FRAG_FIRST_IN))
0161 goto out;
0162
0163
0164
0165
0166
0167 head = inet_frag_pull_head(&qp->q);
0168 if (!head)
0169 goto out;
0170 head->dev = dev_get_by_index_rcu(net, qp->iif);
0171 if (!head->dev)
0172 goto out;
0173
0174
0175
0176 iph = ip_hdr(head);
0177 err = ip_route_input_noref(head, iph->daddr, iph->saddr,
0178 iph->tos, head->dev);
0179 if (err)
0180 goto out;
0181
0182
0183
0184
0185 if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
0186 (skb_rtable(head)->rt_type != RTN_LOCAL))
0187 goto out;
0188
0189 spin_unlock(&qp->q.lock);
0190 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
0191 goto out_rcu_unlock;
0192
0193 out:
0194 spin_unlock(&qp->q.lock);
0195 out_rcu_unlock:
0196 rcu_read_unlock();
0197 kfree_skb(head);
0198 ipq_put(qp);
0199 }
0200
0201
0202
0203
0204 static struct ipq *ip_find(struct net *net, struct iphdr *iph,
0205 u32 user, int vif)
0206 {
0207 struct frag_v4_compare_key key = {
0208 .saddr = iph->saddr,
0209 .daddr = iph->daddr,
0210 .user = user,
0211 .vif = vif,
0212 .id = iph->id,
0213 .protocol = iph->protocol,
0214 };
0215 struct inet_frag_queue *q;
0216
0217 q = inet_frag_find(net->ipv4.fqdir, &key);
0218 if (!q)
0219 return NULL;
0220
0221 return container_of(q, struct ipq, q);
0222 }
0223
0224
0225 static int ip_frag_too_far(struct ipq *qp)
0226 {
0227 struct inet_peer *peer = qp->peer;
0228 unsigned int max = qp->q.fqdir->max_dist;
0229 unsigned int start, end;
0230
0231 int rc;
0232
0233 if (!peer || !max)
0234 return 0;
0235
0236 start = qp->rid;
0237 end = atomic_inc_return(&peer->rid);
0238 qp->rid = end;
0239
0240 rc = qp->q.fragments_tail && (end - start) > max;
0241
0242 if (rc)
0243 __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
0244
0245 return rc;
0246 }
0247
0248 static int ip_frag_reinit(struct ipq *qp)
0249 {
0250 unsigned int sum_truesize = 0;
0251
0252 if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
0253 refcount_inc(&qp->q.refcnt);
0254 return -ETIMEDOUT;
0255 }
0256
0257 sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
0258 sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
0259
0260 qp->q.flags = 0;
0261 qp->q.len = 0;
0262 qp->q.meat = 0;
0263 qp->q.rb_fragments = RB_ROOT;
0264 qp->q.fragments_tail = NULL;
0265 qp->q.last_run_head = NULL;
0266 qp->iif = 0;
0267 qp->ecn = 0;
0268
0269 return 0;
0270 }
0271
0272
0273 static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
0274 {
0275 struct net *net = qp->q.fqdir->net;
0276 int ihl, end, flags, offset;
0277 struct sk_buff *prev_tail;
0278 struct net_device *dev;
0279 unsigned int fragsize;
0280 int err = -ENOENT;
0281 u8 ecn;
0282
0283 if (qp->q.flags & INET_FRAG_COMPLETE)
0284 goto err;
0285
0286 if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
0287 unlikely(ip_frag_too_far(qp)) &&
0288 unlikely(err = ip_frag_reinit(qp))) {
0289 ipq_kill(qp);
0290 goto err;
0291 }
0292
0293 ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
0294 offset = ntohs(ip_hdr(skb)->frag_off);
0295 flags = offset & ~IP_OFFSET;
0296 offset &= IP_OFFSET;
0297 offset <<= 3;
0298 ihl = ip_hdrlen(skb);
0299
0300
0301 end = offset + skb->len - skb_network_offset(skb) - ihl;
0302 err = -EINVAL;
0303
0304
0305 if ((flags & IP_MF) == 0) {
0306
0307
0308
0309 if (end < qp->q.len ||
0310 ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
0311 goto discard_qp;
0312 qp->q.flags |= INET_FRAG_LAST_IN;
0313 qp->q.len = end;
0314 } else {
0315 if (end&7) {
0316 end &= ~7;
0317 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
0318 skb->ip_summed = CHECKSUM_NONE;
0319 }
0320 if (end > qp->q.len) {
0321
0322 if (qp->q.flags & INET_FRAG_LAST_IN)
0323 goto discard_qp;
0324 qp->q.len = end;
0325 }
0326 }
0327 if (end == offset)
0328 goto discard_qp;
0329
0330 err = -ENOMEM;
0331 if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
0332 goto discard_qp;
0333
0334 err = pskb_trim_rcsum(skb, end - offset);
0335 if (err)
0336 goto discard_qp;
0337
0338
0339 dev = skb->dev;
0340
0341 barrier();
0342
0343 prev_tail = qp->q.fragments_tail;
0344 err = inet_frag_queue_insert(&qp->q, skb, offset, end);
0345 if (err)
0346 goto insert_error;
0347
0348 if (dev)
0349 qp->iif = dev->ifindex;
0350
0351 qp->q.stamp = skb->tstamp;
0352 qp->q.mono_delivery_time = skb->mono_delivery_time;
0353 qp->q.meat += skb->len;
0354 qp->ecn |= ecn;
0355 add_frag_mem_limit(qp->q.fqdir, skb->truesize);
0356 if (offset == 0)
0357 qp->q.flags |= INET_FRAG_FIRST_IN;
0358
0359 fragsize = skb->len + ihl;
0360
0361 if (fragsize > qp->q.max_size)
0362 qp->q.max_size = fragsize;
0363
0364 if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
0365 fragsize > qp->max_df_size)
0366 qp->max_df_size = fragsize;
0367
0368 if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
0369 qp->q.meat == qp->q.len) {
0370 unsigned long orefdst = skb->_skb_refdst;
0371
0372 skb->_skb_refdst = 0UL;
0373 err = ip_frag_reasm(qp, skb, prev_tail, dev);
0374 skb->_skb_refdst = orefdst;
0375 if (err)
0376 inet_frag_kill(&qp->q);
0377 return err;
0378 }
0379
0380 skb_dst_drop(skb);
0381 return -EINPROGRESS;
0382
0383 insert_error:
0384 if (err == IPFRAG_DUP) {
0385 kfree_skb(skb);
0386 return -EINVAL;
0387 }
0388 err = -EINVAL;
0389 __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
0390 discard_qp:
0391 inet_frag_kill(&qp->q);
0392 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
0393 err:
0394 kfree_skb(skb);
0395 return err;
0396 }
0397
0398 static bool ip_frag_coalesce_ok(const struct ipq *qp)
0399 {
0400 return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER;
0401 }
0402
0403
0404 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
0405 struct sk_buff *prev_tail, struct net_device *dev)
0406 {
0407 struct net *net = qp->q.fqdir->net;
0408 struct iphdr *iph;
0409 void *reasm_data;
0410 int len, err;
0411 u8 ecn;
0412
0413 ipq_kill(qp);
0414
0415 ecn = ip_frag_ecn_table[qp->ecn];
0416 if (unlikely(ecn == 0xff)) {
0417 err = -EINVAL;
0418 goto out_fail;
0419 }
0420
0421
0422 reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail);
0423 if (!reasm_data)
0424 goto out_nomem;
0425
0426 len = ip_hdrlen(skb) + qp->q.len;
0427 err = -E2BIG;
0428 if (len > 65535)
0429 goto out_oversize;
0430
0431 inet_frag_reasm_finish(&qp->q, skb, reasm_data,
0432 ip_frag_coalesce_ok(qp));
0433
0434 skb->dev = dev;
0435 IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
0436
0437 iph = ip_hdr(skb);
0438 iph->tot_len = htons(len);
0439 iph->tos |= ecn;
0440
0441
0442
0443
0444
0445
0446
0447
0448
0449 if (qp->max_df_size == qp->q.max_size) {
0450 IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
0451 iph->frag_off = htons(IP_DF);
0452 } else {
0453 iph->frag_off = 0;
0454 }
0455
0456 ip_send_check(iph);
0457
0458 __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
0459 qp->q.rb_fragments = RB_ROOT;
0460 qp->q.fragments_tail = NULL;
0461 qp->q.last_run_head = NULL;
0462 return 0;
0463
0464 out_nomem:
0465 net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
0466 err = -ENOMEM;
0467 goto out_fail;
0468 out_oversize:
0469 net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
0470 out_fail:
0471 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
0472 return err;
0473 }
0474
0475
0476 int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
0477 {
0478 struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
0479 int vif = l3mdev_master_ifindex_rcu(dev);
0480 struct ipq *qp;
0481
0482 __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
0483 skb_orphan(skb);
0484
0485
0486 qp = ip_find(net, ip_hdr(skb), user, vif);
0487 if (qp) {
0488 int ret;
0489
0490 spin_lock(&qp->q.lock);
0491
0492 ret = ip_frag_queue(qp, skb);
0493
0494 spin_unlock(&qp->q.lock);
0495 ipq_put(qp);
0496 return ret;
0497 }
0498
0499 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
0500 kfree_skb(skb);
0501 return -ENOMEM;
0502 }
0503 EXPORT_SYMBOL(ip_defrag);
0504
0505 struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
0506 {
0507 struct iphdr iph;
0508 int netoff;
0509 u32 len;
0510
0511 if (skb->protocol != htons(ETH_P_IP))
0512 return skb;
0513
0514 netoff = skb_network_offset(skb);
0515
0516 if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
0517 return skb;
0518
0519 if (iph.ihl < 5 || iph.version != 4)
0520 return skb;
0521
0522 len = ntohs(iph.tot_len);
0523 if (skb->len < netoff + len || len < (iph.ihl * 4))
0524 return skb;
0525
0526 if (ip_is_fragment(&iph)) {
0527 skb = skb_share_check(skb, GFP_ATOMIC);
0528 if (skb) {
0529 if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) {
0530 kfree_skb(skb);
0531 return NULL;
0532 }
0533 if (pskb_trim_rcsum(skb, netoff + len)) {
0534 kfree_skb(skb);
0535 return NULL;
0536 }
0537 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
0538 if (ip_defrag(net, skb, user))
0539 return NULL;
0540 skb_clear_hash(skb);
0541 }
0542 }
0543 return skb;
0544 }
0545 EXPORT_SYMBOL(ip_check_defrag);
0546
0547 #ifdef CONFIG_SYSCTL
0548 static int dist_min;
0549
0550 static struct ctl_table ip4_frags_ns_ctl_table[] = {
0551 {
0552 .procname = "ipfrag_high_thresh",
0553 .maxlen = sizeof(unsigned long),
0554 .mode = 0644,
0555 .proc_handler = proc_doulongvec_minmax,
0556 },
0557 {
0558 .procname = "ipfrag_low_thresh",
0559 .maxlen = sizeof(unsigned long),
0560 .mode = 0644,
0561 .proc_handler = proc_doulongvec_minmax,
0562 },
0563 {
0564 .procname = "ipfrag_time",
0565 .maxlen = sizeof(int),
0566 .mode = 0644,
0567 .proc_handler = proc_dointvec_jiffies,
0568 },
0569 {
0570 .procname = "ipfrag_max_dist",
0571 .maxlen = sizeof(int),
0572 .mode = 0644,
0573 .proc_handler = proc_dointvec_minmax,
0574 .extra1 = &dist_min,
0575 },
0576 { }
0577 };
0578
0579
0580 static int ip4_frags_secret_interval_unused;
0581 static struct ctl_table ip4_frags_ctl_table[] = {
0582 {
0583 .procname = "ipfrag_secret_interval",
0584 .data = &ip4_frags_secret_interval_unused,
0585 .maxlen = sizeof(int),
0586 .mode = 0644,
0587 .proc_handler = proc_dointvec_jiffies,
0588 },
0589 { }
0590 };
0591
0592 static int __net_init ip4_frags_ns_ctl_register(struct net *net)
0593 {
0594 struct ctl_table *table;
0595 struct ctl_table_header *hdr;
0596
0597 table = ip4_frags_ns_ctl_table;
0598 if (!net_eq(net, &init_net)) {
0599 table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
0600 if (!table)
0601 goto err_alloc;
0602
0603 }
0604 table[0].data = &net->ipv4.fqdir->high_thresh;
0605 table[0].extra1 = &net->ipv4.fqdir->low_thresh;
0606 table[1].data = &net->ipv4.fqdir->low_thresh;
0607 table[1].extra2 = &net->ipv4.fqdir->high_thresh;
0608 table[2].data = &net->ipv4.fqdir->timeout;
0609 table[3].data = &net->ipv4.fqdir->max_dist;
0610
0611 hdr = register_net_sysctl(net, "net/ipv4", table);
0612 if (!hdr)
0613 goto err_reg;
0614
0615 net->ipv4.frags_hdr = hdr;
0616 return 0;
0617
0618 err_reg:
0619 if (!net_eq(net, &init_net))
0620 kfree(table);
0621 err_alloc:
0622 return -ENOMEM;
0623 }
0624
0625 static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
0626 {
0627 struct ctl_table *table;
0628
0629 table = net->ipv4.frags_hdr->ctl_table_arg;
0630 unregister_net_sysctl_table(net->ipv4.frags_hdr);
0631 kfree(table);
0632 }
0633
0634 static void __init ip4_frags_ctl_register(void)
0635 {
0636 register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
0637 }
0638 #else
0639 static int ip4_frags_ns_ctl_register(struct net *net)
0640 {
0641 return 0;
0642 }
0643
0644 static void ip4_frags_ns_ctl_unregister(struct net *net)
0645 {
0646 }
0647
0648 static void __init ip4_frags_ctl_register(void)
0649 {
0650 }
0651 #endif
0652
0653 static int __net_init ipv4_frags_init_net(struct net *net)
0654 {
0655 int res;
0656
0657 res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
0658 if (res < 0)
0659 return res;
0660
0661
0662
0663
0664
0665
0666
0667
0668
0669
0670
0671
0672
0673
0674 net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024;
0675 net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024;
0676
0677
0678
0679
0680
0681 net->ipv4.fqdir->timeout = IP_FRAG_TIME;
0682
0683 net->ipv4.fqdir->max_dist = 64;
0684
0685 res = ip4_frags_ns_ctl_register(net);
0686 if (res < 0)
0687 fqdir_exit(net->ipv4.fqdir);
0688 return res;
0689 }
0690
0691 static void __net_exit ipv4_frags_pre_exit_net(struct net *net)
0692 {
0693 fqdir_pre_exit(net->ipv4.fqdir);
0694 }
0695
0696 static void __net_exit ipv4_frags_exit_net(struct net *net)
0697 {
0698 ip4_frags_ns_ctl_unregister(net);
0699 fqdir_exit(net->ipv4.fqdir);
0700 }
0701
0702 static struct pernet_operations ip4_frags_ops = {
0703 .init = ipv4_frags_init_net,
0704 .pre_exit = ipv4_frags_pre_exit_net,
0705 .exit = ipv4_frags_exit_net,
0706 };
0707
0708
0709 static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
0710 {
0711 return jhash2(data,
0712 sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
0713 }
0714
0715 static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
0716 {
0717 const struct inet_frag_queue *fq = data;
0718
0719 return jhash2((const u32 *)&fq->key.v4,
0720 sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
0721 }
0722
0723 static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
0724 {
0725 const struct frag_v4_compare_key *key = arg->key;
0726 const struct inet_frag_queue *fq = ptr;
0727
0728 return !!memcmp(&fq->key, key, sizeof(*key));
0729 }
0730
0731 static const struct rhashtable_params ip4_rhash_params = {
0732 .head_offset = offsetof(struct inet_frag_queue, node),
0733 .key_offset = offsetof(struct inet_frag_queue, key),
0734 .key_len = sizeof(struct frag_v4_compare_key),
0735 .hashfn = ip4_key_hashfn,
0736 .obj_hashfn = ip4_obj_hashfn,
0737 .obj_cmpfn = ip4_obj_cmpfn,
0738 .automatic_shrinking = true,
0739 };
0740
0741 void __init ipfrag_init(void)
0742 {
0743 ip4_frags.constructor = ip4_frag_init;
0744 ip4_frags.destructor = ip4_frag_free;
0745 ip4_frags.qsize = sizeof(struct ipq);
0746 ip4_frags.frag_expire = ip_expire;
0747 ip4_frags.frags_cache_name = ip_frag_cache_name;
0748 ip4_frags.rhash_params = ip4_rhash_params;
0749 if (inet_frags_init(&ip4_frags))
0750 panic("IP: failed to allocate ip4_frags cache\n");
0751 ip4_frags_ctl_register();
0752 register_pernet_subsys(&ip4_frags_ops);
0753 }