0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061 #define pr_fmt(fmt) "IPv4: " fmt
0062
0063 #include <linux/module.h>
0064 #include <linux/bitops.h>
0065 #include <linux/kernel.h>
0066 #include <linux/mm.h>
0067 #include <linux/memblock.h>
0068 #include <linux/socket.h>
0069 #include <linux/errno.h>
0070 #include <linux/in.h>
0071 #include <linux/inet.h>
0072 #include <linux/netdevice.h>
0073 #include <linux/proc_fs.h>
0074 #include <linux/init.h>
0075 #include <linux/skbuff.h>
0076 #include <linux/inetdevice.h>
0077 #include <linux/igmp.h>
0078 #include <linux/pkt_sched.h>
0079 #include <linux/mroute.h>
0080 #include <linux/netfilter_ipv4.h>
0081 #include <linux/random.h>
0082 #include <linux/rcupdate.h>
0083 #include <linux/slab.h>
0084 #include <linux/jhash.h>
0085 #include <net/dst.h>
0086 #include <net/dst_metadata.h>
0087 #include <net/inet_dscp.h>
0088 #include <net/net_namespace.h>
0089 #include <net/ip.h>
0090 #include <net/route.h>
0091 #include <net/inetpeer.h>
0092 #include <net/sock.h>
0093 #include <net/ip_fib.h>
0094 #include <net/nexthop.h>
0095 #include <net/tcp.h>
0096 #include <net/icmp.h>
0097 #include <net/xfrm.h>
0098 #include <net/lwtunnel.h>
0099 #include <net/netevent.h>
0100 #include <net/rtnetlink.h>
0101 #ifdef CONFIG_SYSCTL
0102 #include <linux/sysctl.h>
0103 #endif
0104 #include <net/secure_seq.h>
0105 #include <net/ip_tunnels.h>
0106
0107 #include "fib_lookup.h"
0108
0109 #define RT_FL_TOS(oldflp4) \
0110 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
0111
0112 #define RT_GC_TIMEOUT (300*HZ)
0113
0114 #define DEFAULT_MIN_PMTU (512 + 20 + 20)
0115 #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
0116 #define DEFAULT_MIN_ADVMSS 256
0117 static int ip_rt_max_size;
0118 static int ip_rt_redirect_number __read_mostly = 9;
0119 static int ip_rt_redirect_load __read_mostly = HZ / 50;
0120 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
0121 static int ip_rt_error_cost __read_mostly = HZ;
0122 static int ip_rt_error_burst __read_mostly = 5 * HZ;
0123
0124 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
0125
0126
0127
0128
0129
0130 INDIRECT_CALLABLE_SCOPE
0131 struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0132 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
0133 INDIRECT_CALLABLE_SCOPE
0134 unsigned int ipv4_mtu(const struct dst_entry *dst);
0135 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
0136 static void ipv4_link_failure(struct sk_buff *skb);
0137 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
0138 struct sk_buff *skb, u32 mtu,
0139 bool confirm_neigh);
0140 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
0141 struct sk_buff *skb);
0142 static void ipv4_dst_destroy(struct dst_entry *dst);
0143
0144 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
0145 {
0146 WARN_ON(1);
0147 return NULL;
0148 }
0149
0150 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
0151 struct sk_buff *skb,
0152 const void *daddr);
0153 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
0154
0155 static struct dst_ops ipv4_dst_ops = {
0156 .family = AF_INET,
0157 .check = ipv4_dst_check,
0158 .default_advmss = ipv4_default_advmss,
0159 .mtu = ipv4_mtu,
0160 .cow_metrics = ipv4_cow_metrics,
0161 .destroy = ipv4_dst_destroy,
0162 .negative_advice = ipv4_negative_advice,
0163 .link_failure = ipv4_link_failure,
0164 .update_pmtu = ip_rt_update_pmtu,
0165 .redirect = ip_do_redirect,
0166 .local_out = __ip_local_out,
0167 .neigh_lookup = ipv4_neigh_lookup,
0168 .confirm_neigh = ipv4_confirm_neigh,
0169 };
0170
0171 #define ECN_OR_COST(class) TC_PRIO_##class
0172
0173 const __u8 ip_tos2prio[16] = {
0174 TC_PRIO_BESTEFFORT,
0175 ECN_OR_COST(BESTEFFORT),
0176 TC_PRIO_BESTEFFORT,
0177 ECN_OR_COST(BESTEFFORT),
0178 TC_PRIO_BULK,
0179 ECN_OR_COST(BULK),
0180 TC_PRIO_BULK,
0181 ECN_OR_COST(BULK),
0182 TC_PRIO_INTERACTIVE,
0183 ECN_OR_COST(INTERACTIVE),
0184 TC_PRIO_INTERACTIVE,
0185 ECN_OR_COST(INTERACTIVE),
0186 TC_PRIO_INTERACTIVE_BULK,
0187 ECN_OR_COST(INTERACTIVE_BULK),
0188 TC_PRIO_INTERACTIVE_BULK,
0189 ECN_OR_COST(INTERACTIVE_BULK)
0190 };
0191 EXPORT_SYMBOL(ip_tos2prio);
0192
0193 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
0194 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
0195
0196 #ifdef CONFIG_PROC_FS
0197 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
0198 {
0199 if (*pos)
0200 return NULL;
0201 return SEQ_START_TOKEN;
0202 }
0203
0204 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
0205 {
0206 ++*pos;
0207 return NULL;
0208 }
0209
0210 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
0211 {
0212 }
0213
0214 static int rt_cache_seq_show(struct seq_file *seq, void *v)
0215 {
0216 if (v == SEQ_START_TOKEN)
0217 seq_printf(seq, "%-127s\n",
0218 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
0219 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
0220 "HHUptod\tSpecDst");
0221 return 0;
0222 }
0223
0224 static const struct seq_operations rt_cache_seq_ops = {
0225 .start = rt_cache_seq_start,
0226 .next = rt_cache_seq_next,
0227 .stop = rt_cache_seq_stop,
0228 .show = rt_cache_seq_show,
0229 };
0230
0231 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
0232 {
0233 int cpu;
0234
0235 if (*pos == 0)
0236 return SEQ_START_TOKEN;
0237
0238 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
0239 if (!cpu_possible(cpu))
0240 continue;
0241 *pos = cpu+1;
0242 return &per_cpu(rt_cache_stat, cpu);
0243 }
0244 return NULL;
0245 }
0246
0247 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
0248 {
0249 int cpu;
0250
0251 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
0252 if (!cpu_possible(cpu))
0253 continue;
0254 *pos = cpu+1;
0255 return &per_cpu(rt_cache_stat, cpu);
0256 }
0257 (*pos)++;
0258 return NULL;
0259
0260 }
0261
0262 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
0263 {
0264
0265 }
0266
0267 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
0268 {
0269 struct rt_cache_stat *st = v;
0270
0271 if (v == SEQ_START_TOKEN) {
0272 seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
0273 return 0;
0274 }
0275
0276 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x "
0277 "%08x %08x %08x %08x %08x %08x "
0278 "%08x %08x %08x %08x\n",
0279 dst_entries_get_slow(&ipv4_dst_ops),
0280 0,
0281 st->in_slow_tot,
0282 st->in_slow_mc,
0283 st->in_no_route,
0284 st->in_brd,
0285 st->in_martian_dst,
0286 st->in_martian_src,
0287
0288 0,
0289 st->out_slow_tot,
0290 st->out_slow_mc,
0291
0292 0,
0293 0,
0294 0,
0295 0,
0296 0,
0297 0
0298 );
0299 return 0;
0300 }
0301
0302 static const struct seq_operations rt_cpu_seq_ops = {
0303 .start = rt_cpu_seq_start,
0304 .next = rt_cpu_seq_next,
0305 .stop = rt_cpu_seq_stop,
0306 .show = rt_cpu_seq_show,
0307 };
0308
0309 #ifdef CONFIG_IP_ROUTE_CLASSID
0310 static int rt_acct_proc_show(struct seq_file *m, void *v)
0311 {
0312 struct ip_rt_acct *dst, *src;
0313 unsigned int i, j;
0314
0315 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
0316 if (!dst)
0317 return -ENOMEM;
0318
0319 for_each_possible_cpu(i) {
0320 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
0321 for (j = 0; j < 256; j++) {
0322 dst[j].o_bytes += src[j].o_bytes;
0323 dst[j].o_packets += src[j].o_packets;
0324 dst[j].i_bytes += src[j].i_bytes;
0325 dst[j].i_packets += src[j].i_packets;
0326 }
0327 }
0328
0329 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
0330 kfree(dst);
0331 return 0;
0332 }
0333 #endif
0334
0335 static int __net_init ip_rt_do_proc_init(struct net *net)
0336 {
0337 struct proc_dir_entry *pde;
0338
0339 pde = proc_create_seq("rt_cache", 0444, net->proc_net,
0340 &rt_cache_seq_ops);
0341 if (!pde)
0342 goto err1;
0343
0344 pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
0345 &rt_cpu_seq_ops);
0346 if (!pde)
0347 goto err2;
0348
0349 #ifdef CONFIG_IP_ROUTE_CLASSID
0350 pde = proc_create_single("rt_acct", 0, net->proc_net,
0351 rt_acct_proc_show);
0352 if (!pde)
0353 goto err3;
0354 #endif
0355 return 0;
0356
0357 #ifdef CONFIG_IP_ROUTE_CLASSID
0358 err3:
0359 remove_proc_entry("rt_cache", net->proc_net_stat);
0360 #endif
0361 err2:
0362 remove_proc_entry("rt_cache", net->proc_net);
0363 err1:
0364 return -ENOMEM;
0365 }
0366
0367 static void __net_exit ip_rt_do_proc_exit(struct net *net)
0368 {
0369 remove_proc_entry("rt_cache", net->proc_net_stat);
0370 remove_proc_entry("rt_cache", net->proc_net);
0371 #ifdef CONFIG_IP_ROUTE_CLASSID
0372 remove_proc_entry("rt_acct", net->proc_net);
0373 #endif
0374 }
0375
0376 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
0377 .init = ip_rt_do_proc_init,
0378 .exit = ip_rt_do_proc_exit,
0379 };
0380
0381 static int __init ip_rt_proc_init(void)
0382 {
0383 return register_pernet_subsys(&ip_rt_proc_ops);
0384 }
0385
0386 #else
0387 static inline int ip_rt_proc_init(void)
0388 {
0389 return 0;
0390 }
0391 #endif
0392
0393 static inline bool rt_is_expired(const struct rtable *rth)
0394 {
0395 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
0396 }
0397
0398 void rt_cache_flush(struct net *net)
0399 {
0400 rt_genid_bump_ipv4(net);
0401 }
0402
0403 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
0404 struct sk_buff *skb,
0405 const void *daddr)
0406 {
0407 const struct rtable *rt = container_of(dst, struct rtable, dst);
0408 struct net_device *dev = dst->dev;
0409 struct neighbour *n;
0410
0411 rcu_read_lock_bh();
0412
0413 if (likely(rt->rt_gw_family == AF_INET)) {
0414 n = ip_neigh_gw4(dev, rt->rt_gw4);
0415 } else if (rt->rt_gw_family == AF_INET6) {
0416 n = ip_neigh_gw6(dev, &rt->rt_gw6);
0417 } else {
0418 __be32 pkey;
0419
0420 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
0421 n = ip_neigh_gw4(dev, pkey);
0422 }
0423
0424 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
0425 n = NULL;
0426
0427 rcu_read_unlock_bh();
0428
0429 return n;
0430 }
0431
0432 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
0433 {
0434 const struct rtable *rt = container_of(dst, struct rtable, dst);
0435 struct net_device *dev = dst->dev;
0436 const __be32 *pkey = daddr;
0437
0438 if (rt->rt_gw_family == AF_INET) {
0439 pkey = (const __be32 *)&rt->rt_gw4;
0440 } else if (rt->rt_gw_family == AF_INET6) {
0441 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
0442 } else if (!daddr ||
0443 (rt->rt_flags &
0444 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
0445 return;
0446 }
0447 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
0448 }
0449
0450
0451
0452
0453 static u32 ip_idents_mask __read_mostly;
0454 static atomic_t *ip_idents __read_mostly;
0455 static u32 *ip_tstamps __read_mostly;
0456
0457
0458
0459
0460
0461 static u32 ip_idents_reserve(u32 hash, int segs)
0462 {
0463 u32 bucket, old, now = (u32)jiffies;
0464 atomic_t *p_id;
0465 u32 *p_tstamp;
0466 u32 delta = 0;
0467
0468 bucket = hash & ip_idents_mask;
0469 p_tstamp = ip_tstamps + bucket;
0470 p_id = ip_idents + bucket;
0471 old = READ_ONCE(*p_tstamp);
0472
0473 if (old != now && cmpxchg(p_tstamp, old, now) == old)
0474 delta = prandom_u32_max(now - old);
0475
0476
0477
0478
0479
0480 return atomic_add_return(segs + delta, p_id) - segs;
0481 }
0482
0483 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
0484 {
0485 u32 hash, id;
0486
0487
0488 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
0489 get_random_bytes(&net->ipv4.ip_id_key,
0490 sizeof(net->ipv4.ip_id_key));
0491
0492 hash = siphash_3u32((__force u32)iph->daddr,
0493 (__force u32)iph->saddr,
0494 iph->protocol,
0495 &net->ipv4.ip_id_key);
0496 id = ip_idents_reserve(hash, segs);
0497 iph->id = htons(id);
0498 }
0499 EXPORT_SYMBOL(__ip_select_ident);
0500
0501 static void ip_rt_fix_tos(struct flowi4 *fl4)
0502 {
0503 __u8 tos = RT_FL_TOS(fl4);
0504
0505 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
0506 if (tos & RTO_ONLINK)
0507 fl4->flowi4_scope = RT_SCOPE_LINK;
0508 }
0509
0510 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
0511 const struct sock *sk, const struct iphdr *iph,
0512 int oif, __u8 tos, u8 prot, u32 mark,
0513 int flow_flags)
0514 {
0515 __u8 scope = RT_SCOPE_UNIVERSE;
0516
0517 if (sk) {
0518 const struct inet_sock *inet = inet_sk(sk);
0519
0520 oif = sk->sk_bound_dev_if;
0521 mark = sk->sk_mark;
0522 tos = ip_sock_rt_tos(sk);
0523 scope = ip_sock_rt_scope(sk);
0524 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
0525 }
0526
0527 flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
0528 prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
0529 sock_net_uid(net, sk));
0530 }
0531
0532 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
0533 const struct sock *sk)
0534 {
0535 const struct net *net = dev_net(skb->dev);
0536 const struct iphdr *iph = ip_hdr(skb);
0537 int oif = skb->dev->ifindex;
0538 u8 prot = iph->protocol;
0539 u32 mark = skb->mark;
0540 __u8 tos = iph->tos;
0541
0542 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
0543 }
0544
0545 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
0546 {
0547 const struct inet_sock *inet = inet_sk(sk);
0548 const struct ip_options_rcu *inet_opt;
0549 __be32 daddr = inet->inet_daddr;
0550
0551 rcu_read_lock();
0552 inet_opt = rcu_dereference(inet->inet_opt);
0553 if (inet_opt && inet_opt->opt.srr)
0554 daddr = inet_opt->opt.faddr;
0555 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
0556 ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
0557 ip_sock_rt_scope(sk),
0558 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
0559 inet_sk_flowi_flags(sk),
0560 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
0561 rcu_read_unlock();
0562 }
0563
0564 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
0565 const struct sk_buff *skb)
0566 {
0567 if (skb)
0568 build_skb_flow_key(fl4, skb, sk);
0569 else
0570 build_sk_flow_key(fl4, sk);
0571 }
0572
0573 static DEFINE_SPINLOCK(fnhe_lock);
0574
0575 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
0576 {
0577 struct rtable *rt;
0578
0579 rt = rcu_dereference(fnhe->fnhe_rth_input);
0580 if (rt) {
0581 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
0582 dst_dev_put(&rt->dst);
0583 dst_release(&rt->dst);
0584 }
0585 rt = rcu_dereference(fnhe->fnhe_rth_output);
0586 if (rt) {
0587 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
0588 dst_dev_put(&rt->dst);
0589 dst_release(&rt->dst);
0590 }
0591 }
0592
0593 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
0594 {
0595 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
0596 struct fib_nh_exception *fnhe, *oldest = NULL;
0597
0598 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
0599 fnhe = rcu_dereference_protected(*fnhe_p,
0600 lockdep_is_held(&fnhe_lock));
0601 if (!fnhe)
0602 break;
0603 if (!oldest ||
0604 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
0605 oldest = fnhe;
0606 oldest_p = fnhe_p;
0607 }
0608 }
0609 fnhe_flush_routes(oldest);
0610 *oldest_p = oldest->fnhe_next;
0611 kfree_rcu(oldest, rcu);
0612 }
0613
0614 static u32 fnhe_hashfun(__be32 daddr)
0615 {
0616 static siphash_aligned_key_t fnhe_hash_key;
0617 u64 hval;
0618
0619 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
0620 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
0621 return hash_64(hval, FNHE_HASH_SHIFT);
0622 }
0623
0624 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
0625 {
0626 rt->rt_pmtu = fnhe->fnhe_pmtu;
0627 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
0628 rt->dst.expires = fnhe->fnhe_expires;
0629
0630 if (fnhe->fnhe_gw) {
0631 rt->rt_flags |= RTCF_REDIRECTED;
0632 rt->rt_uses_gateway = 1;
0633 rt->rt_gw_family = AF_INET;
0634 rt->rt_gw4 = fnhe->fnhe_gw;
0635 }
0636 }
0637
0638 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
0639 __be32 gw, u32 pmtu, bool lock,
0640 unsigned long expires)
0641 {
0642 struct fnhe_hash_bucket *hash;
0643 struct fib_nh_exception *fnhe;
0644 struct rtable *rt;
0645 u32 genid, hval;
0646 unsigned int i;
0647 int depth;
0648
0649 genid = fnhe_genid(dev_net(nhc->nhc_dev));
0650 hval = fnhe_hashfun(daddr);
0651
0652 spin_lock_bh(&fnhe_lock);
0653
0654 hash = rcu_dereference(nhc->nhc_exceptions);
0655 if (!hash) {
0656 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
0657 if (!hash)
0658 goto out_unlock;
0659 rcu_assign_pointer(nhc->nhc_exceptions, hash);
0660 }
0661
0662 hash += hval;
0663
0664 depth = 0;
0665 for (fnhe = rcu_dereference(hash->chain); fnhe;
0666 fnhe = rcu_dereference(fnhe->fnhe_next)) {
0667 if (fnhe->fnhe_daddr == daddr)
0668 break;
0669 depth++;
0670 }
0671
0672 if (fnhe) {
0673 if (fnhe->fnhe_genid != genid)
0674 fnhe->fnhe_genid = genid;
0675 if (gw)
0676 fnhe->fnhe_gw = gw;
0677 if (pmtu) {
0678 fnhe->fnhe_pmtu = pmtu;
0679 fnhe->fnhe_mtu_locked = lock;
0680 }
0681 fnhe->fnhe_expires = max(1UL, expires);
0682
0683 rt = rcu_dereference(fnhe->fnhe_rth_input);
0684 if (rt)
0685 fill_route_from_fnhe(rt, fnhe);
0686 rt = rcu_dereference(fnhe->fnhe_rth_output);
0687 if (rt)
0688 fill_route_from_fnhe(rt, fnhe);
0689 } else {
0690
0691 int max_depth = FNHE_RECLAIM_DEPTH +
0692 prandom_u32_max(FNHE_RECLAIM_DEPTH);
0693
0694 while (depth > max_depth) {
0695 fnhe_remove_oldest(hash);
0696 depth--;
0697 }
0698
0699 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
0700 if (!fnhe)
0701 goto out_unlock;
0702
0703 fnhe->fnhe_next = hash->chain;
0704
0705 fnhe->fnhe_genid = genid;
0706 fnhe->fnhe_daddr = daddr;
0707 fnhe->fnhe_gw = gw;
0708 fnhe->fnhe_pmtu = pmtu;
0709 fnhe->fnhe_mtu_locked = lock;
0710 fnhe->fnhe_expires = max(1UL, expires);
0711
0712 rcu_assign_pointer(hash->chain, fnhe);
0713
0714
0715
0716
0717
0718 rt = rcu_dereference(nhc->nhc_rth_input);
0719 if (rt)
0720 rt->dst.obsolete = DST_OBSOLETE_KILL;
0721
0722 for_each_possible_cpu(i) {
0723 struct rtable __rcu **prt;
0724
0725 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
0726 rt = rcu_dereference(*prt);
0727 if (rt)
0728 rt->dst.obsolete = DST_OBSOLETE_KILL;
0729 }
0730 }
0731
0732 fnhe->fnhe_stamp = jiffies;
0733
0734 out_unlock:
0735 spin_unlock_bh(&fnhe_lock);
0736 }
0737
0738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
0739 bool kill_route)
0740 {
0741 __be32 new_gw = icmp_hdr(skb)->un.gateway;
0742 __be32 old_gw = ip_hdr(skb)->saddr;
0743 struct net_device *dev = skb->dev;
0744 struct in_device *in_dev;
0745 struct fib_result res;
0746 struct neighbour *n;
0747 struct net *net;
0748
0749 switch (icmp_hdr(skb)->code & 7) {
0750 case ICMP_REDIR_NET:
0751 case ICMP_REDIR_NETTOS:
0752 case ICMP_REDIR_HOST:
0753 case ICMP_REDIR_HOSTTOS:
0754 break;
0755
0756 default:
0757 return;
0758 }
0759
0760 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
0761 return;
0762
0763 in_dev = __in_dev_get_rcu(dev);
0764 if (!in_dev)
0765 return;
0766
0767 net = dev_net(dev);
0768 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
0769 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
0770 ipv4_is_zeronet(new_gw))
0771 goto reject_redirect;
0772
0773 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
0774 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
0775 goto reject_redirect;
0776 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
0777 goto reject_redirect;
0778 } else {
0779 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
0780 goto reject_redirect;
0781 }
0782
0783 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
0784 if (!n)
0785 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
0786 if (!IS_ERR(n)) {
0787 if (!(n->nud_state & NUD_VALID)) {
0788 neigh_event_send(n, NULL);
0789 } else {
0790 if (fib_lookup(net, fl4, &res, 0) == 0) {
0791 struct fib_nh_common *nhc;
0792
0793 fib_select_path(net, &res, fl4, skb);
0794 nhc = FIB_RES_NHC(res);
0795 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
0796 0, false,
0797 jiffies + ip_rt_gc_timeout);
0798 }
0799 if (kill_route)
0800 rt->dst.obsolete = DST_OBSOLETE_KILL;
0801 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
0802 }
0803 neigh_release(n);
0804 }
0805 return;
0806
0807 reject_redirect:
0808 #ifdef CONFIG_IP_ROUTE_VERBOSE
0809 if (IN_DEV_LOG_MARTIANS(in_dev)) {
0810 const struct iphdr *iph = (const struct iphdr *) skb->data;
0811 __be32 daddr = iph->daddr;
0812 __be32 saddr = iph->saddr;
0813
0814 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
0815 " Advised path = %pI4 -> %pI4\n",
0816 &old_gw, dev->name, &new_gw,
0817 &saddr, &daddr);
0818 }
0819 #endif
0820 ;
0821 }
0822
0823 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
0824 {
0825 struct rtable *rt;
0826 struct flowi4 fl4;
0827 const struct iphdr *iph = (const struct iphdr *) skb->data;
0828 struct net *net = dev_net(skb->dev);
0829 int oif = skb->dev->ifindex;
0830 u8 prot = iph->protocol;
0831 u32 mark = skb->mark;
0832 __u8 tos = iph->tos;
0833
0834 rt = (struct rtable *) dst;
0835
0836 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
0837 __ip_do_redirect(rt, skb, &fl4, true);
0838 }
0839
0840 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
0841 {
0842 struct rtable *rt = (struct rtable *)dst;
0843 struct dst_entry *ret = dst;
0844
0845 if (rt) {
0846 if (dst->obsolete > 0) {
0847 ip_rt_put(rt);
0848 ret = NULL;
0849 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
0850 rt->dst.expires) {
0851 ip_rt_put(rt);
0852 ret = NULL;
0853 }
0854 }
0855 return ret;
0856 }
0857
0858
0859
0860
0861
0862
0863
0864
0865
0866
0867
0868
0869
0870
0871
0872
0873
0874 void ip_rt_send_redirect(struct sk_buff *skb)
0875 {
0876 struct rtable *rt = skb_rtable(skb);
0877 struct in_device *in_dev;
0878 struct inet_peer *peer;
0879 struct net *net;
0880 int log_martians;
0881 int vif;
0882
0883 rcu_read_lock();
0884 in_dev = __in_dev_get_rcu(rt->dst.dev);
0885 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
0886 rcu_read_unlock();
0887 return;
0888 }
0889 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
0890 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
0891 rcu_read_unlock();
0892
0893 net = dev_net(rt->dst.dev);
0894 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
0895 if (!peer) {
0896 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
0897 rt_nexthop(rt, ip_hdr(skb)->daddr));
0898 return;
0899 }
0900
0901
0902
0903
0904 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
0905 peer->rate_tokens = 0;
0906 peer->n_redirects = 0;
0907 }
0908
0909
0910
0911
0912 if (peer->n_redirects >= ip_rt_redirect_number) {
0913 peer->rate_last = jiffies;
0914 goto out_put_peer;
0915 }
0916
0917
0918
0919
0920 if (peer->n_redirects == 0 ||
0921 time_after(jiffies,
0922 (peer->rate_last +
0923 (ip_rt_redirect_load << peer->n_redirects)))) {
0924 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
0925
0926 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
0927 peer->rate_last = jiffies;
0928 ++peer->n_redirects;
0929 #ifdef CONFIG_IP_ROUTE_VERBOSE
0930 if (log_martians &&
0931 peer->n_redirects == ip_rt_redirect_number)
0932 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
0933 &ip_hdr(skb)->saddr, inet_iif(skb),
0934 &ip_hdr(skb)->daddr, &gw);
0935 #endif
0936 }
0937 out_put_peer:
0938 inet_putpeer(peer);
0939 }
0940
0941 static int ip_error(struct sk_buff *skb)
0942 {
0943 struct rtable *rt = skb_rtable(skb);
0944 struct net_device *dev = skb->dev;
0945 struct in_device *in_dev;
0946 struct inet_peer *peer;
0947 unsigned long now;
0948 struct net *net;
0949 SKB_DR(reason);
0950 bool send;
0951 int code;
0952
0953 if (netif_is_l3_master(skb->dev)) {
0954 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
0955 if (!dev)
0956 goto out;
0957 }
0958
0959 in_dev = __in_dev_get_rcu(dev);
0960
0961
0962 if (!in_dev)
0963 goto out;
0964
0965 net = dev_net(rt->dst.dev);
0966 if (!IN_DEV_FORWARD(in_dev)) {
0967 switch (rt->dst.error) {
0968 case EHOSTUNREACH:
0969 SKB_DR_SET(reason, IP_INADDRERRORS);
0970 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
0971 break;
0972
0973 case ENETUNREACH:
0974 SKB_DR_SET(reason, IP_INNOROUTES);
0975 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
0976 break;
0977 }
0978 goto out;
0979 }
0980
0981 switch (rt->dst.error) {
0982 case EINVAL:
0983 default:
0984 goto out;
0985 case EHOSTUNREACH:
0986 code = ICMP_HOST_UNREACH;
0987 break;
0988 case ENETUNREACH:
0989 code = ICMP_NET_UNREACH;
0990 SKB_DR_SET(reason, IP_INNOROUTES);
0991 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
0992 break;
0993 case EACCES:
0994 code = ICMP_PKT_FILTERED;
0995 break;
0996 }
0997
0998 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
0999 l3mdev_master_ifindex(skb->dev), 1);
1000
1001 send = true;
1002 if (peer) {
1003 now = jiffies;
1004 peer->rate_tokens += now - peer->rate_last;
1005 if (peer->rate_tokens > ip_rt_error_burst)
1006 peer->rate_tokens = ip_rt_error_burst;
1007 peer->rate_last = now;
1008 if (peer->rate_tokens >= ip_rt_error_cost)
1009 peer->rate_tokens -= ip_rt_error_cost;
1010 else
1011 send = false;
1012 inet_putpeer(peer);
1013 }
1014 if (send)
1015 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1016
1017 out: kfree_skb_reason(skb, reason);
1018 return 0;
1019 }
1020
1021 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1022 {
1023 struct dst_entry *dst = &rt->dst;
1024 struct net *net = dev_net(dst->dev);
1025 struct fib_result res;
1026 bool lock = false;
1027 u32 old_mtu;
1028
1029 if (ip_mtu_locked(dst))
1030 return;
1031
1032 old_mtu = ipv4_mtu(dst);
1033 if (old_mtu < mtu)
1034 return;
1035
1036 if (mtu < net->ipv4.ip_rt_min_pmtu) {
1037 lock = true;
1038 mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
1039 }
1040
1041 if (rt->rt_pmtu == mtu && !lock &&
1042 time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
1043 return;
1044
1045 rcu_read_lock();
1046 if (fib_lookup(net, fl4, &res, 0) == 0) {
1047 struct fib_nh_common *nhc;
1048
1049 fib_select_path(net, &res, fl4, NULL);
1050 nhc = FIB_RES_NHC(res);
1051 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1052 jiffies + net->ipv4.ip_rt_mtu_expires);
1053 }
1054 rcu_read_unlock();
1055 }
1056
1057 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1058 struct sk_buff *skb, u32 mtu,
1059 bool confirm_neigh)
1060 {
1061 struct rtable *rt = (struct rtable *) dst;
1062 struct flowi4 fl4;
1063
1064 ip_rt_build_flow_key(&fl4, sk, skb);
1065
1066
1067 if (skb && netif_is_any_bridge_port(skb->dev))
1068 fl4.flowi4_oif = 0;
1069
1070 __ip_rt_update_pmtu(rt, &fl4, mtu);
1071 }
1072
1073 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1074 int oif, u8 protocol)
1075 {
1076 const struct iphdr *iph = (const struct iphdr *)skb->data;
1077 struct flowi4 fl4;
1078 struct rtable *rt;
1079 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1080
1081 __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
1082 0);
1083 rt = __ip_route_output_key(net, &fl4);
1084 if (!IS_ERR(rt)) {
1085 __ip_rt_update_pmtu(rt, &fl4, mtu);
1086 ip_rt_put(rt);
1087 }
1088 }
1089 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1090
1091 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092 {
1093 const struct iphdr *iph = (const struct iphdr *)skb->data;
1094 struct flowi4 fl4;
1095 struct rtable *rt;
1096
1097 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1098
1099 if (!fl4.flowi4_mark)
1100 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1101
1102 rt = __ip_route_output_key(sock_net(sk), &fl4);
1103 if (!IS_ERR(rt)) {
1104 __ip_rt_update_pmtu(rt, &fl4, mtu);
1105 ip_rt_put(rt);
1106 }
1107 }
1108
1109 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1110 {
1111 const struct iphdr *iph = (const struct iphdr *)skb->data;
1112 struct flowi4 fl4;
1113 struct rtable *rt;
1114 struct dst_entry *odst = NULL;
1115 bool new = false;
1116 struct net *net = sock_net(sk);
1117
1118 bh_lock_sock(sk);
1119
1120 if (!ip_sk_accept_pmtu(sk))
1121 goto out;
1122
1123 odst = sk_dst_get(sk);
1124
1125 if (sock_owned_by_user(sk) || !odst) {
1126 __ipv4_sk_update_pmtu(skb, sk, mtu);
1127 goto out;
1128 }
1129
1130 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1131
1132 rt = (struct rtable *)odst;
1133 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1134 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1135 if (IS_ERR(rt))
1136 goto out;
1137
1138 new = true;
1139 }
1140
1141 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1142
1143 if (!dst_check(&rt->dst, 0)) {
1144 if (new)
1145 dst_release(&rt->dst);
1146
1147 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1148 if (IS_ERR(rt))
1149 goto out;
1150
1151 new = true;
1152 }
1153
1154 if (new)
1155 sk_dst_set(sk, &rt->dst);
1156
1157 out:
1158 bh_unlock_sock(sk);
1159 dst_release(odst);
1160 }
1161 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1162
1163 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1164 int oif, u8 protocol)
1165 {
1166 const struct iphdr *iph = (const struct iphdr *)skb->data;
1167 struct flowi4 fl4;
1168 struct rtable *rt;
1169
1170 __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
1171 rt = __ip_route_output_key(net, &fl4);
1172 if (!IS_ERR(rt)) {
1173 __ip_do_redirect(rt, skb, &fl4, false);
1174 ip_rt_put(rt);
1175 }
1176 }
1177 EXPORT_SYMBOL_GPL(ipv4_redirect);
1178
1179 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1180 {
1181 const struct iphdr *iph = (const struct iphdr *)skb->data;
1182 struct flowi4 fl4;
1183 struct rtable *rt;
1184 struct net *net = sock_net(sk);
1185
1186 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1187 rt = __ip_route_output_key(net, &fl4);
1188 if (!IS_ERR(rt)) {
1189 __ip_do_redirect(rt, skb, &fl4, false);
1190 ip_rt_put(rt);
1191 }
1192 }
1193 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1194
1195 INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1196 u32 cookie)
1197 {
1198 struct rtable *rt = (struct rtable *) dst;
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1209 return NULL;
1210 return dst;
1211 }
1212 EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1213
1214 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1215 {
1216 struct ip_options opt;
1217 int res;
1218
1219
1220
1221
1222 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1223 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1224 return;
1225
1226 memset(&opt, 0, sizeof(opt));
1227 if (ip_hdr(skb)->ihl > 5) {
1228 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1229 return;
1230 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1231
1232 rcu_read_lock();
1233 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1234 rcu_read_unlock();
1235
1236 if (res)
1237 return;
1238 }
1239 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1240 }
1241
1242 static void ipv4_link_failure(struct sk_buff *skb)
1243 {
1244 struct rtable *rt;
1245
1246 ipv4_send_dest_unreach(skb);
1247
1248 rt = skb_rtable(skb);
1249 if (rt)
1250 dst_set_expires(&rt->dst, 0);
1251 }
1252
1253 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1254 {
1255 pr_debug("%s: %pI4 -> %pI4, %s\n",
1256 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1257 skb->dev ? skb->dev->name : "?");
1258 kfree_skb(skb);
1259 WARN_ON(1);
1260 return 0;
1261 }
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1273 {
1274 __be32 src;
1275
1276 if (rt_is_output_route(rt))
1277 src = ip_hdr(skb)->saddr;
1278 else {
1279 struct fib_result res;
1280 struct iphdr *iph = ip_hdr(skb);
1281 struct flowi4 fl4 = {
1282 .daddr = iph->daddr,
1283 .saddr = iph->saddr,
1284 .flowi4_tos = RT_TOS(iph->tos),
1285 .flowi4_oif = rt->dst.dev->ifindex,
1286 .flowi4_iif = skb->dev->ifindex,
1287 .flowi4_mark = skb->mark,
1288 };
1289
1290 rcu_read_lock();
1291 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1292 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1293 else
1294 src = inet_select_addr(rt->dst.dev,
1295 rt_nexthop(rt, iph->daddr),
1296 RT_SCOPE_UNIVERSE);
1297 rcu_read_unlock();
1298 }
1299 memcpy(addr, &src, 4);
1300 }
1301
1302 #ifdef CONFIG_IP_ROUTE_CLASSID
1303 static void set_class_tag(struct rtable *rt, u32 tag)
1304 {
1305 if (!(rt->dst.tclassid & 0xFFFF))
1306 rt->dst.tclassid |= tag & 0xFFFF;
1307 if (!(rt->dst.tclassid & 0xFFFF0000))
1308 rt->dst.tclassid |= tag & 0xFFFF0000;
1309 }
1310 #endif
1311
1312 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1313 {
1314 struct net *net = dev_net(dst->dev);
1315 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1316 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1317 net->ipv4.ip_rt_min_advmss);
1318
1319 return min(advmss, IPV4_MAX_PMTU - header_size);
1320 }
1321
1322 INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1323 {
1324 return ip_dst_mtu_maybe_forward(dst, false);
1325 }
1326 EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1327
1328 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1329 {
1330 struct fnhe_hash_bucket *hash;
1331 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1332 u32 hval = fnhe_hashfun(daddr);
1333
1334 spin_lock_bh(&fnhe_lock);
1335
1336 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1337 lockdep_is_held(&fnhe_lock));
1338 hash += hval;
1339
1340 fnhe_p = &hash->chain;
1341 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1342 while (fnhe) {
1343 if (fnhe->fnhe_daddr == daddr) {
1344 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1345 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1346
1347
1348
1349 fnhe->fnhe_daddr = 0;
1350 fnhe_flush_routes(fnhe);
1351 kfree_rcu(fnhe, rcu);
1352 break;
1353 }
1354 fnhe_p = &fnhe->fnhe_next;
1355 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1356 lockdep_is_held(&fnhe_lock));
1357 }
1358
1359 spin_unlock_bh(&fnhe_lock);
1360 }
1361
1362 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1363 __be32 daddr)
1364 {
1365 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1366 struct fib_nh_exception *fnhe;
1367 u32 hval;
1368
1369 if (!hash)
1370 return NULL;
1371
1372 hval = fnhe_hashfun(daddr);
1373
1374 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1375 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1376 if (fnhe->fnhe_daddr == daddr) {
1377 if (fnhe->fnhe_expires &&
1378 time_after(jiffies, fnhe->fnhe_expires)) {
1379 ip_del_fnhe(nhc, daddr);
1380 break;
1381 }
1382 return fnhe;
1383 }
1384 }
1385 return NULL;
1386 }
1387
1388
1389
1390
1391
1392
1393
1394 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1395 {
1396 struct fib_nh_common *nhc = res->nhc;
1397 struct net_device *dev = nhc->nhc_dev;
1398 struct fib_info *fi = res->fi;
1399 u32 mtu = 0;
1400
1401 if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1402 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1403 mtu = fi->fib_mtu;
1404
1405 if (likely(!mtu)) {
1406 struct fib_nh_exception *fnhe;
1407
1408 fnhe = find_exception(nhc, daddr);
1409 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1410 mtu = fnhe->fnhe_pmtu;
1411 }
1412
1413 if (likely(!mtu))
1414 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1415
1416 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1417 }
1418
1419 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1420 __be32 daddr, const bool do_cache)
1421 {
1422 bool ret = false;
1423
1424 spin_lock_bh(&fnhe_lock);
1425
1426 if (daddr == fnhe->fnhe_daddr) {
1427 struct rtable __rcu **porig;
1428 struct rtable *orig;
1429 int genid = fnhe_genid(dev_net(rt->dst.dev));
1430
1431 if (rt_is_input_route(rt))
1432 porig = &fnhe->fnhe_rth_input;
1433 else
1434 porig = &fnhe->fnhe_rth_output;
1435 orig = rcu_dereference(*porig);
1436
1437 if (fnhe->fnhe_genid != genid) {
1438 fnhe->fnhe_genid = genid;
1439 fnhe->fnhe_gw = 0;
1440 fnhe->fnhe_pmtu = 0;
1441 fnhe->fnhe_expires = 0;
1442 fnhe->fnhe_mtu_locked = false;
1443 fnhe_flush_routes(fnhe);
1444 orig = NULL;
1445 }
1446 fill_route_from_fnhe(rt, fnhe);
1447 if (!rt->rt_gw4) {
1448 rt->rt_gw4 = daddr;
1449 rt->rt_gw_family = AF_INET;
1450 }
1451
1452 if (do_cache) {
1453 dst_hold(&rt->dst);
1454 rcu_assign_pointer(*porig, rt);
1455 if (orig) {
1456 dst_dev_put(&orig->dst);
1457 dst_release(&orig->dst);
1458 }
1459 ret = true;
1460 }
1461
1462 fnhe->fnhe_stamp = jiffies;
1463 }
1464 spin_unlock_bh(&fnhe_lock);
1465
1466 return ret;
1467 }
1468
1469 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1470 {
1471 struct rtable *orig, *prev, **p;
1472 bool ret = true;
1473
1474 if (rt_is_input_route(rt)) {
1475 p = (struct rtable **)&nhc->nhc_rth_input;
1476 } else {
1477 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1478 }
1479 orig = *p;
1480
1481
1482
1483
1484 dst_hold(&rt->dst);
1485 prev = cmpxchg(p, orig, rt);
1486 if (prev == orig) {
1487 if (orig) {
1488 rt_add_uncached_list(orig);
1489 dst_release(&orig->dst);
1490 }
1491 } else {
1492 dst_release(&rt->dst);
1493 ret = false;
1494 }
1495
1496 return ret;
1497 }
1498
1499 struct uncached_list {
1500 spinlock_t lock;
1501 struct list_head head;
1502 struct list_head quarantine;
1503 };
1504
1505 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1506
1507 void rt_add_uncached_list(struct rtable *rt)
1508 {
1509 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1510
1511 rt->rt_uncached_list = ul;
1512
1513 spin_lock_bh(&ul->lock);
1514 list_add_tail(&rt->rt_uncached, &ul->head);
1515 spin_unlock_bh(&ul->lock);
1516 }
1517
1518 void rt_del_uncached_list(struct rtable *rt)
1519 {
1520 if (!list_empty(&rt->rt_uncached)) {
1521 struct uncached_list *ul = rt->rt_uncached_list;
1522
1523 spin_lock_bh(&ul->lock);
1524 list_del_init(&rt->rt_uncached);
1525 spin_unlock_bh(&ul->lock);
1526 }
1527 }
1528
1529 static void ipv4_dst_destroy(struct dst_entry *dst)
1530 {
1531 struct rtable *rt = (struct rtable *)dst;
1532
1533 ip_dst_metrics_put(dst);
1534 rt_del_uncached_list(rt);
1535 }
1536
1537 void rt_flush_dev(struct net_device *dev)
1538 {
1539 struct rtable *rt, *safe;
1540 int cpu;
1541
1542 for_each_possible_cpu(cpu) {
1543 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1544
1545 if (list_empty(&ul->head))
1546 continue;
1547
1548 spin_lock_bh(&ul->lock);
1549 list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
1550 if (rt->dst.dev != dev)
1551 continue;
1552 rt->dst.dev = blackhole_netdev;
1553 netdev_ref_replace(dev, blackhole_netdev,
1554 &rt->dst.dev_tracker, GFP_ATOMIC);
1555 list_move(&rt->rt_uncached, &ul->quarantine);
1556 }
1557 spin_unlock_bh(&ul->lock);
1558 }
1559 }
1560
1561 static bool rt_cache_valid(const struct rtable *rt)
1562 {
1563 return rt &&
1564 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1565 !rt_is_expired(rt);
1566 }
1567
1568 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1569 const struct fib_result *res,
1570 struct fib_nh_exception *fnhe,
1571 struct fib_info *fi, u16 type, u32 itag,
1572 const bool do_cache)
1573 {
1574 bool cached = false;
1575
1576 if (fi) {
1577 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1578
1579 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1580 rt->rt_uses_gateway = 1;
1581 rt->rt_gw_family = nhc->nhc_gw_family;
1582
1583 if (likely(nhc->nhc_gw_family == AF_INET))
1584 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1585 else
1586 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1587 }
1588
1589 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1590
1591 #ifdef CONFIG_IP_ROUTE_CLASSID
1592 if (nhc->nhc_family == AF_INET) {
1593 struct fib_nh *nh;
1594
1595 nh = container_of(nhc, struct fib_nh, nh_common);
1596 rt->dst.tclassid = nh->nh_tclassid;
1597 }
1598 #endif
1599 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1600 if (unlikely(fnhe))
1601 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1602 else if (do_cache)
1603 cached = rt_cache_route(nhc, rt);
1604 if (unlikely(!cached)) {
1605
1606
1607
1608
1609
1610 if (!rt->rt_gw4) {
1611 rt->rt_gw_family = AF_INET;
1612 rt->rt_gw4 = daddr;
1613 }
1614 rt_add_uncached_list(rt);
1615 }
1616 } else
1617 rt_add_uncached_list(rt);
1618
1619 #ifdef CONFIG_IP_ROUTE_CLASSID
1620 #ifdef CONFIG_IP_MULTIPLE_TABLES
1621 set_class_tag(rt, res->tclassid);
1622 #endif
1623 set_class_tag(rt, itag);
1624 #endif
1625 }
1626
1627 struct rtable *rt_dst_alloc(struct net_device *dev,
1628 unsigned int flags, u16 type,
1629 bool noxfrm)
1630 {
1631 struct rtable *rt;
1632
1633 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1634 (noxfrm ? DST_NOXFRM : 0));
1635
1636 if (rt) {
1637 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1638 rt->rt_flags = flags;
1639 rt->rt_type = type;
1640 rt->rt_is_input = 0;
1641 rt->rt_iif = 0;
1642 rt->rt_pmtu = 0;
1643 rt->rt_mtu_locked = 0;
1644 rt->rt_uses_gateway = 0;
1645 rt->rt_gw_family = 0;
1646 rt->rt_gw4 = 0;
1647 INIT_LIST_HEAD(&rt->rt_uncached);
1648
1649 rt->dst.output = ip_output;
1650 if (flags & RTCF_LOCAL)
1651 rt->dst.input = ip_local_deliver;
1652 }
1653
1654 return rt;
1655 }
1656 EXPORT_SYMBOL(rt_dst_alloc);
1657
1658 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1659 {
1660 struct rtable *new_rt;
1661
1662 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1663 rt->dst.flags);
1664
1665 if (new_rt) {
1666 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1667 new_rt->rt_flags = rt->rt_flags;
1668 new_rt->rt_type = rt->rt_type;
1669 new_rt->rt_is_input = rt->rt_is_input;
1670 new_rt->rt_iif = rt->rt_iif;
1671 new_rt->rt_pmtu = rt->rt_pmtu;
1672 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1673 new_rt->rt_gw_family = rt->rt_gw_family;
1674 if (rt->rt_gw_family == AF_INET)
1675 new_rt->rt_gw4 = rt->rt_gw4;
1676 else if (rt->rt_gw_family == AF_INET6)
1677 new_rt->rt_gw6 = rt->rt_gw6;
1678 INIT_LIST_HEAD(&new_rt->rt_uncached);
1679
1680 new_rt->dst.input = rt->dst.input;
1681 new_rt->dst.output = rt->dst.output;
1682 new_rt->dst.error = rt->dst.error;
1683 new_rt->dst.lastuse = jiffies;
1684 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1685 }
1686 return new_rt;
1687 }
1688 EXPORT_SYMBOL(rt_dst_clone);
1689
1690
1691 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1692 u8 tos, struct net_device *dev,
1693 struct in_device *in_dev, u32 *itag)
1694 {
1695 int err;
1696
1697
1698 if (!in_dev)
1699 return -EINVAL;
1700
1701 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1702 skb->protocol != htons(ETH_P_IP))
1703 return -EINVAL;
1704
1705 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1706 return -EINVAL;
1707
1708 if (ipv4_is_zeronet(saddr)) {
1709 if (!ipv4_is_local_multicast(daddr) &&
1710 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1711 return -EINVAL;
1712 } else {
1713 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1714 in_dev, itag);
1715 if (err < 0)
1716 return err;
1717 }
1718 return 0;
1719 }
1720
1721
1722 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1723 u8 tos, struct net_device *dev, int our)
1724 {
1725 struct in_device *in_dev = __in_dev_get_rcu(dev);
1726 unsigned int flags = RTCF_MULTICAST;
1727 struct rtable *rth;
1728 u32 itag = 0;
1729 int err;
1730
1731 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1732 if (err)
1733 return err;
1734
1735 if (our)
1736 flags |= RTCF_LOCAL;
1737
1738 if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1739 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1740
1741 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1742 false);
1743 if (!rth)
1744 return -ENOBUFS;
1745
1746 #ifdef CONFIG_IP_ROUTE_CLASSID
1747 rth->dst.tclassid = itag;
1748 #endif
1749 rth->dst.output = ip_rt_bug;
1750 rth->rt_is_input= 1;
1751
1752 #ifdef CONFIG_IP_MROUTE
1753 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1754 rth->dst.input = ip_mr_input;
1755 #endif
1756 RT_CACHE_STAT_INC(in_slow_mc);
1757
1758 skb_dst_drop(skb);
1759 skb_dst_set(skb, &rth->dst);
1760 return 0;
1761 }
1762
1763
1764 static void ip_handle_martian_source(struct net_device *dev,
1765 struct in_device *in_dev,
1766 struct sk_buff *skb,
1767 __be32 daddr,
1768 __be32 saddr)
1769 {
1770 RT_CACHE_STAT_INC(in_martian_src);
1771 #ifdef CONFIG_IP_ROUTE_VERBOSE
1772 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1773
1774
1775
1776
1777 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1778 &daddr, &saddr, dev->name);
1779 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1780 print_hex_dump(KERN_WARNING, "ll header: ",
1781 DUMP_PREFIX_OFFSET, 16, 1,
1782 skb_mac_header(skb),
1783 dev->hard_header_len, false);
1784 }
1785 }
1786 #endif
1787 }
1788
1789
1790 static int __mkroute_input(struct sk_buff *skb,
1791 const struct fib_result *res,
1792 struct in_device *in_dev,
1793 __be32 daddr, __be32 saddr, u32 tos)
1794 {
1795 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1796 struct net_device *dev = nhc->nhc_dev;
1797 struct fib_nh_exception *fnhe;
1798 struct rtable *rth;
1799 int err;
1800 struct in_device *out_dev;
1801 bool do_cache;
1802 u32 itag = 0;
1803
1804
1805 out_dev = __in_dev_get_rcu(dev);
1806 if (!out_dev) {
1807 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1808 return -EINVAL;
1809 }
1810
1811 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1812 in_dev->dev, in_dev, &itag);
1813 if (err < 0) {
1814 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1815 saddr);
1816
1817 goto cleanup;
1818 }
1819
1820 do_cache = res->fi && !itag;
1821 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1822 skb->protocol == htons(ETH_P_IP)) {
1823 __be32 gw;
1824
1825 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1826 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1827 inet_addr_onlink(out_dev, saddr, gw))
1828 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1829 }
1830
1831 if (skb->protocol != htons(ETH_P_IP)) {
1832
1833
1834
1835
1836
1837
1838
1839 if (out_dev == in_dev &&
1840 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1841 err = -EINVAL;
1842 goto cleanup;
1843 }
1844 }
1845
1846 if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1847 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1848
1849 fnhe = find_exception(nhc, daddr);
1850 if (do_cache) {
1851 if (fnhe)
1852 rth = rcu_dereference(fnhe->fnhe_rth_input);
1853 else
1854 rth = rcu_dereference(nhc->nhc_rth_input);
1855 if (rt_cache_valid(rth)) {
1856 skb_dst_set_noref(skb, &rth->dst);
1857 goto out;
1858 }
1859 }
1860
1861 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1862 IN_DEV_ORCONF(out_dev, NOXFRM));
1863 if (!rth) {
1864 err = -ENOBUFS;
1865 goto cleanup;
1866 }
1867
1868 rth->rt_is_input = 1;
1869 RT_CACHE_STAT_INC(in_slow_tot);
1870
1871 rth->dst.input = ip_forward;
1872
1873 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1874 do_cache);
1875 lwtunnel_set_redirect(&rth->dst);
1876 skb_dst_set(skb, &rth->dst);
1877 out:
1878 err = 0;
1879 cleanup:
1880 return err;
1881 }
1882
1883 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1884
1885
1886
1887 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1888 struct flow_keys *hash_keys)
1889 {
1890 const struct iphdr *outer_iph = ip_hdr(skb);
1891 const struct iphdr *key_iph = outer_iph;
1892 const struct iphdr *inner_iph;
1893 const struct icmphdr *icmph;
1894 struct iphdr _inner_iph;
1895 struct icmphdr _icmph;
1896
1897 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1898 goto out;
1899
1900 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1901 goto out;
1902
1903 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1904 &_icmph);
1905 if (!icmph)
1906 goto out;
1907
1908 if (!icmp_is_err(icmph->type))
1909 goto out;
1910
1911 inner_iph = skb_header_pointer(skb,
1912 outer_iph->ihl * 4 + sizeof(_icmph),
1913 sizeof(_inner_iph), &_inner_iph);
1914 if (!inner_iph)
1915 goto out;
1916
1917 key_iph = inner_iph;
1918 out:
1919 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1920 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1921 }
1922
1923 static u32 fib_multipath_custom_hash_outer(const struct net *net,
1924 const struct sk_buff *skb,
1925 bool *p_has_inner)
1926 {
1927 u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1928 struct flow_keys keys, hash_keys;
1929
1930 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
1931 return 0;
1932
1933 memset(&hash_keys, 0, sizeof(hash_keys));
1934 skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
1935
1936 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1937 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
1938 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1939 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
1940 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1941 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
1942 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1943 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
1944 hash_keys.ports.src = keys.ports.src;
1945 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
1946 hash_keys.ports.dst = keys.ports.dst;
1947
1948 *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
1949 return flow_hash_from_keys(&hash_keys);
1950 }
1951
1952 static u32 fib_multipath_custom_hash_inner(const struct net *net,
1953 const struct sk_buff *skb,
1954 bool has_inner)
1955 {
1956 u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1957 struct flow_keys keys, hash_keys;
1958
1959
1960
1961
1962
1963 if (!has_inner)
1964 return 0;
1965
1966 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
1967 return 0;
1968
1969 memset(&hash_keys, 0, sizeof(hash_keys));
1970 skb_flow_dissect_flow_keys(skb, &keys, 0);
1971
1972 if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
1973 return 0;
1974
1975 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1976 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1977 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1978 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1979 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1980 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1981 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1982 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1983 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1984 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1985 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1986 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1987 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
1988 hash_keys.tags.flow_label = keys.tags.flow_label;
1989 }
1990
1991 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
1992 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1993 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
1994 hash_keys.ports.src = keys.ports.src;
1995 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
1996 hash_keys.ports.dst = keys.ports.dst;
1997
1998 return flow_hash_from_keys(&hash_keys);
1999 }
2000
2001 static u32 fib_multipath_custom_hash_skb(const struct net *net,
2002 const struct sk_buff *skb)
2003 {
2004 u32 mhash, mhash_inner;
2005 bool has_inner = true;
2006
2007 mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
2008 mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
2009
2010 return jhash_2words(mhash, mhash_inner, 0);
2011 }
2012
2013 static u32 fib_multipath_custom_hash_fl4(const struct net *net,
2014 const struct flowi4 *fl4)
2015 {
2016 u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
2017 struct flow_keys hash_keys;
2018
2019 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2020 return 0;
2021
2022 memset(&hash_keys, 0, sizeof(hash_keys));
2023 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2024 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2025 hash_keys.addrs.v4addrs.src = fl4->saddr;
2026 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2027 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2028 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2029 hash_keys.basic.ip_proto = fl4->flowi4_proto;
2030 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2031 hash_keys.ports.src = fl4->fl4_sport;
2032 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2033 hash_keys.ports.dst = fl4->fl4_dport;
2034
2035 return flow_hash_from_keys(&hash_keys);
2036 }
2037
2038
2039 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
2040 const struct sk_buff *skb, struct flow_keys *flkeys)
2041 {
2042 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
2043 struct flow_keys hash_keys;
2044 u32 mhash = 0;
2045
2046 switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
2047 case 0:
2048 memset(&hash_keys, 0, sizeof(hash_keys));
2049 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2050 if (skb) {
2051 ip_multipath_l3_keys(skb, &hash_keys);
2052 } else {
2053 hash_keys.addrs.v4addrs.src = fl4->saddr;
2054 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2055 }
2056 mhash = flow_hash_from_keys(&hash_keys);
2057 break;
2058 case 1:
2059
2060 if (skb) {
2061 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2062 struct flow_keys keys;
2063
2064
2065 if (skb->l4_hash)
2066 return skb_get_hash_raw(skb) >> 1;
2067
2068 memset(&hash_keys, 0, sizeof(hash_keys));
2069
2070 if (!flkeys) {
2071 skb_flow_dissect_flow_keys(skb, &keys, flag);
2072 flkeys = &keys;
2073 }
2074
2075 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2076 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2077 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2078 hash_keys.ports.src = flkeys->ports.src;
2079 hash_keys.ports.dst = flkeys->ports.dst;
2080 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2081 } else {
2082 memset(&hash_keys, 0, sizeof(hash_keys));
2083 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2084 hash_keys.addrs.v4addrs.src = fl4->saddr;
2085 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2086 hash_keys.ports.src = fl4->fl4_sport;
2087 hash_keys.ports.dst = fl4->fl4_dport;
2088 hash_keys.basic.ip_proto = fl4->flowi4_proto;
2089 }
2090 mhash = flow_hash_from_keys(&hash_keys);
2091 break;
2092 case 2:
2093 memset(&hash_keys, 0, sizeof(hash_keys));
2094
2095 if (skb) {
2096 struct flow_keys keys;
2097
2098 skb_flow_dissect_flow_keys(skb, &keys, 0);
2099
2100 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2101 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2102 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2103 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2104 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2105 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2106 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2107 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2108 hash_keys.tags.flow_label = keys.tags.flow_label;
2109 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2110 } else {
2111
2112 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2113 ip_multipath_l3_keys(skb, &hash_keys);
2114 }
2115 } else {
2116
2117 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2118 hash_keys.addrs.v4addrs.src = fl4->saddr;
2119 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2120 }
2121 mhash = flow_hash_from_keys(&hash_keys);
2122 break;
2123 case 3:
2124 if (skb)
2125 mhash = fib_multipath_custom_hash_skb(net, skb);
2126 else
2127 mhash = fib_multipath_custom_hash_fl4(net, fl4);
2128 break;
2129 }
2130
2131 if (multipath_hash)
2132 mhash = jhash_2words(mhash, multipath_hash, 0);
2133
2134 return mhash >> 1;
2135 }
2136 #endif
2137
2138 static int ip_mkroute_input(struct sk_buff *skb,
2139 struct fib_result *res,
2140 struct in_device *in_dev,
2141 __be32 daddr, __be32 saddr, u32 tos,
2142 struct flow_keys *hkeys)
2143 {
2144 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2145 if (res->fi && fib_info_num_path(res->fi) > 1) {
2146 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2147
2148 fib_select_multipath(res, h);
2149 }
2150 #endif
2151
2152
2153 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2154 }
2155
2156
2157
2158
2159
2160 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2161 u8 tos, struct net_device *dev,
2162 const struct sk_buff *hint)
2163 {
2164 struct in_device *in_dev = __in_dev_get_rcu(dev);
2165 struct rtable *rt = skb_rtable(hint);
2166 struct net *net = dev_net(dev);
2167 int err = -EINVAL;
2168 u32 tag = 0;
2169
2170 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2171 goto martian_source;
2172
2173 if (ipv4_is_zeronet(saddr))
2174 goto martian_source;
2175
2176 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2177 goto martian_source;
2178
2179 if (rt->rt_type != RTN_LOCAL)
2180 goto skip_validate_source;
2181
2182 tos &= IPTOS_RT_MASK;
2183 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2184 if (err < 0)
2185 goto martian_source;
2186
2187 skip_validate_source:
2188 skb_dst_copy(skb, hint);
2189 return 0;
2190
2191 martian_source:
2192 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2193 return err;
2194 }
2195
2196
2197 static struct net_device *ip_rt_get_dev(struct net *net,
2198 const struct fib_result *res)
2199 {
2200 struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2201 struct net_device *dev = NULL;
2202
2203 if (nhc)
2204 dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2205
2206 return dev ? : net->loopback_dev;
2207 }
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2223 u8 tos, struct net_device *dev,
2224 struct fib_result *res)
2225 {
2226 struct in_device *in_dev = __in_dev_get_rcu(dev);
2227 struct flow_keys *flkeys = NULL, _flkeys;
2228 struct net *net = dev_net(dev);
2229 struct ip_tunnel_info *tun_info;
2230 int err = -EINVAL;
2231 unsigned int flags = 0;
2232 u32 itag = 0;
2233 struct rtable *rth;
2234 struct flowi4 fl4;
2235 bool do_cache = true;
2236
2237
2238
2239 if (!in_dev)
2240 goto out;
2241
2242
2243
2244
2245
2246 tun_info = skb_tunnel_info(skb);
2247 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2248 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2249 else
2250 fl4.flowi4_tun_key.tun_id = 0;
2251 skb_dst_drop(skb);
2252
2253 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2254 goto martian_source;
2255
2256 res->fi = NULL;
2257 res->table = NULL;
2258 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2259 goto brd_input;
2260
2261
2262
2263
2264 if (ipv4_is_zeronet(saddr))
2265 goto martian_source;
2266
2267 if (ipv4_is_zeronet(daddr))
2268 goto martian_destination;
2269
2270
2271
2272
2273 if (ipv4_is_loopback(daddr)) {
2274 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2275 goto martian_destination;
2276 } else if (ipv4_is_loopback(saddr)) {
2277 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2278 goto martian_source;
2279 }
2280
2281
2282
2283
2284 fl4.flowi4_l3mdev = 0;
2285 fl4.flowi4_oif = 0;
2286 fl4.flowi4_iif = dev->ifindex;
2287 fl4.flowi4_mark = skb->mark;
2288 fl4.flowi4_tos = tos;
2289 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2290 fl4.flowi4_flags = 0;
2291 fl4.daddr = daddr;
2292 fl4.saddr = saddr;
2293 fl4.flowi4_uid = sock_net_uid(net, NULL);
2294 fl4.flowi4_multipath_hash = 0;
2295
2296 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2297 flkeys = &_flkeys;
2298 } else {
2299 fl4.flowi4_proto = 0;
2300 fl4.fl4_sport = 0;
2301 fl4.fl4_dport = 0;
2302 }
2303
2304 err = fib_lookup(net, &fl4, res, 0);
2305 if (err != 0) {
2306 if (!IN_DEV_FORWARD(in_dev))
2307 err = -EHOSTUNREACH;
2308 goto no_route;
2309 }
2310
2311 if (res->type == RTN_BROADCAST) {
2312 if (IN_DEV_BFORWARD(in_dev))
2313 goto make_route;
2314
2315 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2316 do_cache = false;
2317 goto brd_input;
2318 }
2319
2320 if (res->type == RTN_LOCAL) {
2321 err = fib_validate_source(skb, saddr, daddr, tos,
2322 0, dev, in_dev, &itag);
2323 if (err < 0)
2324 goto martian_source;
2325 goto local_input;
2326 }
2327
2328 if (!IN_DEV_FORWARD(in_dev)) {
2329 err = -EHOSTUNREACH;
2330 goto no_route;
2331 }
2332 if (res->type != RTN_UNICAST)
2333 goto martian_destination;
2334
2335 make_route:
2336 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2337 out: return err;
2338
2339 brd_input:
2340 if (skb->protocol != htons(ETH_P_IP))
2341 goto e_inval;
2342
2343 if (!ipv4_is_zeronet(saddr)) {
2344 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2345 in_dev, &itag);
2346 if (err < 0)
2347 goto martian_source;
2348 }
2349 flags |= RTCF_BROADCAST;
2350 res->type = RTN_BROADCAST;
2351 RT_CACHE_STAT_INC(in_brd);
2352
2353 local_input:
2354 if (IN_DEV_ORCONF(in_dev, NOPOLICY))
2355 IPCB(skb)->flags |= IPSKB_NOPOLICY;
2356
2357 do_cache &= res->fi && !itag;
2358 if (do_cache) {
2359 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2360
2361 rth = rcu_dereference(nhc->nhc_rth_input);
2362 if (rt_cache_valid(rth)) {
2363 skb_dst_set_noref(skb, &rth->dst);
2364 err = 0;
2365 goto out;
2366 }
2367 }
2368
2369 rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2370 flags | RTCF_LOCAL, res->type, false);
2371 if (!rth)
2372 goto e_nobufs;
2373
2374 rth->dst.output= ip_rt_bug;
2375 #ifdef CONFIG_IP_ROUTE_CLASSID
2376 rth->dst.tclassid = itag;
2377 #endif
2378 rth->rt_is_input = 1;
2379
2380 RT_CACHE_STAT_INC(in_slow_tot);
2381 if (res->type == RTN_UNREACHABLE) {
2382 rth->dst.input= ip_error;
2383 rth->dst.error= -err;
2384 rth->rt_flags &= ~RTCF_LOCAL;
2385 }
2386
2387 if (do_cache) {
2388 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2389
2390 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2391 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2392 WARN_ON(rth->dst.input == lwtunnel_input);
2393 rth->dst.lwtstate->orig_input = rth->dst.input;
2394 rth->dst.input = lwtunnel_input;
2395 }
2396
2397 if (unlikely(!rt_cache_route(nhc, rth)))
2398 rt_add_uncached_list(rth);
2399 }
2400 skb_dst_set(skb, &rth->dst);
2401 err = 0;
2402 goto out;
2403
2404 no_route:
2405 RT_CACHE_STAT_INC(in_no_route);
2406 res->type = RTN_UNREACHABLE;
2407 res->fi = NULL;
2408 res->table = NULL;
2409 goto local_input;
2410
2411
2412
2413
2414 martian_destination:
2415 RT_CACHE_STAT_INC(in_martian_dst);
2416 #ifdef CONFIG_IP_ROUTE_VERBOSE
2417 if (IN_DEV_LOG_MARTIANS(in_dev))
2418 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2419 &daddr, &saddr, dev->name);
2420 #endif
2421
2422 e_inval:
2423 err = -EINVAL;
2424 goto out;
2425
2426 e_nobufs:
2427 err = -ENOBUFS;
2428 goto out;
2429
2430 martian_source:
2431 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2432 goto out;
2433 }
2434
2435
2436 static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2437 u8 tos, struct net_device *dev, struct fib_result *res)
2438 {
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450 if (ipv4_is_multicast(daddr)) {
2451 struct in_device *in_dev = __in_dev_get_rcu(dev);
2452 int our = 0;
2453 int err = -EINVAL;
2454
2455 if (!in_dev)
2456 return err;
2457 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2458 ip_hdr(skb)->protocol);
2459
2460
2461 if (!our && netif_is_l3_slave(dev)) {
2462 struct in_device *l3_in_dev;
2463
2464 l3_in_dev = __in_dev_get_rcu(skb->dev);
2465 if (l3_in_dev)
2466 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2467 ip_hdr(skb)->protocol);
2468 }
2469
2470 if (our
2471 #ifdef CONFIG_IP_MROUTE
2472 ||
2473 (!ipv4_is_local_multicast(daddr) &&
2474 IN_DEV_MFORWARD(in_dev))
2475 #endif
2476 ) {
2477 err = ip_route_input_mc(skb, daddr, saddr,
2478 tos, dev, our);
2479 }
2480 return err;
2481 }
2482
2483 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2484 }
2485
2486 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2487 u8 tos, struct net_device *dev)
2488 {
2489 struct fib_result res;
2490 int err;
2491
2492 tos &= IPTOS_RT_MASK;
2493 rcu_read_lock();
2494 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2495 rcu_read_unlock();
2496
2497 return err;
2498 }
2499 EXPORT_SYMBOL(ip_route_input_noref);
2500
2501
2502 static struct rtable *__mkroute_output(const struct fib_result *res,
2503 const struct flowi4 *fl4, int orig_oif,
2504 struct net_device *dev_out,
2505 unsigned int flags)
2506 {
2507 struct fib_info *fi = res->fi;
2508 struct fib_nh_exception *fnhe;
2509 struct in_device *in_dev;
2510 u16 type = res->type;
2511 struct rtable *rth;
2512 bool do_cache;
2513
2514 in_dev = __in_dev_get_rcu(dev_out);
2515 if (!in_dev)
2516 return ERR_PTR(-EINVAL);
2517
2518 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2519 if (ipv4_is_loopback(fl4->saddr) &&
2520 !(dev_out->flags & IFF_LOOPBACK) &&
2521 !netif_is_l3_master(dev_out))
2522 return ERR_PTR(-EINVAL);
2523
2524 if (ipv4_is_lbcast(fl4->daddr))
2525 type = RTN_BROADCAST;
2526 else if (ipv4_is_multicast(fl4->daddr))
2527 type = RTN_MULTICAST;
2528 else if (ipv4_is_zeronet(fl4->daddr))
2529 return ERR_PTR(-EINVAL);
2530
2531 if (dev_out->flags & IFF_LOOPBACK)
2532 flags |= RTCF_LOCAL;
2533
2534 do_cache = true;
2535 if (type == RTN_BROADCAST) {
2536 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2537 fi = NULL;
2538 } else if (type == RTN_MULTICAST) {
2539 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2540 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2541 fl4->flowi4_proto))
2542 flags &= ~RTCF_LOCAL;
2543 else
2544 do_cache = false;
2545
2546
2547
2548
2549 if (fi && res->prefixlen < 4)
2550 fi = NULL;
2551 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2552 (orig_oif != dev_out->ifindex)) {
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562 do_cache = false;
2563 }
2564
2565 fnhe = NULL;
2566 do_cache &= fi != NULL;
2567 if (fi) {
2568 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2569 struct rtable __rcu **prth;
2570
2571 fnhe = find_exception(nhc, fl4->daddr);
2572 if (!do_cache)
2573 goto add;
2574 if (fnhe) {
2575 prth = &fnhe->fnhe_rth_output;
2576 } else {
2577 if (unlikely(fl4->flowi4_flags &
2578 FLOWI_FLAG_KNOWN_NH &&
2579 !(nhc->nhc_gw_family &&
2580 nhc->nhc_scope == RT_SCOPE_LINK))) {
2581 do_cache = false;
2582 goto add;
2583 }
2584 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2585 }
2586 rth = rcu_dereference(*prth);
2587 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2588 return rth;
2589 }
2590
2591 add:
2592 rth = rt_dst_alloc(dev_out, flags, type,
2593 IN_DEV_ORCONF(in_dev, NOXFRM));
2594 if (!rth)
2595 return ERR_PTR(-ENOBUFS);
2596
2597 rth->rt_iif = orig_oif;
2598
2599 RT_CACHE_STAT_INC(out_slow_tot);
2600
2601 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2602 if (flags & RTCF_LOCAL &&
2603 !(dev_out->flags & IFF_LOOPBACK)) {
2604 rth->dst.output = ip_mc_output;
2605 RT_CACHE_STAT_INC(out_slow_mc);
2606 }
2607 #ifdef CONFIG_IP_MROUTE
2608 if (type == RTN_MULTICAST) {
2609 if (IN_DEV_MFORWARD(in_dev) &&
2610 !ipv4_is_local_multicast(fl4->daddr)) {
2611 rth->dst.input = ip_mr_input;
2612 rth->dst.output = ip_mc_output;
2613 }
2614 }
2615 #endif
2616 }
2617
2618 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2619 lwtunnel_set_redirect(&rth->dst);
2620
2621 return rth;
2622 }
2623
2624
2625
2626
2627
2628 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2629 const struct sk_buff *skb)
2630 {
2631 struct fib_result res = {
2632 .type = RTN_UNSPEC,
2633 .fi = NULL,
2634 .table = NULL,
2635 .tclassid = 0,
2636 };
2637 struct rtable *rth;
2638
2639 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2640 ip_rt_fix_tos(fl4);
2641
2642 rcu_read_lock();
2643 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2644 rcu_read_unlock();
2645
2646 return rth;
2647 }
2648 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2649
2650 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2651 struct fib_result *res,
2652 const struct sk_buff *skb)
2653 {
2654 struct net_device *dev_out = NULL;
2655 int orig_oif = fl4->flowi4_oif;
2656 unsigned int flags = 0;
2657 struct rtable *rth;
2658 int err;
2659
2660 if (fl4->saddr) {
2661 if (ipv4_is_multicast(fl4->saddr) ||
2662 ipv4_is_lbcast(fl4->saddr) ||
2663 ipv4_is_zeronet(fl4->saddr)) {
2664 rth = ERR_PTR(-EINVAL);
2665 goto out;
2666 }
2667
2668 rth = ERR_PTR(-ENETUNREACH);
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678 if (fl4->flowi4_oif == 0 &&
2679 (ipv4_is_multicast(fl4->daddr) ||
2680 ipv4_is_lbcast(fl4->daddr))) {
2681
2682 dev_out = __ip_dev_find(net, fl4->saddr, false);
2683 if (!dev_out)
2684 goto out;
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701 fl4->flowi4_oif = dev_out->ifindex;
2702 goto make_route;
2703 }
2704
2705 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2706
2707 if (!__ip_dev_find(net, fl4->saddr, false))
2708 goto out;
2709 }
2710 }
2711
2712
2713 if (fl4->flowi4_oif) {
2714 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2715 rth = ERR_PTR(-ENODEV);
2716 if (!dev_out)
2717 goto out;
2718
2719
2720 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2721 rth = ERR_PTR(-ENETUNREACH);
2722 goto out;
2723 }
2724 if (ipv4_is_local_multicast(fl4->daddr) ||
2725 ipv4_is_lbcast(fl4->daddr) ||
2726 fl4->flowi4_proto == IPPROTO_IGMP) {
2727 if (!fl4->saddr)
2728 fl4->saddr = inet_select_addr(dev_out, 0,
2729 RT_SCOPE_LINK);
2730 goto make_route;
2731 }
2732 if (!fl4->saddr) {
2733 if (ipv4_is_multicast(fl4->daddr))
2734 fl4->saddr = inet_select_addr(dev_out, 0,
2735 fl4->flowi4_scope);
2736 else if (!fl4->daddr)
2737 fl4->saddr = inet_select_addr(dev_out, 0,
2738 RT_SCOPE_HOST);
2739 }
2740 }
2741
2742 if (!fl4->daddr) {
2743 fl4->daddr = fl4->saddr;
2744 if (!fl4->daddr)
2745 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2746 dev_out = net->loopback_dev;
2747 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2748 res->type = RTN_LOCAL;
2749 flags |= RTCF_LOCAL;
2750 goto make_route;
2751 }
2752
2753 err = fib_lookup(net, fl4, res, 0);
2754 if (err) {
2755 res->fi = NULL;
2756 res->table = NULL;
2757 if (fl4->flowi4_oif &&
2758 (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777 if (fl4->saddr == 0)
2778 fl4->saddr = inet_select_addr(dev_out, 0,
2779 RT_SCOPE_LINK);
2780 res->type = RTN_UNICAST;
2781 goto make_route;
2782 }
2783 rth = ERR_PTR(err);
2784 goto out;
2785 }
2786
2787 if (res->type == RTN_LOCAL) {
2788 if (!fl4->saddr) {
2789 if (res->fi->fib_prefsrc)
2790 fl4->saddr = res->fi->fib_prefsrc;
2791 else
2792 fl4->saddr = fl4->daddr;
2793 }
2794
2795
2796 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2797 net->loopback_dev;
2798
2799
2800
2801
2802 orig_oif = FIB_RES_OIF(*res);
2803
2804 fl4->flowi4_oif = dev_out->ifindex;
2805 flags |= RTCF_LOCAL;
2806 goto make_route;
2807 }
2808
2809 fib_select_path(net, res, fl4, skb);
2810
2811 dev_out = FIB_RES_DEV(*res);
2812
2813 make_route:
2814 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2815
2816 out:
2817 return rth;
2818 }
2819
2820 static struct dst_ops ipv4_dst_blackhole_ops = {
2821 .family = AF_INET,
2822 .default_advmss = ipv4_default_advmss,
2823 .neigh_lookup = ipv4_neigh_lookup,
2824 .check = dst_blackhole_check,
2825 .cow_metrics = dst_blackhole_cow_metrics,
2826 .update_pmtu = dst_blackhole_update_pmtu,
2827 .redirect = dst_blackhole_redirect,
2828 .mtu = dst_blackhole_mtu,
2829 };
2830
2831 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2832 {
2833 struct rtable *ort = (struct rtable *) dst_orig;
2834 struct rtable *rt;
2835
2836 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2837 if (rt) {
2838 struct dst_entry *new = &rt->dst;
2839
2840 new->__use = 1;
2841 new->input = dst_discard;
2842 new->output = dst_discard_out;
2843
2844 new->dev = net->loopback_dev;
2845 netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
2846
2847 rt->rt_is_input = ort->rt_is_input;
2848 rt->rt_iif = ort->rt_iif;
2849 rt->rt_pmtu = ort->rt_pmtu;
2850 rt->rt_mtu_locked = ort->rt_mtu_locked;
2851
2852 rt->rt_genid = rt_genid_ipv4(net);
2853 rt->rt_flags = ort->rt_flags;
2854 rt->rt_type = ort->rt_type;
2855 rt->rt_uses_gateway = ort->rt_uses_gateway;
2856 rt->rt_gw_family = ort->rt_gw_family;
2857 if (rt->rt_gw_family == AF_INET)
2858 rt->rt_gw4 = ort->rt_gw4;
2859 else if (rt->rt_gw_family == AF_INET6)
2860 rt->rt_gw6 = ort->rt_gw6;
2861
2862 INIT_LIST_HEAD(&rt->rt_uncached);
2863 }
2864
2865 dst_release(dst_orig);
2866
2867 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2868 }
2869
2870 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2871 const struct sock *sk)
2872 {
2873 struct rtable *rt = __ip_route_output_key(net, flp4);
2874
2875 if (IS_ERR(rt))
2876 return rt;
2877
2878 if (flp4->flowi4_proto) {
2879 flp4->flowi4_oif = rt->dst.dev->ifindex;
2880 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2881 flowi4_to_flowi(flp4),
2882 sk, 0);
2883 }
2884
2885 return rt;
2886 }
2887 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2888
2889 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2890 struct net_device *dev,
2891 struct net *net, __be32 *saddr,
2892 const struct ip_tunnel_info *info,
2893 u8 protocol, bool use_cache)
2894 {
2895 #ifdef CONFIG_DST_CACHE
2896 struct dst_cache *dst_cache;
2897 #endif
2898 struct rtable *rt = NULL;
2899 struct flowi4 fl4;
2900 __u8 tos;
2901
2902 #ifdef CONFIG_DST_CACHE
2903 dst_cache = (struct dst_cache *)&info->dst_cache;
2904 if (use_cache) {
2905 rt = dst_cache_get_ip4(dst_cache, saddr);
2906 if (rt)
2907 return rt;
2908 }
2909 #endif
2910 memset(&fl4, 0, sizeof(fl4));
2911 fl4.flowi4_mark = skb->mark;
2912 fl4.flowi4_proto = protocol;
2913 fl4.daddr = info->key.u.ipv4.dst;
2914 fl4.saddr = info->key.u.ipv4.src;
2915 tos = info->key.tos;
2916 fl4.flowi4_tos = RT_TOS(tos);
2917
2918 rt = ip_route_output_key(net, &fl4);
2919 if (IS_ERR(rt)) {
2920 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2921 return ERR_PTR(-ENETUNREACH);
2922 }
2923 if (rt->dst.dev == dev) {
2924 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2925 ip_rt_put(rt);
2926 return ERR_PTR(-ELOOP);
2927 }
2928 #ifdef CONFIG_DST_CACHE
2929 if (use_cache)
2930 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2931 #endif
2932 *saddr = fl4.saddr;
2933 return rt;
2934 }
2935 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2936
2937
2938 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2939 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2940 struct sk_buff *skb, u32 portid, u32 seq,
2941 unsigned int flags)
2942 {
2943 struct rtmsg *r;
2944 struct nlmsghdr *nlh;
2945 unsigned long expires = 0;
2946 u32 error;
2947 u32 metrics[RTAX_MAX];
2948
2949 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2950 if (!nlh)
2951 return -EMSGSIZE;
2952
2953 r = nlmsg_data(nlh);
2954 r->rtm_family = AF_INET;
2955 r->rtm_dst_len = 32;
2956 r->rtm_src_len = 0;
2957 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2958 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2959 if (nla_put_u32(skb, RTA_TABLE, table_id))
2960 goto nla_put_failure;
2961 r->rtm_type = rt->rt_type;
2962 r->rtm_scope = RT_SCOPE_UNIVERSE;
2963 r->rtm_protocol = RTPROT_UNSPEC;
2964 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2965 if (rt->rt_flags & RTCF_NOTIFY)
2966 r->rtm_flags |= RTM_F_NOTIFY;
2967 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2968 r->rtm_flags |= RTCF_DOREDIRECT;
2969
2970 if (nla_put_in_addr(skb, RTA_DST, dst))
2971 goto nla_put_failure;
2972 if (src) {
2973 r->rtm_src_len = 32;
2974 if (nla_put_in_addr(skb, RTA_SRC, src))
2975 goto nla_put_failure;
2976 }
2977 if (rt->dst.dev &&
2978 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2979 goto nla_put_failure;
2980 if (rt->dst.lwtstate &&
2981 lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2982 goto nla_put_failure;
2983 #ifdef CONFIG_IP_ROUTE_CLASSID
2984 if (rt->dst.tclassid &&
2985 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2986 goto nla_put_failure;
2987 #endif
2988 if (fl4 && !rt_is_input_route(rt) &&
2989 fl4->saddr != src) {
2990 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2991 goto nla_put_failure;
2992 }
2993 if (rt->rt_uses_gateway) {
2994 if (rt->rt_gw_family == AF_INET &&
2995 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2996 goto nla_put_failure;
2997 } else if (rt->rt_gw_family == AF_INET6) {
2998 int alen = sizeof(struct in6_addr);
2999 struct nlattr *nla;
3000 struct rtvia *via;
3001
3002 nla = nla_reserve(skb, RTA_VIA, alen + 2);
3003 if (!nla)
3004 goto nla_put_failure;
3005
3006 via = nla_data(nla);
3007 via->rtvia_family = AF_INET6;
3008 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
3009 }
3010 }
3011
3012 expires = rt->dst.expires;
3013 if (expires) {
3014 unsigned long now = jiffies;
3015
3016 if (time_before(now, expires))
3017 expires -= now;
3018 else
3019 expires = 0;
3020 }
3021
3022 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3023 if (rt->rt_pmtu && expires)
3024 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
3025 if (rt->rt_mtu_locked && expires)
3026 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
3027 if (rtnetlink_put_metrics(skb, metrics) < 0)
3028 goto nla_put_failure;
3029
3030 if (fl4) {
3031 if (fl4->flowi4_mark &&
3032 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
3033 goto nla_put_failure;
3034
3035 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
3036 nla_put_u32(skb, RTA_UID,
3037 from_kuid_munged(current_user_ns(),
3038 fl4->flowi4_uid)))
3039 goto nla_put_failure;
3040
3041 if (rt_is_input_route(rt)) {
3042 #ifdef CONFIG_IP_MROUTE
3043 if (ipv4_is_multicast(dst) &&
3044 !ipv4_is_local_multicast(dst) &&
3045 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3046 int err = ipmr_get_route(net, skb,
3047 fl4->saddr, fl4->daddr,
3048 r, portid);
3049
3050 if (err <= 0) {
3051 if (err == 0)
3052 return 0;
3053 goto nla_put_failure;
3054 }
3055 } else
3056 #endif
3057 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
3058 goto nla_put_failure;
3059 }
3060 }
3061
3062 error = rt->dst.error;
3063
3064 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3065 goto nla_put_failure;
3066
3067 nlmsg_end(skb, nlh);
3068 return 0;
3069
3070 nla_put_failure:
3071 nlmsg_cancel(skb, nlh);
3072 return -EMSGSIZE;
3073 }
3074
3075 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3076 struct netlink_callback *cb, u32 table_id,
3077 struct fnhe_hash_bucket *bucket, int genid,
3078 int *fa_index, int fa_start, unsigned int flags)
3079 {
3080 int i;
3081
3082 for (i = 0; i < FNHE_HASH_SIZE; i++) {
3083 struct fib_nh_exception *fnhe;
3084
3085 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3086 fnhe = rcu_dereference(fnhe->fnhe_next)) {
3087 struct rtable *rt;
3088 int err;
3089
3090 if (*fa_index < fa_start)
3091 goto next;
3092
3093 if (fnhe->fnhe_genid != genid)
3094 goto next;
3095
3096 if (fnhe->fnhe_expires &&
3097 time_after(jiffies, fnhe->fnhe_expires))
3098 goto next;
3099
3100 rt = rcu_dereference(fnhe->fnhe_rth_input);
3101 if (!rt)
3102 rt = rcu_dereference(fnhe->fnhe_rth_output);
3103 if (!rt)
3104 goto next;
3105
3106 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3107 table_id, NULL, skb,
3108 NETLINK_CB(cb->skb).portid,
3109 cb->nlh->nlmsg_seq, flags);
3110 if (err)
3111 return err;
3112 next:
3113 (*fa_index)++;
3114 }
3115 }
3116
3117 return 0;
3118 }
3119
3120 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3121 u32 table_id, struct fib_info *fi,
3122 int *fa_index, int fa_start, unsigned int flags)
3123 {
3124 struct net *net = sock_net(cb->skb->sk);
3125 int nhsel, genid = fnhe_genid(net);
3126
3127 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3128 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3129 struct fnhe_hash_bucket *bucket;
3130 int err;
3131
3132 if (nhc->nhc_flags & RTNH_F_DEAD)
3133 continue;
3134
3135 rcu_read_lock();
3136 bucket = rcu_dereference(nhc->nhc_exceptions);
3137 err = 0;
3138 if (bucket)
3139 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3140 genid, fa_index, fa_start,
3141 flags);
3142 rcu_read_unlock();
3143 if (err)
3144 return err;
3145 }
3146
3147 return 0;
3148 }
3149
3150 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3151 u8 ip_proto, __be16 sport,
3152 __be16 dport)
3153 {
3154 struct sk_buff *skb;
3155 struct iphdr *iph;
3156
3157 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3158 if (!skb)
3159 return NULL;
3160
3161
3162
3163
3164 skb_reset_mac_header(skb);
3165 skb_reset_network_header(skb);
3166 skb->protocol = htons(ETH_P_IP);
3167 iph = skb_put(skb, sizeof(struct iphdr));
3168 iph->protocol = ip_proto;
3169 iph->saddr = src;
3170 iph->daddr = dst;
3171 iph->version = 0x4;
3172 iph->frag_off = 0;
3173 iph->ihl = 0x5;
3174 skb_set_transport_header(skb, skb->len);
3175
3176 switch (iph->protocol) {
3177 case IPPROTO_UDP: {
3178 struct udphdr *udph;
3179
3180 udph = skb_put_zero(skb, sizeof(struct udphdr));
3181 udph->source = sport;
3182 udph->dest = dport;
3183 udph->len = htons(sizeof(struct udphdr));
3184 udph->check = 0;
3185 break;
3186 }
3187 case IPPROTO_TCP: {
3188 struct tcphdr *tcph;
3189
3190 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3191 tcph->source = sport;
3192 tcph->dest = dport;
3193 tcph->doff = sizeof(struct tcphdr) / 4;
3194 tcph->rst = 1;
3195 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3196 src, dst, 0);
3197 break;
3198 }
3199 case IPPROTO_ICMP: {
3200 struct icmphdr *icmph;
3201
3202 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3203 icmph->type = ICMP_ECHO;
3204 icmph->code = 0;
3205 }
3206 }
3207
3208 return skb;
3209 }
3210
3211 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3212 const struct nlmsghdr *nlh,
3213 struct nlattr **tb,
3214 struct netlink_ext_ack *extack)
3215 {
3216 struct rtmsg *rtm;
3217 int i, err;
3218
3219 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3220 NL_SET_ERR_MSG(extack,
3221 "ipv4: Invalid header for route get request");
3222 return -EINVAL;
3223 }
3224
3225 if (!netlink_strict_get_check(skb))
3226 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3227 rtm_ipv4_policy, extack);
3228
3229 rtm = nlmsg_data(nlh);
3230 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3231 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3232 rtm->rtm_table || rtm->rtm_protocol ||
3233 rtm->rtm_scope || rtm->rtm_type) {
3234 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3235 return -EINVAL;
3236 }
3237
3238 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3239 RTM_F_LOOKUP_TABLE |
3240 RTM_F_FIB_MATCH)) {
3241 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3242 return -EINVAL;
3243 }
3244
3245 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3246 rtm_ipv4_policy, extack);
3247 if (err)
3248 return err;
3249
3250 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3251 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3252 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3253 return -EINVAL;
3254 }
3255
3256 for (i = 0; i <= RTA_MAX; i++) {
3257 if (!tb[i])
3258 continue;
3259
3260 switch (i) {
3261 case RTA_IIF:
3262 case RTA_OIF:
3263 case RTA_SRC:
3264 case RTA_DST:
3265 case RTA_IP_PROTO:
3266 case RTA_SPORT:
3267 case RTA_DPORT:
3268 case RTA_MARK:
3269 case RTA_UID:
3270 break;
3271 default:
3272 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3273 return -EINVAL;
3274 }
3275 }
3276
3277 return 0;
3278 }
3279
3280 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3281 struct netlink_ext_ack *extack)
3282 {
3283 struct net *net = sock_net(in_skb->sk);
3284 struct nlattr *tb[RTA_MAX+1];
3285 u32 table_id = RT_TABLE_MAIN;
3286 __be16 sport = 0, dport = 0;
3287 struct fib_result res = {};
3288 u8 ip_proto = IPPROTO_UDP;
3289 struct rtable *rt = NULL;
3290 struct sk_buff *skb;
3291 struct rtmsg *rtm;
3292 struct flowi4 fl4 = {};
3293 __be32 dst = 0;
3294 __be32 src = 0;
3295 kuid_t uid;
3296 u32 iif;
3297 int err;
3298 int mark;
3299
3300 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3301 if (err < 0)
3302 return err;
3303
3304 rtm = nlmsg_data(nlh);
3305 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3306 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3307 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3308 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3309 if (tb[RTA_UID])
3310 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3311 else
3312 uid = (iif ? INVALID_UID : current_uid());
3313
3314 if (tb[RTA_IP_PROTO]) {
3315 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3316 &ip_proto, AF_INET, extack);
3317 if (err)
3318 return err;
3319 }
3320
3321 if (tb[RTA_SPORT])
3322 sport = nla_get_be16(tb[RTA_SPORT]);
3323
3324 if (tb[RTA_DPORT])
3325 dport = nla_get_be16(tb[RTA_DPORT]);
3326
3327 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3328 if (!skb)
3329 return -ENOBUFS;
3330
3331 fl4.daddr = dst;
3332 fl4.saddr = src;
3333 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3334 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3335 fl4.flowi4_mark = mark;
3336 fl4.flowi4_uid = uid;
3337 if (sport)
3338 fl4.fl4_sport = sport;
3339 if (dport)
3340 fl4.fl4_dport = dport;
3341 fl4.flowi4_proto = ip_proto;
3342
3343 rcu_read_lock();
3344
3345 if (iif) {
3346 struct net_device *dev;
3347
3348 dev = dev_get_by_index_rcu(net, iif);
3349 if (!dev) {
3350 err = -ENODEV;
3351 goto errout_rcu;
3352 }
3353
3354 fl4.flowi4_iif = iif;
3355 skb->dev = dev;
3356 skb->mark = mark;
3357 err = ip_route_input_rcu(skb, dst, src,
3358 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3359 &res);
3360
3361 rt = skb_rtable(skb);
3362 if (err == 0 && rt->dst.error)
3363 err = -rt->dst.error;
3364 } else {
3365 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3366 skb->dev = net->loopback_dev;
3367 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3368 err = 0;
3369 if (IS_ERR(rt))
3370 err = PTR_ERR(rt);
3371 else
3372 skb_dst_set(skb, &rt->dst);
3373 }
3374
3375 if (err)
3376 goto errout_rcu;
3377
3378 if (rtm->rtm_flags & RTM_F_NOTIFY)
3379 rt->rt_flags |= RTCF_NOTIFY;
3380
3381 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3382 table_id = res.table ? res.table->tb_id : 0;
3383
3384
3385 skb_trim(skb, 0);
3386 skb_reset_network_header(skb);
3387 skb_reset_transport_header(skb);
3388 skb_reset_mac_header(skb);
3389
3390 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3391 struct fib_rt_info fri;
3392
3393 if (!res.fi) {
3394 err = fib_props[res.type].error;
3395 if (!err)
3396 err = -EHOSTUNREACH;
3397 goto errout_rcu;
3398 }
3399 fri.fi = res.fi;
3400 fri.tb_id = table_id;
3401 fri.dst = res.prefix;
3402 fri.dst_len = res.prefixlen;
3403 fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
3404 fri.type = rt->rt_type;
3405 fri.offload = 0;
3406 fri.trap = 0;
3407 fri.offload_failed = 0;
3408 if (res.fa_head) {
3409 struct fib_alias *fa;
3410
3411 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3412 u8 slen = 32 - fri.dst_len;
3413
3414 if (fa->fa_slen == slen &&
3415 fa->tb_id == fri.tb_id &&
3416 fa->fa_dscp == fri.dscp &&
3417 fa->fa_info == res.fi &&
3418 fa->fa_type == fri.type) {
3419 fri.offload = READ_ONCE(fa->offload);
3420 fri.trap = READ_ONCE(fa->trap);
3421 break;
3422 }
3423 }
3424 }
3425 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3426 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3427 } else {
3428 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3429 NETLINK_CB(in_skb).portid,
3430 nlh->nlmsg_seq, 0);
3431 }
3432 if (err < 0)
3433 goto errout_rcu;
3434
3435 rcu_read_unlock();
3436
3437 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3438
3439 errout_free:
3440 return err;
3441 errout_rcu:
3442 rcu_read_unlock();
3443 kfree_skb(skb);
3444 goto errout_free;
3445 }
3446
3447 void ip_rt_multicast_event(struct in_device *in_dev)
3448 {
3449 rt_cache_flush(dev_net(in_dev->dev));
3450 }
3451
3452 #ifdef CONFIG_SYSCTL
3453 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3454 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3455 static int ip_rt_gc_elasticity __read_mostly = 8;
3456 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3457
3458 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3459 void *buffer, size_t *lenp, loff_t *ppos)
3460 {
3461 struct net *net = (struct net *)__ctl->extra1;
3462
3463 if (write) {
3464 rt_cache_flush(net);
3465 fnhe_genid_bump(net);
3466 return 0;
3467 }
3468
3469 return -EINVAL;
3470 }
3471
3472 static struct ctl_table ipv4_route_table[] = {
3473 {
3474 .procname = "gc_thresh",
3475 .data = &ipv4_dst_ops.gc_thresh,
3476 .maxlen = sizeof(int),
3477 .mode = 0644,
3478 .proc_handler = proc_dointvec,
3479 },
3480 {
3481 .procname = "max_size",
3482 .data = &ip_rt_max_size,
3483 .maxlen = sizeof(int),
3484 .mode = 0644,
3485 .proc_handler = proc_dointvec,
3486 },
3487 {
3488
3489
3490 .procname = "gc_min_interval",
3491 .data = &ip_rt_gc_min_interval,
3492 .maxlen = sizeof(int),
3493 .mode = 0644,
3494 .proc_handler = proc_dointvec_jiffies,
3495 },
3496 {
3497 .procname = "gc_min_interval_ms",
3498 .data = &ip_rt_gc_min_interval,
3499 .maxlen = sizeof(int),
3500 .mode = 0644,
3501 .proc_handler = proc_dointvec_ms_jiffies,
3502 },
3503 {
3504 .procname = "gc_timeout",
3505 .data = &ip_rt_gc_timeout,
3506 .maxlen = sizeof(int),
3507 .mode = 0644,
3508 .proc_handler = proc_dointvec_jiffies,
3509 },
3510 {
3511 .procname = "gc_interval",
3512 .data = &ip_rt_gc_interval,
3513 .maxlen = sizeof(int),
3514 .mode = 0644,
3515 .proc_handler = proc_dointvec_jiffies,
3516 },
3517 {
3518 .procname = "redirect_load",
3519 .data = &ip_rt_redirect_load,
3520 .maxlen = sizeof(int),
3521 .mode = 0644,
3522 .proc_handler = proc_dointvec,
3523 },
3524 {
3525 .procname = "redirect_number",
3526 .data = &ip_rt_redirect_number,
3527 .maxlen = sizeof(int),
3528 .mode = 0644,
3529 .proc_handler = proc_dointvec,
3530 },
3531 {
3532 .procname = "redirect_silence",
3533 .data = &ip_rt_redirect_silence,
3534 .maxlen = sizeof(int),
3535 .mode = 0644,
3536 .proc_handler = proc_dointvec,
3537 },
3538 {
3539 .procname = "error_cost",
3540 .data = &ip_rt_error_cost,
3541 .maxlen = sizeof(int),
3542 .mode = 0644,
3543 .proc_handler = proc_dointvec,
3544 },
3545 {
3546 .procname = "error_burst",
3547 .data = &ip_rt_error_burst,
3548 .maxlen = sizeof(int),
3549 .mode = 0644,
3550 .proc_handler = proc_dointvec,
3551 },
3552 {
3553 .procname = "gc_elasticity",
3554 .data = &ip_rt_gc_elasticity,
3555 .maxlen = sizeof(int),
3556 .mode = 0644,
3557 .proc_handler = proc_dointvec,
3558 },
3559 { }
3560 };
3561
3562 static const char ipv4_route_flush_procname[] = "flush";
3563
3564 static struct ctl_table ipv4_route_netns_table[] = {
3565 {
3566 .procname = ipv4_route_flush_procname,
3567 .maxlen = sizeof(int),
3568 .mode = 0200,
3569 .proc_handler = ipv4_sysctl_rtcache_flush,
3570 },
3571 {
3572 .procname = "min_pmtu",
3573 .data = &init_net.ipv4.ip_rt_min_pmtu,
3574 .maxlen = sizeof(int),
3575 .mode = 0644,
3576 .proc_handler = proc_dointvec_minmax,
3577 .extra1 = &ip_min_valid_pmtu,
3578 },
3579 {
3580 .procname = "mtu_expires",
3581 .data = &init_net.ipv4.ip_rt_mtu_expires,
3582 .maxlen = sizeof(int),
3583 .mode = 0644,
3584 .proc_handler = proc_dointvec_jiffies,
3585 },
3586 {
3587 .procname = "min_adv_mss",
3588 .data = &init_net.ipv4.ip_rt_min_advmss,
3589 .maxlen = sizeof(int),
3590 .mode = 0644,
3591 .proc_handler = proc_dointvec,
3592 },
3593 { },
3594 };
3595
3596 static __net_init int sysctl_route_net_init(struct net *net)
3597 {
3598 struct ctl_table *tbl;
3599
3600 tbl = ipv4_route_netns_table;
3601 if (!net_eq(net, &init_net)) {
3602 int i;
3603
3604 tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
3605 if (!tbl)
3606 goto err_dup;
3607
3608
3609 if (net->user_ns != &init_user_ns) {
3610 if (tbl[0].procname != ipv4_route_flush_procname)
3611 tbl[0].procname = NULL;
3612 }
3613
3614
3615
3616
3617 for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
3618 tbl[i].data += (void *)net - (void *)&init_net;
3619 }
3620 tbl[0].extra1 = net;
3621
3622 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3623 if (!net->ipv4.route_hdr)
3624 goto err_reg;
3625 return 0;
3626
3627 err_reg:
3628 if (tbl != ipv4_route_netns_table)
3629 kfree(tbl);
3630 err_dup:
3631 return -ENOMEM;
3632 }
3633
3634 static __net_exit void sysctl_route_net_exit(struct net *net)
3635 {
3636 struct ctl_table *tbl;
3637
3638 tbl = net->ipv4.route_hdr->ctl_table_arg;
3639 unregister_net_sysctl_table(net->ipv4.route_hdr);
3640 BUG_ON(tbl == ipv4_route_netns_table);
3641 kfree(tbl);
3642 }
3643
3644 static __net_initdata struct pernet_operations sysctl_route_ops = {
3645 .init = sysctl_route_net_init,
3646 .exit = sysctl_route_net_exit,
3647 };
3648 #endif
3649
3650 static __net_init int netns_ip_rt_init(struct net *net)
3651 {
3652
3653 net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
3654 net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
3655 net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
3656 return 0;
3657 }
3658
3659 static struct pernet_operations __net_initdata ip_rt_ops = {
3660 .init = netns_ip_rt_init,
3661 };
3662
3663 static __net_init int rt_genid_init(struct net *net)
3664 {
3665 atomic_set(&net->ipv4.rt_genid, 0);
3666 atomic_set(&net->fnhe_genid, 0);
3667 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3668 return 0;
3669 }
3670
3671 static __net_initdata struct pernet_operations rt_genid_ops = {
3672 .init = rt_genid_init,
3673 };
3674
3675 static int __net_init ipv4_inetpeer_init(struct net *net)
3676 {
3677 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3678
3679 if (!bp)
3680 return -ENOMEM;
3681 inet_peer_base_init(bp);
3682 net->ipv4.peers = bp;
3683 return 0;
3684 }
3685
3686 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3687 {
3688 struct inet_peer_base *bp = net->ipv4.peers;
3689
3690 net->ipv4.peers = NULL;
3691 inetpeer_invalidate_tree(bp);
3692 kfree(bp);
3693 }
3694
3695 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3696 .init = ipv4_inetpeer_init,
3697 .exit = ipv4_inetpeer_exit,
3698 };
3699
3700 #ifdef CONFIG_IP_ROUTE_CLASSID
3701 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3702 #endif
3703
3704 int __init ip_rt_init(void)
3705 {
3706 void *idents_hash;
3707 int cpu;
3708
3709
3710 idents_hash = alloc_large_system_hash("IP idents",
3711 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3712 0,
3713 16,
3714 HASH_ZERO,
3715 NULL,
3716 &ip_idents_mask,
3717 2048,
3718 256*1024);
3719
3720 ip_idents = idents_hash;
3721
3722 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3723
3724 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3725
3726 for_each_possible_cpu(cpu) {
3727 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3728
3729 INIT_LIST_HEAD(&ul->head);
3730 INIT_LIST_HEAD(&ul->quarantine);
3731 spin_lock_init(&ul->lock);
3732 }
3733 #ifdef CONFIG_IP_ROUTE_CLASSID
3734 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3735 if (!ip_rt_acct)
3736 panic("IP: failed to allocate ip_rt_acct\n");
3737 #endif
3738
3739 ipv4_dst_ops.kmem_cachep =
3740 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3741 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3742
3743 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3744
3745 if (dst_entries_init(&ipv4_dst_ops) < 0)
3746 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3747
3748 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3749 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3750
3751 ipv4_dst_ops.gc_thresh = ~0;
3752 ip_rt_max_size = INT_MAX;
3753
3754 devinet_init();
3755 ip_fib_init();
3756
3757 if (ip_rt_proc_init())
3758 pr_err("Unable to create route proc files\n");
3759 #ifdef CONFIG_XFRM
3760 xfrm_init();
3761 xfrm4_init();
3762 #endif
3763 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3764 RTNL_FLAG_DOIT_UNLOCKED);
3765
3766 #ifdef CONFIG_SYSCTL
3767 register_pernet_subsys(&sysctl_route_ops);
3768 #endif
3769 register_pernet_subsys(&ip_rt_ops);
3770 register_pernet_subsys(&rt_genid_ops);
3771 register_pernet_subsys(&ipv4_inetpeer_ops);
3772 return 0;
3773 }
3774
3775 #ifdef CONFIG_SYSCTL
3776
3777
3778
3779
3780 void __init ip_static_sysctl_init(void)
3781 {
3782 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3783 }
3784 #endif