Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * INET     An implementation of the TCP/IP protocol suite for the LINUX
0004  *      operating system.  INET is implemented using the  BSD Socket
0005  *      interface as the means of communication with the user level.
0006  *
0007  *      ROUTE - implementation of the IP router.
0008  *
0009  * Authors: Ross Biro
0010  *      Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
0011  *      Alan Cox, <gw4pts@gw4pts.ampr.org>
0012  *      Linus Torvalds, <Linus.Torvalds@helsinki.fi>
0013  *      Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
0014  *
0015  * Fixes:
0016  *      Alan Cox    :   Verify area fixes.
0017  *      Alan Cox    :   cli() protects routing changes
0018  *      Rui Oliveira    :   ICMP routing table updates
0019  *      (rco@di.uminho.pt)  Routing table insertion and update
0020  *      Linus Torvalds  :   Rewrote bits to be sensible
0021  *      Alan Cox    :   Added BSD route gw semantics
0022  *      Alan Cox    :   Super /proc >4K
0023  *      Alan Cox    :   MTU in route table
0024  *      Alan Cox    :   MSS actually. Also added the window
0025  *                  clamper.
0026  *      Sam Lantinga    :   Fixed route matching in rt_del()
0027  *      Alan Cox    :   Routing cache support.
0028  *      Alan Cox    :   Removed compatibility cruft.
0029  *      Alan Cox    :   RTF_REJECT support.
0030  *      Alan Cox    :   TCP irtt support.
0031  *      Jonathan Naylor :   Added Metric support.
0032  *  Miquel van Smoorenburg  :   BSD API fixes.
0033  *  Miquel van Smoorenburg  :   Metrics.
0034  *      Alan Cox    :   Use __u32 properly
0035  *      Alan Cox    :   Aligned routing errors more closely with BSD
0036  *                  our system is still very different.
0037  *      Alan Cox    :   Faster /proc handling
0038  *  Alexey Kuznetsov    :   Massive rework to support tree based routing,
0039  *                  routing caches and better behaviour.
0040  *
0041  *      Olaf Erb    :   irtt wasn't being copied right.
0042  *      Bjorn Ekwall    :   Kerneld route support.
0043  *      Alan Cox    :   Multicast fixed (I hope)
0044  *      Pavel Krauz :   Limited broadcast fixed
0045  *      Mike McLagan    :   Routing by source
0046  *  Alexey Kuznetsov    :   End of old history. Split to fib.c and
0047  *                  route.c and rewritten from scratch.
0048  *      Andi Kleen  :   Load-limit warning messages.
0049  *  Vitaly E. Lavrov    :   Transparent proxy revived after year coma.
0050  *  Vitaly E. Lavrov    :   Race condition in ip_route_input_slow.
0051  *  Tobias Ringstrom    :   Uninitialized res.type in ip_route_output_slow.
0052  *  Vladimir V. Ivanov  :   IP rule info (flowid) is really useful.
0053  *      Marc Boucher    :   routing by fwmark
0054  *  Robert Olsson       :   Added rt_cache statistics
0055  *  Arnaldo C. Melo     :   Convert proc stuff to seq_file
0056  *  Eric Dumazet        :   hashed spinlocks and rt_check_expire() fixes.
0057  *  Ilia Sotnikov       :   Ignore TOS on PMTUD and Redirect
0058  *  Ilia Sotnikov       :   Removed TOS from hash calculations
0059  */
0060 
0061 #define pr_fmt(fmt) "IPv4: " fmt
0062 
0063 #include <linux/module.h>
0064 #include <linux/bitops.h>
0065 #include <linux/kernel.h>
0066 #include <linux/mm.h>
0067 #include <linux/memblock.h>
0068 #include <linux/socket.h>
0069 #include <linux/errno.h>
0070 #include <linux/in.h>
0071 #include <linux/inet.h>
0072 #include <linux/netdevice.h>
0073 #include <linux/proc_fs.h>
0074 #include <linux/init.h>
0075 #include <linux/skbuff.h>
0076 #include <linux/inetdevice.h>
0077 #include <linux/igmp.h>
0078 #include <linux/pkt_sched.h>
0079 #include <linux/mroute.h>
0080 #include <linux/netfilter_ipv4.h>
0081 #include <linux/random.h>
0082 #include <linux/rcupdate.h>
0083 #include <linux/slab.h>
0084 #include <linux/jhash.h>
0085 #include <net/dst.h>
0086 #include <net/dst_metadata.h>
0087 #include <net/inet_dscp.h>
0088 #include <net/net_namespace.h>
0089 #include <net/ip.h>
0090 #include <net/route.h>
0091 #include <net/inetpeer.h>
0092 #include <net/sock.h>
0093 #include <net/ip_fib.h>
0094 #include <net/nexthop.h>
0095 #include <net/tcp.h>
0096 #include <net/icmp.h>
0097 #include <net/xfrm.h>
0098 #include <net/lwtunnel.h>
0099 #include <net/netevent.h>
0100 #include <net/rtnetlink.h>
0101 #ifdef CONFIG_SYSCTL
0102 #include <linux/sysctl.h>
0103 #endif
0104 #include <net/secure_seq.h>
0105 #include <net/ip_tunnels.h>
0106 
0107 #include "fib_lookup.h"
0108 
0109 #define RT_FL_TOS(oldflp4) \
0110     ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
0111 
0112 #define RT_GC_TIMEOUT (300*HZ)
0113 
0114 #define DEFAULT_MIN_PMTU (512 + 20 + 20)
0115 #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
0116 #define DEFAULT_MIN_ADVMSS 256
0117 static int ip_rt_max_size;
0118 static int ip_rt_redirect_number __read_mostly  = 9;
0119 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
0120 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
0121 static int ip_rt_error_cost __read_mostly   = HZ;
0122 static int ip_rt_error_burst __read_mostly  = 5 * HZ;
0123 
0124 static int ip_rt_gc_timeout __read_mostly   = RT_GC_TIMEOUT;
0125 
0126 /*
0127  *  Interface to generic destination cache.
0128  */
0129 
0130 INDIRECT_CALLABLE_SCOPE
0131 struct dst_entry    *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0132 static unsigned int  ipv4_default_advmss(const struct dst_entry *dst);
0133 INDIRECT_CALLABLE_SCOPE
0134 unsigned int        ipv4_mtu(const struct dst_entry *dst);
0135 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
0136 static void      ipv4_link_failure(struct sk_buff *skb);
0137 static void      ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
0138                        struct sk_buff *skb, u32 mtu,
0139                        bool confirm_neigh);
0140 static void      ip_do_redirect(struct dst_entry *dst, struct sock *sk,
0141                     struct sk_buff *skb);
0142 static void     ipv4_dst_destroy(struct dst_entry *dst);
0143 
0144 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
0145 {
0146     WARN_ON(1);
0147     return NULL;
0148 }
0149 
0150 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
0151                        struct sk_buff *skb,
0152                        const void *daddr);
0153 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
0154 
0155 static struct dst_ops ipv4_dst_ops = {
0156     .family =       AF_INET,
0157     .check =        ipv4_dst_check,
0158     .default_advmss =   ipv4_default_advmss,
0159     .mtu =          ipv4_mtu,
0160     .cow_metrics =      ipv4_cow_metrics,
0161     .destroy =      ipv4_dst_destroy,
0162     .negative_advice =  ipv4_negative_advice,
0163     .link_failure =     ipv4_link_failure,
0164     .update_pmtu =      ip_rt_update_pmtu,
0165     .redirect =     ip_do_redirect,
0166     .local_out =        __ip_local_out,
0167     .neigh_lookup =     ipv4_neigh_lookup,
0168     .confirm_neigh =    ipv4_confirm_neigh,
0169 };
0170 
0171 #define ECN_OR_COST(class)  TC_PRIO_##class
0172 
0173 const __u8 ip_tos2prio[16] = {
0174     TC_PRIO_BESTEFFORT,
0175     ECN_OR_COST(BESTEFFORT),
0176     TC_PRIO_BESTEFFORT,
0177     ECN_OR_COST(BESTEFFORT),
0178     TC_PRIO_BULK,
0179     ECN_OR_COST(BULK),
0180     TC_PRIO_BULK,
0181     ECN_OR_COST(BULK),
0182     TC_PRIO_INTERACTIVE,
0183     ECN_OR_COST(INTERACTIVE),
0184     TC_PRIO_INTERACTIVE,
0185     ECN_OR_COST(INTERACTIVE),
0186     TC_PRIO_INTERACTIVE_BULK,
0187     ECN_OR_COST(INTERACTIVE_BULK),
0188     TC_PRIO_INTERACTIVE_BULK,
0189     ECN_OR_COST(INTERACTIVE_BULK)
0190 };
0191 EXPORT_SYMBOL(ip_tos2prio);
0192 
0193 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
0194 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
0195 
0196 #ifdef CONFIG_PROC_FS
0197 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
0198 {
0199     if (*pos)
0200         return NULL;
0201     return SEQ_START_TOKEN;
0202 }
0203 
0204 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
0205 {
0206     ++*pos;
0207     return NULL;
0208 }
0209 
0210 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
0211 {
0212 }
0213 
0214 static int rt_cache_seq_show(struct seq_file *seq, void *v)
0215 {
0216     if (v == SEQ_START_TOKEN)
0217         seq_printf(seq, "%-127s\n",
0218                "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
0219                "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
0220                "HHUptod\tSpecDst");
0221     return 0;
0222 }
0223 
0224 static const struct seq_operations rt_cache_seq_ops = {
0225     .start  = rt_cache_seq_start,
0226     .next   = rt_cache_seq_next,
0227     .stop   = rt_cache_seq_stop,
0228     .show   = rt_cache_seq_show,
0229 };
0230 
0231 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
0232 {
0233     int cpu;
0234 
0235     if (*pos == 0)
0236         return SEQ_START_TOKEN;
0237 
0238     for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
0239         if (!cpu_possible(cpu))
0240             continue;
0241         *pos = cpu+1;
0242         return &per_cpu(rt_cache_stat, cpu);
0243     }
0244     return NULL;
0245 }
0246 
0247 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
0248 {
0249     int cpu;
0250 
0251     for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
0252         if (!cpu_possible(cpu))
0253             continue;
0254         *pos = cpu+1;
0255         return &per_cpu(rt_cache_stat, cpu);
0256     }
0257     (*pos)++;
0258     return NULL;
0259 
0260 }
0261 
0262 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
0263 {
0264 
0265 }
0266 
0267 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
0268 {
0269     struct rt_cache_stat *st = v;
0270 
0271     if (v == SEQ_START_TOKEN) {
0272         seq_puts(seq, "entries  in_hit   in_slow_tot in_slow_mc in_no_route in_brd   in_martian_dst in_martian_src out_hit  out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
0273         return 0;
0274     }
0275 
0276     seq_printf(seq, "%08x %08x %08x    %08x   %08x    %08x %08x       "
0277             "%08x       %08x %08x     %08x    %08x %08x   "
0278             "%08x     %08x        %08x        %08x\n",
0279            dst_entries_get_slow(&ipv4_dst_ops),
0280            0, /* st->in_hit */
0281            st->in_slow_tot,
0282            st->in_slow_mc,
0283            st->in_no_route,
0284            st->in_brd,
0285            st->in_martian_dst,
0286            st->in_martian_src,
0287 
0288            0, /* st->out_hit */
0289            st->out_slow_tot,
0290            st->out_slow_mc,
0291 
0292            0, /* st->gc_total */
0293            0, /* st->gc_ignored */
0294            0, /* st->gc_goal_miss */
0295            0, /* st->gc_dst_overflow */
0296            0, /* st->in_hlist_search */
0297            0  /* st->out_hlist_search */
0298         );
0299     return 0;
0300 }
0301 
0302 static const struct seq_operations rt_cpu_seq_ops = {
0303     .start  = rt_cpu_seq_start,
0304     .next   = rt_cpu_seq_next,
0305     .stop   = rt_cpu_seq_stop,
0306     .show   = rt_cpu_seq_show,
0307 };
0308 
0309 #ifdef CONFIG_IP_ROUTE_CLASSID
0310 static int rt_acct_proc_show(struct seq_file *m, void *v)
0311 {
0312     struct ip_rt_acct *dst, *src;
0313     unsigned int i, j;
0314 
0315     dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
0316     if (!dst)
0317         return -ENOMEM;
0318 
0319     for_each_possible_cpu(i) {
0320         src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
0321         for (j = 0; j < 256; j++) {
0322             dst[j].o_bytes   += src[j].o_bytes;
0323             dst[j].o_packets += src[j].o_packets;
0324             dst[j].i_bytes   += src[j].i_bytes;
0325             dst[j].i_packets += src[j].i_packets;
0326         }
0327     }
0328 
0329     seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
0330     kfree(dst);
0331     return 0;
0332 }
0333 #endif
0334 
0335 static int __net_init ip_rt_do_proc_init(struct net *net)
0336 {
0337     struct proc_dir_entry *pde;
0338 
0339     pde = proc_create_seq("rt_cache", 0444, net->proc_net,
0340                   &rt_cache_seq_ops);
0341     if (!pde)
0342         goto err1;
0343 
0344     pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
0345                   &rt_cpu_seq_ops);
0346     if (!pde)
0347         goto err2;
0348 
0349 #ifdef CONFIG_IP_ROUTE_CLASSID
0350     pde = proc_create_single("rt_acct", 0, net->proc_net,
0351             rt_acct_proc_show);
0352     if (!pde)
0353         goto err3;
0354 #endif
0355     return 0;
0356 
0357 #ifdef CONFIG_IP_ROUTE_CLASSID
0358 err3:
0359     remove_proc_entry("rt_cache", net->proc_net_stat);
0360 #endif
0361 err2:
0362     remove_proc_entry("rt_cache", net->proc_net);
0363 err1:
0364     return -ENOMEM;
0365 }
0366 
0367 static void __net_exit ip_rt_do_proc_exit(struct net *net)
0368 {
0369     remove_proc_entry("rt_cache", net->proc_net_stat);
0370     remove_proc_entry("rt_cache", net->proc_net);
0371 #ifdef CONFIG_IP_ROUTE_CLASSID
0372     remove_proc_entry("rt_acct", net->proc_net);
0373 #endif
0374 }
0375 
0376 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
0377     .init = ip_rt_do_proc_init,
0378     .exit = ip_rt_do_proc_exit,
0379 };
0380 
0381 static int __init ip_rt_proc_init(void)
0382 {
0383     return register_pernet_subsys(&ip_rt_proc_ops);
0384 }
0385 
0386 #else
0387 static inline int ip_rt_proc_init(void)
0388 {
0389     return 0;
0390 }
0391 #endif /* CONFIG_PROC_FS */
0392 
0393 static inline bool rt_is_expired(const struct rtable *rth)
0394 {
0395     return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
0396 }
0397 
0398 void rt_cache_flush(struct net *net)
0399 {
0400     rt_genid_bump_ipv4(net);
0401 }
0402 
0403 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
0404                        struct sk_buff *skb,
0405                        const void *daddr)
0406 {
0407     const struct rtable *rt = container_of(dst, struct rtable, dst);
0408     struct net_device *dev = dst->dev;
0409     struct neighbour *n;
0410 
0411     rcu_read_lock_bh();
0412 
0413     if (likely(rt->rt_gw_family == AF_INET)) {
0414         n = ip_neigh_gw4(dev, rt->rt_gw4);
0415     } else if (rt->rt_gw_family == AF_INET6) {
0416         n = ip_neigh_gw6(dev, &rt->rt_gw6);
0417         } else {
0418         __be32 pkey;
0419 
0420         pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
0421         n = ip_neigh_gw4(dev, pkey);
0422     }
0423 
0424     if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
0425         n = NULL;
0426 
0427     rcu_read_unlock_bh();
0428 
0429     return n;
0430 }
0431 
0432 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
0433 {
0434     const struct rtable *rt = container_of(dst, struct rtable, dst);
0435     struct net_device *dev = dst->dev;
0436     const __be32 *pkey = daddr;
0437 
0438     if (rt->rt_gw_family == AF_INET) {
0439         pkey = (const __be32 *)&rt->rt_gw4;
0440     } else if (rt->rt_gw_family == AF_INET6) {
0441         return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
0442     } else if (!daddr ||
0443          (rt->rt_flags &
0444           (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
0445         return;
0446     }
0447     __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
0448 }
0449 
0450 /* Hash tables of size 2048..262144 depending on RAM size.
0451  * Each bucket uses 8 bytes.
0452  */
0453 static u32 ip_idents_mask __read_mostly;
0454 static atomic_t *ip_idents __read_mostly;
0455 static u32 *ip_tstamps __read_mostly;
0456 
0457 /* In order to protect privacy, we add a perturbation to identifiers
0458  * if one generator is seldom used. This makes hard for an attacker
0459  * to infer how many packets were sent between two points in time.
0460  */
0461 static u32 ip_idents_reserve(u32 hash, int segs)
0462 {
0463     u32 bucket, old, now = (u32)jiffies;
0464     atomic_t *p_id;
0465     u32 *p_tstamp;
0466     u32 delta = 0;
0467 
0468     bucket = hash & ip_idents_mask;
0469     p_tstamp = ip_tstamps + bucket;
0470     p_id = ip_idents + bucket;
0471     old = READ_ONCE(*p_tstamp);
0472 
0473     if (old != now && cmpxchg(p_tstamp, old, now) == old)
0474         delta = prandom_u32_max(now - old);
0475 
0476     /* If UBSAN reports an error there, please make sure your compiler
0477      * supports -fno-strict-overflow before reporting it that was a bug
0478      * in UBSAN, and it has been fixed in GCC-8.
0479      */
0480     return atomic_add_return(segs + delta, p_id) - segs;
0481 }
0482 
0483 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
0484 {
0485     u32 hash, id;
0486 
0487     /* Note the following code is not safe, but this is okay. */
0488     if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
0489         get_random_bytes(&net->ipv4.ip_id_key,
0490                  sizeof(net->ipv4.ip_id_key));
0491 
0492     hash = siphash_3u32((__force u32)iph->daddr,
0493                 (__force u32)iph->saddr,
0494                 iph->protocol,
0495                 &net->ipv4.ip_id_key);
0496     id = ip_idents_reserve(hash, segs);
0497     iph->id = htons(id);
0498 }
0499 EXPORT_SYMBOL(__ip_select_ident);
0500 
0501 static void ip_rt_fix_tos(struct flowi4 *fl4)
0502 {
0503     __u8 tos = RT_FL_TOS(fl4);
0504 
0505     fl4->flowi4_tos = tos & IPTOS_RT_MASK;
0506     if (tos & RTO_ONLINK)
0507         fl4->flowi4_scope = RT_SCOPE_LINK;
0508 }
0509 
0510 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
0511                  const struct sock *sk, const struct iphdr *iph,
0512                  int oif, __u8 tos, u8 prot, u32 mark,
0513                  int flow_flags)
0514 {
0515     __u8 scope = RT_SCOPE_UNIVERSE;
0516 
0517     if (sk) {
0518         const struct inet_sock *inet = inet_sk(sk);
0519 
0520         oif = sk->sk_bound_dev_if;
0521         mark = sk->sk_mark;
0522         tos = ip_sock_rt_tos(sk);
0523         scope = ip_sock_rt_scope(sk);
0524         prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
0525     }
0526 
0527     flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
0528                prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
0529                sock_net_uid(net, sk));
0530 }
0531 
0532 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
0533                    const struct sock *sk)
0534 {
0535     const struct net *net = dev_net(skb->dev);
0536     const struct iphdr *iph = ip_hdr(skb);
0537     int oif = skb->dev->ifindex;
0538     u8 prot = iph->protocol;
0539     u32 mark = skb->mark;
0540     __u8 tos = iph->tos;
0541 
0542     __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
0543 }
0544 
0545 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
0546 {
0547     const struct inet_sock *inet = inet_sk(sk);
0548     const struct ip_options_rcu *inet_opt;
0549     __be32 daddr = inet->inet_daddr;
0550 
0551     rcu_read_lock();
0552     inet_opt = rcu_dereference(inet->inet_opt);
0553     if (inet_opt && inet_opt->opt.srr)
0554         daddr = inet_opt->opt.faddr;
0555     flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
0556                ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
0557                ip_sock_rt_scope(sk),
0558                inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
0559                inet_sk_flowi_flags(sk),
0560                daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
0561     rcu_read_unlock();
0562 }
0563 
0564 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
0565                  const struct sk_buff *skb)
0566 {
0567     if (skb)
0568         build_skb_flow_key(fl4, skb, sk);
0569     else
0570         build_sk_flow_key(fl4, sk);
0571 }
0572 
0573 static DEFINE_SPINLOCK(fnhe_lock);
0574 
0575 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
0576 {
0577     struct rtable *rt;
0578 
0579     rt = rcu_dereference(fnhe->fnhe_rth_input);
0580     if (rt) {
0581         RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
0582         dst_dev_put(&rt->dst);
0583         dst_release(&rt->dst);
0584     }
0585     rt = rcu_dereference(fnhe->fnhe_rth_output);
0586     if (rt) {
0587         RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
0588         dst_dev_put(&rt->dst);
0589         dst_release(&rt->dst);
0590     }
0591 }
0592 
0593 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
0594 {
0595     struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
0596     struct fib_nh_exception *fnhe, *oldest = NULL;
0597 
0598     for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
0599         fnhe = rcu_dereference_protected(*fnhe_p,
0600                          lockdep_is_held(&fnhe_lock));
0601         if (!fnhe)
0602             break;
0603         if (!oldest ||
0604             time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
0605             oldest = fnhe;
0606             oldest_p = fnhe_p;
0607         }
0608     }
0609     fnhe_flush_routes(oldest);
0610     *oldest_p = oldest->fnhe_next;
0611     kfree_rcu(oldest, rcu);
0612 }
0613 
0614 static u32 fnhe_hashfun(__be32 daddr)
0615 {
0616     static siphash_aligned_key_t fnhe_hash_key;
0617     u64 hval;
0618 
0619     net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
0620     hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
0621     return hash_64(hval, FNHE_HASH_SHIFT);
0622 }
0623 
0624 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
0625 {
0626     rt->rt_pmtu = fnhe->fnhe_pmtu;
0627     rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
0628     rt->dst.expires = fnhe->fnhe_expires;
0629 
0630     if (fnhe->fnhe_gw) {
0631         rt->rt_flags |= RTCF_REDIRECTED;
0632         rt->rt_uses_gateway = 1;
0633         rt->rt_gw_family = AF_INET;
0634         rt->rt_gw4 = fnhe->fnhe_gw;
0635     }
0636 }
0637 
0638 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
0639                   __be32 gw, u32 pmtu, bool lock,
0640                   unsigned long expires)
0641 {
0642     struct fnhe_hash_bucket *hash;
0643     struct fib_nh_exception *fnhe;
0644     struct rtable *rt;
0645     u32 genid, hval;
0646     unsigned int i;
0647     int depth;
0648 
0649     genid = fnhe_genid(dev_net(nhc->nhc_dev));
0650     hval = fnhe_hashfun(daddr);
0651 
0652     spin_lock_bh(&fnhe_lock);
0653 
0654     hash = rcu_dereference(nhc->nhc_exceptions);
0655     if (!hash) {
0656         hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
0657         if (!hash)
0658             goto out_unlock;
0659         rcu_assign_pointer(nhc->nhc_exceptions, hash);
0660     }
0661 
0662     hash += hval;
0663 
0664     depth = 0;
0665     for (fnhe = rcu_dereference(hash->chain); fnhe;
0666          fnhe = rcu_dereference(fnhe->fnhe_next)) {
0667         if (fnhe->fnhe_daddr == daddr)
0668             break;
0669         depth++;
0670     }
0671 
0672     if (fnhe) {
0673         if (fnhe->fnhe_genid != genid)
0674             fnhe->fnhe_genid = genid;
0675         if (gw)
0676             fnhe->fnhe_gw = gw;
0677         if (pmtu) {
0678             fnhe->fnhe_pmtu = pmtu;
0679             fnhe->fnhe_mtu_locked = lock;
0680         }
0681         fnhe->fnhe_expires = max(1UL, expires);
0682         /* Update all cached dsts too */
0683         rt = rcu_dereference(fnhe->fnhe_rth_input);
0684         if (rt)
0685             fill_route_from_fnhe(rt, fnhe);
0686         rt = rcu_dereference(fnhe->fnhe_rth_output);
0687         if (rt)
0688             fill_route_from_fnhe(rt, fnhe);
0689     } else {
0690         /* Randomize max depth to avoid some side channels attacks. */
0691         int max_depth = FNHE_RECLAIM_DEPTH +
0692                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
0693 
0694         while (depth > max_depth) {
0695             fnhe_remove_oldest(hash);
0696             depth--;
0697         }
0698 
0699         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
0700         if (!fnhe)
0701             goto out_unlock;
0702 
0703         fnhe->fnhe_next = hash->chain;
0704 
0705         fnhe->fnhe_genid = genid;
0706         fnhe->fnhe_daddr = daddr;
0707         fnhe->fnhe_gw = gw;
0708         fnhe->fnhe_pmtu = pmtu;
0709         fnhe->fnhe_mtu_locked = lock;
0710         fnhe->fnhe_expires = max(1UL, expires);
0711 
0712         rcu_assign_pointer(hash->chain, fnhe);
0713 
0714         /* Exception created; mark the cached routes for the nexthop
0715          * stale, so anyone caching it rechecks if this exception
0716          * applies to them.
0717          */
0718         rt = rcu_dereference(nhc->nhc_rth_input);
0719         if (rt)
0720             rt->dst.obsolete = DST_OBSOLETE_KILL;
0721 
0722         for_each_possible_cpu(i) {
0723             struct rtable __rcu **prt;
0724 
0725             prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
0726             rt = rcu_dereference(*prt);
0727             if (rt)
0728                 rt->dst.obsolete = DST_OBSOLETE_KILL;
0729         }
0730     }
0731 
0732     fnhe->fnhe_stamp = jiffies;
0733 
0734 out_unlock:
0735     spin_unlock_bh(&fnhe_lock);
0736 }
0737 
0738 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
0739                  bool kill_route)
0740 {
0741     __be32 new_gw = icmp_hdr(skb)->un.gateway;
0742     __be32 old_gw = ip_hdr(skb)->saddr;
0743     struct net_device *dev = skb->dev;
0744     struct in_device *in_dev;
0745     struct fib_result res;
0746     struct neighbour *n;
0747     struct net *net;
0748 
0749     switch (icmp_hdr(skb)->code & 7) {
0750     case ICMP_REDIR_NET:
0751     case ICMP_REDIR_NETTOS:
0752     case ICMP_REDIR_HOST:
0753     case ICMP_REDIR_HOSTTOS:
0754         break;
0755 
0756     default:
0757         return;
0758     }
0759 
0760     if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
0761         return;
0762 
0763     in_dev = __in_dev_get_rcu(dev);
0764     if (!in_dev)
0765         return;
0766 
0767     net = dev_net(dev);
0768     if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
0769         ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
0770         ipv4_is_zeronet(new_gw))
0771         goto reject_redirect;
0772 
0773     if (!IN_DEV_SHARED_MEDIA(in_dev)) {
0774         if (!inet_addr_onlink(in_dev, new_gw, old_gw))
0775             goto reject_redirect;
0776         if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
0777             goto reject_redirect;
0778     } else {
0779         if (inet_addr_type(net, new_gw) != RTN_UNICAST)
0780             goto reject_redirect;
0781     }
0782 
0783     n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
0784     if (!n)
0785         n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
0786     if (!IS_ERR(n)) {
0787         if (!(n->nud_state & NUD_VALID)) {
0788             neigh_event_send(n, NULL);
0789         } else {
0790             if (fib_lookup(net, fl4, &res, 0) == 0) {
0791                 struct fib_nh_common *nhc;
0792 
0793                 fib_select_path(net, &res, fl4, skb);
0794                 nhc = FIB_RES_NHC(res);
0795                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
0796                         0, false,
0797                         jiffies + ip_rt_gc_timeout);
0798             }
0799             if (kill_route)
0800                 rt->dst.obsolete = DST_OBSOLETE_KILL;
0801             call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
0802         }
0803         neigh_release(n);
0804     }
0805     return;
0806 
0807 reject_redirect:
0808 #ifdef CONFIG_IP_ROUTE_VERBOSE
0809     if (IN_DEV_LOG_MARTIANS(in_dev)) {
0810         const struct iphdr *iph = (const struct iphdr *) skb->data;
0811         __be32 daddr = iph->daddr;
0812         __be32 saddr = iph->saddr;
0813 
0814         net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
0815                      "  Advised path = %pI4 -> %pI4\n",
0816                      &old_gw, dev->name, &new_gw,
0817                      &saddr, &daddr);
0818     }
0819 #endif
0820     ;
0821 }
0822 
0823 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
0824 {
0825     struct rtable *rt;
0826     struct flowi4 fl4;
0827     const struct iphdr *iph = (const struct iphdr *) skb->data;
0828     struct net *net = dev_net(skb->dev);
0829     int oif = skb->dev->ifindex;
0830     u8 prot = iph->protocol;
0831     u32 mark = skb->mark;
0832     __u8 tos = iph->tos;
0833 
0834     rt = (struct rtable *) dst;
0835 
0836     __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
0837     __ip_do_redirect(rt, skb, &fl4, true);
0838 }
0839 
0840 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
0841 {
0842     struct rtable *rt = (struct rtable *)dst;
0843     struct dst_entry *ret = dst;
0844 
0845     if (rt) {
0846         if (dst->obsolete > 0) {
0847             ip_rt_put(rt);
0848             ret = NULL;
0849         } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
0850                rt->dst.expires) {
0851             ip_rt_put(rt);
0852             ret = NULL;
0853         }
0854     }
0855     return ret;
0856 }
0857 
0858 /*
0859  * Algorithm:
0860  *  1. The first ip_rt_redirect_number redirects are sent
0861  *     with exponential backoff, then we stop sending them at all,
0862  *     assuming that the host ignores our redirects.
0863  *  2. If we did not see packets requiring redirects
0864  *     during ip_rt_redirect_silence, we assume that the host
0865  *     forgot redirected route and start to send redirects again.
0866  *
0867  * This algorithm is much cheaper and more intelligent than dumb load limiting
0868  * in icmp.c.
0869  *
0870  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
0871  * and "frag. need" (breaks PMTU discovery) in icmp.c.
0872  */
0873 
0874 void ip_rt_send_redirect(struct sk_buff *skb)
0875 {
0876     struct rtable *rt = skb_rtable(skb);
0877     struct in_device *in_dev;
0878     struct inet_peer *peer;
0879     struct net *net;
0880     int log_martians;
0881     int vif;
0882 
0883     rcu_read_lock();
0884     in_dev = __in_dev_get_rcu(rt->dst.dev);
0885     if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
0886         rcu_read_unlock();
0887         return;
0888     }
0889     log_martians = IN_DEV_LOG_MARTIANS(in_dev);
0890     vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
0891     rcu_read_unlock();
0892 
0893     net = dev_net(rt->dst.dev);
0894     peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
0895     if (!peer) {
0896         icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
0897               rt_nexthop(rt, ip_hdr(skb)->daddr));
0898         return;
0899     }
0900 
0901     /* No redirected packets during ip_rt_redirect_silence;
0902      * reset the algorithm.
0903      */
0904     if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
0905         peer->rate_tokens = 0;
0906         peer->n_redirects = 0;
0907     }
0908 
0909     /* Too many ignored redirects; do not send anything
0910      * set dst.rate_last to the last seen redirected packet.
0911      */
0912     if (peer->n_redirects >= ip_rt_redirect_number) {
0913         peer->rate_last = jiffies;
0914         goto out_put_peer;
0915     }
0916 
0917     /* Check for load limit; set rate_last to the latest sent
0918      * redirect.
0919      */
0920     if (peer->n_redirects == 0 ||
0921         time_after(jiffies,
0922                (peer->rate_last +
0923             (ip_rt_redirect_load << peer->n_redirects)))) {
0924         __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
0925 
0926         icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
0927         peer->rate_last = jiffies;
0928         ++peer->n_redirects;
0929 #ifdef CONFIG_IP_ROUTE_VERBOSE
0930         if (log_martians &&
0931             peer->n_redirects == ip_rt_redirect_number)
0932             net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
0933                          &ip_hdr(skb)->saddr, inet_iif(skb),
0934                          &ip_hdr(skb)->daddr, &gw);
0935 #endif
0936     }
0937 out_put_peer:
0938     inet_putpeer(peer);
0939 }
0940 
0941 static int ip_error(struct sk_buff *skb)
0942 {
0943     struct rtable *rt = skb_rtable(skb);
0944     struct net_device *dev = skb->dev;
0945     struct in_device *in_dev;
0946     struct inet_peer *peer;
0947     unsigned long now;
0948     struct net *net;
0949     SKB_DR(reason);
0950     bool send;
0951     int code;
0952 
0953     if (netif_is_l3_master(skb->dev)) {
0954         dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
0955         if (!dev)
0956             goto out;
0957     }
0958 
0959     in_dev = __in_dev_get_rcu(dev);
0960 
0961     /* IP on this device is disabled. */
0962     if (!in_dev)
0963         goto out;
0964 
0965     net = dev_net(rt->dst.dev);
0966     if (!IN_DEV_FORWARD(in_dev)) {
0967         switch (rt->dst.error) {
0968         case EHOSTUNREACH:
0969             SKB_DR_SET(reason, IP_INADDRERRORS);
0970             __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
0971             break;
0972 
0973         case ENETUNREACH:
0974             SKB_DR_SET(reason, IP_INNOROUTES);
0975             __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
0976             break;
0977         }
0978         goto out;
0979     }
0980 
0981     switch (rt->dst.error) {
0982     case EINVAL:
0983     default:
0984         goto out;
0985     case EHOSTUNREACH:
0986         code = ICMP_HOST_UNREACH;
0987         break;
0988     case ENETUNREACH:
0989         code = ICMP_NET_UNREACH;
0990         SKB_DR_SET(reason, IP_INNOROUTES);
0991         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
0992         break;
0993     case EACCES:
0994         code = ICMP_PKT_FILTERED;
0995         break;
0996     }
0997 
0998     peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
0999                    l3mdev_master_ifindex(skb->dev), 1);
1000 
1001     send = true;
1002     if (peer) {
1003         now = jiffies;
1004         peer->rate_tokens += now - peer->rate_last;
1005         if (peer->rate_tokens > ip_rt_error_burst)
1006             peer->rate_tokens = ip_rt_error_burst;
1007         peer->rate_last = now;
1008         if (peer->rate_tokens >= ip_rt_error_cost)
1009             peer->rate_tokens -= ip_rt_error_cost;
1010         else
1011             send = false;
1012         inet_putpeer(peer);
1013     }
1014     if (send)
1015         icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1016 
1017 out:    kfree_skb_reason(skb, reason);
1018     return 0;
1019 }
1020 
1021 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1022 {
1023     struct dst_entry *dst = &rt->dst;
1024     struct net *net = dev_net(dst->dev);
1025     struct fib_result res;
1026     bool lock = false;
1027     u32 old_mtu;
1028 
1029     if (ip_mtu_locked(dst))
1030         return;
1031 
1032     old_mtu = ipv4_mtu(dst);
1033     if (old_mtu < mtu)
1034         return;
1035 
1036     if (mtu < net->ipv4.ip_rt_min_pmtu) {
1037         lock = true;
1038         mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
1039     }
1040 
1041     if (rt->rt_pmtu == mtu && !lock &&
1042         time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
1043         return;
1044 
1045     rcu_read_lock();
1046     if (fib_lookup(net, fl4, &res, 0) == 0) {
1047         struct fib_nh_common *nhc;
1048 
1049         fib_select_path(net, &res, fl4, NULL);
1050         nhc = FIB_RES_NHC(res);
1051         update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1052                       jiffies + net->ipv4.ip_rt_mtu_expires);
1053     }
1054     rcu_read_unlock();
1055 }
1056 
1057 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1058                   struct sk_buff *skb, u32 mtu,
1059                   bool confirm_neigh)
1060 {
1061     struct rtable *rt = (struct rtable *) dst;
1062     struct flowi4 fl4;
1063 
1064     ip_rt_build_flow_key(&fl4, sk, skb);
1065 
1066     /* Don't make lookup fail for bridged encapsulations */
1067     if (skb && netif_is_any_bridge_port(skb->dev))
1068         fl4.flowi4_oif = 0;
1069 
1070     __ip_rt_update_pmtu(rt, &fl4, mtu);
1071 }
1072 
1073 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1074               int oif, u8 protocol)
1075 {
1076     const struct iphdr *iph = (const struct iphdr *)skb->data;
1077     struct flowi4 fl4;
1078     struct rtable *rt;
1079     u32 mark = IP4_REPLY_MARK(net, skb->mark);
1080 
1081     __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
1082              0);
1083     rt = __ip_route_output_key(net, &fl4);
1084     if (!IS_ERR(rt)) {
1085         __ip_rt_update_pmtu(rt, &fl4, mtu);
1086         ip_rt_put(rt);
1087     }
1088 }
1089 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1090 
1091 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092 {
1093     const struct iphdr *iph = (const struct iphdr *)skb->data;
1094     struct flowi4 fl4;
1095     struct rtable *rt;
1096 
1097     __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1098 
1099     if (!fl4.flowi4_mark)
1100         fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1101 
1102     rt = __ip_route_output_key(sock_net(sk), &fl4);
1103     if (!IS_ERR(rt)) {
1104         __ip_rt_update_pmtu(rt, &fl4, mtu);
1105         ip_rt_put(rt);
1106     }
1107 }
1108 
1109 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1110 {
1111     const struct iphdr *iph = (const struct iphdr *)skb->data;
1112     struct flowi4 fl4;
1113     struct rtable *rt;
1114     struct dst_entry *odst = NULL;
1115     bool new = false;
1116     struct net *net = sock_net(sk);
1117 
1118     bh_lock_sock(sk);
1119 
1120     if (!ip_sk_accept_pmtu(sk))
1121         goto out;
1122 
1123     odst = sk_dst_get(sk);
1124 
1125     if (sock_owned_by_user(sk) || !odst) {
1126         __ipv4_sk_update_pmtu(skb, sk, mtu);
1127         goto out;
1128     }
1129 
1130     __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1131 
1132     rt = (struct rtable *)odst;
1133     if (odst->obsolete && !odst->ops->check(odst, 0)) {
1134         rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1135         if (IS_ERR(rt))
1136             goto out;
1137 
1138         new = true;
1139     }
1140 
1141     __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1142 
1143     if (!dst_check(&rt->dst, 0)) {
1144         if (new)
1145             dst_release(&rt->dst);
1146 
1147         rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1148         if (IS_ERR(rt))
1149             goto out;
1150 
1151         new = true;
1152     }
1153 
1154     if (new)
1155         sk_dst_set(sk, &rt->dst);
1156 
1157 out:
1158     bh_unlock_sock(sk);
1159     dst_release(odst);
1160 }
1161 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1162 
1163 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1164            int oif, u8 protocol)
1165 {
1166     const struct iphdr *iph = (const struct iphdr *)skb->data;
1167     struct flowi4 fl4;
1168     struct rtable *rt;
1169 
1170     __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
1171     rt = __ip_route_output_key(net, &fl4);
1172     if (!IS_ERR(rt)) {
1173         __ip_do_redirect(rt, skb, &fl4, false);
1174         ip_rt_put(rt);
1175     }
1176 }
1177 EXPORT_SYMBOL_GPL(ipv4_redirect);
1178 
1179 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1180 {
1181     const struct iphdr *iph = (const struct iphdr *)skb->data;
1182     struct flowi4 fl4;
1183     struct rtable *rt;
1184     struct net *net = sock_net(sk);
1185 
1186     __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1187     rt = __ip_route_output_key(net, &fl4);
1188     if (!IS_ERR(rt)) {
1189         __ip_do_redirect(rt, skb, &fl4, false);
1190         ip_rt_put(rt);
1191     }
1192 }
1193 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1194 
1195 INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1196                              u32 cookie)
1197 {
1198     struct rtable *rt = (struct rtable *) dst;
1199 
1200     /* All IPV4 dsts are created with ->obsolete set to the value
1201      * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1202      * into this function always.
1203      *
1204      * When a PMTU/redirect information update invalidates a route,
1205      * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1206      * DST_OBSOLETE_DEAD.
1207      */
1208     if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1209         return NULL;
1210     return dst;
1211 }
1212 EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1213 
1214 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1215 {
1216     struct ip_options opt;
1217     int res;
1218 
1219     /* Recompile ip options since IPCB may not be valid anymore.
1220      * Also check we have a reasonable ipv4 header.
1221      */
1222     if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1223         ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1224         return;
1225 
1226     memset(&opt, 0, sizeof(opt));
1227     if (ip_hdr(skb)->ihl > 5) {
1228         if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1229             return;
1230         opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1231 
1232         rcu_read_lock();
1233         res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1234         rcu_read_unlock();
1235 
1236         if (res)
1237             return;
1238     }
1239     __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1240 }
1241 
1242 static void ipv4_link_failure(struct sk_buff *skb)
1243 {
1244     struct rtable *rt;
1245 
1246     ipv4_send_dest_unreach(skb);
1247 
1248     rt = skb_rtable(skb);
1249     if (rt)
1250         dst_set_expires(&rt->dst, 0);
1251 }
1252 
1253 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1254 {
1255     pr_debug("%s: %pI4 -> %pI4, %s\n",
1256          __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1257          skb->dev ? skb->dev->name : "?");
1258     kfree_skb(skb);
1259     WARN_ON(1);
1260     return 0;
1261 }
1262 
1263 /*
1264  * We do not cache source address of outgoing interface,
1265  * because it is used only by IP RR, TS and SRR options,
1266  * so that it out of fast path.
1267  *
1268  * BTW remember: "addr" is allowed to be not aligned
1269  * in IP options!
1270  */
1271 
1272 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1273 {
1274     __be32 src;
1275 
1276     if (rt_is_output_route(rt))
1277         src = ip_hdr(skb)->saddr;
1278     else {
1279         struct fib_result res;
1280         struct iphdr *iph = ip_hdr(skb);
1281         struct flowi4 fl4 = {
1282             .daddr = iph->daddr,
1283             .saddr = iph->saddr,
1284             .flowi4_tos = RT_TOS(iph->tos),
1285             .flowi4_oif = rt->dst.dev->ifindex,
1286             .flowi4_iif = skb->dev->ifindex,
1287             .flowi4_mark = skb->mark,
1288         };
1289 
1290         rcu_read_lock();
1291         if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1292             src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1293         else
1294             src = inet_select_addr(rt->dst.dev,
1295                            rt_nexthop(rt, iph->daddr),
1296                            RT_SCOPE_UNIVERSE);
1297         rcu_read_unlock();
1298     }
1299     memcpy(addr, &src, 4);
1300 }
1301 
1302 #ifdef CONFIG_IP_ROUTE_CLASSID
1303 static void set_class_tag(struct rtable *rt, u32 tag)
1304 {
1305     if (!(rt->dst.tclassid & 0xFFFF))
1306         rt->dst.tclassid |= tag & 0xFFFF;
1307     if (!(rt->dst.tclassid & 0xFFFF0000))
1308         rt->dst.tclassid |= tag & 0xFFFF0000;
1309 }
1310 #endif
1311 
1312 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1313 {
1314     struct net *net = dev_net(dst->dev);
1315     unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1316     unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1317                     net->ipv4.ip_rt_min_advmss);
1318 
1319     return min(advmss, IPV4_MAX_PMTU - header_size);
1320 }
1321 
1322 INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1323 {
1324     return ip_dst_mtu_maybe_forward(dst, false);
1325 }
1326 EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1327 
1328 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1329 {
1330     struct fnhe_hash_bucket *hash;
1331     struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1332     u32 hval = fnhe_hashfun(daddr);
1333 
1334     spin_lock_bh(&fnhe_lock);
1335 
1336     hash = rcu_dereference_protected(nhc->nhc_exceptions,
1337                      lockdep_is_held(&fnhe_lock));
1338     hash += hval;
1339 
1340     fnhe_p = &hash->chain;
1341     fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1342     while (fnhe) {
1343         if (fnhe->fnhe_daddr == daddr) {
1344             rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1345                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1346             /* set fnhe_daddr to 0 to ensure it won't bind with
1347              * new dsts in rt_bind_exception().
1348              */
1349             fnhe->fnhe_daddr = 0;
1350             fnhe_flush_routes(fnhe);
1351             kfree_rcu(fnhe, rcu);
1352             break;
1353         }
1354         fnhe_p = &fnhe->fnhe_next;
1355         fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1356                          lockdep_is_held(&fnhe_lock));
1357     }
1358 
1359     spin_unlock_bh(&fnhe_lock);
1360 }
1361 
1362 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1363                            __be32 daddr)
1364 {
1365     struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1366     struct fib_nh_exception *fnhe;
1367     u32 hval;
1368 
1369     if (!hash)
1370         return NULL;
1371 
1372     hval = fnhe_hashfun(daddr);
1373 
1374     for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1375          fnhe = rcu_dereference(fnhe->fnhe_next)) {
1376         if (fnhe->fnhe_daddr == daddr) {
1377             if (fnhe->fnhe_expires &&
1378                 time_after(jiffies, fnhe->fnhe_expires)) {
1379                 ip_del_fnhe(nhc, daddr);
1380                 break;
1381             }
1382             return fnhe;
1383         }
1384     }
1385     return NULL;
1386 }
1387 
1388 /* MTU selection:
1389  * 1. mtu on route is locked - use it
1390  * 2. mtu from nexthop exception
1391  * 3. mtu from egress device
1392  */
1393 
1394 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1395 {
1396     struct fib_nh_common *nhc = res->nhc;
1397     struct net_device *dev = nhc->nhc_dev;
1398     struct fib_info *fi = res->fi;
1399     u32 mtu = 0;
1400 
1401     if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1402         fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1403         mtu = fi->fib_mtu;
1404 
1405     if (likely(!mtu)) {
1406         struct fib_nh_exception *fnhe;
1407 
1408         fnhe = find_exception(nhc, daddr);
1409         if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1410             mtu = fnhe->fnhe_pmtu;
1411     }
1412 
1413     if (likely(!mtu))
1414         mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1415 
1416     return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1417 }
1418 
1419 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1420                   __be32 daddr, const bool do_cache)
1421 {
1422     bool ret = false;
1423 
1424     spin_lock_bh(&fnhe_lock);
1425 
1426     if (daddr == fnhe->fnhe_daddr) {
1427         struct rtable __rcu **porig;
1428         struct rtable *orig;
1429         int genid = fnhe_genid(dev_net(rt->dst.dev));
1430 
1431         if (rt_is_input_route(rt))
1432             porig = &fnhe->fnhe_rth_input;
1433         else
1434             porig = &fnhe->fnhe_rth_output;
1435         orig = rcu_dereference(*porig);
1436 
1437         if (fnhe->fnhe_genid != genid) {
1438             fnhe->fnhe_genid = genid;
1439             fnhe->fnhe_gw = 0;
1440             fnhe->fnhe_pmtu = 0;
1441             fnhe->fnhe_expires = 0;
1442             fnhe->fnhe_mtu_locked = false;
1443             fnhe_flush_routes(fnhe);
1444             orig = NULL;
1445         }
1446         fill_route_from_fnhe(rt, fnhe);
1447         if (!rt->rt_gw4) {
1448             rt->rt_gw4 = daddr;
1449             rt->rt_gw_family = AF_INET;
1450         }
1451 
1452         if (do_cache) {
1453             dst_hold(&rt->dst);
1454             rcu_assign_pointer(*porig, rt);
1455             if (orig) {
1456                 dst_dev_put(&orig->dst);
1457                 dst_release(&orig->dst);
1458             }
1459             ret = true;
1460         }
1461 
1462         fnhe->fnhe_stamp = jiffies;
1463     }
1464     spin_unlock_bh(&fnhe_lock);
1465 
1466     return ret;
1467 }
1468 
1469 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1470 {
1471     struct rtable *orig, *prev, **p;
1472     bool ret = true;
1473 
1474     if (rt_is_input_route(rt)) {
1475         p = (struct rtable **)&nhc->nhc_rth_input;
1476     } else {
1477         p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1478     }
1479     orig = *p;
1480 
1481     /* hold dst before doing cmpxchg() to avoid race condition
1482      * on this dst
1483      */
1484     dst_hold(&rt->dst);
1485     prev = cmpxchg(p, orig, rt);
1486     if (prev == orig) {
1487         if (orig) {
1488             rt_add_uncached_list(orig);
1489             dst_release(&orig->dst);
1490         }
1491     } else {
1492         dst_release(&rt->dst);
1493         ret = false;
1494     }
1495 
1496     return ret;
1497 }
1498 
1499 struct uncached_list {
1500     spinlock_t      lock;
1501     struct list_head    head;
1502     struct list_head    quarantine;
1503 };
1504 
1505 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1506 
1507 void rt_add_uncached_list(struct rtable *rt)
1508 {
1509     struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1510 
1511     rt->rt_uncached_list = ul;
1512 
1513     spin_lock_bh(&ul->lock);
1514     list_add_tail(&rt->rt_uncached, &ul->head);
1515     spin_unlock_bh(&ul->lock);
1516 }
1517 
1518 void rt_del_uncached_list(struct rtable *rt)
1519 {
1520     if (!list_empty(&rt->rt_uncached)) {
1521         struct uncached_list *ul = rt->rt_uncached_list;
1522 
1523         spin_lock_bh(&ul->lock);
1524         list_del_init(&rt->rt_uncached);
1525         spin_unlock_bh(&ul->lock);
1526     }
1527 }
1528 
1529 static void ipv4_dst_destroy(struct dst_entry *dst)
1530 {
1531     struct rtable *rt = (struct rtable *)dst;
1532 
1533     ip_dst_metrics_put(dst);
1534     rt_del_uncached_list(rt);
1535 }
1536 
1537 void rt_flush_dev(struct net_device *dev)
1538 {
1539     struct rtable *rt, *safe;
1540     int cpu;
1541 
1542     for_each_possible_cpu(cpu) {
1543         struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1544 
1545         if (list_empty(&ul->head))
1546             continue;
1547 
1548         spin_lock_bh(&ul->lock);
1549         list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
1550             if (rt->dst.dev != dev)
1551                 continue;
1552             rt->dst.dev = blackhole_netdev;
1553             netdev_ref_replace(dev, blackhole_netdev,
1554                        &rt->dst.dev_tracker, GFP_ATOMIC);
1555             list_move(&rt->rt_uncached, &ul->quarantine);
1556         }
1557         spin_unlock_bh(&ul->lock);
1558     }
1559 }
1560 
1561 static bool rt_cache_valid(const struct rtable *rt)
1562 {
1563     return  rt &&
1564         rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1565         !rt_is_expired(rt);
1566 }
1567 
1568 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1569                const struct fib_result *res,
1570                struct fib_nh_exception *fnhe,
1571                struct fib_info *fi, u16 type, u32 itag,
1572                const bool do_cache)
1573 {
1574     bool cached = false;
1575 
1576     if (fi) {
1577         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1578 
1579         if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1580             rt->rt_uses_gateway = 1;
1581             rt->rt_gw_family = nhc->nhc_gw_family;
1582             /* only INET and INET6 are supported */
1583             if (likely(nhc->nhc_gw_family == AF_INET))
1584                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1585             else
1586                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1587         }
1588 
1589         ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1590 
1591 #ifdef CONFIG_IP_ROUTE_CLASSID
1592         if (nhc->nhc_family == AF_INET) {
1593             struct fib_nh *nh;
1594 
1595             nh = container_of(nhc, struct fib_nh, nh_common);
1596             rt->dst.tclassid = nh->nh_tclassid;
1597         }
1598 #endif
1599         rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1600         if (unlikely(fnhe))
1601             cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1602         else if (do_cache)
1603             cached = rt_cache_route(nhc, rt);
1604         if (unlikely(!cached)) {
1605             /* Routes we intend to cache in nexthop exception or
1606              * FIB nexthop have the DST_NOCACHE bit clear.
1607              * However, if we are unsuccessful at storing this
1608              * route into the cache we really need to set it.
1609              */
1610             if (!rt->rt_gw4) {
1611                 rt->rt_gw_family = AF_INET;
1612                 rt->rt_gw4 = daddr;
1613             }
1614             rt_add_uncached_list(rt);
1615         }
1616     } else
1617         rt_add_uncached_list(rt);
1618 
1619 #ifdef CONFIG_IP_ROUTE_CLASSID
1620 #ifdef CONFIG_IP_MULTIPLE_TABLES
1621     set_class_tag(rt, res->tclassid);
1622 #endif
1623     set_class_tag(rt, itag);
1624 #endif
1625 }
1626 
1627 struct rtable *rt_dst_alloc(struct net_device *dev,
1628                 unsigned int flags, u16 type,
1629                 bool noxfrm)
1630 {
1631     struct rtable *rt;
1632 
1633     rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1634                (noxfrm ? DST_NOXFRM : 0));
1635 
1636     if (rt) {
1637         rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1638         rt->rt_flags = flags;
1639         rt->rt_type = type;
1640         rt->rt_is_input = 0;
1641         rt->rt_iif = 0;
1642         rt->rt_pmtu = 0;
1643         rt->rt_mtu_locked = 0;
1644         rt->rt_uses_gateway = 0;
1645         rt->rt_gw_family = 0;
1646         rt->rt_gw4 = 0;
1647         INIT_LIST_HEAD(&rt->rt_uncached);
1648 
1649         rt->dst.output = ip_output;
1650         if (flags & RTCF_LOCAL)
1651             rt->dst.input = ip_local_deliver;
1652     }
1653 
1654     return rt;
1655 }
1656 EXPORT_SYMBOL(rt_dst_alloc);
1657 
1658 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1659 {
1660     struct rtable *new_rt;
1661 
1662     new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1663                rt->dst.flags);
1664 
1665     if (new_rt) {
1666         new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1667         new_rt->rt_flags = rt->rt_flags;
1668         new_rt->rt_type = rt->rt_type;
1669         new_rt->rt_is_input = rt->rt_is_input;
1670         new_rt->rt_iif = rt->rt_iif;
1671         new_rt->rt_pmtu = rt->rt_pmtu;
1672         new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1673         new_rt->rt_gw_family = rt->rt_gw_family;
1674         if (rt->rt_gw_family == AF_INET)
1675             new_rt->rt_gw4 = rt->rt_gw4;
1676         else if (rt->rt_gw_family == AF_INET6)
1677             new_rt->rt_gw6 = rt->rt_gw6;
1678         INIT_LIST_HEAD(&new_rt->rt_uncached);
1679 
1680         new_rt->dst.input = rt->dst.input;
1681         new_rt->dst.output = rt->dst.output;
1682         new_rt->dst.error = rt->dst.error;
1683         new_rt->dst.lastuse = jiffies;
1684         new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1685     }
1686     return new_rt;
1687 }
1688 EXPORT_SYMBOL(rt_dst_clone);
1689 
1690 /* called in rcu_read_lock() section */
1691 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1692               u8 tos, struct net_device *dev,
1693               struct in_device *in_dev, u32 *itag)
1694 {
1695     int err;
1696 
1697     /* Primary sanity checks. */
1698     if (!in_dev)
1699         return -EINVAL;
1700 
1701     if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1702         skb->protocol != htons(ETH_P_IP))
1703         return -EINVAL;
1704 
1705     if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1706         return -EINVAL;
1707 
1708     if (ipv4_is_zeronet(saddr)) {
1709         if (!ipv4_is_local_multicast(daddr) &&
1710             ip_hdr(skb)->protocol != IPPROTO_IGMP)
1711             return -EINVAL;
1712     } else {
1713         err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1714                       in_dev, itag);
1715         if (err < 0)
1716             return err;
1717     }
1718     return 0;
1719 }
1720 
1721 /* called in rcu_read_lock() section */
1722 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1723                  u8 tos, struct net_device *dev, int our)
1724 {
1725     struct in_device *in_dev = __in_dev_get_rcu(dev);
1726     unsigned int flags = RTCF_MULTICAST;
1727     struct rtable *rth;
1728     u32 itag = 0;
1729     int err;
1730 
1731     err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1732     if (err)
1733         return err;
1734 
1735     if (our)
1736         flags |= RTCF_LOCAL;
1737 
1738     if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1739         IPCB(skb)->flags |= IPSKB_NOPOLICY;
1740 
1741     rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1742                false);
1743     if (!rth)
1744         return -ENOBUFS;
1745 
1746 #ifdef CONFIG_IP_ROUTE_CLASSID
1747     rth->dst.tclassid = itag;
1748 #endif
1749     rth->dst.output = ip_rt_bug;
1750     rth->rt_is_input= 1;
1751 
1752 #ifdef CONFIG_IP_MROUTE
1753     if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1754         rth->dst.input = ip_mr_input;
1755 #endif
1756     RT_CACHE_STAT_INC(in_slow_mc);
1757 
1758     skb_dst_drop(skb);
1759     skb_dst_set(skb, &rth->dst);
1760     return 0;
1761 }
1762 
1763 
1764 static void ip_handle_martian_source(struct net_device *dev,
1765                      struct in_device *in_dev,
1766                      struct sk_buff *skb,
1767                      __be32 daddr,
1768                      __be32 saddr)
1769 {
1770     RT_CACHE_STAT_INC(in_martian_src);
1771 #ifdef CONFIG_IP_ROUTE_VERBOSE
1772     if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1773         /*
1774          *  RFC1812 recommendation, if source is martian,
1775          *  the only hint is MAC header.
1776          */
1777         pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1778             &daddr, &saddr, dev->name);
1779         if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1780             print_hex_dump(KERN_WARNING, "ll header: ",
1781                        DUMP_PREFIX_OFFSET, 16, 1,
1782                        skb_mac_header(skb),
1783                        dev->hard_header_len, false);
1784         }
1785     }
1786 #endif
1787 }
1788 
1789 /* called in rcu_read_lock() section */
1790 static int __mkroute_input(struct sk_buff *skb,
1791                const struct fib_result *res,
1792                struct in_device *in_dev,
1793                __be32 daddr, __be32 saddr, u32 tos)
1794 {
1795     struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1796     struct net_device *dev = nhc->nhc_dev;
1797     struct fib_nh_exception *fnhe;
1798     struct rtable *rth;
1799     int err;
1800     struct in_device *out_dev;
1801     bool do_cache;
1802     u32 itag = 0;
1803 
1804     /* get a working reference to the output device */
1805     out_dev = __in_dev_get_rcu(dev);
1806     if (!out_dev) {
1807         net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1808         return -EINVAL;
1809     }
1810 
1811     err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1812                   in_dev->dev, in_dev, &itag);
1813     if (err < 0) {
1814         ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1815                      saddr);
1816 
1817         goto cleanup;
1818     }
1819 
1820     do_cache = res->fi && !itag;
1821     if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1822         skb->protocol == htons(ETH_P_IP)) {
1823         __be32 gw;
1824 
1825         gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1826         if (IN_DEV_SHARED_MEDIA(out_dev) ||
1827             inet_addr_onlink(out_dev, saddr, gw))
1828             IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1829     }
1830 
1831     if (skb->protocol != htons(ETH_P_IP)) {
1832         /* Not IP (i.e. ARP). Do not create route, if it is
1833          * invalid for proxy arp. DNAT routes are always valid.
1834          *
1835          * Proxy arp feature have been extended to allow, ARP
1836          * replies back to the same interface, to support
1837          * Private VLAN switch technologies. See arp.c.
1838          */
1839         if (out_dev == in_dev &&
1840             IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1841             err = -EINVAL;
1842             goto cleanup;
1843         }
1844     }
1845 
1846     if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1847         IPCB(skb)->flags |= IPSKB_NOPOLICY;
1848 
1849     fnhe = find_exception(nhc, daddr);
1850     if (do_cache) {
1851         if (fnhe)
1852             rth = rcu_dereference(fnhe->fnhe_rth_input);
1853         else
1854             rth = rcu_dereference(nhc->nhc_rth_input);
1855         if (rt_cache_valid(rth)) {
1856             skb_dst_set_noref(skb, &rth->dst);
1857             goto out;
1858         }
1859     }
1860 
1861     rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1862                IN_DEV_ORCONF(out_dev, NOXFRM));
1863     if (!rth) {
1864         err = -ENOBUFS;
1865         goto cleanup;
1866     }
1867 
1868     rth->rt_is_input = 1;
1869     RT_CACHE_STAT_INC(in_slow_tot);
1870 
1871     rth->dst.input = ip_forward;
1872 
1873     rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1874                do_cache);
1875     lwtunnel_set_redirect(&rth->dst);
1876     skb_dst_set(skb, &rth->dst);
1877 out:
1878     err = 0;
1879  cleanup:
1880     return err;
1881 }
1882 
1883 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1884 /* To make ICMP packets follow the right flow, the multipath hash is
1885  * calculated from the inner IP addresses.
1886  */
1887 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1888                  struct flow_keys *hash_keys)
1889 {
1890     const struct iphdr *outer_iph = ip_hdr(skb);
1891     const struct iphdr *key_iph = outer_iph;
1892     const struct iphdr *inner_iph;
1893     const struct icmphdr *icmph;
1894     struct iphdr _inner_iph;
1895     struct icmphdr _icmph;
1896 
1897     if (likely(outer_iph->protocol != IPPROTO_ICMP))
1898         goto out;
1899 
1900     if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1901         goto out;
1902 
1903     icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1904                    &_icmph);
1905     if (!icmph)
1906         goto out;
1907 
1908     if (!icmp_is_err(icmph->type))
1909         goto out;
1910 
1911     inner_iph = skb_header_pointer(skb,
1912                        outer_iph->ihl * 4 + sizeof(_icmph),
1913                        sizeof(_inner_iph), &_inner_iph);
1914     if (!inner_iph)
1915         goto out;
1916 
1917     key_iph = inner_iph;
1918 out:
1919     hash_keys->addrs.v4addrs.src = key_iph->saddr;
1920     hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1921 }
1922 
1923 static u32 fib_multipath_custom_hash_outer(const struct net *net,
1924                        const struct sk_buff *skb,
1925                        bool *p_has_inner)
1926 {
1927     u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1928     struct flow_keys keys, hash_keys;
1929 
1930     if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
1931         return 0;
1932 
1933     memset(&hash_keys, 0, sizeof(hash_keys));
1934     skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
1935 
1936     hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1937     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
1938         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1939     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
1940         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1941     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
1942         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1943     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
1944         hash_keys.ports.src = keys.ports.src;
1945     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
1946         hash_keys.ports.dst = keys.ports.dst;
1947 
1948     *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
1949     return flow_hash_from_keys(&hash_keys);
1950 }
1951 
1952 static u32 fib_multipath_custom_hash_inner(const struct net *net,
1953                        const struct sk_buff *skb,
1954                        bool has_inner)
1955 {
1956     u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1957     struct flow_keys keys, hash_keys;
1958 
1959     /* We assume the packet carries an encapsulation, but if none was
1960      * encountered during dissection of the outer flow, then there is no
1961      * point in calling the flow dissector again.
1962      */
1963     if (!has_inner)
1964         return 0;
1965 
1966     if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
1967         return 0;
1968 
1969     memset(&hash_keys, 0, sizeof(hash_keys));
1970     skb_flow_dissect_flow_keys(skb, &keys, 0);
1971 
1972     if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
1973         return 0;
1974 
1975     if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1976         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1977         if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1978             hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1979         if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1980             hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1981     } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1982         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1983         if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1984             hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1985         if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1986             hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1987         if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
1988             hash_keys.tags.flow_label = keys.tags.flow_label;
1989     }
1990 
1991     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
1992         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1993     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
1994         hash_keys.ports.src = keys.ports.src;
1995     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
1996         hash_keys.ports.dst = keys.ports.dst;
1997 
1998     return flow_hash_from_keys(&hash_keys);
1999 }
2000 
2001 static u32 fib_multipath_custom_hash_skb(const struct net *net,
2002                      const struct sk_buff *skb)
2003 {
2004     u32 mhash, mhash_inner;
2005     bool has_inner = true;
2006 
2007     mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
2008     mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
2009 
2010     return jhash_2words(mhash, mhash_inner, 0);
2011 }
2012 
2013 static u32 fib_multipath_custom_hash_fl4(const struct net *net,
2014                      const struct flowi4 *fl4)
2015 {
2016     u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
2017     struct flow_keys hash_keys;
2018 
2019     if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2020         return 0;
2021 
2022     memset(&hash_keys, 0, sizeof(hash_keys));
2023     hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2024     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2025         hash_keys.addrs.v4addrs.src = fl4->saddr;
2026     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2027         hash_keys.addrs.v4addrs.dst = fl4->daddr;
2028     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2029         hash_keys.basic.ip_proto = fl4->flowi4_proto;
2030     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2031         hash_keys.ports.src = fl4->fl4_sport;
2032     if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2033         hash_keys.ports.dst = fl4->fl4_dport;
2034 
2035     return flow_hash_from_keys(&hash_keys);
2036 }
2037 
2038 /* if skb is set it will be used and fl4 can be NULL */
2039 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
2040                const struct sk_buff *skb, struct flow_keys *flkeys)
2041 {
2042     u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
2043     struct flow_keys hash_keys;
2044     u32 mhash = 0;
2045 
2046     switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
2047     case 0:
2048         memset(&hash_keys, 0, sizeof(hash_keys));
2049         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2050         if (skb) {
2051             ip_multipath_l3_keys(skb, &hash_keys);
2052         } else {
2053             hash_keys.addrs.v4addrs.src = fl4->saddr;
2054             hash_keys.addrs.v4addrs.dst = fl4->daddr;
2055         }
2056         mhash = flow_hash_from_keys(&hash_keys);
2057         break;
2058     case 1:
2059         /* skb is currently provided only when forwarding */
2060         if (skb) {
2061             unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2062             struct flow_keys keys;
2063 
2064             /* short-circuit if we already have L4 hash present */
2065             if (skb->l4_hash)
2066                 return skb_get_hash_raw(skb) >> 1;
2067 
2068             memset(&hash_keys, 0, sizeof(hash_keys));
2069 
2070             if (!flkeys) {
2071                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2072                 flkeys = &keys;
2073             }
2074 
2075             hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2076             hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2077             hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2078             hash_keys.ports.src = flkeys->ports.src;
2079             hash_keys.ports.dst = flkeys->ports.dst;
2080             hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2081         } else {
2082             memset(&hash_keys, 0, sizeof(hash_keys));
2083             hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2084             hash_keys.addrs.v4addrs.src = fl4->saddr;
2085             hash_keys.addrs.v4addrs.dst = fl4->daddr;
2086             hash_keys.ports.src = fl4->fl4_sport;
2087             hash_keys.ports.dst = fl4->fl4_dport;
2088             hash_keys.basic.ip_proto = fl4->flowi4_proto;
2089         }
2090         mhash = flow_hash_from_keys(&hash_keys);
2091         break;
2092     case 2:
2093         memset(&hash_keys, 0, sizeof(hash_keys));
2094         /* skb is currently provided only when forwarding */
2095         if (skb) {
2096             struct flow_keys keys;
2097 
2098             skb_flow_dissect_flow_keys(skb, &keys, 0);
2099             /* Inner can be v4 or v6 */
2100             if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2101                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2102                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2103                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2104             } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2105                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2106                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2107                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2108                 hash_keys.tags.flow_label = keys.tags.flow_label;
2109                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2110             } else {
2111                 /* Same as case 0 */
2112                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2113                 ip_multipath_l3_keys(skb, &hash_keys);
2114             }
2115         } else {
2116             /* Same as case 0 */
2117             hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2118             hash_keys.addrs.v4addrs.src = fl4->saddr;
2119             hash_keys.addrs.v4addrs.dst = fl4->daddr;
2120         }
2121         mhash = flow_hash_from_keys(&hash_keys);
2122         break;
2123     case 3:
2124         if (skb)
2125             mhash = fib_multipath_custom_hash_skb(net, skb);
2126         else
2127             mhash = fib_multipath_custom_hash_fl4(net, fl4);
2128         break;
2129     }
2130 
2131     if (multipath_hash)
2132         mhash = jhash_2words(mhash, multipath_hash, 0);
2133 
2134     return mhash >> 1;
2135 }
2136 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2137 
2138 static int ip_mkroute_input(struct sk_buff *skb,
2139                 struct fib_result *res,
2140                 struct in_device *in_dev,
2141                 __be32 daddr, __be32 saddr, u32 tos,
2142                 struct flow_keys *hkeys)
2143 {
2144 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2145     if (res->fi && fib_info_num_path(res->fi) > 1) {
2146         int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2147 
2148         fib_select_multipath(res, h);
2149     }
2150 #endif
2151 
2152     /* create a routing cache entry */
2153     return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2154 }
2155 
2156 /* Implements all the saddr-related checks as ip_route_input_slow(),
2157  * assuming daddr is valid and the destination is not a local broadcast one.
2158  * Uses the provided hint instead of performing a route lookup.
2159  */
2160 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2161               u8 tos, struct net_device *dev,
2162               const struct sk_buff *hint)
2163 {
2164     struct in_device *in_dev = __in_dev_get_rcu(dev);
2165     struct rtable *rt = skb_rtable(hint);
2166     struct net *net = dev_net(dev);
2167     int err = -EINVAL;
2168     u32 tag = 0;
2169 
2170     if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2171         goto martian_source;
2172 
2173     if (ipv4_is_zeronet(saddr))
2174         goto martian_source;
2175 
2176     if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2177         goto martian_source;
2178 
2179     if (rt->rt_type != RTN_LOCAL)
2180         goto skip_validate_source;
2181 
2182     tos &= IPTOS_RT_MASK;
2183     err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2184     if (err < 0)
2185         goto martian_source;
2186 
2187 skip_validate_source:
2188     skb_dst_copy(skb, hint);
2189     return 0;
2190 
2191 martian_source:
2192     ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2193     return err;
2194 }
2195 
2196 /* get device for dst_alloc with local routes */
2197 static struct net_device *ip_rt_get_dev(struct net *net,
2198                     const struct fib_result *res)
2199 {
2200     struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2201     struct net_device *dev = NULL;
2202 
2203     if (nhc)
2204         dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2205 
2206     return dev ? : net->loopback_dev;
2207 }
2208 
2209 /*
2210  *  NOTE. We drop all the packets that has local source
2211  *  addresses, because every properly looped back packet
2212  *  must have correct destination already attached by output routine.
2213  *  Changes in the enforced policies must be applied also to
2214  *  ip_route_use_hint().
2215  *
2216  *  Such approach solves two big problems:
2217  *  1. Not simplex devices are handled properly.
2218  *  2. IP spoofing attempts are filtered with 100% of guarantee.
2219  *  called with rcu_read_lock()
2220  */
2221 
2222 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2223                    u8 tos, struct net_device *dev,
2224                    struct fib_result *res)
2225 {
2226     struct in_device *in_dev = __in_dev_get_rcu(dev);
2227     struct flow_keys *flkeys = NULL, _flkeys;
2228     struct net    *net = dev_net(dev);
2229     struct ip_tunnel_info *tun_info;
2230     int     err = -EINVAL;
2231     unsigned int    flags = 0;
2232     u32     itag = 0;
2233     struct rtable   *rth;
2234     struct flowi4   fl4;
2235     bool do_cache = true;
2236 
2237     /* IP on this device is disabled. */
2238 
2239     if (!in_dev)
2240         goto out;
2241 
2242     /* Check for the most weird martians, which can be not detected
2243      * by fib_lookup.
2244      */
2245 
2246     tun_info = skb_tunnel_info(skb);
2247     if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2248         fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2249     else
2250         fl4.flowi4_tun_key.tun_id = 0;
2251     skb_dst_drop(skb);
2252 
2253     if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2254         goto martian_source;
2255 
2256     res->fi = NULL;
2257     res->table = NULL;
2258     if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2259         goto brd_input;
2260 
2261     /* Accept zero addresses only to limited broadcast;
2262      * I even do not know to fix it or not. Waiting for complains :-)
2263      */
2264     if (ipv4_is_zeronet(saddr))
2265         goto martian_source;
2266 
2267     if (ipv4_is_zeronet(daddr))
2268         goto martian_destination;
2269 
2270     /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2271      * and call it once if daddr or/and saddr are loopback addresses
2272      */
2273     if (ipv4_is_loopback(daddr)) {
2274         if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2275             goto martian_destination;
2276     } else if (ipv4_is_loopback(saddr)) {
2277         if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2278             goto martian_source;
2279     }
2280 
2281     /*
2282      *  Now we are ready to route packet.
2283      */
2284     fl4.flowi4_l3mdev = 0;
2285     fl4.flowi4_oif = 0;
2286     fl4.flowi4_iif = dev->ifindex;
2287     fl4.flowi4_mark = skb->mark;
2288     fl4.flowi4_tos = tos;
2289     fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2290     fl4.flowi4_flags = 0;
2291     fl4.daddr = daddr;
2292     fl4.saddr = saddr;
2293     fl4.flowi4_uid = sock_net_uid(net, NULL);
2294     fl4.flowi4_multipath_hash = 0;
2295 
2296     if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2297         flkeys = &_flkeys;
2298     } else {
2299         fl4.flowi4_proto = 0;
2300         fl4.fl4_sport = 0;
2301         fl4.fl4_dport = 0;
2302     }
2303 
2304     err = fib_lookup(net, &fl4, res, 0);
2305     if (err != 0) {
2306         if (!IN_DEV_FORWARD(in_dev))
2307             err = -EHOSTUNREACH;
2308         goto no_route;
2309     }
2310 
2311     if (res->type == RTN_BROADCAST) {
2312         if (IN_DEV_BFORWARD(in_dev))
2313             goto make_route;
2314         /* not do cache if bc_forwarding is enabled */
2315         if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2316             do_cache = false;
2317         goto brd_input;
2318     }
2319 
2320     if (res->type == RTN_LOCAL) {
2321         err = fib_validate_source(skb, saddr, daddr, tos,
2322                       0, dev, in_dev, &itag);
2323         if (err < 0)
2324             goto martian_source;
2325         goto local_input;
2326     }
2327 
2328     if (!IN_DEV_FORWARD(in_dev)) {
2329         err = -EHOSTUNREACH;
2330         goto no_route;
2331     }
2332     if (res->type != RTN_UNICAST)
2333         goto martian_destination;
2334 
2335 make_route:
2336     err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2337 out:    return err;
2338 
2339 brd_input:
2340     if (skb->protocol != htons(ETH_P_IP))
2341         goto e_inval;
2342 
2343     if (!ipv4_is_zeronet(saddr)) {
2344         err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2345                       in_dev, &itag);
2346         if (err < 0)
2347             goto martian_source;
2348     }
2349     flags |= RTCF_BROADCAST;
2350     res->type = RTN_BROADCAST;
2351     RT_CACHE_STAT_INC(in_brd);
2352 
2353 local_input:
2354     if (IN_DEV_ORCONF(in_dev, NOPOLICY))
2355         IPCB(skb)->flags |= IPSKB_NOPOLICY;
2356 
2357     do_cache &= res->fi && !itag;
2358     if (do_cache) {
2359         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2360 
2361         rth = rcu_dereference(nhc->nhc_rth_input);
2362         if (rt_cache_valid(rth)) {
2363             skb_dst_set_noref(skb, &rth->dst);
2364             err = 0;
2365             goto out;
2366         }
2367     }
2368 
2369     rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2370                flags | RTCF_LOCAL, res->type, false);
2371     if (!rth)
2372         goto e_nobufs;
2373 
2374     rth->dst.output= ip_rt_bug;
2375 #ifdef CONFIG_IP_ROUTE_CLASSID
2376     rth->dst.tclassid = itag;
2377 #endif
2378     rth->rt_is_input = 1;
2379 
2380     RT_CACHE_STAT_INC(in_slow_tot);
2381     if (res->type == RTN_UNREACHABLE) {
2382         rth->dst.input= ip_error;
2383         rth->dst.error= -err;
2384         rth->rt_flags   &= ~RTCF_LOCAL;
2385     }
2386 
2387     if (do_cache) {
2388         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2389 
2390         rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2391         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2392             WARN_ON(rth->dst.input == lwtunnel_input);
2393             rth->dst.lwtstate->orig_input = rth->dst.input;
2394             rth->dst.input = lwtunnel_input;
2395         }
2396 
2397         if (unlikely(!rt_cache_route(nhc, rth)))
2398             rt_add_uncached_list(rth);
2399     }
2400     skb_dst_set(skb, &rth->dst);
2401     err = 0;
2402     goto out;
2403 
2404 no_route:
2405     RT_CACHE_STAT_INC(in_no_route);
2406     res->type = RTN_UNREACHABLE;
2407     res->fi = NULL;
2408     res->table = NULL;
2409     goto local_input;
2410 
2411     /*
2412      *  Do not cache martian addresses: they should be logged (RFC1812)
2413      */
2414 martian_destination:
2415     RT_CACHE_STAT_INC(in_martian_dst);
2416 #ifdef CONFIG_IP_ROUTE_VERBOSE
2417     if (IN_DEV_LOG_MARTIANS(in_dev))
2418         net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2419                      &daddr, &saddr, dev->name);
2420 #endif
2421 
2422 e_inval:
2423     err = -EINVAL;
2424     goto out;
2425 
2426 e_nobufs:
2427     err = -ENOBUFS;
2428     goto out;
2429 
2430 martian_source:
2431     ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2432     goto out;
2433 }
2434 
2435 /* called with rcu_read_lock held */
2436 static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2437                   u8 tos, struct net_device *dev, struct fib_result *res)
2438 {
2439     /* Multicast recognition logic is moved from route cache to here.
2440      * The problem was that too many Ethernet cards have broken/missing
2441      * hardware multicast filters :-( As result the host on multicasting
2442      * network acquires a lot of useless route cache entries, sort of
2443      * SDR messages from all the world. Now we try to get rid of them.
2444      * Really, provided software IP multicast filter is organized
2445      * reasonably (at least, hashed), it does not result in a slowdown
2446      * comparing with route cache reject entries.
2447      * Note, that multicast routers are not affected, because
2448      * route cache entry is created eventually.
2449      */
2450     if (ipv4_is_multicast(daddr)) {
2451         struct in_device *in_dev = __in_dev_get_rcu(dev);
2452         int our = 0;
2453         int err = -EINVAL;
2454 
2455         if (!in_dev)
2456             return err;
2457         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2458                       ip_hdr(skb)->protocol);
2459 
2460         /* check l3 master if no match yet */
2461         if (!our && netif_is_l3_slave(dev)) {
2462             struct in_device *l3_in_dev;
2463 
2464             l3_in_dev = __in_dev_get_rcu(skb->dev);
2465             if (l3_in_dev)
2466                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2467                               ip_hdr(skb)->protocol);
2468         }
2469 
2470         if (our
2471 #ifdef CONFIG_IP_MROUTE
2472             ||
2473             (!ipv4_is_local_multicast(daddr) &&
2474              IN_DEV_MFORWARD(in_dev))
2475 #endif
2476            ) {
2477             err = ip_route_input_mc(skb, daddr, saddr,
2478                         tos, dev, our);
2479         }
2480         return err;
2481     }
2482 
2483     return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2484 }
2485 
2486 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2487              u8 tos, struct net_device *dev)
2488 {
2489     struct fib_result res;
2490     int err;
2491 
2492     tos &= IPTOS_RT_MASK;
2493     rcu_read_lock();
2494     err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2495     rcu_read_unlock();
2496 
2497     return err;
2498 }
2499 EXPORT_SYMBOL(ip_route_input_noref);
2500 
2501 /* called with rcu_read_lock() */
2502 static struct rtable *__mkroute_output(const struct fib_result *res,
2503                        const struct flowi4 *fl4, int orig_oif,
2504                        struct net_device *dev_out,
2505                        unsigned int flags)
2506 {
2507     struct fib_info *fi = res->fi;
2508     struct fib_nh_exception *fnhe;
2509     struct in_device *in_dev;
2510     u16 type = res->type;
2511     struct rtable *rth;
2512     bool do_cache;
2513 
2514     in_dev = __in_dev_get_rcu(dev_out);
2515     if (!in_dev)
2516         return ERR_PTR(-EINVAL);
2517 
2518     if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2519         if (ipv4_is_loopback(fl4->saddr) &&
2520             !(dev_out->flags & IFF_LOOPBACK) &&
2521             !netif_is_l3_master(dev_out))
2522             return ERR_PTR(-EINVAL);
2523 
2524     if (ipv4_is_lbcast(fl4->daddr))
2525         type = RTN_BROADCAST;
2526     else if (ipv4_is_multicast(fl4->daddr))
2527         type = RTN_MULTICAST;
2528     else if (ipv4_is_zeronet(fl4->daddr))
2529         return ERR_PTR(-EINVAL);
2530 
2531     if (dev_out->flags & IFF_LOOPBACK)
2532         flags |= RTCF_LOCAL;
2533 
2534     do_cache = true;
2535     if (type == RTN_BROADCAST) {
2536         flags |= RTCF_BROADCAST | RTCF_LOCAL;
2537         fi = NULL;
2538     } else if (type == RTN_MULTICAST) {
2539         flags |= RTCF_MULTICAST | RTCF_LOCAL;
2540         if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2541                      fl4->flowi4_proto))
2542             flags &= ~RTCF_LOCAL;
2543         else
2544             do_cache = false;
2545         /* If multicast route do not exist use
2546          * default one, but do not gateway in this case.
2547          * Yes, it is hack.
2548          */
2549         if (fi && res->prefixlen < 4)
2550             fi = NULL;
2551     } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2552            (orig_oif != dev_out->ifindex)) {
2553         /* For local routes that require a particular output interface
2554          * we do not want to cache the result.  Caching the result
2555          * causes incorrect behaviour when there are multiple source
2556          * addresses on the interface, the end result being that if the
2557          * intended recipient is waiting on that interface for the
2558          * packet he won't receive it because it will be delivered on
2559          * the loopback interface and the IP_PKTINFO ipi_ifindex will
2560          * be set to the loopback interface as well.
2561          */
2562         do_cache = false;
2563     }
2564 
2565     fnhe = NULL;
2566     do_cache &= fi != NULL;
2567     if (fi) {
2568         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2569         struct rtable __rcu **prth;
2570 
2571         fnhe = find_exception(nhc, fl4->daddr);
2572         if (!do_cache)
2573             goto add;
2574         if (fnhe) {
2575             prth = &fnhe->fnhe_rth_output;
2576         } else {
2577             if (unlikely(fl4->flowi4_flags &
2578                      FLOWI_FLAG_KNOWN_NH &&
2579                      !(nhc->nhc_gw_family &&
2580                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2581                 do_cache = false;
2582                 goto add;
2583             }
2584             prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2585         }
2586         rth = rcu_dereference(*prth);
2587         if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2588             return rth;
2589     }
2590 
2591 add:
2592     rth = rt_dst_alloc(dev_out, flags, type,
2593                IN_DEV_ORCONF(in_dev, NOXFRM));
2594     if (!rth)
2595         return ERR_PTR(-ENOBUFS);
2596 
2597     rth->rt_iif = orig_oif;
2598 
2599     RT_CACHE_STAT_INC(out_slow_tot);
2600 
2601     if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2602         if (flags & RTCF_LOCAL &&
2603             !(dev_out->flags & IFF_LOOPBACK)) {
2604             rth->dst.output = ip_mc_output;
2605             RT_CACHE_STAT_INC(out_slow_mc);
2606         }
2607 #ifdef CONFIG_IP_MROUTE
2608         if (type == RTN_MULTICAST) {
2609             if (IN_DEV_MFORWARD(in_dev) &&
2610                 !ipv4_is_local_multicast(fl4->daddr)) {
2611                 rth->dst.input = ip_mr_input;
2612                 rth->dst.output = ip_mc_output;
2613             }
2614         }
2615 #endif
2616     }
2617 
2618     rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2619     lwtunnel_set_redirect(&rth->dst);
2620 
2621     return rth;
2622 }
2623 
2624 /*
2625  * Major route resolver routine.
2626  */
2627 
2628 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2629                     const struct sk_buff *skb)
2630 {
2631     struct fib_result res = {
2632         .type       = RTN_UNSPEC,
2633         .fi     = NULL,
2634         .table      = NULL,
2635         .tclassid   = 0,
2636     };
2637     struct rtable *rth;
2638 
2639     fl4->flowi4_iif = LOOPBACK_IFINDEX;
2640     ip_rt_fix_tos(fl4);
2641 
2642     rcu_read_lock();
2643     rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2644     rcu_read_unlock();
2645 
2646     return rth;
2647 }
2648 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2649 
2650 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2651                         struct fib_result *res,
2652                         const struct sk_buff *skb)
2653 {
2654     struct net_device *dev_out = NULL;
2655     int orig_oif = fl4->flowi4_oif;
2656     unsigned int flags = 0;
2657     struct rtable *rth;
2658     int err;
2659 
2660     if (fl4->saddr) {
2661         if (ipv4_is_multicast(fl4->saddr) ||
2662             ipv4_is_lbcast(fl4->saddr) ||
2663             ipv4_is_zeronet(fl4->saddr)) {
2664             rth = ERR_PTR(-EINVAL);
2665             goto out;
2666         }
2667 
2668         rth = ERR_PTR(-ENETUNREACH);
2669 
2670         /* I removed check for oif == dev_out->oif here.
2671          * It was wrong for two reasons:
2672          * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2673          *    is assigned to multiple interfaces.
2674          * 2. Moreover, we are allowed to send packets with saddr
2675          *    of another iface. --ANK
2676          */
2677 
2678         if (fl4->flowi4_oif == 0 &&
2679             (ipv4_is_multicast(fl4->daddr) ||
2680              ipv4_is_lbcast(fl4->daddr))) {
2681             /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2682             dev_out = __ip_dev_find(net, fl4->saddr, false);
2683             if (!dev_out)
2684                 goto out;
2685 
2686             /* Special hack: user can direct multicasts
2687              * and limited broadcast via necessary interface
2688              * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2689              * This hack is not just for fun, it allows
2690              * vic,vat and friends to work.
2691              * They bind socket to loopback, set ttl to zero
2692              * and expect that it will work.
2693              * From the viewpoint of routing cache they are broken,
2694              * because we are not allowed to build multicast path
2695              * with loopback source addr (look, routing cache
2696              * cannot know, that ttl is zero, so that packet
2697              * will not leave this host and route is valid).
2698              * Luckily, this hack is good workaround.
2699              */
2700 
2701             fl4->flowi4_oif = dev_out->ifindex;
2702             goto make_route;
2703         }
2704 
2705         if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2706             /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2707             if (!__ip_dev_find(net, fl4->saddr, false))
2708                 goto out;
2709         }
2710     }
2711 
2712 
2713     if (fl4->flowi4_oif) {
2714         dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2715         rth = ERR_PTR(-ENODEV);
2716         if (!dev_out)
2717             goto out;
2718 
2719         /* RACE: Check return value of inet_select_addr instead. */
2720         if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2721             rth = ERR_PTR(-ENETUNREACH);
2722             goto out;
2723         }
2724         if (ipv4_is_local_multicast(fl4->daddr) ||
2725             ipv4_is_lbcast(fl4->daddr) ||
2726             fl4->flowi4_proto == IPPROTO_IGMP) {
2727             if (!fl4->saddr)
2728                 fl4->saddr = inet_select_addr(dev_out, 0,
2729                                   RT_SCOPE_LINK);
2730             goto make_route;
2731         }
2732         if (!fl4->saddr) {
2733             if (ipv4_is_multicast(fl4->daddr))
2734                 fl4->saddr = inet_select_addr(dev_out, 0,
2735                                   fl4->flowi4_scope);
2736             else if (!fl4->daddr)
2737                 fl4->saddr = inet_select_addr(dev_out, 0,
2738                                   RT_SCOPE_HOST);
2739         }
2740     }
2741 
2742     if (!fl4->daddr) {
2743         fl4->daddr = fl4->saddr;
2744         if (!fl4->daddr)
2745             fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2746         dev_out = net->loopback_dev;
2747         fl4->flowi4_oif = LOOPBACK_IFINDEX;
2748         res->type = RTN_LOCAL;
2749         flags |= RTCF_LOCAL;
2750         goto make_route;
2751     }
2752 
2753     err = fib_lookup(net, fl4, res, 0);
2754     if (err) {
2755         res->fi = NULL;
2756         res->table = NULL;
2757         if (fl4->flowi4_oif &&
2758             (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
2759             /* Apparently, routing tables are wrong. Assume,
2760              * that the destination is on link.
2761              *
2762              * WHY? DW.
2763              * Because we are allowed to send to iface
2764              * even if it has NO routes and NO assigned
2765              * addresses. When oif is specified, routing
2766              * tables are looked up with only one purpose:
2767              * to catch if destination is gatewayed, rather than
2768              * direct. Moreover, if MSG_DONTROUTE is set,
2769              * we send packet, ignoring both routing tables
2770              * and ifaddr state. --ANK
2771              *
2772              *
2773              * We could make it even if oif is unknown,
2774              * likely IPv6, but we do not.
2775              */
2776 
2777             if (fl4->saddr == 0)
2778                 fl4->saddr = inet_select_addr(dev_out, 0,
2779                                   RT_SCOPE_LINK);
2780             res->type = RTN_UNICAST;
2781             goto make_route;
2782         }
2783         rth = ERR_PTR(err);
2784         goto out;
2785     }
2786 
2787     if (res->type == RTN_LOCAL) {
2788         if (!fl4->saddr) {
2789             if (res->fi->fib_prefsrc)
2790                 fl4->saddr = res->fi->fib_prefsrc;
2791             else
2792                 fl4->saddr = fl4->daddr;
2793         }
2794 
2795         /* L3 master device is the loopback for that domain */
2796         dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2797             net->loopback_dev;
2798 
2799         /* make sure orig_oif points to fib result device even
2800          * though packet rx/tx happens over loopback or l3mdev
2801          */
2802         orig_oif = FIB_RES_OIF(*res);
2803 
2804         fl4->flowi4_oif = dev_out->ifindex;
2805         flags |= RTCF_LOCAL;
2806         goto make_route;
2807     }
2808 
2809     fib_select_path(net, res, fl4, skb);
2810 
2811     dev_out = FIB_RES_DEV(*res);
2812 
2813 make_route:
2814     rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2815 
2816 out:
2817     return rth;
2818 }
2819 
2820 static struct dst_ops ipv4_dst_blackhole_ops = {
2821     .family         = AF_INET,
2822     .default_advmss     = ipv4_default_advmss,
2823     .neigh_lookup       = ipv4_neigh_lookup,
2824     .check          = dst_blackhole_check,
2825     .cow_metrics        = dst_blackhole_cow_metrics,
2826     .update_pmtu        = dst_blackhole_update_pmtu,
2827     .redirect       = dst_blackhole_redirect,
2828     .mtu            = dst_blackhole_mtu,
2829 };
2830 
2831 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2832 {
2833     struct rtable *ort = (struct rtable *) dst_orig;
2834     struct rtable *rt;
2835 
2836     rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2837     if (rt) {
2838         struct dst_entry *new = &rt->dst;
2839 
2840         new->__use = 1;
2841         new->input = dst_discard;
2842         new->output = dst_discard_out;
2843 
2844         new->dev = net->loopback_dev;
2845         netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
2846 
2847         rt->rt_is_input = ort->rt_is_input;
2848         rt->rt_iif = ort->rt_iif;
2849         rt->rt_pmtu = ort->rt_pmtu;
2850         rt->rt_mtu_locked = ort->rt_mtu_locked;
2851 
2852         rt->rt_genid = rt_genid_ipv4(net);
2853         rt->rt_flags = ort->rt_flags;
2854         rt->rt_type = ort->rt_type;
2855         rt->rt_uses_gateway = ort->rt_uses_gateway;
2856         rt->rt_gw_family = ort->rt_gw_family;
2857         if (rt->rt_gw_family == AF_INET)
2858             rt->rt_gw4 = ort->rt_gw4;
2859         else if (rt->rt_gw_family == AF_INET6)
2860             rt->rt_gw6 = ort->rt_gw6;
2861 
2862         INIT_LIST_HEAD(&rt->rt_uncached);
2863     }
2864 
2865     dst_release(dst_orig);
2866 
2867     return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2868 }
2869 
2870 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2871                     const struct sock *sk)
2872 {
2873     struct rtable *rt = __ip_route_output_key(net, flp4);
2874 
2875     if (IS_ERR(rt))
2876         return rt;
2877 
2878     if (flp4->flowi4_proto) {
2879         flp4->flowi4_oif = rt->dst.dev->ifindex;
2880         rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2881                             flowi4_to_flowi(flp4),
2882                             sk, 0);
2883     }
2884 
2885     return rt;
2886 }
2887 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2888 
2889 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2890                       struct net_device *dev,
2891                       struct net *net, __be32 *saddr,
2892                       const struct ip_tunnel_info *info,
2893                       u8 protocol, bool use_cache)
2894 {
2895 #ifdef CONFIG_DST_CACHE
2896     struct dst_cache *dst_cache;
2897 #endif
2898     struct rtable *rt = NULL;
2899     struct flowi4 fl4;
2900     __u8 tos;
2901 
2902 #ifdef CONFIG_DST_CACHE
2903     dst_cache = (struct dst_cache *)&info->dst_cache;
2904     if (use_cache) {
2905         rt = dst_cache_get_ip4(dst_cache, saddr);
2906         if (rt)
2907             return rt;
2908     }
2909 #endif
2910     memset(&fl4, 0, sizeof(fl4));
2911     fl4.flowi4_mark = skb->mark;
2912     fl4.flowi4_proto = protocol;
2913     fl4.daddr = info->key.u.ipv4.dst;
2914     fl4.saddr = info->key.u.ipv4.src;
2915     tos = info->key.tos;
2916     fl4.flowi4_tos = RT_TOS(tos);
2917 
2918     rt = ip_route_output_key(net, &fl4);
2919     if (IS_ERR(rt)) {
2920         netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2921         return ERR_PTR(-ENETUNREACH);
2922     }
2923     if (rt->dst.dev == dev) { /* is this necessary? */
2924         netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2925         ip_rt_put(rt);
2926         return ERR_PTR(-ELOOP);
2927     }
2928 #ifdef CONFIG_DST_CACHE
2929     if (use_cache)
2930         dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2931 #endif
2932     *saddr = fl4.saddr;
2933     return rt;
2934 }
2935 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2936 
2937 /* called with rcu_read_lock held */
2938 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2939             struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2940             struct sk_buff *skb, u32 portid, u32 seq,
2941             unsigned int flags)
2942 {
2943     struct rtmsg *r;
2944     struct nlmsghdr *nlh;
2945     unsigned long expires = 0;
2946     u32 error;
2947     u32 metrics[RTAX_MAX];
2948 
2949     nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2950     if (!nlh)
2951         return -EMSGSIZE;
2952 
2953     r = nlmsg_data(nlh);
2954     r->rtm_family    = AF_INET;
2955     r->rtm_dst_len  = 32;
2956     r->rtm_src_len  = 0;
2957     r->rtm_tos  = fl4 ? fl4->flowi4_tos : 0;
2958     r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2959     if (nla_put_u32(skb, RTA_TABLE, table_id))
2960         goto nla_put_failure;
2961     r->rtm_type = rt->rt_type;
2962     r->rtm_scope    = RT_SCOPE_UNIVERSE;
2963     r->rtm_protocol = RTPROT_UNSPEC;
2964     r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2965     if (rt->rt_flags & RTCF_NOTIFY)
2966         r->rtm_flags |= RTM_F_NOTIFY;
2967     if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2968         r->rtm_flags |= RTCF_DOREDIRECT;
2969 
2970     if (nla_put_in_addr(skb, RTA_DST, dst))
2971         goto nla_put_failure;
2972     if (src) {
2973         r->rtm_src_len = 32;
2974         if (nla_put_in_addr(skb, RTA_SRC, src))
2975             goto nla_put_failure;
2976     }
2977     if (rt->dst.dev &&
2978         nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2979         goto nla_put_failure;
2980     if (rt->dst.lwtstate &&
2981         lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2982         goto nla_put_failure;
2983 #ifdef CONFIG_IP_ROUTE_CLASSID
2984     if (rt->dst.tclassid &&
2985         nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2986         goto nla_put_failure;
2987 #endif
2988     if (fl4 && !rt_is_input_route(rt) &&
2989         fl4->saddr != src) {
2990         if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2991             goto nla_put_failure;
2992     }
2993     if (rt->rt_uses_gateway) {
2994         if (rt->rt_gw_family == AF_INET &&
2995             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2996             goto nla_put_failure;
2997         } else if (rt->rt_gw_family == AF_INET6) {
2998             int alen = sizeof(struct in6_addr);
2999             struct nlattr *nla;
3000             struct rtvia *via;
3001 
3002             nla = nla_reserve(skb, RTA_VIA, alen + 2);
3003             if (!nla)
3004                 goto nla_put_failure;
3005 
3006             via = nla_data(nla);
3007             via->rtvia_family = AF_INET6;
3008             memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
3009         }
3010     }
3011 
3012     expires = rt->dst.expires;
3013     if (expires) {
3014         unsigned long now = jiffies;
3015 
3016         if (time_before(now, expires))
3017             expires -= now;
3018         else
3019             expires = 0;
3020     }
3021 
3022     memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3023     if (rt->rt_pmtu && expires)
3024         metrics[RTAX_MTU - 1] = rt->rt_pmtu;
3025     if (rt->rt_mtu_locked && expires)
3026         metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
3027     if (rtnetlink_put_metrics(skb, metrics) < 0)
3028         goto nla_put_failure;
3029 
3030     if (fl4) {
3031         if (fl4->flowi4_mark &&
3032             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
3033             goto nla_put_failure;
3034 
3035         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
3036             nla_put_u32(skb, RTA_UID,
3037                 from_kuid_munged(current_user_ns(),
3038                          fl4->flowi4_uid)))
3039             goto nla_put_failure;
3040 
3041         if (rt_is_input_route(rt)) {
3042 #ifdef CONFIG_IP_MROUTE
3043             if (ipv4_is_multicast(dst) &&
3044                 !ipv4_is_local_multicast(dst) &&
3045                 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3046                 int err = ipmr_get_route(net, skb,
3047                              fl4->saddr, fl4->daddr,
3048                              r, portid);
3049 
3050                 if (err <= 0) {
3051                     if (err == 0)
3052                         return 0;
3053                     goto nla_put_failure;
3054                 }
3055             } else
3056 #endif
3057                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
3058                     goto nla_put_failure;
3059         }
3060     }
3061 
3062     error = rt->dst.error;
3063 
3064     if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3065         goto nla_put_failure;
3066 
3067     nlmsg_end(skb, nlh);
3068     return 0;
3069 
3070 nla_put_failure:
3071     nlmsg_cancel(skb, nlh);
3072     return -EMSGSIZE;
3073 }
3074 
3075 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3076                 struct netlink_callback *cb, u32 table_id,
3077                 struct fnhe_hash_bucket *bucket, int genid,
3078                 int *fa_index, int fa_start, unsigned int flags)
3079 {
3080     int i;
3081 
3082     for (i = 0; i < FNHE_HASH_SIZE; i++) {
3083         struct fib_nh_exception *fnhe;
3084 
3085         for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3086              fnhe = rcu_dereference(fnhe->fnhe_next)) {
3087             struct rtable *rt;
3088             int err;
3089 
3090             if (*fa_index < fa_start)
3091                 goto next;
3092 
3093             if (fnhe->fnhe_genid != genid)
3094                 goto next;
3095 
3096             if (fnhe->fnhe_expires &&
3097                 time_after(jiffies, fnhe->fnhe_expires))
3098                 goto next;
3099 
3100             rt = rcu_dereference(fnhe->fnhe_rth_input);
3101             if (!rt)
3102                 rt = rcu_dereference(fnhe->fnhe_rth_output);
3103             if (!rt)
3104                 goto next;
3105 
3106             err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3107                        table_id, NULL, skb,
3108                        NETLINK_CB(cb->skb).portid,
3109                        cb->nlh->nlmsg_seq, flags);
3110             if (err)
3111                 return err;
3112 next:
3113             (*fa_index)++;
3114         }
3115     }
3116 
3117     return 0;
3118 }
3119 
3120 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3121                u32 table_id, struct fib_info *fi,
3122                int *fa_index, int fa_start, unsigned int flags)
3123 {
3124     struct net *net = sock_net(cb->skb->sk);
3125     int nhsel, genid = fnhe_genid(net);
3126 
3127     for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3128         struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3129         struct fnhe_hash_bucket *bucket;
3130         int err;
3131 
3132         if (nhc->nhc_flags & RTNH_F_DEAD)
3133             continue;
3134 
3135         rcu_read_lock();
3136         bucket = rcu_dereference(nhc->nhc_exceptions);
3137         err = 0;
3138         if (bucket)
3139             err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3140                            genid, fa_index, fa_start,
3141                            flags);
3142         rcu_read_unlock();
3143         if (err)
3144             return err;
3145     }
3146 
3147     return 0;
3148 }
3149 
3150 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3151                            u8 ip_proto, __be16 sport,
3152                            __be16 dport)
3153 {
3154     struct sk_buff *skb;
3155     struct iphdr *iph;
3156 
3157     skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3158     if (!skb)
3159         return NULL;
3160 
3161     /* Reserve room for dummy headers, this skb can pass
3162      * through good chunk of routing engine.
3163      */
3164     skb_reset_mac_header(skb);
3165     skb_reset_network_header(skb);
3166     skb->protocol = htons(ETH_P_IP);
3167     iph = skb_put(skb, sizeof(struct iphdr));
3168     iph->protocol = ip_proto;
3169     iph->saddr = src;
3170     iph->daddr = dst;
3171     iph->version = 0x4;
3172     iph->frag_off = 0;
3173     iph->ihl = 0x5;
3174     skb_set_transport_header(skb, skb->len);
3175 
3176     switch (iph->protocol) {
3177     case IPPROTO_UDP: {
3178         struct udphdr *udph;
3179 
3180         udph = skb_put_zero(skb, sizeof(struct udphdr));
3181         udph->source = sport;
3182         udph->dest = dport;
3183         udph->len = htons(sizeof(struct udphdr));
3184         udph->check = 0;
3185         break;
3186     }
3187     case IPPROTO_TCP: {
3188         struct tcphdr *tcph;
3189 
3190         tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3191         tcph->source    = sport;
3192         tcph->dest  = dport;
3193         tcph->doff  = sizeof(struct tcphdr) / 4;
3194         tcph->rst = 1;
3195         tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3196                         src, dst, 0);
3197         break;
3198     }
3199     case IPPROTO_ICMP: {
3200         struct icmphdr *icmph;
3201 
3202         icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3203         icmph->type = ICMP_ECHO;
3204         icmph->code = 0;
3205     }
3206     }
3207 
3208     return skb;
3209 }
3210 
3211 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3212                        const struct nlmsghdr *nlh,
3213                        struct nlattr **tb,
3214                        struct netlink_ext_ack *extack)
3215 {
3216     struct rtmsg *rtm;
3217     int i, err;
3218 
3219     if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3220         NL_SET_ERR_MSG(extack,
3221                    "ipv4: Invalid header for route get request");
3222         return -EINVAL;
3223     }
3224 
3225     if (!netlink_strict_get_check(skb))
3226         return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3227                           rtm_ipv4_policy, extack);
3228 
3229     rtm = nlmsg_data(nlh);
3230     if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3231         (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3232         rtm->rtm_table || rtm->rtm_protocol ||
3233         rtm->rtm_scope || rtm->rtm_type) {
3234         NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3235         return -EINVAL;
3236     }
3237 
3238     if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3239                    RTM_F_LOOKUP_TABLE |
3240                    RTM_F_FIB_MATCH)) {
3241         NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3242         return -EINVAL;
3243     }
3244 
3245     err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3246                         rtm_ipv4_policy, extack);
3247     if (err)
3248         return err;
3249 
3250     if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3251         (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3252         NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3253         return -EINVAL;
3254     }
3255 
3256     for (i = 0; i <= RTA_MAX; i++) {
3257         if (!tb[i])
3258             continue;
3259 
3260         switch (i) {
3261         case RTA_IIF:
3262         case RTA_OIF:
3263         case RTA_SRC:
3264         case RTA_DST:
3265         case RTA_IP_PROTO:
3266         case RTA_SPORT:
3267         case RTA_DPORT:
3268         case RTA_MARK:
3269         case RTA_UID:
3270             break;
3271         default:
3272             NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3273             return -EINVAL;
3274         }
3275     }
3276 
3277     return 0;
3278 }
3279 
3280 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3281                  struct netlink_ext_ack *extack)
3282 {
3283     struct net *net = sock_net(in_skb->sk);
3284     struct nlattr *tb[RTA_MAX+1];
3285     u32 table_id = RT_TABLE_MAIN;
3286     __be16 sport = 0, dport = 0;
3287     struct fib_result res = {};
3288     u8 ip_proto = IPPROTO_UDP;
3289     struct rtable *rt = NULL;
3290     struct sk_buff *skb;
3291     struct rtmsg *rtm;
3292     struct flowi4 fl4 = {};
3293     __be32 dst = 0;
3294     __be32 src = 0;
3295     kuid_t uid;
3296     u32 iif;
3297     int err;
3298     int mark;
3299 
3300     err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3301     if (err < 0)
3302         return err;
3303 
3304     rtm = nlmsg_data(nlh);
3305     src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3306     dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3307     iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3308     mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3309     if (tb[RTA_UID])
3310         uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3311     else
3312         uid = (iif ? INVALID_UID : current_uid());
3313 
3314     if (tb[RTA_IP_PROTO]) {
3315         err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3316                           &ip_proto, AF_INET, extack);
3317         if (err)
3318             return err;
3319     }
3320 
3321     if (tb[RTA_SPORT])
3322         sport = nla_get_be16(tb[RTA_SPORT]);
3323 
3324     if (tb[RTA_DPORT])
3325         dport = nla_get_be16(tb[RTA_DPORT]);
3326 
3327     skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3328     if (!skb)
3329         return -ENOBUFS;
3330 
3331     fl4.daddr = dst;
3332     fl4.saddr = src;
3333     fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3334     fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3335     fl4.flowi4_mark = mark;
3336     fl4.flowi4_uid = uid;
3337     if (sport)
3338         fl4.fl4_sport = sport;
3339     if (dport)
3340         fl4.fl4_dport = dport;
3341     fl4.flowi4_proto = ip_proto;
3342 
3343     rcu_read_lock();
3344 
3345     if (iif) {
3346         struct net_device *dev;
3347 
3348         dev = dev_get_by_index_rcu(net, iif);
3349         if (!dev) {
3350             err = -ENODEV;
3351             goto errout_rcu;
3352         }
3353 
3354         fl4.flowi4_iif = iif; /* for rt_fill_info */
3355         skb->dev    = dev;
3356         skb->mark   = mark;
3357         err = ip_route_input_rcu(skb, dst, src,
3358                      rtm->rtm_tos & IPTOS_RT_MASK, dev,
3359                      &res);
3360 
3361         rt = skb_rtable(skb);
3362         if (err == 0 && rt->dst.error)
3363             err = -rt->dst.error;
3364     } else {
3365         fl4.flowi4_iif = LOOPBACK_IFINDEX;
3366         skb->dev = net->loopback_dev;
3367         rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3368         err = 0;
3369         if (IS_ERR(rt))
3370             err = PTR_ERR(rt);
3371         else
3372             skb_dst_set(skb, &rt->dst);
3373     }
3374 
3375     if (err)
3376         goto errout_rcu;
3377 
3378     if (rtm->rtm_flags & RTM_F_NOTIFY)
3379         rt->rt_flags |= RTCF_NOTIFY;
3380 
3381     if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3382         table_id = res.table ? res.table->tb_id : 0;
3383 
3384     /* reset skb for netlink reply msg */
3385     skb_trim(skb, 0);
3386     skb_reset_network_header(skb);
3387     skb_reset_transport_header(skb);
3388     skb_reset_mac_header(skb);
3389 
3390     if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3391         struct fib_rt_info fri;
3392 
3393         if (!res.fi) {
3394             err = fib_props[res.type].error;
3395             if (!err)
3396                 err = -EHOSTUNREACH;
3397             goto errout_rcu;
3398         }
3399         fri.fi = res.fi;
3400         fri.tb_id = table_id;
3401         fri.dst = res.prefix;
3402         fri.dst_len = res.prefixlen;
3403         fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
3404         fri.type = rt->rt_type;
3405         fri.offload = 0;
3406         fri.trap = 0;
3407         fri.offload_failed = 0;
3408         if (res.fa_head) {
3409             struct fib_alias *fa;
3410 
3411             hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3412                 u8 slen = 32 - fri.dst_len;
3413 
3414                 if (fa->fa_slen == slen &&
3415                     fa->tb_id == fri.tb_id &&
3416                     fa->fa_dscp == fri.dscp &&
3417                     fa->fa_info == res.fi &&
3418                     fa->fa_type == fri.type) {
3419                     fri.offload = READ_ONCE(fa->offload);
3420                     fri.trap = READ_ONCE(fa->trap);
3421                     break;
3422                 }
3423             }
3424         }
3425         err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3426                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3427     } else {
3428         err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3429                    NETLINK_CB(in_skb).portid,
3430                    nlh->nlmsg_seq, 0);
3431     }
3432     if (err < 0)
3433         goto errout_rcu;
3434 
3435     rcu_read_unlock();
3436 
3437     err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3438 
3439 errout_free:
3440     return err;
3441 errout_rcu:
3442     rcu_read_unlock();
3443     kfree_skb(skb);
3444     goto errout_free;
3445 }
3446 
3447 void ip_rt_multicast_event(struct in_device *in_dev)
3448 {
3449     rt_cache_flush(dev_net(in_dev->dev));
3450 }
3451 
3452 #ifdef CONFIG_SYSCTL
3453 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3454 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3455 static int ip_rt_gc_elasticity __read_mostly    = 8;
3456 static int ip_min_valid_pmtu __read_mostly  = IPV4_MIN_MTU;
3457 
3458 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3459         void *buffer, size_t *lenp, loff_t *ppos)
3460 {
3461     struct net *net = (struct net *)__ctl->extra1;
3462 
3463     if (write) {
3464         rt_cache_flush(net);
3465         fnhe_genid_bump(net);
3466         return 0;
3467     }
3468 
3469     return -EINVAL;
3470 }
3471 
3472 static struct ctl_table ipv4_route_table[] = {
3473     {
3474         .procname   = "gc_thresh",
3475         .data       = &ipv4_dst_ops.gc_thresh,
3476         .maxlen     = sizeof(int),
3477         .mode       = 0644,
3478         .proc_handler   = proc_dointvec,
3479     },
3480     {
3481         .procname   = "max_size",
3482         .data       = &ip_rt_max_size,
3483         .maxlen     = sizeof(int),
3484         .mode       = 0644,
3485         .proc_handler   = proc_dointvec,
3486     },
3487     {
3488         /*  Deprecated. Use gc_min_interval_ms */
3489 
3490         .procname   = "gc_min_interval",
3491         .data       = &ip_rt_gc_min_interval,
3492         .maxlen     = sizeof(int),
3493         .mode       = 0644,
3494         .proc_handler   = proc_dointvec_jiffies,
3495     },
3496     {
3497         .procname   = "gc_min_interval_ms",
3498         .data       = &ip_rt_gc_min_interval,
3499         .maxlen     = sizeof(int),
3500         .mode       = 0644,
3501         .proc_handler   = proc_dointvec_ms_jiffies,
3502     },
3503     {
3504         .procname   = "gc_timeout",
3505         .data       = &ip_rt_gc_timeout,
3506         .maxlen     = sizeof(int),
3507         .mode       = 0644,
3508         .proc_handler   = proc_dointvec_jiffies,
3509     },
3510     {
3511         .procname   = "gc_interval",
3512         .data       = &ip_rt_gc_interval,
3513         .maxlen     = sizeof(int),
3514         .mode       = 0644,
3515         .proc_handler   = proc_dointvec_jiffies,
3516     },
3517     {
3518         .procname   = "redirect_load",
3519         .data       = &ip_rt_redirect_load,
3520         .maxlen     = sizeof(int),
3521         .mode       = 0644,
3522         .proc_handler   = proc_dointvec,
3523     },
3524     {
3525         .procname   = "redirect_number",
3526         .data       = &ip_rt_redirect_number,
3527         .maxlen     = sizeof(int),
3528         .mode       = 0644,
3529         .proc_handler   = proc_dointvec,
3530     },
3531     {
3532         .procname   = "redirect_silence",
3533         .data       = &ip_rt_redirect_silence,
3534         .maxlen     = sizeof(int),
3535         .mode       = 0644,
3536         .proc_handler   = proc_dointvec,
3537     },
3538     {
3539         .procname   = "error_cost",
3540         .data       = &ip_rt_error_cost,
3541         .maxlen     = sizeof(int),
3542         .mode       = 0644,
3543         .proc_handler   = proc_dointvec,
3544     },
3545     {
3546         .procname   = "error_burst",
3547         .data       = &ip_rt_error_burst,
3548         .maxlen     = sizeof(int),
3549         .mode       = 0644,
3550         .proc_handler   = proc_dointvec,
3551     },
3552     {
3553         .procname   = "gc_elasticity",
3554         .data       = &ip_rt_gc_elasticity,
3555         .maxlen     = sizeof(int),
3556         .mode       = 0644,
3557         .proc_handler   = proc_dointvec,
3558     },
3559     { }
3560 };
3561 
3562 static const char ipv4_route_flush_procname[] = "flush";
3563 
3564 static struct ctl_table ipv4_route_netns_table[] = {
3565     {
3566         .procname   = ipv4_route_flush_procname,
3567         .maxlen     = sizeof(int),
3568         .mode       = 0200,
3569         .proc_handler   = ipv4_sysctl_rtcache_flush,
3570     },
3571     {
3572         .procname       = "min_pmtu",
3573         .data           = &init_net.ipv4.ip_rt_min_pmtu,
3574         .maxlen         = sizeof(int),
3575         .mode           = 0644,
3576         .proc_handler   = proc_dointvec_minmax,
3577         .extra1         = &ip_min_valid_pmtu,
3578     },
3579     {
3580         .procname       = "mtu_expires",
3581         .data           = &init_net.ipv4.ip_rt_mtu_expires,
3582         .maxlen         = sizeof(int),
3583         .mode           = 0644,
3584         .proc_handler   = proc_dointvec_jiffies,
3585     },
3586     {
3587         .procname   = "min_adv_mss",
3588         .data       = &init_net.ipv4.ip_rt_min_advmss,
3589         .maxlen     = sizeof(int),
3590         .mode       = 0644,
3591         .proc_handler   = proc_dointvec,
3592     },
3593     { },
3594 };
3595 
3596 static __net_init int sysctl_route_net_init(struct net *net)
3597 {
3598     struct ctl_table *tbl;
3599 
3600     tbl = ipv4_route_netns_table;
3601     if (!net_eq(net, &init_net)) {
3602         int i;
3603 
3604         tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
3605         if (!tbl)
3606             goto err_dup;
3607 
3608         /* Don't export non-whitelisted sysctls to unprivileged users */
3609         if (net->user_ns != &init_user_ns) {
3610             if (tbl[0].procname != ipv4_route_flush_procname)
3611                 tbl[0].procname = NULL;
3612         }
3613 
3614         /* Update the variables to point into the current struct net
3615          * except for the first element flush
3616          */
3617         for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
3618             tbl[i].data += (void *)net - (void *)&init_net;
3619     }
3620     tbl[0].extra1 = net;
3621 
3622     net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3623     if (!net->ipv4.route_hdr)
3624         goto err_reg;
3625     return 0;
3626 
3627 err_reg:
3628     if (tbl != ipv4_route_netns_table)
3629         kfree(tbl);
3630 err_dup:
3631     return -ENOMEM;
3632 }
3633 
3634 static __net_exit void sysctl_route_net_exit(struct net *net)
3635 {
3636     struct ctl_table *tbl;
3637 
3638     tbl = net->ipv4.route_hdr->ctl_table_arg;
3639     unregister_net_sysctl_table(net->ipv4.route_hdr);
3640     BUG_ON(tbl == ipv4_route_netns_table);
3641     kfree(tbl);
3642 }
3643 
3644 static __net_initdata struct pernet_operations sysctl_route_ops = {
3645     .init = sysctl_route_net_init,
3646     .exit = sysctl_route_net_exit,
3647 };
3648 #endif
3649 
3650 static __net_init int netns_ip_rt_init(struct net *net)
3651 {
3652     /* Set default value for namespaceified sysctls */
3653     net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
3654     net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
3655     net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
3656     return 0;
3657 }
3658 
3659 static struct pernet_operations __net_initdata ip_rt_ops = {
3660     .init = netns_ip_rt_init,
3661 };
3662 
3663 static __net_init int rt_genid_init(struct net *net)
3664 {
3665     atomic_set(&net->ipv4.rt_genid, 0);
3666     atomic_set(&net->fnhe_genid, 0);
3667     atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3668     return 0;
3669 }
3670 
3671 static __net_initdata struct pernet_operations rt_genid_ops = {
3672     .init = rt_genid_init,
3673 };
3674 
3675 static int __net_init ipv4_inetpeer_init(struct net *net)
3676 {
3677     struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3678 
3679     if (!bp)
3680         return -ENOMEM;
3681     inet_peer_base_init(bp);
3682     net->ipv4.peers = bp;
3683     return 0;
3684 }
3685 
3686 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3687 {
3688     struct inet_peer_base *bp = net->ipv4.peers;
3689 
3690     net->ipv4.peers = NULL;
3691     inetpeer_invalidate_tree(bp);
3692     kfree(bp);
3693 }
3694 
3695 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3696     .init   =   ipv4_inetpeer_init,
3697     .exit   =   ipv4_inetpeer_exit,
3698 };
3699 
3700 #ifdef CONFIG_IP_ROUTE_CLASSID
3701 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3702 #endif /* CONFIG_IP_ROUTE_CLASSID */
3703 
3704 int __init ip_rt_init(void)
3705 {
3706     void *idents_hash;
3707     int cpu;
3708 
3709     /* For modern hosts, this will use 2 MB of memory */
3710     idents_hash = alloc_large_system_hash("IP idents",
3711                           sizeof(*ip_idents) + sizeof(*ip_tstamps),
3712                           0,
3713                           16, /* one bucket per 64 KB */
3714                           HASH_ZERO,
3715                           NULL,
3716                           &ip_idents_mask,
3717                           2048,
3718                           256*1024);
3719 
3720     ip_idents = idents_hash;
3721 
3722     prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3723 
3724     ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3725 
3726     for_each_possible_cpu(cpu) {
3727         struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3728 
3729         INIT_LIST_HEAD(&ul->head);
3730         INIT_LIST_HEAD(&ul->quarantine);
3731         spin_lock_init(&ul->lock);
3732     }
3733 #ifdef CONFIG_IP_ROUTE_CLASSID
3734     ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3735     if (!ip_rt_acct)
3736         panic("IP: failed to allocate ip_rt_acct\n");
3737 #endif
3738 
3739     ipv4_dst_ops.kmem_cachep =
3740         kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3741                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3742 
3743     ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3744 
3745     if (dst_entries_init(&ipv4_dst_ops) < 0)
3746         panic("IP: failed to allocate ipv4_dst_ops counter\n");
3747 
3748     if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3749         panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3750 
3751     ipv4_dst_ops.gc_thresh = ~0;
3752     ip_rt_max_size = INT_MAX;
3753 
3754     devinet_init();
3755     ip_fib_init();
3756 
3757     if (ip_rt_proc_init())
3758         pr_err("Unable to create route proc files\n");
3759 #ifdef CONFIG_XFRM
3760     xfrm_init();
3761     xfrm4_init();
3762 #endif
3763     rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3764               RTNL_FLAG_DOIT_UNLOCKED);
3765 
3766 #ifdef CONFIG_SYSCTL
3767     register_pernet_subsys(&sysctl_route_ops);
3768 #endif
3769     register_pernet_subsys(&ip_rt_ops);
3770     register_pernet_subsys(&rt_genid_ops);
3771     register_pernet_subsys(&ipv4_inetpeer_ops);
3772     return 0;
3773 }
3774 
3775 #ifdef CONFIG_SYSCTL
3776 /*
3777  * We really need to sanitize the damn ipv4 init order, then all
3778  * this nonsense will go away.
3779  */
3780 void __init ip_static_sysctl_init(void)
3781 {
3782     register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3783 }
3784 #endif