Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  * INET     An implementation of the TCP/IP protocol suite for the LINUX
0004  *      operating system.  INET is implemented using the  BSD Socket
0005  *      interface as the means of communication with the user level.
0006  *
0007  *      IPv4 Forwarding Information Base: FIB frontend.
0008  *
0009  * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
0010  */
0011 
0012 #include <linux/module.h>
0013 #include <linux/uaccess.h>
0014 #include <linux/bitops.h>
0015 #include <linux/capability.h>
0016 #include <linux/types.h>
0017 #include <linux/kernel.h>
0018 #include <linux/mm.h>
0019 #include <linux/string.h>
0020 #include <linux/socket.h>
0021 #include <linux/sockios.h>
0022 #include <linux/errno.h>
0023 #include <linux/in.h>
0024 #include <linux/inet.h>
0025 #include <linux/inetdevice.h>
0026 #include <linux/netdevice.h>
0027 #include <linux/if_addr.h>
0028 #include <linux/if_arp.h>
0029 #include <linux/skbuff.h>
0030 #include <linux/cache.h>
0031 #include <linux/init.h>
0032 #include <linux/list.h>
0033 #include <linux/slab.h>
0034 
0035 #include <net/inet_dscp.h>
0036 #include <net/ip.h>
0037 #include <net/protocol.h>
0038 #include <net/route.h>
0039 #include <net/tcp.h>
0040 #include <net/sock.h>
0041 #include <net/arp.h>
0042 #include <net/ip_fib.h>
0043 #include <net/nexthop.h>
0044 #include <net/rtnetlink.h>
0045 #include <net/xfrm.h>
0046 #include <net/l3mdev.h>
0047 #include <net/lwtunnel.h>
0048 #include <trace/events/fib.h>
0049 
0050 #ifndef CONFIG_IP_MULTIPLE_TABLES
0051 
0052 static int __net_init fib4_rules_init(struct net *net)
0053 {
0054     struct fib_table *local_table, *main_table;
0055 
0056     main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
0057     if (!main_table)
0058         return -ENOMEM;
0059 
0060     local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
0061     if (!local_table)
0062         goto fail;
0063 
0064     hlist_add_head_rcu(&local_table->tb_hlist,
0065                 &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
0066     hlist_add_head_rcu(&main_table->tb_hlist,
0067                 &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
0068     return 0;
0069 
0070 fail:
0071     fib_free_table(main_table);
0072     return -ENOMEM;
0073 }
0074 #else
0075 
0076 struct fib_table *fib_new_table(struct net *net, u32 id)
0077 {
0078     struct fib_table *tb, *alias = NULL;
0079     unsigned int h;
0080 
0081     if (id == 0)
0082         id = RT_TABLE_MAIN;
0083     tb = fib_get_table(net, id);
0084     if (tb)
0085         return tb;
0086 
0087     if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
0088         alias = fib_new_table(net, RT_TABLE_MAIN);
0089 
0090     tb = fib_trie_table(id, alias);
0091     if (!tb)
0092         return NULL;
0093 
0094     switch (id) {
0095     case RT_TABLE_MAIN:
0096         rcu_assign_pointer(net->ipv4.fib_main, tb);
0097         break;
0098     case RT_TABLE_DEFAULT:
0099         rcu_assign_pointer(net->ipv4.fib_default, tb);
0100         break;
0101     default:
0102         break;
0103     }
0104 
0105     h = id & (FIB_TABLE_HASHSZ - 1);
0106     hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
0107     return tb;
0108 }
0109 EXPORT_SYMBOL_GPL(fib_new_table);
0110 
0111 /* caller must hold either rtnl or rcu read lock */
0112 struct fib_table *fib_get_table(struct net *net, u32 id)
0113 {
0114     struct fib_table *tb;
0115     struct hlist_head *head;
0116     unsigned int h;
0117 
0118     if (id == 0)
0119         id = RT_TABLE_MAIN;
0120     h = id & (FIB_TABLE_HASHSZ - 1);
0121 
0122     head = &net->ipv4.fib_table_hash[h];
0123     hlist_for_each_entry_rcu(tb, head, tb_hlist,
0124                  lockdep_rtnl_is_held()) {
0125         if (tb->tb_id == id)
0126             return tb;
0127     }
0128     return NULL;
0129 }
0130 #endif /* CONFIG_IP_MULTIPLE_TABLES */
0131 
0132 static void fib_replace_table(struct net *net, struct fib_table *old,
0133                   struct fib_table *new)
0134 {
0135 #ifdef CONFIG_IP_MULTIPLE_TABLES
0136     switch (new->tb_id) {
0137     case RT_TABLE_MAIN:
0138         rcu_assign_pointer(net->ipv4.fib_main, new);
0139         break;
0140     case RT_TABLE_DEFAULT:
0141         rcu_assign_pointer(net->ipv4.fib_default, new);
0142         break;
0143     default:
0144         break;
0145     }
0146 
0147 #endif
0148     /* replace the old table in the hlist */
0149     hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
0150 }
0151 
0152 int fib_unmerge(struct net *net)
0153 {
0154     struct fib_table *old, *new, *main_table;
0155 
0156     /* attempt to fetch local table if it has been allocated */
0157     old = fib_get_table(net, RT_TABLE_LOCAL);
0158     if (!old)
0159         return 0;
0160 
0161     new = fib_trie_unmerge(old);
0162     if (!new)
0163         return -ENOMEM;
0164 
0165     /* table is already unmerged */
0166     if (new == old)
0167         return 0;
0168 
0169     /* replace merged table with clean table */
0170     fib_replace_table(net, old, new);
0171     fib_free_table(old);
0172 
0173     /* attempt to fetch main table if it has been allocated */
0174     main_table = fib_get_table(net, RT_TABLE_MAIN);
0175     if (!main_table)
0176         return 0;
0177 
0178     /* flush local entries from main table */
0179     fib_table_flush_external(main_table);
0180 
0181     return 0;
0182 }
0183 
0184 void fib_flush(struct net *net)
0185 {
0186     int flushed = 0;
0187     unsigned int h;
0188 
0189     for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
0190         struct hlist_head *head = &net->ipv4.fib_table_hash[h];
0191         struct hlist_node *tmp;
0192         struct fib_table *tb;
0193 
0194         hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
0195             flushed += fib_table_flush(net, tb, false);
0196     }
0197 
0198     if (flushed)
0199         rt_cache_flush(net);
0200 }
0201 
0202 /*
0203  * Find address type as if only "dev" was present in the system. If
0204  * on_dev is NULL then all interfaces are taken into consideration.
0205  */
0206 static inline unsigned int __inet_dev_addr_type(struct net *net,
0207                         const struct net_device *dev,
0208                         __be32 addr, u32 tb_id)
0209 {
0210     struct flowi4       fl4 = { .daddr = addr };
0211     struct fib_result   res;
0212     unsigned int ret = RTN_BROADCAST;
0213     struct fib_table *table;
0214 
0215     if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
0216         return RTN_BROADCAST;
0217     if (ipv4_is_multicast(addr))
0218         return RTN_MULTICAST;
0219 
0220     rcu_read_lock();
0221 
0222     table = fib_get_table(net, tb_id);
0223     if (table) {
0224         ret = RTN_UNICAST;
0225         if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
0226             struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
0227 
0228             if (!dev || dev == nhc->nhc_dev)
0229                 ret = res.type;
0230         }
0231     }
0232 
0233     rcu_read_unlock();
0234     return ret;
0235 }
0236 
0237 unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
0238 {
0239     return __inet_dev_addr_type(net, NULL, addr, tb_id);
0240 }
0241 EXPORT_SYMBOL(inet_addr_type_table);
0242 
0243 unsigned int inet_addr_type(struct net *net, __be32 addr)
0244 {
0245     return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
0246 }
0247 EXPORT_SYMBOL(inet_addr_type);
0248 
0249 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
0250                 __be32 addr)
0251 {
0252     u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
0253 
0254     return __inet_dev_addr_type(net, dev, addr, rt_table);
0255 }
0256 EXPORT_SYMBOL(inet_dev_addr_type);
0257 
0258 /* inet_addr_type with dev == NULL but using the table from a dev
0259  * if one is associated
0260  */
0261 unsigned int inet_addr_type_dev_table(struct net *net,
0262                       const struct net_device *dev,
0263                       __be32 addr)
0264 {
0265     u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
0266 
0267     return __inet_dev_addr_type(net, NULL, addr, rt_table);
0268 }
0269 EXPORT_SYMBOL(inet_addr_type_dev_table);
0270 
0271 __be32 fib_compute_spec_dst(struct sk_buff *skb)
0272 {
0273     struct net_device *dev = skb->dev;
0274     struct in_device *in_dev;
0275     struct fib_result res;
0276     struct rtable *rt;
0277     struct net *net;
0278     int scope;
0279 
0280     rt = skb_rtable(skb);
0281     if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
0282         RTCF_LOCAL)
0283         return ip_hdr(skb)->daddr;
0284 
0285     in_dev = __in_dev_get_rcu(dev);
0286 
0287     net = dev_net(dev);
0288 
0289     scope = RT_SCOPE_UNIVERSE;
0290     if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
0291         bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
0292         struct flowi4 fl4 = {
0293             .flowi4_iif = LOOPBACK_IFINDEX,
0294             .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
0295             .daddr = ip_hdr(skb)->saddr,
0296             .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK,
0297             .flowi4_scope = scope,
0298             .flowi4_mark = vmark ? skb->mark : 0,
0299         };
0300         if (!fib_lookup(net, &fl4, &res, 0))
0301             return fib_result_prefsrc(net, &res);
0302     } else {
0303         scope = RT_SCOPE_LINK;
0304     }
0305 
0306     return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
0307 }
0308 
0309 bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
0310 {
0311     bool dev_match = false;
0312 #ifdef CONFIG_IP_ROUTE_MULTIPATH
0313     if (unlikely(fi->nh)) {
0314         dev_match = nexthop_uses_dev(fi->nh, dev);
0315     } else {
0316         int ret;
0317 
0318         for (ret = 0; ret < fib_info_num_path(fi); ret++) {
0319             const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
0320 
0321             if (nhc_l3mdev_matches_dev(nhc, dev)) {
0322                 dev_match = true;
0323                 break;
0324             }
0325         }
0326     }
0327 #else
0328     if (fib_info_nhc(fi, 0)->nhc_dev == dev)
0329         dev_match = true;
0330 #endif
0331 
0332     return dev_match;
0333 }
0334 EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
0335 
0336 /* Given (packet source, input interface) and optional (dst, oif, tos):
0337  * - (main) check, that source is valid i.e. not broadcast or our local
0338  *   address.
0339  * - figure out what "logical" interface this packet arrived
0340  *   and calculate "specific destination" address.
0341  * - check, that packet arrived from expected physical interface.
0342  * called with rcu_read_lock()
0343  */
0344 static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
0345                  u8 tos, int oif, struct net_device *dev,
0346                  int rpf, struct in_device *idev, u32 *itag)
0347 {
0348     struct net *net = dev_net(dev);
0349     struct flow_keys flkeys;
0350     int ret, no_addr;
0351     struct fib_result res;
0352     struct flowi4 fl4;
0353     bool dev_match;
0354 
0355     fl4.flowi4_oif = 0;
0356     fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev);
0357     fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
0358     fl4.daddr = src;
0359     fl4.saddr = dst;
0360     fl4.flowi4_tos = tos;
0361     fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
0362     fl4.flowi4_tun_key.tun_id = 0;
0363     fl4.flowi4_flags = 0;
0364     fl4.flowi4_uid = sock_net_uid(net, NULL);
0365     fl4.flowi4_multipath_hash = 0;
0366 
0367     no_addr = idev->ifa_list == NULL;
0368 
0369     fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
0370     if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
0371         fl4.flowi4_proto = 0;
0372         fl4.fl4_sport = 0;
0373         fl4.fl4_dport = 0;
0374     } else {
0375         swap(fl4.fl4_sport, fl4.fl4_dport);
0376     }
0377 
0378     if (fib_lookup(net, &fl4, &res, 0))
0379         goto last_resort;
0380     if (res.type != RTN_UNICAST &&
0381         (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
0382         goto e_inval;
0383     fib_combine_itag(itag, &res);
0384 
0385     dev_match = fib_info_nh_uses_dev(res.fi, dev);
0386     /* This is not common, loopback packets retain skb_dst so normally they
0387      * would not even hit this slow path.
0388      */
0389     dev_match = dev_match || (res.type == RTN_LOCAL &&
0390                   dev == net->loopback_dev);
0391     if (dev_match) {
0392         ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK;
0393         return ret;
0394     }
0395     if (no_addr)
0396         goto last_resort;
0397     if (rpf == 1)
0398         goto e_rpf;
0399     fl4.flowi4_oif = dev->ifindex;
0400 
0401     ret = 0;
0402     if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
0403         if (res.type == RTN_UNICAST)
0404             ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK;
0405     }
0406     return ret;
0407 
0408 last_resort:
0409     if (rpf)
0410         goto e_rpf;
0411     *itag = 0;
0412     return 0;
0413 
0414 e_inval:
0415     return -EINVAL;
0416 e_rpf:
0417     return -EXDEV;
0418 }
0419 
0420 /* Ignore rp_filter for packets protected by IPsec. */
0421 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
0422             u8 tos, int oif, struct net_device *dev,
0423             struct in_device *idev, u32 *itag)
0424 {
0425     int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
0426     struct net *net = dev_net(dev);
0427 
0428     if (!r && !fib_num_tclassid_users(net) &&
0429         (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
0430         if (IN_DEV_ACCEPT_LOCAL(idev))
0431             goto ok;
0432         /* with custom local routes in place, checking local addresses
0433          * only will be too optimistic, with custom rules, checking
0434          * local addresses only can be too strict, e.g. due to vrf
0435          */
0436         if (net->ipv4.fib_has_custom_local_routes ||
0437             fib4_has_custom_rules(net))
0438             goto full_check;
0439         /* Within the same container, it is regarded as a martian source,
0440          * and the same host but different containers are not.
0441          */
0442         if (inet_lookup_ifaddr_rcu(net, src))
0443             return -EINVAL;
0444 
0445 ok:
0446         *itag = 0;
0447         return 0;
0448     }
0449 
0450 full_check:
0451     return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
0452 }
0453 
0454 static inline __be32 sk_extract_addr(struct sockaddr *addr)
0455 {
0456     return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
0457 }
0458 
0459 static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
0460 {
0461     struct nlattr *nla;
0462 
0463     nla = (struct nlattr *) ((char *) mx + len);
0464     nla->nla_type = type;
0465     nla->nla_len = nla_attr_size(4);
0466     *(u32 *) nla_data(nla) = value;
0467 
0468     return len + nla_total_size(4);
0469 }
0470 
0471 static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
0472                  struct fib_config *cfg)
0473 {
0474     __be32 addr;
0475     int plen;
0476 
0477     memset(cfg, 0, sizeof(*cfg));
0478     cfg->fc_nlinfo.nl_net = net;
0479 
0480     if (rt->rt_dst.sa_family != AF_INET)
0481         return -EAFNOSUPPORT;
0482 
0483     /*
0484      * Check mask for validity:
0485      * a) it must be contiguous.
0486      * b) destination must have all host bits clear.
0487      * c) if application forgot to set correct family (AF_INET),
0488      *    reject request unless it is absolutely clear i.e.
0489      *    both family and mask are zero.
0490      */
0491     plen = 32;
0492     addr = sk_extract_addr(&rt->rt_dst);
0493     if (!(rt->rt_flags & RTF_HOST)) {
0494         __be32 mask = sk_extract_addr(&rt->rt_genmask);
0495 
0496         if (rt->rt_genmask.sa_family != AF_INET) {
0497             if (mask || rt->rt_genmask.sa_family)
0498                 return -EAFNOSUPPORT;
0499         }
0500 
0501         if (bad_mask(mask, addr))
0502             return -EINVAL;
0503 
0504         plen = inet_mask_len(mask);
0505     }
0506 
0507     cfg->fc_dst_len = plen;
0508     cfg->fc_dst = addr;
0509 
0510     if (cmd != SIOCDELRT) {
0511         cfg->fc_nlflags = NLM_F_CREATE;
0512         cfg->fc_protocol = RTPROT_BOOT;
0513     }
0514 
0515     if (rt->rt_metric)
0516         cfg->fc_priority = rt->rt_metric - 1;
0517 
0518     if (rt->rt_flags & RTF_REJECT) {
0519         cfg->fc_scope = RT_SCOPE_HOST;
0520         cfg->fc_type = RTN_UNREACHABLE;
0521         return 0;
0522     }
0523 
0524     cfg->fc_scope = RT_SCOPE_NOWHERE;
0525     cfg->fc_type = RTN_UNICAST;
0526 
0527     if (rt->rt_dev) {
0528         char *colon;
0529         struct net_device *dev;
0530         char devname[IFNAMSIZ];
0531 
0532         if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
0533             return -EFAULT;
0534 
0535         devname[IFNAMSIZ-1] = 0;
0536         colon = strchr(devname, ':');
0537         if (colon)
0538             *colon = 0;
0539         dev = __dev_get_by_name(net, devname);
0540         if (!dev)
0541             return -ENODEV;
0542         cfg->fc_oif = dev->ifindex;
0543         cfg->fc_table = l3mdev_fib_table(dev);
0544         if (colon) {
0545             const struct in_ifaddr *ifa;
0546             struct in_device *in_dev;
0547 
0548             in_dev = __in_dev_get_rtnl(dev);
0549             if (!in_dev)
0550                 return -ENODEV;
0551 
0552             *colon = ':';
0553 
0554             rcu_read_lock();
0555             in_dev_for_each_ifa_rcu(ifa, in_dev) {
0556                 if (strcmp(ifa->ifa_label, devname) == 0)
0557                     break;
0558             }
0559             rcu_read_unlock();
0560 
0561             if (!ifa)
0562                 return -ENODEV;
0563             cfg->fc_prefsrc = ifa->ifa_local;
0564         }
0565     }
0566 
0567     addr = sk_extract_addr(&rt->rt_gateway);
0568     if (rt->rt_gateway.sa_family == AF_INET && addr) {
0569         unsigned int addr_type;
0570 
0571         cfg->fc_gw4 = addr;
0572         cfg->fc_gw_family = AF_INET;
0573         addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
0574         if (rt->rt_flags & RTF_GATEWAY &&
0575             addr_type == RTN_UNICAST)
0576             cfg->fc_scope = RT_SCOPE_UNIVERSE;
0577     }
0578 
0579     if (cmd == SIOCDELRT)
0580         return 0;
0581 
0582     if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
0583         return -EINVAL;
0584 
0585     if (cfg->fc_scope == RT_SCOPE_NOWHERE)
0586         cfg->fc_scope = RT_SCOPE_LINK;
0587 
0588     if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
0589         struct nlattr *mx;
0590         int len = 0;
0591 
0592         mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
0593         if (!mx)
0594             return -ENOMEM;
0595 
0596         if (rt->rt_flags & RTF_MTU)
0597             len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
0598 
0599         if (rt->rt_flags & RTF_WINDOW)
0600             len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
0601 
0602         if (rt->rt_flags & RTF_IRTT)
0603             len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
0604 
0605         cfg->fc_mx = mx;
0606         cfg->fc_mx_len = len;
0607     }
0608 
0609     return 0;
0610 }
0611 
0612 /*
0613  * Handle IP routing ioctl calls.
0614  * These are used to manipulate the routing tables
0615  */
0616 int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
0617 {
0618     struct fib_config cfg;
0619     int err;
0620 
0621     switch (cmd) {
0622     case SIOCADDRT:     /* Add a route */
0623     case SIOCDELRT:     /* Delete a route */
0624         if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
0625             return -EPERM;
0626 
0627         rtnl_lock();
0628         err = rtentry_to_fib_config(net, cmd, rt, &cfg);
0629         if (err == 0) {
0630             struct fib_table *tb;
0631 
0632             if (cmd == SIOCDELRT) {
0633                 tb = fib_get_table(net, cfg.fc_table);
0634                 if (tb)
0635                     err = fib_table_delete(net, tb, &cfg,
0636                                    NULL);
0637                 else
0638                     err = -ESRCH;
0639             } else {
0640                 tb = fib_new_table(net, cfg.fc_table);
0641                 if (tb)
0642                     err = fib_table_insert(net, tb,
0643                                    &cfg, NULL);
0644                 else
0645                     err = -ENOBUFS;
0646             }
0647 
0648             /* allocated by rtentry_to_fib_config() */
0649             kfree(cfg.fc_mx);
0650         }
0651         rtnl_unlock();
0652         return err;
0653     }
0654     return -EINVAL;
0655 }
0656 
0657 const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
0658     [RTA_UNSPEC]        = { .strict_start_type = RTA_DPORT + 1 },
0659     [RTA_DST]       = { .type = NLA_U32 },
0660     [RTA_SRC]       = { .type = NLA_U32 },
0661     [RTA_IIF]       = { .type = NLA_U32 },
0662     [RTA_OIF]       = { .type = NLA_U32 },
0663     [RTA_GATEWAY]       = { .type = NLA_U32 },
0664     [RTA_PRIORITY]      = { .type = NLA_U32 },
0665     [RTA_PREFSRC]       = { .type = NLA_U32 },
0666     [RTA_METRICS]       = { .type = NLA_NESTED },
0667     [RTA_MULTIPATH]     = { .len = sizeof(struct rtnexthop) },
0668     [RTA_FLOW]      = { .type = NLA_U32 },
0669     [RTA_ENCAP_TYPE]    = { .type = NLA_U16 },
0670     [RTA_ENCAP]     = { .type = NLA_NESTED },
0671     [RTA_UID]       = { .type = NLA_U32 },
0672     [RTA_MARK]      = { .type = NLA_U32 },
0673     [RTA_TABLE]     = { .type = NLA_U32 },
0674     [RTA_IP_PROTO]      = { .type = NLA_U8 },
0675     [RTA_SPORT]     = { .type = NLA_U16 },
0676     [RTA_DPORT]     = { .type = NLA_U16 },
0677     [RTA_NH_ID]     = { .type = NLA_U32 },
0678 };
0679 
0680 int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
0681             struct netlink_ext_ack *extack)
0682 {
0683     struct rtvia *via;
0684     int alen;
0685 
0686     if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
0687         NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
0688         return -EINVAL;
0689     }
0690 
0691     via = nla_data(nla);
0692     alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
0693 
0694     switch (via->rtvia_family) {
0695     case AF_INET:
0696         if (alen != sizeof(__be32)) {
0697             NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
0698             return -EINVAL;
0699         }
0700         cfg->fc_gw_family = AF_INET;
0701         cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
0702         break;
0703     case AF_INET6:
0704 #if IS_ENABLED(CONFIG_IPV6)
0705         if (alen != sizeof(struct in6_addr)) {
0706             NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
0707             return -EINVAL;
0708         }
0709         cfg->fc_gw_family = AF_INET6;
0710         cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
0711 #else
0712         NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
0713         return -EINVAL;
0714 #endif
0715         break;
0716     default:
0717         NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
0718         return -EINVAL;
0719     }
0720 
0721     return 0;
0722 }
0723 
0724 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
0725                  struct nlmsghdr *nlh, struct fib_config *cfg,
0726                  struct netlink_ext_ack *extack)
0727 {
0728     bool has_gw = false, has_via = false;
0729     struct nlattr *attr;
0730     int err, remaining;
0731     struct rtmsg *rtm;
0732 
0733     err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
0734                     rtm_ipv4_policy, extack);
0735     if (err < 0)
0736         goto errout;
0737 
0738     memset(cfg, 0, sizeof(*cfg));
0739 
0740     rtm = nlmsg_data(nlh);
0741 
0742     if (!inet_validate_dscp(rtm->rtm_tos)) {
0743         NL_SET_ERR_MSG(extack,
0744                    "Invalid dsfield (tos): ECN bits must be 0");
0745         err = -EINVAL;
0746         goto errout;
0747     }
0748     cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
0749 
0750     cfg->fc_dst_len = rtm->rtm_dst_len;
0751     cfg->fc_table = rtm->rtm_table;
0752     cfg->fc_protocol = rtm->rtm_protocol;
0753     cfg->fc_scope = rtm->rtm_scope;
0754     cfg->fc_type = rtm->rtm_type;
0755     cfg->fc_flags = rtm->rtm_flags;
0756     cfg->fc_nlflags = nlh->nlmsg_flags;
0757 
0758     cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
0759     cfg->fc_nlinfo.nlh = nlh;
0760     cfg->fc_nlinfo.nl_net = net;
0761 
0762     if (cfg->fc_type > RTN_MAX) {
0763         NL_SET_ERR_MSG(extack, "Invalid route type");
0764         err = -EINVAL;
0765         goto errout;
0766     }
0767 
0768     nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
0769         switch (nla_type(attr)) {
0770         case RTA_DST:
0771             cfg->fc_dst = nla_get_be32(attr);
0772             break;
0773         case RTA_OIF:
0774             cfg->fc_oif = nla_get_u32(attr);
0775             break;
0776         case RTA_GATEWAY:
0777             has_gw = true;
0778             cfg->fc_gw4 = nla_get_be32(attr);
0779             if (cfg->fc_gw4)
0780                 cfg->fc_gw_family = AF_INET;
0781             break;
0782         case RTA_VIA:
0783             has_via = true;
0784             err = fib_gw_from_via(cfg, attr, extack);
0785             if (err)
0786                 goto errout;
0787             break;
0788         case RTA_PRIORITY:
0789             cfg->fc_priority = nla_get_u32(attr);
0790             break;
0791         case RTA_PREFSRC:
0792             cfg->fc_prefsrc = nla_get_be32(attr);
0793             break;
0794         case RTA_METRICS:
0795             cfg->fc_mx = nla_data(attr);
0796             cfg->fc_mx_len = nla_len(attr);
0797             break;
0798         case RTA_MULTIPATH:
0799             err = lwtunnel_valid_encap_type_attr(nla_data(attr),
0800                                  nla_len(attr),
0801                                  extack);
0802             if (err < 0)
0803                 goto errout;
0804             cfg->fc_mp = nla_data(attr);
0805             cfg->fc_mp_len = nla_len(attr);
0806             break;
0807         case RTA_FLOW:
0808             cfg->fc_flow = nla_get_u32(attr);
0809             break;
0810         case RTA_TABLE:
0811             cfg->fc_table = nla_get_u32(attr);
0812             break;
0813         case RTA_ENCAP:
0814             cfg->fc_encap = attr;
0815             break;
0816         case RTA_ENCAP_TYPE:
0817             cfg->fc_encap_type = nla_get_u16(attr);
0818             err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
0819                             extack);
0820             if (err < 0)
0821                 goto errout;
0822             break;
0823         case RTA_NH_ID:
0824             cfg->fc_nh_id = nla_get_u32(attr);
0825             break;
0826         }
0827     }
0828 
0829     if (cfg->fc_nh_id) {
0830         if (cfg->fc_oif || cfg->fc_gw_family ||
0831             cfg->fc_encap || cfg->fc_mp) {
0832             NL_SET_ERR_MSG(extack,
0833                        "Nexthop specification and nexthop id are mutually exclusive");
0834             return -EINVAL;
0835         }
0836     }
0837 
0838     if (has_gw && has_via) {
0839         NL_SET_ERR_MSG(extack,
0840                    "Nexthop configuration can not contain both GATEWAY and VIA");
0841         return -EINVAL;
0842     }
0843 
0844     return 0;
0845 errout:
0846     return err;
0847 }
0848 
0849 static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
0850                  struct netlink_ext_ack *extack)
0851 {
0852     struct net *net = sock_net(skb->sk);
0853     struct fib_config cfg;
0854     struct fib_table *tb;
0855     int err;
0856 
0857     err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
0858     if (err < 0)
0859         goto errout;
0860 
0861     if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
0862         NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
0863         err = -EINVAL;
0864         goto errout;
0865     }
0866 
0867     tb = fib_get_table(net, cfg.fc_table);
0868     if (!tb) {
0869         NL_SET_ERR_MSG(extack, "FIB table does not exist");
0870         err = -ESRCH;
0871         goto errout;
0872     }
0873 
0874     err = fib_table_delete(net, tb, &cfg, extack);
0875 errout:
0876     return err;
0877 }
0878 
0879 static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
0880                  struct netlink_ext_ack *extack)
0881 {
0882     struct net *net = sock_net(skb->sk);
0883     struct fib_config cfg;
0884     struct fib_table *tb;
0885     int err;
0886 
0887     err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
0888     if (err < 0)
0889         goto errout;
0890 
0891     tb = fib_new_table(net, cfg.fc_table);
0892     if (!tb) {
0893         err = -ENOBUFS;
0894         goto errout;
0895     }
0896 
0897     err = fib_table_insert(net, tb, &cfg, extack);
0898     if (!err && cfg.fc_type == RTN_LOCAL)
0899         net->ipv4.fib_has_custom_local_routes = true;
0900 errout:
0901     return err;
0902 }
0903 
0904 int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
0905               struct fib_dump_filter *filter,
0906               struct netlink_callback *cb)
0907 {
0908     struct netlink_ext_ack *extack = cb->extack;
0909     struct nlattr *tb[RTA_MAX + 1];
0910     struct rtmsg *rtm;
0911     int err, i;
0912 
0913     ASSERT_RTNL();
0914 
0915     if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
0916         NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
0917         return -EINVAL;
0918     }
0919 
0920     rtm = nlmsg_data(nlh);
0921     if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
0922         rtm->rtm_scope) {
0923         NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
0924         return -EINVAL;
0925     }
0926 
0927     if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
0928         NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
0929         return -EINVAL;
0930     }
0931     if (rtm->rtm_flags & RTM_F_CLONED)
0932         filter->dump_routes = false;
0933     else
0934         filter->dump_exceptions = false;
0935 
0936     filter->flags    = rtm->rtm_flags;
0937     filter->protocol = rtm->rtm_protocol;
0938     filter->rt_type  = rtm->rtm_type;
0939     filter->table_id = rtm->rtm_table;
0940 
0941     err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
0942                         rtm_ipv4_policy, extack);
0943     if (err < 0)
0944         return err;
0945 
0946     for (i = 0; i <= RTA_MAX; ++i) {
0947         int ifindex;
0948 
0949         if (!tb[i])
0950             continue;
0951 
0952         switch (i) {
0953         case RTA_TABLE:
0954             filter->table_id = nla_get_u32(tb[i]);
0955             break;
0956         case RTA_OIF:
0957             ifindex = nla_get_u32(tb[i]);
0958             filter->dev = __dev_get_by_index(net, ifindex);
0959             if (!filter->dev)
0960                 return -ENODEV;
0961             break;
0962         default:
0963             NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
0964             return -EINVAL;
0965         }
0966     }
0967 
0968     if (filter->flags || filter->protocol || filter->rt_type ||
0969         filter->table_id || filter->dev) {
0970         filter->filter_set = 1;
0971         cb->answer_flags = NLM_F_DUMP_FILTERED;
0972     }
0973 
0974     return 0;
0975 }
0976 EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
0977 
0978 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
0979 {
0980     struct fib_dump_filter filter = { .dump_routes = true,
0981                       .dump_exceptions = true };
0982     const struct nlmsghdr *nlh = cb->nlh;
0983     struct net *net = sock_net(skb->sk);
0984     unsigned int h, s_h;
0985     unsigned int e = 0, s_e;
0986     struct fib_table *tb;
0987     struct hlist_head *head;
0988     int dumped = 0, err;
0989 
0990     if (cb->strict_check) {
0991         err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
0992         if (err < 0)
0993             return err;
0994     } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
0995         struct rtmsg *rtm = nlmsg_data(nlh);
0996 
0997         filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
0998     }
0999 
1000     /* ipv4 does not use prefix flag */
1001     if (filter.flags & RTM_F_PREFIX)
1002         return skb->len;
1003 
1004     if (filter.table_id) {
1005         tb = fib_get_table(net, filter.table_id);
1006         if (!tb) {
1007             if (rtnl_msg_family(cb->nlh) != PF_INET)
1008                 return skb->len;
1009 
1010             NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
1011             return -ENOENT;
1012         }
1013 
1014         rcu_read_lock();
1015         err = fib_table_dump(tb, skb, cb, &filter);
1016         rcu_read_unlock();
1017         return skb->len ? : err;
1018     }
1019 
1020     s_h = cb->args[0];
1021     s_e = cb->args[1];
1022 
1023     rcu_read_lock();
1024 
1025     for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
1026         e = 0;
1027         head = &net->ipv4.fib_table_hash[h];
1028         hlist_for_each_entry_rcu(tb, head, tb_hlist) {
1029             if (e < s_e)
1030                 goto next;
1031             if (dumped)
1032                 memset(&cb->args[2], 0, sizeof(cb->args) -
1033                          2 * sizeof(cb->args[0]));
1034             err = fib_table_dump(tb, skb, cb, &filter);
1035             if (err < 0) {
1036                 if (likely(skb->len))
1037                     goto out;
1038 
1039                 goto out_err;
1040             }
1041             dumped = 1;
1042 next:
1043             e++;
1044         }
1045     }
1046 out:
1047     err = skb->len;
1048 out_err:
1049     rcu_read_unlock();
1050 
1051     cb->args[1] = e;
1052     cb->args[0] = h;
1053 
1054     return err;
1055 }
1056 
1057 /* Prepare and feed intra-kernel routing request.
1058  * Really, it should be netlink message, but :-( netlink
1059  * can be not configured, so that we feed it directly
1060  * to fib engine. It is legal, because all events occur
1061  * only when netlink is already locked.
1062  */
1063 static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
1064               struct in_ifaddr *ifa, u32 rt_priority)
1065 {
1066     struct net *net = dev_net(ifa->ifa_dev->dev);
1067     u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
1068     struct fib_table *tb;
1069     struct fib_config cfg = {
1070         .fc_protocol = RTPROT_KERNEL,
1071         .fc_type = type,
1072         .fc_dst = dst,
1073         .fc_dst_len = dst_len,
1074         .fc_priority = rt_priority,
1075         .fc_prefsrc = ifa->ifa_local,
1076         .fc_oif = ifa->ifa_dev->dev->ifindex,
1077         .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
1078         .fc_nlinfo = {
1079             .nl_net = net,
1080         },
1081     };
1082 
1083     if (!tb_id)
1084         tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
1085 
1086     tb = fib_new_table(net, tb_id);
1087     if (!tb)
1088         return;
1089 
1090     cfg.fc_table = tb->tb_id;
1091 
1092     if (type != RTN_LOCAL)
1093         cfg.fc_scope = RT_SCOPE_LINK;
1094     else
1095         cfg.fc_scope = RT_SCOPE_HOST;
1096 
1097     if (cmd == RTM_NEWROUTE)
1098         fib_table_insert(net, tb, &cfg, NULL);
1099     else
1100         fib_table_delete(net, tb, &cfg, NULL);
1101 }
1102 
1103 void fib_add_ifaddr(struct in_ifaddr *ifa)
1104 {
1105     struct in_device *in_dev = ifa->ifa_dev;
1106     struct net_device *dev = in_dev->dev;
1107     struct in_ifaddr *prim = ifa;
1108     __be32 mask = ifa->ifa_mask;
1109     __be32 addr = ifa->ifa_local;
1110     __be32 prefix = ifa->ifa_address & mask;
1111 
1112     if (ifa->ifa_flags & IFA_F_SECONDARY) {
1113         prim = inet_ifa_byprefix(in_dev, prefix, mask);
1114         if (!prim) {
1115             pr_warn("%s: bug: prim == NULL\n", __func__);
1116             return;
1117         }
1118     }
1119 
1120     fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
1121 
1122     if (!(dev->flags & IFF_UP))
1123         return;
1124 
1125     /* Add broadcast address, if it is explicitly assigned. */
1126     if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) {
1127         fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1128               prim, 0);
1129         arp_invalidate(dev, ifa->ifa_broadcast, false);
1130     }
1131 
1132     if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
1133         (prefix != addr || ifa->ifa_prefixlen < 32)) {
1134         if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1135             fib_magic(RTM_NEWROUTE,
1136                   dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1137                   prefix, ifa->ifa_prefixlen, prim,
1138                   ifa->ifa_rt_priority);
1139 
1140         /* Add the network broadcast address, when it makes sense */
1141         if (ifa->ifa_prefixlen < 31) {
1142             fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
1143                   32, prim, 0);
1144             arp_invalidate(dev, prefix | ~mask, false);
1145         }
1146     }
1147 }
1148 
1149 void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
1150 {
1151     __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
1152     struct in_device *in_dev = ifa->ifa_dev;
1153     struct net_device *dev = in_dev->dev;
1154 
1155     if (!(dev->flags & IFF_UP) ||
1156         ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
1157         ipv4_is_zeronet(prefix) ||
1158         (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
1159         return;
1160 
1161     /* add the new */
1162     fib_magic(RTM_NEWROUTE,
1163           dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1164           prefix, ifa->ifa_prefixlen, ifa, new_metric);
1165 
1166     /* delete the old */
1167     fib_magic(RTM_DELROUTE,
1168           dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1169           prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
1170 }
1171 
1172 /* Delete primary or secondary address.
1173  * Optionally, on secondary address promotion consider the addresses
1174  * from subnet iprim as deleted, even if they are in device list.
1175  * In this case the secondary ifa can be in device list.
1176  */
1177 void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
1178 {
1179     struct in_device *in_dev = ifa->ifa_dev;
1180     struct net_device *dev = in_dev->dev;
1181     struct in_ifaddr *ifa1;
1182     struct in_ifaddr *prim = ifa, *prim1 = NULL;
1183     __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
1184     __be32 any = ifa->ifa_address & ifa->ifa_mask;
1185 #define LOCAL_OK    1
1186 #define BRD_OK      2
1187 #define BRD0_OK     4
1188 #define BRD1_OK     8
1189     unsigned int ok = 0;
1190     int subnet = 0;     /* Primary network */
1191     int gone = 1;       /* Address is missing */
1192     int same_prefsrc = 0;   /* Another primary with same IP */
1193 
1194     if (ifa->ifa_flags & IFA_F_SECONDARY) {
1195         prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
1196         if (!prim) {
1197             /* if the device has been deleted, we don't perform
1198              * address promotion
1199              */
1200             if (!in_dev->dead)
1201                 pr_warn("%s: bug: prim == NULL\n", __func__);
1202             return;
1203         }
1204         if (iprim && iprim != prim) {
1205             pr_warn("%s: bug: iprim != prim\n", __func__);
1206             return;
1207         }
1208     } else if (!ipv4_is_zeronet(any) &&
1209            (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
1210         if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1211             fib_magic(RTM_DELROUTE,
1212                   dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1213                   any, ifa->ifa_prefixlen, prim, 0);
1214         subnet = 1;
1215     }
1216 
1217     if (in_dev->dead)
1218         goto no_promotions;
1219 
1220     /* Deletion is more complicated than add.
1221      * We should take care of not to delete too much :-)
1222      *
1223      * Scan address list to be sure that addresses are really gone.
1224      */
1225     rcu_read_lock();
1226     in_dev_for_each_ifa_rcu(ifa1, in_dev) {
1227         if (ifa1 == ifa) {
1228             /* promotion, keep the IP */
1229             gone = 0;
1230             continue;
1231         }
1232         /* Ignore IFAs from our subnet */
1233         if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
1234             inet_ifa_match(ifa1->ifa_address, iprim))
1235             continue;
1236 
1237         /* Ignore ifa1 if it uses different primary IP (prefsrc) */
1238         if (ifa1->ifa_flags & IFA_F_SECONDARY) {
1239             /* Another address from our subnet? */
1240             if (ifa1->ifa_mask == prim->ifa_mask &&
1241                 inet_ifa_match(ifa1->ifa_address, prim))
1242                 prim1 = prim;
1243             else {
1244                 /* We reached the secondaries, so
1245                  * same_prefsrc should be determined.
1246                  */
1247                 if (!same_prefsrc)
1248                     continue;
1249                 /* Search new prim1 if ifa1 is not
1250                  * using the current prim1
1251                  */
1252                 if (!prim1 ||
1253                     ifa1->ifa_mask != prim1->ifa_mask ||
1254                     !inet_ifa_match(ifa1->ifa_address, prim1))
1255                     prim1 = inet_ifa_byprefix(in_dev,
1256                             ifa1->ifa_address,
1257                             ifa1->ifa_mask);
1258                 if (!prim1)
1259                     continue;
1260                 if (prim1->ifa_local != prim->ifa_local)
1261                     continue;
1262             }
1263         } else {
1264             if (prim->ifa_local != ifa1->ifa_local)
1265                 continue;
1266             prim1 = ifa1;
1267             if (prim != prim1)
1268                 same_prefsrc = 1;
1269         }
1270         if (ifa->ifa_local == ifa1->ifa_local)
1271             ok |= LOCAL_OK;
1272         if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
1273             ok |= BRD_OK;
1274         if (brd == ifa1->ifa_broadcast)
1275             ok |= BRD1_OK;
1276         if (any == ifa1->ifa_broadcast)
1277             ok |= BRD0_OK;
1278         /* primary has network specific broadcasts */
1279         if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
1280             __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
1281             __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
1282 
1283             if (!ipv4_is_zeronet(any1)) {
1284                 if (ifa->ifa_broadcast == brd1 ||
1285                     ifa->ifa_broadcast == any1)
1286                     ok |= BRD_OK;
1287                 if (brd == brd1 || brd == any1)
1288                     ok |= BRD1_OK;
1289                 if (any == brd1 || any == any1)
1290                     ok |= BRD0_OK;
1291             }
1292         }
1293     }
1294     rcu_read_unlock();
1295 
1296 no_promotions:
1297     if (!(ok & BRD_OK))
1298         fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1299               prim, 0);
1300     if (subnet && ifa->ifa_prefixlen < 31) {
1301         if (!(ok & BRD1_OK))
1302             fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
1303                   prim, 0);
1304         if (!(ok & BRD0_OK))
1305             fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
1306                   prim, 0);
1307     }
1308     if (!(ok & LOCAL_OK)) {
1309         unsigned int addr_type;
1310 
1311         fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
1312 
1313         /* Check, that this local address finally disappeared. */
1314         addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
1315                              ifa->ifa_local);
1316         if (gone && addr_type != RTN_LOCAL) {
1317             /* And the last, but not the least thing.
1318              * We must flush stray FIB entries.
1319              *
1320              * First of all, we scan fib_info list searching
1321              * for stray nexthop entries, then ignite fib_flush.
1322              */
1323             if (fib_sync_down_addr(dev, ifa->ifa_local))
1324                 fib_flush(dev_net(dev));
1325         }
1326     }
1327 #undef LOCAL_OK
1328 #undef BRD_OK
1329 #undef BRD0_OK
1330 #undef BRD1_OK
1331 }
1332 
1333 static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
1334 {
1335 
1336     struct fib_result       res;
1337     struct flowi4           fl4 = {
1338         .flowi4_mark = frn->fl_mark,
1339         .daddr = frn->fl_addr,
1340         .flowi4_tos = frn->fl_tos,
1341         .flowi4_scope = frn->fl_scope,
1342     };
1343     struct fib_table *tb;
1344 
1345     rcu_read_lock();
1346 
1347     tb = fib_get_table(net, frn->tb_id_in);
1348 
1349     frn->err = -ENOENT;
1350     if (tb) {
1351         local_bh_disable();
1352 
1353         frn->tb_id = tb->tb_id;
1354         frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
1355 
1356         if (!frn->err) {
1357             frn->prefixlen = res.prefixlen;
1358             frn->nh_sel = res.nh_sel;
1359             frn->type = res.type;
1360             frn->scope = res.scope;
1361         }
1362         local_bh_enable();
1363     }
1364 
1365     rcu_read_unlock();
1366 }
1367 
1368 static void nl_fib_input(struct sk_buff *skb)
1369 {
1370     struct net *net;
1371     struct fib_result_nl *frn;
1372     struct nlmsghdr *nlh;
1373     u32 portid;
1374 
1375     net = sock_net(skb->sk);
1376     nlh = nlmsg_hdr(skb);
1377     if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
1378         skb->len < nlh->nlmsg_len ||
1379         nlmsg_len(nlh) < sizeof(*frn))
1380         return;
1381 
1382     skb = netlink_skb_clone(skb, GFP_KERNEL);
1383     if (!skb)
1384         return;
1385     nlh = nlmsg_hdr(skb);
1386 
1387     frn = nlmsg_data(nlh);
1388     nl_fib_lookup(net, frn);
1389 
1390     portid = NETLINK_CB(skb).portid;      /* netlink portid */
1391     NETLINK_CB(skb).portid = 0;        /* from kernel */
1392     NETLINK_CB(skb).dst_group = 0;  /* unicast */
1393     nlmsg_unicast(net->ipv4.fibnl, skb, portid);
1394 }
1395 
1396 static int __net_init nl_fib_lookup_init(struct net *net)
1397 {
1398     struct sock *sk;
1399     struct netlink_kernel_cfg cfg = {
1400         .input  = nl_fib_input,
1401     };
1402 
1403     sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
1404     if (!sk)
1405         return -EAFNOSUPPORT;
1406     net->ipv4.fibnl = sk;
1407     return 0;
1408 }
1409 
1410 static void nl_fib_lookup_exit(struct net *net)
1411 {
1412     netlink_kernel_release(net->ipv4.fibnl);
1413     net->ipv4.fibnl = NULL;
1414 }
1415 
1416 static void fib_disable_ip(struct net_device *dev, unsigned long event,
1417                bool force)
1418 {
1419     if (fib_sync_down_dev(dev, event, force))
1420         fib_flush(dev_net(dev));
1421     else
1422         rt_cache_flush(dev_net(dev));
1423     arp_ifdown(dev);
1424 }
1425 
1426 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
1427 {
1428     struct in_ifaddr *ifa = ptr;
1429     struct net_device *dev = ifa->ifa_dev->dev;
1430     struct net *net = dev_net(dev);
1431 
1432     switch (event) {
1433     case NETDEV_UP:
1434         fib_add_ifaddr(ifa);
1435 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1436         fib_sync_up(dev, RTNH_F_DEAD);
1437 #endif
1438         atomic_inc(&net->ipv4.dev_addr_genid);
1439         rt_cache_flush(dev_net(dev));
1440         break;
1441     case NETDEV_DOWN:
1442         fib_del_ifaddr(ifa, NULL);
1443         atomic_inc(&net->ipv4.dev_addr_genid);
1444         if (!ifa->ifa_dev->ifa_list) {
1445             /* Last address was deleted from this interface.
1446              * Disable IP.
1447              */
1448             fib_disable_ip(dev, event, true);
1449         } else {
1450             rt_cache_flush(dev_net(dev));
1451         }
1452         break;
1453     }
1454     return NOTIFY_DONE;
1455 }
1456 
1457 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1458 {
1459     struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1460     struct netdev_notifier_changeupper_info *upper_info = ptr;
1461     struct netdev_notifier_info_ext *info_ext = ptr;
1462     struct in_device *in_dev;
1463     struct net *net = dev_net(dev);
1464     struct in_ifaddr *ifa;
1465     unsigned int flags;
1466 
1467     if (event == NETDEV_UNREGISTER) {
1468         fib_disable_ip(dev, event, true);
1469         rt_flush_dev(dev);
1470         return NOTIFY_DONE;
1471     }
1472 
1473     in_dev = __in_dev_get_rtnl(dev);
1474     if (!in_dev)
1475         return NOTIFY_DONE;
1476 
1477     switch (event) {
1478     case NETDEV_UP:
1479         in_dev_for_each_ifa_rtnl(ifa, in_dev) {
1480             fib_add_ifaddr(ifa);
1481         }
1482 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1483         fib_sync_up(dev, RTNH_F_DEAD);
1484 #endif
1485         atomic_inc(&net->ipv4.dev_addr_genid);
1486         rt_cache_flush(net);
1487         break;
1488     case NETDEV_DOWN:
1489         fib_disable_ip(dev, event, false);
1490         break;
1491     case NETDEV_CHANGE:
1492         flags = dev_get_flags(dev);
1493         if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1494             fib_sync_up(dev, RTNH_F_LINKDOWN);
1495         else
1496             fib_sync_down_dev(dev, event, false);
1497         rt_cache_flush(net);
1498         break;
1499     case NETDEV_CHANGEMTU:
1500         fib_sync_mtu(dev, info_ext->ext.mtu);
1501         rt_cache_flush(net);
1502         break;
1503     case NETDEV_CHANGEUPPER:
1504         upper_info = ptr;
1505         /* flush all routes if dev is linked to or unlinked from
1506          * an L3 master device (e.g., VRF)
1507          */
1508         if (upper_info->upper_dev &&
1509             netif_is_l3_master(upper_info->upper_dev))
1510             fib_disable_ip(dev, NETDEV_DOWN, true);
1511         break;
1512     }
1513     return NOTIFY_DONE;
1514 }
1515 
1516 static struct notifier_block fib_inetaddr_notifier = {
1517     .notifier_call = fib_inetaddr_event,
1518 };
1519 
1520 static struct notifier_block fib_netdev_notifier = {
1521     .notifier_call = fib_netdev_event,
1522 };
1523 
1524 static int __net_init ip_fib_net_init(struct net *net)
1525 {
1526     int err;
1527     size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1528 
1529     err = fib4_notifier_init(net);
1530     if (err)
1531         return err;
1532 
1533 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1534     /* Default to 3-tuple */
1535     net->ipv4.sysctl_fib_multipath_hash_fields =
1536         FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
1537 #endif
1538 
1539     /* Avoid false sharing : Use at least a full cache line */
1540     size = max_t(size_t, size, L1_CACHE_BYTES);
1541 
1542     net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1543     if (!net->ipv4.fib_table_hash) {
1544         err = -ENOMEM;
1545         goto err_table_hash_alloc;
1546     }
1547 
1548     err = fib4_rules_init(net);
1549     if (err < 0)
1550         goto err_rules_init;
1551     return 0;
1552 
1553 err_rules_init:
1554     kfree(net->ipv4.fib_table_hash);
1555 err_table_hash_alloc:
1556     fib4_notifier_exit(net);
1557     return err;
1558 }
1559 
1560 static void ip_fib_net_exit(struct net *net)
1561 {
1562     int i;
1563 
1564     ASSERT_RTNL();
1565 #ifdef CONFIG_IP_MULTIPLE_TABLES
1566     RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1567     RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1568 #endif
1569     /* Destroy the tables in reverse order to guarantee that the
1570      * local table, ID 255, is destroyed before the main table, ID
1571      * 254. This is necessary as the local table may contain
1572      * references to data contained in the main table.
1573      */
1574     for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
1575         struct hlist_head *head = &net->ipv4.fib_table_hash[i];
1576         struct hlist_node *tmp;
1577         struct fib_table *tb;
1578 
1579         hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1580             hlist_del(&tb->tb_hlist);
1581             fib_table_flush(net, tb, true);
1582             fib_free_table(tb);
1583         }
1584     }
1585 
1586 #ifdef CONFIG_IP_MULTIPLE_TABLES
1587     fib4_rules_exit(net);
1588 #endif
1589 
1590     kfree(net->ipv4.fib_table_hash);
1591     fib4_notifier_exit(net);
1592 }
1593 
1594 static int __net_init fib_net_init(struct net *net)
1595 {
1596     int error;
1597 
1598 #ifdef CONFIG_IP_ROUTE_CLASSID
1599     atomic_set(&net->ipv4.fib_num_tclassid_users, 0);
1600 #endif
1601     error = ip_fib_net_init(net);
1602     if (error < 0)
1603         goto out;
1604     error = nl_fib_lookup_init(net);
1605     if (error < 0)
1606         goto out_nlfl;
1607     error = fib_proc_init(net);
1608     if (error < 0)
1609         goto out_proc;
1610 out:
1611     return error;
1612 
1613 out_proc:
1614     nl_fib_lookup_exit(net);
1615 out_nlfl:
1616     rtnl_lock();
1617     ip_fib_net_exit(net);
1618     rtnl_unlock();
1619     goto out;
1620 }
1621 
1622 static void __net_exit fib_net_exit(struct net *net)
1623 {
1624     fib_proc_exit(net);
1625     nl_fib_lookup_exit(net);
1626 }
1627 
1628 static void __net_exit fib_net_exit_batch(struct list_head *net_list)
1629 {
1630     struct net *net;
1631 
1632     rtnl_lock();
1633     list_for_each_entry(net, net_list, exit_list)
1634         ip_fib_net_exit(net);
1635 
1636     rtnl_unlock();
1637 }
1638 
1639 static struct pernet_operations fib_net_ops = {
1640     .init = fib_net_init,
1641     .exit = fib_net_exit,
1642     .exit_batch = fib_net_exit_batch,
1643 };
1644 
1645 void __init ip_fib_init(void)
1646 {
1647     fib_trie_init();
1648 
1649     register_pernet_subsys(&fib_net_ops);
1650 
1651     register_netdevice_notifier(&fib_netdev_notifier);
1652     register_inetaddr_notifier(&fib_inetaddr_notifier);
1653 
1654     rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
1655     rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
1656     rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
1657 }