0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #include <linux/ethtool.h>
0013 #include <linux/module.h>
0014 #include <linux/kernel.h>
0015 #include <linux/netdevice.h>
0016 #include <linux/etherdevice.h>
0017 #include <linux/ip.h>
0018 #include <linux/init.h>
0019 #include <linux/moduleparam.h>
0020 #include <linux/netfilter.h>
0021 #include <linux/rtnetlink.h>
0022 #include <net/rtnetlink.h>
0023 #include <linux/u64_stats_sync.h>
0024 #include <linux/hashtable.h>
0025 #include <linux/spinlock_types.h>
0026
0027 #include <linux/inetdevice.h>
0028 #include <net/arp.h>
0029 #include <net/ip.h>
0030 #include <net/ip_fib.h>
0031 #include <net/ip6_fib.h>
0032 #include <net/ip6_route.h>
0033 #include <net/route.h>
0034 #include <net/addrconf.h>
0035 #include <net/l3mdev.h>
0036 #include <net/fib_rules.h>
0037 #include <net/sch_generic.h>
0038 #include <net/netns/generic.h>
0039 #include <net/netfilter/nf_conntrack.h>
0040
0041 #define DRV_NAME "vrf"
0042 #define DRV_VERSION "1.1"
0043
0044 #define FIB_RULE_PREF 1000
0045
0046 #define HT_MAP_BITS 4
0047 #define HASH_INITVAL ((u32)0xcafef00d)
0048
0049 struct vrf_map {
0050 DECLARE_HASHTABLE(ht, HT_MAP_BITS);
0051 spinlock_t vmap_lock;
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087 u32 shared_tables;
0088
0089 bool strict_mode;
0090 };
0091
0092 struct vrf_map_elem {
0093 struct hlist_node hnode;
0094 struct list_head vrf_list;
0095
0096 u32 table_id;
0097 int users;
0098 int ifindex;
0099 };
0100
0101 static unsigned int vrf_net_id;
0102
0103
0104 struct netns_vrf {
0105
0106 bool add_fib_rules;
0107
0108 struct vrf_map vmap;
0109 struct ctl_table_header *ctl_hdr;
0110 };
0111
0112 struct net_vrf {
0113 struct rtable __rcu *rth;
0114 struct rt6_info __rcu *rt6;
0115 #if IS_ENABLED(CONFIG_IPV6)
0116 struct fib6_table *fib6_table;
0117 #endif
0118 u32 tb_id;
0119
0120 struct list_head me_list;
0121 int ifindex;
0122 };
0123
0124 struct pcpu_dstats {
0125 u64 tx_pkts;
0126 u64 tx_bytes;
0127 u64 tx_drps;
0128 u64 rx_pkts;
0129 u64 rx_bytes;
0130 u64 rx_drps;
0131 struct u64_stats_sync syncp;
0132 };
0133
0134 static void vrf_rx_stats(struct net_device *dev, int len)
0135 {
0136 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
0137
0138 u64_stats_update_begin(&dstats->syncp);
0139 dstats->rx_pkts++;
0140 dstats->rx_bytes += len;
0141 u64_stats_update_end(&dstats->syncp);
0142 }
0143
0144 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
0145 {
0146 vrf_dev->stats.tx_errors++;
0147 kfree_skb(skb);
0148 }
0149
0150 static void vrf_get_stats64(struct net_device *dev,
0151 struct rtnl_link_stats64 *stats)
0152 {
0153 int i;
0154
0155 for_each_possible_cpu(i) {
0156 const struct pcpu_dstats *dstats;
0157 u64 tbytes, tpkts, tdrops, rbytes, rpkts;
0158 unsigned int start;
0159
0160 dstats = per_cpu_ptr(dev->dstats, i);
0161 do {
0162 start = u64_stats_fetch_begin_irq(&dstats->syncp);
0163 tbytes = dstats->tx_bytes;
0164 tpkts = dstats->tx_pkts;
0165 tdrops = dstats->tx_drps;
0166 rbytes = dstats->rx_bytes;
0167 rpkts = dstats->rx_pkts;
0168 } while (u64_stats_fetch_retry_irq(&dstats->syncp, start));
0169 stats->tx_bytes += tbytes;
0170 stats->tx_packets += tpkts;
0171 stats->tx_dropped += tdrops;
0172 stats->rx_bytes += rbytes;
0173 stats->rx_packets += rpkts;
0174 }
0175 }
0176
0177 static struct vrf_map *netns_vrf_map(struct net *net)
0178 {
0179 struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
0180
0181 return &nn_vrf->vmap;
0182 }
0183
0184 static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev)
0185 {
0186 return netns_vrf_map(dev_net(dev));
0187 }
0188
0189 static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me)
0190 {
0191 struct list_head *me_head = &me->vrf_list;
0192 struct net_vrf *vrf;
0193
0194 if (list_empty(me_head))
0195 return -ENODEV;
0196
0197 vrf = list_first_entry(me_head, struct net_vrf, me_list);
0198
0199 return vrf->ifindex;
0200 }
0201
0202 static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags)
0203 {
0204 struct vrf_map_elem *me;
0205
0206 me = kmalloc(sizeof(*me), flags);
0207 if (!me)
0208 return NULL;
0209
0210 return me;
0211 }
0212
0213 static void vrf_map_elem_free(struct vrf_map_elem *me)
0214 {
0215 kfree(me);
0216 }
0217
0218 static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id,
0219 int ifindex, int users)
0220 {
0221 me->table_id = table_id;
0222 me->ifindex = ifindex;
0223 me->users = users;
0224 INIT_LIST_HEAD(&me->vrf_list);
0225 }
0226
0227 static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap,
0228 u32 table_id)
0229 {
0230 struct vrf_map_elem *me;
0231 u32 key;
0232
0233 key = jhash_1word(table_id, HASH_INITVAL);
0234 hash_for_each_possible(vmap->ht, me, hnode, key) {
0235 if (me->table_id == table_id)
0236 return me;
0237 }
0238
0239 return NULL;
0240 }
0241
0242 static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me)
0243 {
0244 u32 table_id = me->table_id;
0245 u32 key;
0246
0247 key = jhash_1word(table_id, HASH_INITVAL);
0248 hash_add(vmap->ht, &me->hnode, key);
0249 }
0250
0251 static void vrf_map_del_elem(struct vrf_map_elem *me)
0252 {
0253 hash_del(&me->hnode);
0254 }
0255
0256 static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock)
0257 {
0258 spin_lock(&vmap->vmap_lock);
0259 }
0260
0261 static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock)
0262 {
0263 spin_unlock(&vmap->vmap_lock);
0264 }
0265
0266
0267 static int
0268 vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack)
0269 {
0270 struct vrf_map *vmap = netns_vrf_map_by_dev(dev);
0271 struct net_vrf *vrf = netdev_priv(dev);
0272 struct vrf_map_elem *new_me, *me;
0273 u32 table_id = vrf->tb_id;
0274 bool free_new_me = false;
0275 int users;
0276 int res;
0277
0278
0279
0280
0281 new_me = vrf_map_elem_alloc(GFP_KERNEL);
0282 if (!new_me)
0283 return -ENOMEM;
0284
0285 vrf_map_elem_init(new_me, table_id, dev->ifindex, 0);
0286
0287 vrf_map_lock(vmap);
0288
0289 me = vrf_map_lookup_elem(vmap, table_id);
0290 if (!me) {
0291 me = new_me;
0292 vrf_map_add_elem(vmap, me);
0293 goto link_vrf;
0294 }
0295
0296
0297
0298
0299 free_new_me = true;
0300 if (vmap->strict_mode) {
0301
0302 NL_SET_ERR_MSG(extack, "Table is used by another VRF");
0303 res = -EBUSY;
0304 goto unlock;
0305 }
0306
0307 link_vrf:
0308 users = ++me->users;
0309 if (users == 2)
0310 ++vmap->shared_tables;
0311
0312 list_add(&vrf->me_list, &me->vrf_list);
0313
0314 res = 0;
0315
0316 unlock:
0317 vrf_map_unlock(vmap);
0318
0319
0320 if (free_new_me)
0321 vrf_map_elem_free(new_me);
0322
0323 return res;
0324 }
0325
0326
0327 static void vrf_map_unregister_dev(struct net_device *dev)
0328 {
0329 struct vrf_map *vmap = netns_vrf_map_by_dev(dev);
0330 struct net_vrf *vrf = netdev_priv(dev);
0331 u32 table_id = vrf->tb_id;
0332 struct vrf_map_elem *me;
0333 int users;
0334
0335 vrf_map_lock(vmap);
0336
0337 me = vrf_map_lookup_elem(vmap, table_id);
0338 if (!me)
0339 goto unlock;
0340
0341 list_del(&vrf->me_list);
0342
0343 users = --me->users;
0344 if (users == 1) {
0345 --vmap->shared_tables;
0346 } else if (users == 0) {
0347 vrf_map_del_elem(me);
0348
0349
0350 vrf_map_elem_free(me);
0351 }
0352
0353 unlock:
0354 vrf_map_unlock(vmap);
0355 }
0356
0357
0358 static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id)
0359 {
0360 struct vrf_map *vmap = netns_vrf_map(net);
0361 struct vrf_map_elem *me;
0362 int ifindex;
0363
0364 vrf_map_lock(vmap);
0365
0366 if (!vmap->strict_mode) {
0367 ifindex = -EPERM;
0368 goto unlock;
0369 }
0370
0371 me = vrf_map_lookup_elem(vmap, table_id);
0372 if (!me) {
0373 ifindex = -ENODEV;
0374 goto unlock;
0375 }
0376
0377 ifindex = vrf_map_elem_get_vrf_ifindex(me);
0378
0379 unlock:
0380 vrf_map_unlock(vmap);
0381
0382 return ifindex;
0383 }
0384
0385
0386
0387
0388 static bool qdisc_tx_is_default(const struct net_device *dev)
0389 {
0390 struct netdev_queue *txq;
0391 struct Qdisc *qdisc;
0392
0393 if (dev->num_tx_queues > 1)
0394 return false;
0395
0396 txq = netdev_get_tx_queue(dev, 0);
0397 qdisc = rcu_access_pointer(txq->qdisc);
0398
0399 return !qdisc->enqueue;
0400 }
0401
0402
0403
0404
0405 static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
0406 struct dst_entry *dst)
0407 {
0408 int len = skb->len;
0409
0410 skb_orphan(skb);
0411
0412 skb_dst_set(skb, dst);
0413
0414
0415
0416
0417 skb->pkt_type = PACKET_LOOPBACK;
0418
0419 skb->protocol = eth_type_trans(skb, dev);
0420
0421 if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
0422 vrf_rx_stats(dev, len);
0423 else
0424 this_cpu_inc(dev->dstats->rx_drps);
0425
0426 return NETDEV_TX_OK;
0427 }
0428
0429 static void vrf_nf_set_untracked(struct sk_buff *skb)
0430 {
0431 if (skb_get_nfct(skb) == 0)
0432 nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
0433 }
0434
0435 static void vrf_nf_reset_ct(struct sk_buff *skb)
0436 {
0437 if (skb_get_nfct(skb) == IP_CT_UNTRACKED)
0438 nf_reset_ct(skb);
0439 }
0440
0441 #if IS_ENABLED(CONFIG_IPV6)
0442 static int vrf_ip6_local_out(struct net *net, struct sock *sk,
0443 struct sk_buff *skb)
0444 {
0445 int err;
0446
0447 vrf_nf_reset_ct(skb);
0448
0449 err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net,
0450 sk, skb, NULL, skb_dst(skb)->dev, dst_output);
0451
0452 if (likely(err == 1))
0453 err = dst_output(net, sk, skb);
0454
0455 return err;
0456 }
0457
0458 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
0459 struct net_device *dev)
0460 {
0461 const struct ipv6hdr *iph;
0462 struct net *net = dev_net(skb->dev);
0463 struct flowi6 fl6;
0464 int ret = NET_XMIT_DROP;
0465 struct dst_entry *dst;
0466 struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst;
0467
0468 if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
0469 goto err;
0470
0471 iph = ipv6_hdr(skb);
0472
0473 memset(&fl6, 0, sizeof(fl6));
0474
0475 fl6.flowi6_l3mdev = dev->ifindex;
0476 fl6.flowi6_iif = LOOPBACK_IFINDEX;
0477 fl6.daddr = iph->daddr;
0478 fl6.saddr = iph->saddr;
0479 fl6.flowlabel = ip6_flowinfo(iph);
0480 fl6.flowi6_mark = skb->mark;
0481 fl6.flowi6_proto = iph->nexthdr;
0482
0483 dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL);
0484 if (IS_ERR(dst) || dst == dst_null)
0485 goto err;
0486
0487 skb_dst_drop(skb);
0488
0489
0490
0491
0492 if (dst->dev == dev)
0493 return vrf_local_xmit(skb, dev, dst);
0494
0495 skb_dst_set(skb, dst);
0496
0497
0498 __skb_pull(skb, skb_network_offset(skb));
0499
0500 memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
0501 ret = vrf_ip6_local_out(net, skb->sk, skb);
0502 if (unlikely(net_xmit_eval(ret)))
0503 dev->stats.tx_errors++;
0504 else
0505 ret = NET_XMIT_SUCCESS;
0506
0507 return ret;
0508 err:
0509 vrf_tx_error(dev, skb);
0510 return NET_XMIT_DROP;
0511 }
0512 #else
0513 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
0514 struct net_device *dev)
0515 {
0516 vrf_tx_error(dev, skb);
0517 return NET_XMIT_DROP;
0518 }
0519 #endif
0520
0521
0522 static int vrf_ip_local_out(struct net *net, struct sock *sk,
0523 struct sk_buff *skb)
0524 {
0525 int err;
0526
0527 vrf_nf_reset_ct(skb);
0528
0529 err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
0530 skb, NULL, skb_dst(skb)->dev, dst_output);
0531 if (likely(err == 1))
0532 err = dst_output(net, sk, skb);
0533
0534 return err;
0535 }
0536
0537 static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
0538 struct net_device *vrf_dev)
0539 {
0540 struct iphdr *ip4h;
0541 int ret = NET_XMIT_DROP;
0542 struct flowi4 fl4;
0543 struct net *net = dev_net(vrf_dev);
0544 struct rtable *rt;
0545
0546 if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
0547 goto err;
0548
0549 ip4h = ip_hdr(skb);
0550
0551 memset(&fl4, 0, sizeof(fl4));
0552
0553 fl4.flowi4_l3mdev = vrf_dev->ifindex;
0554 fl4.flowi4_iif = LOOPBACK_IFINDEX;
0555 fl4.flowi4_tos = RT_TOS(ip4h->tos);
0556 fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
0557 fl4.flowi4_proto = ip4h->protocol;
0558 fl4.daddr = ip4h->daddr;
0559 fl4.saddr = ip4h->saddr;
0560
0561 rt = ip_route_output_flow(net, &fl4, NULL);
0562 if (IS_ERR(rt))
0563 goto err;
0564
0565 skb_dst_drop(skb);
0566
0567
0568
0569
0570 if (rt->dst.dev == vrf_dev)
0571 return vrf_local_xmit(skb, vrf_dev, &rt->dst);
0572
0573 skb_dst_set(skb, &rt->dst);
0574
0575
0576 __skb_pull(skb, skb_network_offset(skb));
0577
0578 if (!ip4h->saddr) {
0579 ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0,
0580 RT_SCOPE_LINK);
0581 }
0582
0583 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
0584 ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
0585 if (unlikely(net_xmit_eval(ret)))
0586 vrf_dev->stats.tx_errors++;
0587 else
0588 ret = NET_XMIT_SUCCESS;
0589
0590 out:
0591 return ret;
0592 err:
0593 vrf_tx_error(vrf_dev, skb);
0594 goto out;
0595 }
0596
0597 static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev)
0598 {
0599 switch (skb->protocol) {
0600 case htons(ETH_P_IP):
0601 return vrf_process_v4_outbound(skb, dev);
0602 case htons(ETH_P_IPV6):
0603 return vrf_process_v6_outbound(skb, dev);
0604 default:
0605 vrf_tx_error(dev, skb);
0606 return NET_XMIT_DROP;
0607 }
0608 }
0609
0610 static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
0611 {
0612 int len = skb->len;
0613 netdev_tx_t ret = is_ip_tx_frame(skb, dev);
0614
0615 if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
0616 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
0617
0618 u64_stats_update_begin(&dstats->syncp);
0619 dstats->tx_pkts++;
0620 dstats->tx_bytes += len;
0621 u64_stats_update_end(&dstats->syncp);
0622 } else {
0623 this_cpu_inc(dev->dstats->tx_drps);
0624 }
0625
0626 return ret;
0627 }
0628
0629 static void vrf_finish_direct(struct sk_buff *skb)
0630 {
0631 struct net_device *vrf_dev = skb->dev;
0632
0633 if (!list_empty(&vrf_dev->ptype_all) &&
0634 likely(skb_headroom(skb) >= ETH_HLEN)) {
0635 struct ethhdr *eth = skb_push(skb, ETH_HLEN);
0636
0637 ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
0638 eth_zero_addr(eth->h_dest);
0639 eth->h_proto = skb->protocol;
0640
0641 rcu_read_lock_bh();
0642 dev_queue_xmit_nit(skb, vrf_dev);
0643 rcu_read_unlock_bh();
0644
0645 skb_pull(skb, ETH_HLEN);
0646 }
0647
0648 vrf_nf_reset_ct(skb);
0649 }
0650
0651 #if IS_ENABLED(CONFIG_IPV6)
0652
0653 static int vrf_finish_output6(struct net *net, struct sock *sk,
0654 struct sk_buff *skb)
0655 {
0656 struct dst_entry *dst = skb_dst(skb);
0657 struct net_device *dev = dst->dev;
0658 const struct in6_addr *nexthop;
0659 struct neighbour *neigh;
0660 int ret;
0661
0662 vrf_nf_reset_ct(skb);
0663
0664 skb->protocol = htons(ETH_P_IPV6);
0665 skb->dev = dev;
0666
0667 rcu_read_lock_bh();
0668 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
0669 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
0670 if (unlikely(!neigh))
0671 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
0672 if (!IS_ERR(neigh)) {
0673 sock_confirm_neigh(skb, neigh);
0674 ret = neigh_output(neigh, skb, false);
0675 rcu_read_unlock_bh();
0676 return ret;
0677 }
0678 rcu_read_unlock_bh();
0679
0680 IP6_INC_STATS(dev_net(dst->dev),
0681 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
0682 kfree_skb(skb);
0683 return -EINVAL;
0684 }
0685
0686
0687 static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
0688 {
0689 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
0690 net, sk, skb, NULL, skb_dst(skb)->dev,
0691 vrf_finish_output6,
0692 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
0693 }
0694
0695
0696
0697
0698
0699 static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev,
0700 struct sk_buff *skb)
0701 {
0702 struct net_vrf *vrf = netdev_priv(vrf_dev);
0703 struct dst_entry *dst = NULL;
0704 struct rt6_info *rt6;
0705
0706 rcu_read_lock();
0707
0708 rt6 = rcu_dereference(vrf->rt6);
0709 if (likely(rt6)) {
0710 dst = &rt6->dst;
0711 dst_hold(dst);
0712 }
0713
0714 rcu_read_unlock();
0715
0716 if (unlikely(!dst)) {
0717 vrf_tx_error(vrf_dev, skb);
0718 return NULL;
0719 }
0720
0721 skb_dst_drop(skb);
0722 skb_dst_set(skb, dst);
0723
0724 return skb;
0725 }
0726
0727 static int vrf_output6_direct_finish(struct net *net, struct sock *sk,
0728 struct sk_buff *skb)
0729 {
0730 vrf_finish_direct(skb);
0731
0732 return vrf_ip6_local_out(net, sk, skb);
0733 }
0734
0735 static int vrf_output6_direct(struct net *net, struct sock *sk,
0736 struct sk_buff *skb)
0737 {
0738 int err = 1;
0739
0740 skb->protocol = htons(ETH_P_IPV6);
0741
0742 if (!(IPCB(skb)->flags & IPSKB_REROUTED))
0743 err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb,
0744 NULL, skb->dev, vrf_output6_direct_finish);
0745
0746 if (likely(err == 1))
0747 vrf_finish_direct(skb);
0748
0749 return err;
0750 }
0751
0752 static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk,
0753 struct sk_buff *skb)
0754 {
0755 int err;
0756
0757 err = vrf_output6_direct(net, sk, skb);
0758 if (likely(err == 1))
0759 err = vrf_ip6_local_out(net, sk, skb);
0760
0761 return err;
0762 }
0763
0764 static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev,
0765 struct sock *sk,
0766 struct sk_buff *skb)
0767 {
0768 struct net *net = dev_net(vrf_dev);
0769 int err;
0770
0771 skb->dev = vrf_dev;
0772
0773 err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk,
0774 skb, NULL, vrf_dev, vrf_ip6_out_direct_finish);
0775
0776 if (likely(err == 1))
0777 err = vrf_output6_direct(net, sk, skb);
0778
0779 if (likely(err == 1))
0780 return skb;
0781
0782 return NULL;
0783 }
0784
0785 static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
0786 struct sock *sk,
0787 struct sk_buff *skb)
0788 {
0789
0790 if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
0791 return skb;
0792
0793 vrf_nf_set_untracked(skb);
0794
0795 if (qdisc_tx_is_default(vrf_dev) ||
0796 IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
0797 return vrf_ip6_out_direct(vrf_dev, sk, skb);
0798
0799 return vrf_ip6_out_redirect(vrf_dev, skb);
0800 }
0801
0802
0803 static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
0804 {
0805 struct rt6_info *rt6 = rtnl_dereference(vrf->rt6);
0806 struct net *net = dev_net(dev);
0807 struct dst_entry *dst;
0808
0809 RCU_INIT_POINTER(vrf->rt6, NULL);
0810 synchronize_rcu();
0811
0812
0813
0814
0815 if (rt6) {
0816 dst = &rt6->dst;
0817 netdev_ref_replace(dst->dev, net->loopback_dev,
0818 &dst->dev_tracker, GFP_KERNEL);
0819 dst->dev = net->loopback_dev;
0820 dst_release(dst);
0821 }
0822 }
0823
0824 static int vrf_rt6_create(struct net_device *dev)
0825 {
0826 int flags = DST_NOPOLICY | DST_NOXFRM;
0827 struct net_vrf *vrf = netdev_priv(dev);
0828 struct net *net = dev_net(dev);
0829 struct rt6_info *rt6;
0830 int rc = -ENOMEM;
0831
0832
0833 if (!ipv6_mod_enabled())
0834 return 0;
0835
0836 vrf->fib6_table = fib6_new_table(net, vrf->tb_id);
0837 if (!vrf->fib6_table)
0838 goto out;
0839
0840
0841 rt6 = ip6_dst_alloc(net, dev, flags);
0842 if (!rt6)
0843 goto out;
0844
0845 rt6->dst.output = vrf_output6;
0846
0847 rcu_assign_pointer(vrf->rt6, rt6);
0848
0849 rc = 0;
0850 out:
0851 return rc;
0852 }
0853 #else
0854 static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
0855 struct sock *sk,
0856 struct sk_buff *skb)
0857 {
0858 return skb;
0859 }
0860
0861 static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
0862 {
0863 }
0864
0865 static int vrf_rt6_create(struct net_device *dev)
0866 {
0867 return 0;
0868 }
0869 #endif
0870
0871
0872 static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
0873 {
0874 struct dst_entry *dst = skb_dst(skb);
0875 struct rtable *rt = (struct rtable *)dst;
0876 struct net_device *dev = dst->dev;
0877 unsigned int hh_len = LL_RESERVED_SPACE(dev);
0878 struct neighbour *neigh;
0879 bool is_v6gw = false;
0880
0881 vrf_nf_reset_ct(skb);
0882
0883
0884 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
0885 skb = skb_expand_head(skb, hh_len);
0886 if (!skb) {
0887 dev->stats.tx_errors++;
0888 return -ENOMEM;
0889 }
0890 }
0891
0892 rcu_read_lock_bh();
0893
0894 neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
0895 if (!IS_ERR(neigh)) {
0896 int ret;
0897
0898 sock_confirm_neigh(skb, neigh);
0899
0900 ret = neigh_output(neigh, skb, is_v6gw);
0901 rcu_read_unlock_bh();
0902 return ret;
0903 }
0904
0905 rcu_read_unlock_bh();
0906 vrf_tx_error(skb->dev, skb);
0907 return -EINVAL;
0908 }
0909
0910 static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
0911 {
0912 struct net_device *dev = skb_dst(skb)->dev;
0913
0914 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
0915
0916 skb->dev = dev;
0917 skb->protocol = htons(ETH_P_IP);
0918
0919 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
0920 net, sk, skb, NULL, dev,
0921 vrf_finish_output,
0922 !(IPCB(skb)->flags & IPSKB_REROUTED));
0923 }
0924
0925
0926
0927
0928
0929 static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev,
0930 struct sk_buff *skb)
0931 {
0932 struct net_vrf *vrf = netdev_priv(vrf_dev);
0933 struct dst_entry *dst = NULL;
0934 struct rtable *rth;
0935
0936 rcu_read_lock();
0937
0938 rth = rcu_dereference(vrf->rth);
0939 if (likely(rth)) {
0940 dst = &rth->dst;
0941 dst_hold(dst);
0942 }
0943
0944 rcu_read_unlock();
0945
0946 if (unlikely(!dst)) {
0947 vrf_tx_error(vrf_dev, skb);
0948 return NULL;
0949 }
0950
0951 skb_dst_drop(skb);
0952 skb_dst_set(skb, dst);
0953
0954 return skb;
0955 }
0956
0957 static int vrf_output_direct_finish(struct net *net, struct sock *sk,
0958 struct sk_buff *skb)
0959 {
0960 vrf_finish_direct(skb);
0961
0962 return vrf_ip_local_out(net, sk, skb);
0963 }
0964
0965 static int vrf_output_direct(struct net *net, struct sock *sk,
0966 struct sk_buff *skb)
0967 {
0968 int err = 1;
0969
0970 skb->protocol = htons(ETH_P_IP);
0971
0972 if (!(IPCB(skb)->flags & IPSKB_REROUTED))
0973 err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb,
0974 NULL, skb->dev, vrf_output_direct_finish);
0975
0976 if (likely(err == 1))
0977 vrf_finish_direct(skb);
0978
0979 return err;
0980 }
0981
0982 static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk,
0983 struct sk_buff *skb)
0984 {
0985 int err;
0986
0987 err = vrf_output_direct(net, sk, skb);
0988 if (likely(err == 1))
0989 err = vrf_ip_local_out(net, sk, skb);
0990
0991 return err;
0992 }
0993
0994 static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev,
0995 struct sock *sk,
0996 struct sk_buff *skb)
0997 {
0998 struct net *net = dev_net(vrf_dev);
0999 int err;
1000
1001 skb->dev = vrf_dev;
1002
1003 err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
1004 skb, NULL, vrf_dev, vrf_ip_out_direct_finish);
1005
1006 if (likely(err == 1))
1007 err = vrf_output_direct(net, sk, skb);
1008
1009 if (likely(err == 1))
1010 return skb;
1011
1012 return NULL;
1013 }
1014
1015 static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
1016 struct sock *sk,
1017 struct sk_buff *skb)
1018 {
1019
1020 if (ipv4_is_multicast(ip_hdr(skb)->daddr) ||
1021 ipv4_is_lbcast(ip_hdr(skb)->daddr))
1022 return skb;
1023
1024 vrf_nf_set_untracked(skb);
1025
1026 if (qdisc_tx_is_default(vrf_dev) ||
1027 IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
1028 return vrf_ip_out_direct(vrf_dev, sk, skb);
1029
1030 return vrf_ip_out_redirect(vrf_dev, skb);
1031 }
1032
1033
1034 static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev,
1035 struct sock *sk,
1036 struct sk_buff *skb,
1037 u16 proto)
1038 {
1039 switch (proto) {
1040 case AF_INET:
1041 return vrf_ip_out(vrf_dev, sk, skb);
1042 case AF_INET6:
1043 return vrf_ip6_out(vrf_dev, sk, skb);
1044 }
1045
1046 return skb;
1047 }
1048
1049
1050 static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf)
1051 {
1052 struct rtable *rth = rtnl_dereference(vrf->rth);
1053 struct net *net = dev_net(dev);
1054 struct dst_entry *dst;
1055
1056 RCU_INIT_POINTER(vrf->rth, NULL);
1057 synchronize_rcu();
1058
1059
1060
1061
1062 if (rth) {
1063 dst = &rth->dst;
1064 netdev_ref_replace(dst->dev, net->loopback_dev,
1065 &dst->dev_tracker, GFP_KERNEL);
1066 dst->dev = net->loopback_dev;
1067 dst_release(dst);
1068 }
1069 }
1070
1071 static int vrf_rtable_create(struct net_device *dev)
1072 {
1073 struct net_vrf *vrf = netdev_priv(dev);
1074 struct rtable *rth;
1075
1076 if (!fib_new_table(dev_net(dev), vrf->tb_id))
1077 return -ENOMEM;
1078
1079
1080 rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1);
1081 if (!rth)
1082 return -ENOMEM;
1083
1084 rth->dst.output = vrf_output;
1085
1086 rcu_assign_pointer(vrf->rth, rth);
1087
1088 return 0;
1089 }
1090
1091
1092
1093
1094 static void cycle_netdev(struct net_device *dev,
1095 struct netlink_ext_ack *extack)
1096 {
1097 unsigned int flags = dev->flags;
1098 int ret;
1099
1100 if (!netif_running(dev))
1101 return;
1102
1103 ret = dev_change_flags(dev, flags & ~IFF_UP, extack);
1104 if (ret >= 0)
1105 ret = dev_change_flags(dev, flags, extack);
1106
1107 if (ret < 0) {
1108 netdev_err(dev,
1109 "Failed to cycle device %s; route tables might be wrong!\n",
1110 dev->name);
1111 }
1112 }
1113
1114 static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
1115 struct netlink_ext_ack *extack)
1116 {
1117 int ret;
1118
1119
1120
1121
1122 if (port_dev == dev_net(dev)->loopback_dev) {
1123 NL_SET_ERR_MSG(extack,
1124 "Can not enslave loopback device to a VRF");
1125 return -EOPNOTSUPP;
1126 }
1127
1128 port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
1129 ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack);
1130 if (ret < 0)
1131 goto err;
1132
1133 cycle_netdev(port_dev, extack);
1134
1135 return 0;
1136
1137 err:
1138 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
1139 return ret;
1140 }
1141
1142 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
1143 struct netlink_ext_ack *extack)
1144 {
1145 if (netif_is_l3_master(port_dev)) {
1146 NL_SET_ERR_MSG(extack,
1147 "Can not enslave an L3 master device to a VRF");
1148 return -EINVAL;
1149 }
1150
1151 if (netif_is_l3_slave(port_dev))
1152 return -EINVAL;
1153
1154 return do_vrf_add_slave(dev, port_dev, extack);
1155 }
1156
1157
1158 static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
1159 {
1160 netdev_upper_dev_unlink(port_dev, dev);
1161 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
1162
1163 cycle_netdev(port_dev, NULL);
1164
1165 return 0;
1166 }
1167
1168 static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
1169 {
1170 return do_vrf_del_slave(dev, port_dev);
1171 }
1172
1173 static void vrf_dev_uninit(struct net_device *dev)
1174 {
1175 struct net_vrf *vrf = netdev_priv(dev);
1176
1177 vrf_rtable_release(dev, vrf);
1178 vrf_rt6_release(dev, vrf);
1179
1180 free_percpu(dev->dstats);
1181 dev->dstats = NULL;
1182 }
1183
1184 static int vrf_dev_init(struct net_device *dev)
1185 {
1186 struct net_vrf *vrf = netdev_priv(dev);
1187
1188 dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
1189 if (!dev->dstats)
1190 goto out_nomem;
1191
1192
1193 if (vrf_rtable_create(dev) != 0)
1194 goto out_stats;
1195
1196 if (vrf_rt6_create(dev) != 0)
1197 goto out_rth;
1198
1199 dev->flags = IFF_MASTER | IFF_NOARP;
1200
1201
1202 dev->operstate = IF_OPER_UP;
1203 netdev_lockdep_set_classes(dev);
1204 return 0;
1205
1206 out_rth:
1207 vrf_rtable_release(dev, vrf);
1208 out_stats:
1209 free_percpu(dev->dstats);
1210 dev->dstats = NULL;
1211 out_nomem:
1212 return -ENOMEM;
1213 }
1214
1215 static const struct net_device_ops vrf_netdev_ops = {
1216 .ndo_init = vrf_dev_init,
1217 .ndo_uninit = vrf_dev_uninit,
1218 .ndo_start_xmit = vrf_xmit,
1219 .ndo_set_mac_address = eth_mac_addr,
1220 .ndo_get_stats64 = vrf_get_stats64,
1221 .ndo_add_slave = vrf_add_slave,
1222 .ndo_del_slave = vrf_del_slave,
1223 };
1224
1225 static u32 vrf_fib_table(const struct net_device *dev)
1226 {
1227 struct net_vrf *vrf = netdev_priv(dev);
1228
1229 return vrf->tb_id;
1230 }
1231
1232 static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
1233 {
1234 kfree_skb(skb);
1235 return 0;
1236 }
1237
1238 static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook,
1239 struct sk_buff *skb,
1240 struct net_device *dev)
1241 {
1242 struct net *net = dev_net(dev);
1243
1244 if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
1245 skb = NULL;
1246
1247 return skb;
1248 }
1249
1250 static int vrf_prepare_mac_header(struct sk_buff *skb,
1251 struct net_device *vrf_dev, u16 proto)
1252 {
1253 struct ethhdr *eth;
1254 int err;
1255
1256
1257
1258
1259 err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev));
1260 if (unlikely(err))
1261
1262 return -ENOBUFS;
1263
1264 __skb_push(skb, ETH_HLEN);
1265 eth = (struct ethhdr *)skb->data;
1266
1267 skb_reset_mac_header(skb);
1268 skb_reset_mac_len(skb);
1269
1270
1271
1272
1273 ether_addr_copy(eth->h_dest, vrf_dev->dev_addr);
1274 ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
1275 eth->h_proto = htons(proto);
1276
1277
1278
1279
1280
1281 skb->protocol = eth->h_proto;
1282 skb->pkt_type = PACKET_HOST;
1283
1284 skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
1285
1286 skb_pull_inline(skb, ETH_HLEN);
1287
1288 return 0;
1289 }
1290
1291
1292
1293
1294
1295
1296 static int vrf_add_mac_header_if_unset(struct sk_buff *skb,
1297 struct net_device *vrf_dev,
1298 u16 proto, struct net_device *orig_dev)
1299 {
1300 if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev))
1301 return 0;
1302
1303 return vrf_prepare_mac_header(skb, vrf_dev, proto);
1304 }
1305
1306 #if IS_ENABLED(CONFIG_IPV6)
1307
1308
1309
1310
1311
1312 static bool ipv6_ndisc_frame(const struct sk_buff *skb)
1313 {
1314 const struct ipv6hdr *iph = ipv6_hdr(skb);
1315 bool rc = false;
1316
1317 if (iph->nexthdr == NEXTHDR_ICMP) {
1318 const struct icmp6hdr *icmph;
1319 struct icmp6hdr _icmph;
1320
1321 icmph = skb_header_pointer(skb, sizeof(*iph),
1322 sizeof(_icmph), &_icmph);
1323 if (!icmph)
1324 goto out;
1325
1326 switch (icmph->icmp6_type) {
1327 case NDISC_ROUTER_SOLICITATION:
1328 case NDISC_ROUTER_ADVERTISEMENT:
1329 case NDISC_NEIGHBOUR_SOLICITATION:
1330 case NDISC_NEIGHBOUR_ADVERTISEMENT:
1331 case NDISC_REDIRECT:
1332 rc = true;
1333 break;
1334 }
1335 }
1336
1337 out:
1338 return rc;
1339 }
1340
1341 static struct rt6_info *vrf_ip6_route_lookup(struct net *net,
1342 const struct net_device *dev,
1343 struct flowi6 *fl6,
1344 int ifindex,
1345 const struct sk_buff *skb,
1346 int flags)
1347 {
1348 struct net_vrf *vrf = netdev_priv(dev);
1349
1350 return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags);
1351 }
1352
1353 static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev,
1354 int ifindex)
1355 {
1356 const struct ipv6hdr *iph = ipv6_hdr(skb);
1357 struct flowi6 fl6 = {
1358 .flowi6_iif = ifindex,
1359 .flowi6_mark = skb->mark,
1360 .flowi6_proto = iph->nexthdr,
1361 .daddr = iph->daddr,
1362 .saddr = iph->saddr,
1363 .flowlabel = ip6_flowinfo(iph),
1364 };
1365 struct net *net = dev_net(vrf_dev);
1366 struct rt6_info *rt6;
1367
1368 rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb,
1369 RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE);
1370 if (unlikely(!rt6))
1371 return;
1372
1373 if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst))
1374 return;
1375
1376 skb_dst_set(skb, &rt6->dst);
1377 }
1378
1379 static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
1380 struct sk_buff *skb)
1381 {
1382 int orig_iif = skb->skb_iif;
1383 bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr);
1384 bool is_ndisc = ipv6_ndisc_frame(skb);
1385
1386
1387
1388
1389
1390
1391 if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
1392 skb->dev = vrf_dev;
1393 skb->skb_iif = vrf_dev->ifindex;
1394 IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
1395
1396 if (skb->pkt_type == PACKET_LOOPBACK)
1397 skb->pkt_type = PACKET_HOST;
1398 else if (ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)
1399 vrf_ip6_input_dst(skb, vrf_dev, orig_iif);
1400
1401 goto out;
1402 }
1403
1404
1405 if (!is_ndisc) {
1406 struct net_device *orig_dev = skb->dev;
1407
1408 vrf_rx_stats(vrf_dev, skb->len);
1409 skb->dev = vrf_dev;
1410 skb->skb_iif = vrf_dev->ifindex;
1411
1412 if (!list_empty(&vrf_dev->ptype_all)) {
1413 int err;
1414
1415 err = vrf_add_mac_header_if_unset(skb, vrf_dev,
1416 ETH_P_IPV6,
1417 orig_dev);
1418 if (likely(!err)) {
1419 skb_push(skb, skb->mac_len);
1420 dev_queue_xmit_nit(skb, vrf_dev);
1421 skb_pull(skb, skb->mac_len);
1422 }
1423 }
1424
1425 IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
1426 }
1427
1428 if (need_strict)
1429 vrf_ip6_input_dst(skb, vrf_dev, orig_iif);
1430
1431 skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev);
1432 out:
1433 return skb;
1434 }
1435
1436 #else
1437 static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
1438 struct sk_buff *skb)
1439 {
1440 return skb;
1441 }
1442 #endif
1443
1444 static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
1445 struct sk_buff *skb)
1446 {
1447 struct net_device *orig_dev = skb->dev;
1448
1449 skb->dev = vrf_dev;
1450 skb->skb_iif = vrf_dev->ifindex;
1451 IPCB(skb)->flags |= IPSKB_L3SLAVE;
1452
1453 if (ipv4_is_multicast(ip_hdr(skb)->daddr))
1454 goto out;
1455
1456
1457
1458
1459 if (skb->pkt_type == PACKET_LOOPBACK) {
1460 skb->pkt_type = PACKET_HOST;
1461 goto out;
1462 }
1463
1464 vrf_rx_stats(vrf_dev, skb->len);
1465
1466 if (!list_empty(&vrf_dev->ptype_all)) {
1467 int err;
1468
1469 err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP,
1470 orig_dev);
1471 if (likely(!err)) {
1472 skb_push(skb, skb->mac_len);
1473 dev_queue_xmit_nit(skb, vrf_dev);
1474 skb_pull(skb, skb->mac_len);
1475 }
1476 }
1477
1478 skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev);
1479 out:
1480 return skb;
1481 }
1482
1483
1484 static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
1485 struct sk_buff *skb,
1486 u16 proto)
1487 {
1488 switch (proto) {
1489 case AF_INET:
1490 return vrf_ip_rcv(vrf_dev, skb);
1491 case AF_INET6:
1492 return vrf_ip6_rcv(vrf_dev, skb);
1493 }
1494
1495 return skb;
1496 }
1497
1498 #if IS_ENABLED(CONFIG_IPV6)
1499
1500
1501
1502
1503
1504 static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
1505 struct flowi6 *fl6)
1506 {
1507 struct net *net = dev_net(dev);
1508 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF;
1509 struct dst_entry *dst = NULL;
1510 struct rt6_info *rt;
1511
1512
1513
1514
1515
1516 if (fl6->flowi6_oif == dev->ifindex) {
1517 dst = &net->ipv6.ip6_null_entry->dst;
1518 return dst;
1519 }
1520
1521 if (!ipv6_addr_any(&fl6->saddr))
1522 flags |= RT6_LOOKUP_F_HAS_SADDR;
1523
1524 rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags);
1525 if (rt)
1526 dst = &rt->dst;
1527
1528 return dst;
1529 }
1530 #endif
1531
1532 static const struct l3mdev_ops vrf_l3mdev_ops = {
1533 .l3mdev_fib_table = vrf_fib_table,
1534 .l3mdev_l3_rcv = vrf_l3_rcv,
1535 .l3mdev_l3_out = vrf_l3_out,
1536 #if IS_ENABLED(CONFIG_IPV6)
1537 .l3mdev_link_scope_lookup = vrf_link_scope_lookup,
1538 #endif
1539 };
1540
1541 static void vrf_get_drvinfo(struct net_device *dev,
1542 struct ethtool_drvinfo *info)
1543 {
1544 strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
1545 strlcpy(info->version, DRV_VERSION, sizeof(info->version));
1546 }
1547
1548 static const struct ethtool_ops vrf_ethtool_ops = {
1549 .get_drvinfo = vrf_get_drvinfo,
1550 };
1551
1552 static inline size_t vrf_fib_rule_nl_size(void)
1553 {
1554 size_t sz;
1555
1556 sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr));
1557 sz += nla_total_size(sizeof(u8));
1558 sz += nla_total_size(sizeof(u32));
1559 sz += nla_total_size(sizeof(u8));
1560
1561 return sz;
1562 }
1563
1564 static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it)
1565 {
1566 struct fib_rule_hdr *frh;
1567 struct nlmsghdr *nlh;
1568 struct sk_buff *skb;
1569 int err;
1570
1571 if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) &&
1572 !ipv6_mod_enabled())
1573 return 0;
1574
1575 skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL);
1576 if (!skb)
1577 return -ENOMEM;
1578
1579 nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0);
1580 if (!nlh)
1581 goto nla_put_failure;
1582
1583
1584 nlh->nlmsg_flags |= NLM_F_EXCL;
1585
1586 frh = nlmsg_data(nlh);
1587 memset(frh, 0, sizeof(*frh));
1588 frh->family = family;
1589 frh->action = FR_ACT_TO_TBL;
1590
1591 if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL))
1592 goto nla_put_failure;
1593
1594 if (nla_put_u8(skb, FRA_L3MDEV, 1))
1595 goto nla_put_failure;
1596
1597 if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF))
1598 goto nla_put_failure;
1599
1600 nlmsg_end(skb, nlh);
1601
1602
1603 skb->sk = dev_net(dev)->rtnl;
1604 if (add_it) {
1605 err = fib_nl_newrule(skb, nlh, NULL);
1606 if (err == -EEXIST)
1607 err = 0;
1608 } else {
1609 err = fib_nl_delrule(skb, nlh, NULL);
1610 if (err == -ENOENT)
1611 err = 0;
1612 }
1613 nlmsg_free(skb);
1614
1615 return err;
1616
1617 nla_put_failure:
1618 nlmsg_free(skb);
1619
1620 return -EMSGSIZE;
1621 }
1622
1623 static int vrf_add_fib_rules(const struct net_device *dev)
1624 {
1625 int err;
1626
1627 err = vrf_fib_rule(dev, AF_INET, true);
1628 if (err < 0)
1629 goto out_err;
1630
1631 err = vrf_fib_rule(dev, AF_INET6, true);
1632 if (err < 0)
1633 goto ipv6_err;
1634
1635 #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
1636 err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true);
1637 if (err < 0)
1638 goto ipmr_err;
1639 #endif
1640
1641 #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
1642 err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true);
1643 if (err < 0)
1644 goto ip6mr_err;
1645 #endif
1646
1647 return 0;
1648
1649 #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
1650 ip6mr_err:
1651 vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false);
1652 #endif
1653
1654 #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
1655 ipmr_err:
1656 vrf_fib_rule(dev, AF_INET6, false);
1657 #endif
1658
1659 ipv6_err:
1660 vrf_fib_rule(dev, AF_INET, false);
1661
1662 out_err:
1663 netdev_err(dev, "Failed to add FIB rules.\n");
1664 return err;
1665 }
1666
1667 static void vrf_setup(struct net_device *dev)
1668 {
1669 ether_setup(dev);
1670
1671
1672 dev->netdev_ops = &vrf_netdev_ops;
1673 dev->l3mdev_ops = &vrf_l3mdev_ops;
1674 dev->ethtool_ops = &vrf_ethtool_ops;
1675 dev->needs_free_netdev = true;
1676
1677
1678 eth_hw_addr_random(dev);
1679
1680
1681 dev->features |= NETIF_F_LLTX;
1682
1683
1684 dev->features |= NETIF_F_NETNS_LOCAL;
1685
1686
1687 dev->features |= NETIF_F_VLAN_CHALLENGED;
1688
1689
1690 dev->features |= NETIF_F_GSO_SOFTWARE;
1691 dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC;
1692 dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA;
1693
1694 dev->hw_features = dev->features;
1695 dev->hw_enc_features = dev->features;
1696
1697
1698 dev->priv_flags |= IFF_NO_QUEUE;
1699 dev->priv_flags |= IFF_NO_RX_HANDLER;
1700 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1701
1702
1703
1704
1705
1706 dev->min_mtu = IPV6_MIN_MTU;
1707 dev->max_mtu = IP6_MAX_MTU;
1708 dev->mtu = dev->max_mtu;
1709 }
1710
1711 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[],
1712 struct netlink_ext_ack *extack)
1713 {
1714 if (tb[IFLA_ADDRESS]) {
1715 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
1716 NL_SET_ERR_MSG(extack, "Invalid hardware address");
1717 return -EINVAL;
1718 }
1719 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
1720 NL_SET_ERR_MSG(extack, "Invalid hardware address");
1721 return -EADDRNOTAVAIL;
1722 }
1723 }
1724 return 0;
1725 }
1726
1727 static void vrf_dellink(struct net_device *dev, struct list_head *head)
1728 {
1729 struct net_device *port_dev;
1730 struct list_head *iter;
1731
1732 netdev_for_each_lower_dev(dev, port_dev, iter)
1733 vrf_del_slave(dev, port_dev);
1734
1735 vrf_map_unregister_dev(dev);
1736
1737 unregister_netdevice_queue(dev, head);
1738 }
1739
1740 static int vrf_newlink(struct net *src_net, struct net_device *dev,
1741 struct nlattr *tb[], struct nlattr *data[],
1742 struct netlink_ext_ack *extack)
1743 {
1744 struct net_vrf *vrf = netdev_priv(dev);
1745 struct netns_vrf *nn_vrf;
1746 bool *add_fib_rules;
1747 struct net *net;
1748 int err;
1749
1750 if (!data || !data[IFLA_VRF_TABLE]) {
1751 NL_SET_ERR_MSG(extack, "VRF table id is missing");
1752 return -EINVAL;
1753 }
1754
1755 vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]);
1756 if (vrf->tb_id == RT_TABLE_UNSPEC) {
1757 NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE],
1758 "Invalid VRF table id");
1759 return -EINVAL;
1760 }
1761
1762 dev->priv_flags |= IFF_L3MDEV_MASTER;
1763
1764 err = register_netdevice(dev);
1765 if (err)
1766 goto out;
1767
1768
1769
1770
1771
1772 vrf->ifindex = dev->ifindex;
1773
1774 err = vrf_map_register_dev(dev, extack);
1775 if (err) {
1776 unregister_netdevice(dev);
1777 goto out;
1778 }
1779
1780 net = dev_net(dev);
1781 nn_vrf = net_generic(net, vrf_net_id);
1782
1783 add_fib_rules = &nn_vrf->add_fib_rules;
1784 if (*add_fib_rules) {
1785 err = vrf_add_fib_rules(dev);
1786 if (err) {
1787 vrf_map_unregister_dev(dev);
1788 unregister_netdevice(dev);
1789 goto out;
1790 }
1791 *add_fib_rules = false;
1792 }
1793
1794 out:
1795 return err;
1796 }
1797
1798 static size_t vrf_nl_getsize(const struct net_device *dev)
1799 {
1800 return nla_total_size(sizeof(u32));
1801 }
1802
1803 static int vrf_fillinfo(struct sk_buff *skb,
1804 const struct net_device *dev)
1805 {
1806 struct net_vrf *vrf = netdev_priv(dev);
1807
1808 return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id);
1809 }
1810
1811 static size_t vrf_get_slave_size(const struct net_device *bond_dev,
1812 const struct net_device *slave_dev)
1813 {
1814 return nla_total_size(sizeof(u32));
1815 }
1816
1817 static int vrf_fill_slave_info(struct sk_buff *skb,
1818 const struct net_device *vrf_dev,
1819 const struct net_device *slave_dev)
1820 {
1821 struct net_vrf *vrf = netdev_priv(vrf_dev);
1822
1823 if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id))
1824 return -EMSGSIZE;
1825
1826 return 0;
1827 }
1828
1829 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = {
1830 [IFLA_VRF_TABLE] = { .type = NLA_U32 },
1831 };
1832
1833 static struct rtnl_link_ops vrf_link_ops __read_mostly = {
1834 .kind = DRV_NAME,
1835 .priv_size = sizeof(struct net_vrf),
1836
1837 .get_size = vrf_nl_getsize,
1838 .policy = vrf_nl_policy,
1839 .validate = vrf_validate,
1840 .fill_info = vrf_fillinfo,
1841
1842 .get_slave_size = vrf_get_slave_size,
1843 .fill_slave_info = vrf_fill_slave_info,
1844
1845 .newlink = vrf_newlink,
1846 .dellink = vrf_dellink,
1847 .setup = vrf_setup,
1848 .maxtype = IFLA_VRF_MAX,
1849 };
1850
1851 static int vrf_device_event(struct notifier_block *unused,
1852 unsigned long event, void *ptr)
1853 {
1854 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1855
1856
1857 if (event == NETDEV_UNREGISTER) {
1858 struct net_device *vrf_dev;
1859
1860 if (!netif_is_l3_slave(dev))
1861 goto out;
1862
1863 vrf_dev = netdev_master_upper_dev_get(dev);
1864 vrf_del_slave(vrf_dev, dev);
1865 }
1866 out:
1867 return NOTIFY_DONE;
1868 }
1869
1870 static struct notifier_block vrf_notifier_block __read_mostly = {
1871 .notifier_call = vrf_device_event,
1872 };
1873
1874 static int vrf_map_init(struct vrf_map *vmap)
1875 {
1876 spin_lock_init(&vmap->vmap_lock);
1877 hash_init(vmap->ht);
1878
1879 vmap->strict_mode = false;
1880
1881 return 0;
1882 }
1883
1884 #ifdef CONFIG_SYSCTL
1885 static bool vrf_strict_mode(struct vrf_map *vmap)
1886 {
1887 bool strict_mode;
1888
1889 vrf_map_lock(vmap);
1890 strict_mode = vmap->strict_mode;
1891 vrf_map_unlock(vmap);
1892
1893 return strict_mode;
1894 }
1895
1896 static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode)
1897 {
1898 bool *cur_mode;
1899 int res = 0;
1900
1901 vrf_map_lock(vmap);
1902
1903 cur_mode = &vmap->strict_mode;
1904 if (*cur_mode == new_mode)
1905 goto unlock;
1906
1907 if (*cur_mode) {
1908
1909 *cur_mode = false;
1910 } else {
1911 if (vmap->shared_tables) {
1912
1913
1914
1915 res = -EBUSY;
1916 goto unlock;
1917 }
1918
1919
1920
1921
1922 *cur_mode = true;
1923 }
1924
1925 unlock:
1926 vrf_map_unlock(vmap);
1927
1928 return res;
1929 }
1930
1931 static int vrf_shared_table_handler(struct ctl_table *table, int write,
1932 void *buffer, size_t *lenp, loff_t *ppos)
1933 {
1934 struct net *net = (struct net *)table->extra1;
1935 struct vrf_map *vmap = netns_vrf_map(net);
1936 int proc_strict_mode = 0;
1937 struct ctl_table tmp = {
1938 .procname = table->procname,
1939 .data = &proc_strict_mode,
1940 .maxlen = sizeof(int),
1941 .mode = table->mode,
1942 .extra1 = SYSCTL_ZERO,
1943 .extra2 = SYSCTL_ONE,
1944 };
1945 int ret;
1946
1947 if (!write)
1948 proc_strict_mode = vrf_strict_mode(vmap);
1949
1950 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
1951
1952 if (write && ret == 0)
1953 ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode);
1954
1955 return ret;
1956 }
1957
1958 static const struct ctl_table vrf_table[] = {
1959 {
1960 .procname = "strict_mode",
1961 .data = NULL,
1962 .maxlen = sizeof(int),
1963 .mode = 0644,
1964 .proc_handler = vrf_shared_table_handler,
1965
1966 .extra1 = NULL,
1967 },
1968 { },
1969 };
1970
1971 static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf)
1972 {
1973 struct ctl_table *table;
1974
1975 table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL);
1976 if (!table)
1977 return -ENOMEM;
1978
1979
1980 table[0].extra1 = net;
1981
1982 nn_vrf->ctl_hdr = register_net_sysctl(net, "net/vrf", table);
1983 if (!nn_vrf->ctl_hdr) {
1984 kfree(table);
1985 return -ENOMEM;
1986 }
1987
1988 return 0;
1989 }
1990
1991 static void vrf_netns_exit_sysctl(struct net *net)
1992 {
1993 struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
1994 struct ctl_table *table;
1995
1996 table = nn_vrf->ctl_hdr->ctl_table_arg;
1997 unregister_net_sysctl_table(nn_vrf->ctl_hdr);
1998 kfree(table);
1999 }
2000 #else
2001 static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf)
2002 {
2003 return 0;
2004 }
2005
2006 static void vrf_netns_exit_sysctl(struct net *net)
2007 {
2008 }
2009 #endif
2010
2011
2012 static int __net_init vrf_netns_init(struct net *net)
2013 {
2014 struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
2015
2016 nn_vrf->add_fib_rules = true;
2017 vrf_map_init(&nn_vrf->vmap);
2018
2019 return vrf_netns_init_sysctl(net, nn_vrf);
2020 }
2021
2022 static void __net_exit vrf_netns_exit(struct net *net)
2023 {
2024 vrf_netns_exit_sysctl(net);
2025 }
2026
2027 static struct pernet_operations vrf_net_ops __net_initdata = {
2028 .init = vrf_netns_init,
2029 .exit = vrf_netns_exit,
2030 .id = &vrf_net_id,
2031 .size = sizeof(struct netns_vrf),
2032 };
2033
2034 static int __init vrf_init_module(void)
2035 {
2036 int rc;
2037
2038 register_netdevice_notifier(&vrf_notifier_block);
2039
2040 rc = register_pernet_subsys(&vrf_net_ops);
2041 if (rc < 0)
2042 goto error;
2043
2044 rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF,
2045 vrf_ifindex_lookup_by_table_id);
2046 if (rc < 0)
2047 goto unreg_pernet;
2048
2049 rc = rtnl_link_register(&vrf_link_ops);
2050 if (rc < 0)
2051 goto table_lookup_unreg;
2052
2053 return 0;
2054
2055 table_lookup_unreg:
2056 l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF,
2057 vrf_ifindex_lookup_by_table_id);
2058
2059 unreg_pernet:
2060 unregister_pernet_subsys(&vrf_net_ops);
2061
2062 error:
2063 unregister_netdevice_notifier(&vrf_notifier_block);
2064 return rc;
2065 }
2066
2067 module_init(vrf_init_module);
2068 MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
2069 MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
2070 MODULE_LICENSE("GPL");
2071 MODULE_ALIAS_RTNL_LINK(DRV_NAME);
2072 MODULE_VERSION(DRV_VERSION);