0001
0002
0003
0004
0005
0006
0007 #include <linux/skbuff.h>
0008 #include <linux/if_arp.h>
0009 #include <linux/netdevice.h>
0010 #include <linux/if.h>
0011 #include <linux/if_vlan.h>
0012 #include <net/udp_tunnel.h>
0013 #include <net/sch_generic.h>
0014 #include <linux/netfilter.h>
0015 #include <rdma/ib_addr.h>
0016
0017 #include "rxe.h"
0018 #include "rxe_net.h"
0019 #include "rxe_loc.h"
0020
0021 static struct rxe_recv_sockets recv_sockets;
0022
0023 static struct dst_entry *rxe_find_route4(struct net_device *ndev,
0024 struct in_addr *saddr,
0025 struct in_addr *daddr)
0026 {
0027 struct rtable *rt;
0028 struct flowi4 fl = { { 0 } };
0029
0030 memset(&fl, 0, sizeof(fl));
0031 fl.flowi4_oif = ndev->ifindex;
0032 memcpy(&fl.saddr, saddr, sizeof(*saddr));
0033 memcpy(&fl.daddr, daddr, sizeof(*daddr));
0034 fl.flowi4_proto = IPPROTO_UDP;
0035
0036 rt = ip_route_output_key(&init_net, &fl);
0037 if (IS_ERR(rt)) {
0038 pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr);
0039 return NULL;
0040 }
0041
0042 return &rt->dst;
0043 }
0044
0045 #if IS_ENABLED(CONFIG_IPV6)
0046 static struct dst_entry *rxe_find_route6(struct net_device *ndev,
0047 struct in6_addr *saddr,
0048 struct in6_addr *daddr)
0049 {
0050 struct dst_entry *ndst;
0051 struct flowi6 fl6 = { { 0 } };
0052
0053 memset(&fl6, 0, sizeof(fl6));
0054 fl6.flowi6_oif = ndev->ifindex;
0055 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
0056 memcpy(&fl6.daddr, daddr, sizeof(*daddr));
0057 fl6.flowi6_proto = IPPROTO_UDP;
0058
0059 ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk),
0060 recv_sockets.sk6->sk, &fl6,
0061 NULL);
0062 if (IS_ERR(ndst)) {
0063 pr_err_ratelimited("no route to %pI6\n", daddr);
0064 return NULL;
0065 }
0066
0067 if (unlikely(ndst->error)) {
0068 pr_err("no route to %pI6\n", daddr);
0069 goto put;
0070 }
0071
0072 return ndst;
0073 put:
0074 dst_release(ndst);
0075 return NULL;
0076 }
0077
0078 #else
0079
0080 static struct dst_entry *rxe_find_route6(struct net_device *ndev,
0081 struct in6_addr *saddr,
0082 struct in6_addr *daddr)
0083 {
0084 return NULL;
0085 }
0086
0087 #endif
0088
0089 static struct dst_entry *rxe_find_route(struct net_device *ndev,
0090 struct rxe_qp *qp,
0091 struct rxe_av *av)
0092 {
0093 struct dst_entry *dst = NULL;
0094
0095 if (qp_type(qp) == IB_QPT_RC)
0096 dst = sk_dst_get(qp->sk->sk);
0097
0098 if (!dst || !dst_check(dst, qp->dst_cookie)) {
0099 if (dst)
0100 dst_release(dst);
0101
0102 if (av->network_type == RXE_NETWORK_TYPE_IPV4) {
0103 struct in_addr *saddr;
0104 struct in_addr *daddr;
0105
0106 saddr = &av->sgid_addr._sockaddr_in.sin_addr;
0107 daddr = &av->dgid_addr._sockaddr_in.sin_addr;
0108 dst = rxe_find_route4(ndev, saddr, daddr);
0109 } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) {
0110 struct in6_addr *saddr6;
0111 struct in6_addr *daddr6;
0112
0113 saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr;
0114 daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr;
0115 dst = rxe_find_route6(ndev, saddr6, daddr6);
0116 #if IS_ENABLED(CONFIG_IPV6)
0117 if (dst)
0118 qp->dst_cookie =
0119 rt6_get_cookie((struct rt6_info *)dst);
0120 #endif
0121 }
0122
0123 if (dst && (qp_type(qp) == IB_QPT_RC)) {
0124 dst_hold(dst);
0125 sk_dst_set(qp->sk->sk, dst);
0126 }
0127 }
0128 return dst;
0129 }
0130
0131 static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
0132 {
0133 struct udphdr *udph;
0134 struct rxe_dev *rxe;
0135 struct net_device *ndev = skb->dev;
0136 struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
0137
0138
0139
0140
0141 rxe = rxe_get_dev_from_net(ndev);
0142 if (!rxe && is_vlan_dev(ndev))
0143 rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev));
0144 if (!rxe)
0145 goto drop;
0146
0147 if (skb_linearize(skb)) {
0148 pr_err("skb_linearize failed\n");
0149 ib_device_put(&rxe->ib_dev);
0150 goto drop;
0151 }
0152
0153 udph = udp_hdr(skb);
0154 pkt->rxe = rxe;
0155 pkt->port_num = 1;
0156 pkt->hdr = (u8 *)(udph + 1);
0157 pkt->mask = RXE_GRH_MASK;
0158 pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph);
0159
0160 rxe_rcv(skb);
0161
0162 return 0;
0163 drop:
0164 kfree_skb(skb);
0165
0166 return 0;
0167 }
0168
0169 static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
0170 bool ipv6)
0171 {
0172 int err;
0173 struct socket *sock;
0174 struct udp_port_cfg udp_cfg = { };
0175 struct udp_tunnel_sock_cfg tnl_cfg = { };
0176
0177 if (ipv6) {
0178 udp_cfg.family = AF_INET6;
0179 udp_cfg.ipv6_v6only = 1;
0180 } else {
0181 udp_cfg.family = AF_INET;
0182 }
0183
0184 udp_cfg.local_udp_port = port;
0185
0186
0187 err = udp_sock_create(net, &udp_cfg, &sock);
0188 if (err < 0)
0189 return ERR_PTR(err);
0190
0191 tnl_cfg.encap_type = 1;
0192 tnl_cfg.encap_rcv = rxe_udp_encap_recv;
0193
0194
0195 setup_udp_tunnel_sock(net, sock, &tnl_cfg);
0196
0197 return sock;
0198 }
0199
0200 static void rxe_release_udp_tunnel(struct socket *sk)
0201 {
0202 if (sk)
0203 udp_tunnel_sock_release(sk);
0204 }
0205
0206 static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port,
0207 __be16 dst_port)
0208 {
0209 struct udphdr *udph;
0210
0211 __skb_push(skb, sizeof(*udph));
0212 skb_reset_transport_header(skb);
0213 udph = udp_hdr(skb);
0214
0215 udph->dest = dst_port;
0216 udph->source = src_port;
0217 udph->len = htons(skb->len);
0218 udph->check = 0;
0219 }
0220
0221 static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb,
0222 __be32 saddr, __be32 daddr, __u8 proto,
0223 __u8 tos, __u8 ttl, __be16 df, bool xnet)
0224 {
0225 struct iphdr *iph;
0226
0227 skb_scrub_packet(skb, xnet);
0228
0229 skb_clear_hash(skb);
0230 skb_dst_set(skb, dst_clone(dst));
0231 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
0232
0233 skb_push(skb, sizeof(struct iphdr));
0234 skb_reset_network_header(skb);
0235
0236 iph = ip_hdr(skb);
0237
0238 iph->version = IPVERSION;
0239 iph->ihl = sizeof(struct iphdr) >> 2;
0240 iph->tot_len = htons(skb->len);
0241 iph->frag_off = df;
0242 iph->protocol = proto;
0243 iph->tos = tos;
0244 iph->daddr = daddr;
0245 iph->saddr = saddr;
0246 iph->ttl = ttl;
0247 __ip_select_ident(dev_net(dst->dev), iph,
0248 skb_shinfo(skb)->gso_segs ?: 1);
0249 }
0250
0251 static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
0252 struct in6_addr *saddr, struct in6_addr *daddr,
0253 __u8 proto, __u8 prio, __u8 ttl)
0254 {
0255 struct ipv6hdr *ip6h;
0256
0257 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
0258 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
0259 | IPSKB_REROUTED);
0260 skb_dst_set(skb, dst_clone(dst));
0261
0262 __skb_push(skb, sizeof(*ip6h));
0263 skb_reset_network_header(skb);
0264 ip6h = ipv6_hdr(skb);
0265 ip6_flow_hdr(ip6h, prio, htonl(0));
0266 ip6h->payload_len = htons(skb->len);
0267 ip6h->nexthdr = proto;
0268 ip6h->hop_limit = ttl;
0269 ip6h->daddr = *daddr;
0270 ip6h->saddr = *saddr;
0271 ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
0272 }
0273
0274 static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt,
0275 struct sk_buff *skb)
0276 {
0277 struct rxe_qp *qp = pkt->qp;
0278 struct dst_entry *dst;
0279 bool xnet = false;
0280 __be16 df = htons(IP_DF);
0281 struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
0282 struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;
0283
0284 dst = rxe_find_route(skb->dev, qp, av);
0285 if (!dst) {
0286 pr_err("Host not reachable\n");
0287 return -EHOSTUNREACH;
0288 }
0289
0290 prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
0291 cpu_to_be16(ROCE_V2_UDP_DPORT));
0292
0293 prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP,
0294 av->grh.traffic_class, av->grh.hop_limit, df, xnet);
0295
0296 dst_release(dst);
0297 return 0;
0298 }
0299
0300 static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt,
0301 struct sk_buff *skb)
0302 {
0303 struct rxe_qp *qp = pkt->qp;
0304 struct dst_entry *dst;
0305 struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
0306 struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;
0307
0308 dst = rxe_find_route(skb->dev, qp, av);
0309 if (!dst) {
0310 pr_err("Host not reachable\n");
0311 return -EHOSTUNREACH;
0312 }
0313
0314 prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
0315 cpu_to_be16(ROCE_V2_UDP_DPORT));
0316
0317 prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP,
0318 av->grh.traffic_class,
0319 av->grh.hop_limit);
0320
0321 dst_release(dst);
0322 return 0;
0323 }
0324
0325 int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt,
0326 struct sk_buff *skb)
0327 {
0328 int err = 0;
0329
0330 if (skb->protocol == htons(ETH_P_IP))
0331 err = prepare4(av, pkt, skb);
0332 else if (skb->protocol == htons(ETH_P_IPV6))
0333 err = prepare6(av, pkt, skb);
0334
0335 if (ether_addr_equal(skb->dev->dev_addr, av->dmac))
0336 pkt->mask |= RXE_LOOPBACK_MASK;
0337
0338 return err;
0339 }
0340
0341 static void rxe_skb_tx_dtor(struct sk_buff *skb)
0342 {
0343 struct sock *sk = skb->sk;
0344 struct rxe_qp *qp = sk->sk_user_data;
0345 int skb_out = atomic_dec_return(&qp->skb_out);
0346
0347 if (unlikely(qp->need_req_skb &&
0348 skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW))
0349 rxe_run_task(&qp->req.task, 1);
0350
0351 rxe_put(qp);
0352 }
0353
0354 static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt)
0355 {
0356 int err;
0357
0358 skb->destructor = rxe_skb_tx_dtor;
0359 skb->sk = pkt->qp->sk->sk;
0360
0361 rxe_get(pkt->qp);
0362 atomic_inc(&pkt->qp->skb_out);
0363
0364 if (skb->protocol == htons(ETH_P_IP)) {
0365 err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
0366 } else if (skb->protocol == htons(ETH_P_IPV6)) {
0367 err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
0368 } else {
0369 pr_err("Unknown layer 3 protocol: %d\n", skb->protocol);
0370 atomic_dec(&pkt->qp->skb_out);
0371 rxe_put(pkt->qp);
0372 kfree_skb(skb);
0373 return -EINVAL;
0374 }
0375
0376 if (unlikely(net_xmit_eval(err))) {
0377 pr_debug("error sending packet: %d\n", err);
0378 return -EAGAIN;
0379 }
0380
0381 return 0;
0382 }
0383
0384
0385
0386
0387 static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt)
0388 {
0389 memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));
0390
0391 if (skb->protocol == htons(ETH_P_IP))
0392 skb_pull(skb, sizeof(struct iphdr));
0393 else
0394 skb_pull(skb, sizeof(struct ipv6hdr));
0395
0396 if (WARN_ON(!ib_device_try_get(&pkt->rxe->ib_dev))) {
0397 kfree_skb(skb);
0398 return -EIO;
0399 }
0400
0401 rxe_rcv(skb);
0402
0403 return 0;
0404 }
0405
0406 int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
0407 struct sk_buff *skb)
0408 {
0409 int err;
0410 int is_request = pkt->mask & RXE_REQ_MASK;
0411 struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
0412
0413 if ((is_request && (qp->req.state != QP_STATE_READY)) ||
0414 (!is_request && (qp->resp.state != QP_STATE_READY))) {
0415 pr_info("Packet dropped. QP is not in ready state\n");
0416 goto drop;
0417 }
0418
0419 rxe_icrc_generate(skb, pkt);
0420
0421 if (pkt->mask & RXE_LOOPBACK_MASK)
0422 err = rxe_loopback(skb, pkt);
0423 else
0424 err = rxe_send(skb, pkt);
0425 if (err) {
0426 rxe_counter_inc(rxe, RXE_CNT_SEND_ERR);
0427 return err;
0428 }
0429
0430 if ((qp_type(qp) != IB_QPT_RC) &&
0431 (pkt->mask & RXE_END_MASK)) {
0432 pkt->wqe->state = wqe_state_done;
0433 rxe_run_task(&qp->comp.task, 1);
0434 }
0435
0436 rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS);
0437 goto done;
0438
0439 drop:
0440 kfree_skb(skb);
0441 err = 0;
0442 done:
0443 return err;
0444 }
0445
0446 struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
0447 int paylen, struct rxe_pkt_info *pkt)
0448 {
0449 unsigned int hdr_len;
0450 struct sk_buff *skb = NULL;
0451 struct net_device *ndev;
0452 const struct ib_gid_attr *attr;
0453 const int port_num = 1;
0454
0455 attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index);
0456 if (IS_ERR(attr))
0457 return NULL;
0458
0459 if (av->network_type == RXE_NETWORK_TYPE_IPV4)
0460 hdr_len = ETH_HLEN + sizeof(struct udphdr) +
0461 sizeof(struct iphdr);
0462 else
0463 hdr_len = ETH_HLEN + sizeof(struct udphdr) +
0464 sizeof(struct ipv6hdr);
0465
0466 rcu_read_lock();
0467 ndev = rdma_read_gid_attr_ndev_rcu(attr);
0468 if (IS_ERR(ndev)) {
0469 rcu_read_unlock();
0470 goto out;
0471 }
0472 skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev),
0473 GFP_ATOMIC);
0474
0475 if (unlikely(!skb)) {
0476 rcu_read_unlock();
0477 goto out;
0478 }
0479
0480 skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev));
0481
0482
0483 skb->dev = ndev;
0484 rcu_read_unlock();
0485
0486 if (av->network_type == RXE_NETWORK_TYPE_IPV4)
0487 skb->protocol = htons(ETH_P_IP);
0488 else
0489 skb->protocol = htons(ETH_P_IPV6);
0490
0491 pkt->rxe = rxe;
0492 pkt->port_num = port_num;
0493 pkt->hdr = skb_put(skb, paylen);
0494 pkt->mask |= RXE_GRH_MASK;
0495
0496 out:
0497 rdma_put_gid_attr(attr);
0498 return skb;
0499 }
0500
0501
0502
0503
0504
0505 const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num)
0506 {
0507 return rxe->ndev->name;
0508 }
0509
0510 int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
0511 {
0512 int err;
0513 struct rxe_dev *rxe = NULL;
0514
0515 rxe = ib_alloc_device(rxe_dev, ib_dev);
0516 if (!rxe)
0517 return -ENOMEM;
0518
0519 rxe->ndev = ndev;
0520
0521 err = rxe_add(rxe, ndev->mtu, ibdev_name);
0522 if (err) {
0523 ib_dealloc_device(&rxe->ib_dev);
0524 return err;
0525 }
0526
0527 return 0;
0528 }
0529
0530 static void rxe_port_event(struct rxe_dev *rxe,
0531 enum ib_event_type event)
0532 {
0533 struct ib_event ev;
0534
0535 ev.device = &rxe->ib_dev;
0536 ev.element.port_num = 1;
0537 ev.event = event;
0538
0539 ib_dispatch_event(&ev);
0540 }
0541
0542
0543 void rxe_port_up(struct rxe_dev *rxe)
0544 {
0545 struct rxe_port *port;
0546
0547 port = &rxe->port;
0548 port->attr.state = IB_PORT_ACTIVE;
0549
0550 rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE);
0551 dev_info(&rxe->ib_dev.dev, "set active\n");
0552 }
0553
0554
0555 void rxe_port_down(struct rxe_dev *rxe)
0556 {
0557 struct rxe_port *port;
0558
0559 port = &rxe->port;
0560 port->attr.state = IB_PORT_DOWN;
0561
0562 rxe_port_event(rxe, IB_EVENT_PORT_ERR);
0563 rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED);
0564 dev_info(&rxe->ib_dev.dev, "set down\n");
0565 }
0566
0567 void rxe_set_port_state(struct rxe_dev *rxe)
0568 {
0569 if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev))
0570 rxe_port_up(rxe);
0571 else
0572 rxe_port_down(rxe);
0573 }
0574
0575 static int rxe_notify(struct notifier_block *not_blk,
0576 unsigned long event,
0577 void *arg)
0578 {
0579 struct net_device *ndev = netdev_notifier_info_to_dev(arg);
0580 struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
0581
0582 if (!rxe)
0583 return NOTIFY_OK;
0584
0585 switch (event) {
0586 case NETDEV_UNREGISTER:
0587 ib_unregister_device_queued(&rxe->ib_dev);
0588 break;
0589 case NETDEV_UP:
0590 rxe_port_up(rxe);
0591 break;
0592 case NETDEV_DOWN:
0593 rxe_port_down(rxe);
0594 break;
0595 case NETDEV_CHANGEMTU:
0596 pr_info("%s changed mtu to %d\n", ndev->name, ndev->mtu);
0597 rxe_set_mtu(rxe, ndev->mtu);
0598 break;
0599 case NETDEV_CHANGE:
0600 rxe_set_port_state(rxe);
0601 break;
0602 case NETDEV_REBOOT:
0603 case NETDEV_GOING_DOWN:
0604 case NETDEV_CHANGEADDR:
0605 case NETDEV_CHANGENAME:
0606 case NETDEV_FEAT_CHANGE:
0607 default:
0608 pr_info("ignoring netdev event = %ld for %s\n",
0609 event, ndev->name);
0610 break;
0611 }
0612
0613 ib_device_put(&rxe->ib_dev);
0614 return NOTIFY_OK;
0615 }
0616
0617 static struct notifier_block rxe_net_notifier = {
0618 .notifier_call = rxe_notify,
0619 };
0620
0621 static int rxe_net_ipv4_init(void)
0622 {
0623 recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net,
0624 htons(ROCE_V2_UDP_DPORT), false);
0625 if (IS_ERR(recv_sockets.sk4)) {
0626 recv_sockets.sk4 = NULL;
0627 pr_err("Failed to create IPv4 UDP tunnel\n");
0628 return -1;
0629 }
0630
0631 return 0;
0632 }
0633
0634 static int rxe_net_ipv6_init(void)
0635 {
0636 #if IS_ENABLED(CONFIG_IPV6)
0637
0638 recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net,
0639 htons(ROCE_V2_UDP_DPORT), true);
0640 if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) {
0641 recv_sockets.sk6 = NULL;
0642 pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n");
0643 return 0;
0644 }
0645
0646 if (IS_ERR(recv_sockets.sk6)) {
0647 recv_sockets.sk6 = NULL;
0648 pr_err("Failed to create IPv6 UDP tunnel\n");
0649 return -1;
0650 }
0651 #endif
0652 return 0;
0653 }
0654
0655 void rxe_net_exit(void)
0656 {
0657 rxe_release_udp_tunnel(recv_sockets.sk6);
0658 rxe_release_udp_tunnel(recv_sockets.sk4);
0659 unregister_netdevice_notifier(&rxe_net_notifier);
0660 }
0661
0662 int rxe_net_init(void)
0663 {
0664 int err;
0665
0666 recv_sockets.sk6 = NULL;
0667
0668 err = rxe_net_ipv4_init();
0669 if (err)
0670 return err;
0671 err = rxe_net_ipv6_init();
0672 if (err)
0673 goto err_out;
0674 err = register_netdevice_notifier(&rxe_net_notifier);
0675 if (err) {
0676 pr_err("Failed to register netdev notifier\n");
0677 goto err_out;
0678 }
0679 return 0;
0680 err_out:
0681 rxe_net_exit();
0682 return err;
0683 }