Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
0004  *
0005  *  IB infrastructure:
0006  *  Establish SMC-R as an Infiniband Client to be notified about added and
0007  *  removed IB devices of type RDMA.
0008  *  Determine device and port characteristics for these IB devices.
0009  *
0010  *  Copyright IBM Corp. 2016
0011  *
0012  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
0013  */
0014 
0015 #include <linux/etherdevice.h>
0016 #include <linux/if_vlan.h>
0017 #include <linux/random.h>
0018 #include <linux/workqueue.h>
0019 #include <linux/scatterlist.h>
0020 #include <linux/wait.h>
0021 #include <linux/mutex.h>
0022 #include <linux/inetdevice.h>
0023 #include <rdma/ib_verbs.h>
0024 #include <rdma/ib_cache.h>
0025 
0026 #include "smc_pnet.h"
0027 #include "smc_ib.h"
0028 #include "smc_core.h"
0029 #include "smc_wr.h"
0030 #include "smc.h"
0031 #include "smc_netlink.h"
0032 
0033 #define SMC_MAX_CQE 32766   /* max. # of completion queue elements */
0034 
0035 #define SMC_QP_MIN_RNR_TIMER        5
0036 #define SMC_QP_TIMEOUT          15 /* 4096 * 2 ** timeout usec */
0037 #define SMC_QP_RETRY_CNT            7 /* 7: infinite */
0038 #define SMC_QP_RNR_RETRY            7 /* 7: infinite */
0039 
0040 struct smc_ib_devices smc_ib_devices = {    /* smc-registered ib devices */
0041     .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex),
0042     .list = LIST_HEAD_INIT(smc_ib_devices.list),
0043 };
0044 
0045 u8 local_systemid[SMC_SYSTEMID_LEN];        /* unique system identifier */
0046 
0047 static int smc_ib_modify_qp_init(struct smc_link *lnk)
0048 {
0049     struct ib_qp_attr qp_attr;
0050 
0051     memset(&qp_attr, 0, sizeof(qp_attr));
0052     qp_attr.qp_state = IB_QPS_INIT;
0053     qp_attr.pkey_index = 0;
0054     qp_attr.port_num = lnk->ibport;
0055     qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
0056                 | IB_ACCESS_REMOTE_WRITE;
0057     return ib_modify_qp(lnk->roce_qp, &qp_attr,
0058                 IB_QP_STATE | IB_QP_PKEY_INDEX |
0059                 IB_QP_ACCESS_FLAGS | IB_QP_PORT);
0060 }
0061 
0062 static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
0063 {
0064     enum ib_qp_attr_mask qp_attr_mask =
0065         IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
0066         IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
0067     struct ib_qp_attr qp_attr;
0068     u8 hop_lim = 1;
0069 
0070     memset(&qp_attr, 0, sizeof(qp_attr));
0071     qp_attr.qp_state = IB_QPS_RTR;
0072     qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
0073     qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
0074     rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
0075     if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
0076         hop_lim = IPV6_DEFAULT_HOPLIMIT;
0077     rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0);
0078     rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
0079     if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
0080         memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac,
0081                sizeof(lnk->lgr->nexthop_mac));
0082     else
0083         memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
0084                sizeof(lnk->peer_mac));
0085     qp_attr.dest_qp_num = lnk->peer_qpn;
0086     qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
0087     qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
0088                      * requests
0089                      */
0090     qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
0091 
0092     return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
0093 }
0094 
0095 int smc_ib_modify_qp_rts(struct smc_link *lnk)
0096 {
0097     struct ib_qp_attr qp_attr;
0098 
0099     memset(&qp_attr, 0, sizeof(qp_attr));
0100     qp_attr.qp_state = IB_QPS_RTS;
0101     qp_attr.timeout = SMC_QP_TIMEOUT;   /* local ack timeout */
0102     qp_attr.retry_cnt = SMC_QP_RETRY_CNT;   /* retry count */
0103     qp_attr.rnr_retry = SMC_QP_RNR_RETRY;   /* RNR retries, 7=infinite */
0104     qp_attr.sq_psn = lnk->psn_initial;  /* starting send packet seq # */
0105     qp_attr.max_rd_atomic = 1;  /* # of outstanding RDMA reads and
0106                      * atomic ops allowed
0107                      */
0108     return ib_modify_qp(lnk->roce_qp, &qp_attr,
0109                 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
0110                 IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
0111                 IB_QP_MAX_QP_RD_ATOMIC);
0112 }
0113 
0114 int smc_ib_modify_qp_error(struct smc_link *lnk)
0115 {
0116     struct ib_qp_attr qp_attr;
0117 
0118     memset(&qp_attr, 0, sizeof(qp_attr));
0119     qp_attr.qp_state = IB_QPS_ERR;
0120     return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
0121 }
0122 
0123 int smc_ib_ready_link(struct smc_link *lnk)
0124 {
0125     struct smc_link_group *lgr = smc_get_lgr(lnk);
0126     int rc = 0;
0127 
0128     rc = smc_ib_modify_qp_init(lnk);
0129     if (rc)
0130         goto out;
0131 
0132     rc = smc_ib_modify_qp_rtr(lnk);
0133     if (rc)
0134         goto out;
0135     smc_wr_remember_qp_attr(lnk);
0136     rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
0137                   IB_CQ_SOLICITED_MASK);
0138     if (rc)
0139         goto out;
0140     rc = smc_wr_rx_post_init(lnk);
0141     if (rc)
0142         goto out;
0143     smc_wr_remember_qp_attr(lnk);
0144 
0145     if (lgr->role == SMC_SERV) {
0146         rc = smc_ib_modify_qp_rts(lnk);
0147         if (rc)
0148             goto out;
0149         smc_wr_remember_qp_attr(lnk);
0150     }
0151 out:
0152     return rc;
0153 }
0154 
0155 static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
0156 {
0157     const struct ib_gid_attr *attr;
0158     int rc;
0159 
0160     attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
0161     if (IS_ERR(attr))
0162         return -ENODEV;
0163 
0164     rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]);
0165     rdma_put_gid_attr(attr);
0166     return rc;
0167 }
0168 
0169 /* Create an identifier unique for this instance of SMC-R.
0170  * The MAC-address of the first active registered IB device
0171  * plus a random 2-byte number is used to create this identifier.
0172  * This name is delivered to the peer during connection initialization.
0173  */
0174 static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
0175                         u8 ibport)
0176 {
0177     memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
0178            sizeof(smcibdev->mac[ibport - 1]));
0179 }
0180 
0181 bool smc_ib_is_valid_local_systemid(void)
0182 {
0183     return !is_zero_ether_addr(&local_systemid[2]);
0184 }
0185 
0186 static void smc_ib_init_local_systemid(void)
0187 {
0188     get_random_bytes(&local_systemid[0], 2);
0189 }
0190 
0191 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
0192 {
0193     return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
0194 }
0195 
0196 int smc_ib_find_route(__be32 saddr, __be32 daddr,
0197               u8 nexthop_mac[], u8 *uses_gateway)
0198 {
0199     struct neighbour *neigh = NULL;
0200     struct rtable *rt = NULL;
0201     struct flowi4 fl4 = {
0202         .saddr = saddr,
0203         .daddr = daddr
0204     };
0205 
0206     if (daddr == cpu_to_be32(INADDR_NONE))
0207         goto out;
0208     rt = ip_route_output_flow(&init_net, &fl4, NULL);
0209     if (IS_ERR(rt))
0210         goto out;
0211     if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET)
0212         goto out;
0213     neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr);
0214     if (neigh) {
0215         memcpy(nexthop_mac, neigh->ha, ETH_ALEN);
0216         *uses_gateway = rt->rt_uses_gateway;
0217         return 0;
0218     }
0219 out:
0220     return -ENOENT;
0221 }
0222 
0223 static int smc_ib_determine_gid_rcu(const struct net_device *ndev,
0224                     const struct ib_gid_attr *attr,
0225                     u8 gid[], u8 *sgid_index,
0226                     struct smc_init_info_smcrv2 *smcrv2)
0227 {
0228     if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) {
0229         if (gid)
0230             memcpy(gid, &attr->gid, SMC_GID_SIZE);
0231         if (sgid_index)
0232             *sgid_index = attr->index;
0233         return 0;
0234     }
0235     if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
0236         smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) {
0237         struct in_device *in_dev = __in_dev_get_rcu(ndev);
0238         const struct in_ifaddr *ifa;
0239         bool subnet_match = false;
0240 
0241         if (!in_dev)
0242             goto out;
0243         in_dev_for_each_ifa_rcu(ifa, in_dev) {
0244             if (!inet_ifa_match(smcrv2->saddr, ifa))
0245                 continue;
0246             subnet_match = true;
0247             break;
0248         }
0249         if (!subnet_match)
0250             goto out;
0251         if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr,
0252                                smcrv2->daddr,
0253                                smcrv2->nexthop_mac,
0254                                &smcrv2->uses_gateway))
0255             goto out;
0256 
0257         if (gid)
0258             memcpy(gid, &attr->gid, SMC_GID_SIZE);
0259         if (sgid_index)
0260             *sgid_index = attr->index;
0261         return 0;
0262     }
0263 out:
0264     return -ENODEV;
0265 }
0266 
0267 /* determine the gid for an ib-device port and vlan id */
0268 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
0269              unsigned short vlan_id, u8 gid[], u8 *sgid_index,
0270              struct smc_init_info_smcrv2 *smcrv2)
0271 {
0272     const struct ib_gid_attr *attr;
0273     const struct net_device *ndev;
0274     int i;
0275 
0276     for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
0277         attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
0278         if (IS_ERR(attr))
0279             continue;
0280 
0281         rcu_read_lock();
0282         ndev = rdma_read_gid_attr_ndev_rcu(attr);
0283         if (!IS_ERR(ndev) &&
0284             ((!vlan_id && !is_vlan_dev(ndev)) ||
0285              (vlan_id && is_vlan_dev(ndev) &&
0286               vlan_dev_vlan_id(ndev) == vlan_id))) {
0287             if (!smc_ib_determine_gid_rcu(ndev, attr, gid,
0288                               sgid_index, smcrv2)) {
0289                 rcu_read_unlock();
0290                 rdma_put_gid_attr(attr);
0291                 return 0;
0292             }
0293         }
0294         rcu_read_unlock();
0295         rdma_put_gid_attr(attr);
0296     }
0297     return -ENODEV;
0298 }
0299 
0300 /* check if gid is still defined on smcibdev */
0301 static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2,
0302                   struct smc_ib_device *smcibdev, u8 ibport)
0303 {
0304     const struct ib_gid_attr *attr;
0305     bool rc = false;
0306     int i;
0307 
0308     for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
0309         attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
0310         if (IS_ERR(attr))
0311             continue;
0312 
0313         rcu_read_lock();
0314         if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) ||
0315             (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
0316              !(ipv6_addr_type((const struct in6_addr *)&attr->gid)
0317                      & IPV6_ADDR_LINKLOCAL)))
0318             if (!memcmp(gid, &attr->gid, SMC_GID_SIZE))
0319                 rc = true;
0320         rcu_read_unlock();
0321         rdma_put_gid_attr(attr);
0322     }
0323     return rc;
0324 }
0325 
0326 /* check all links if the gid is still defined on smcibdev */
0327 static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport)
0328 {
0329     struct smc_link_group *lgr;
0330     int i;
0331 
0332     spin_lock_bh(&smc_lgr_list.lock);
0333     list_for_each_entry(lgr, &smc_lgr_list.list, list) {
0334         if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
0335                 SMC_MAX_PNETID_LEN))
0336             continue; /* lgr is not affected */
0337         if (list_empty(&lgr->list))
0338             continue;
0339         for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
0340             if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
0341                 lgr->lnk[i].smcibdev != smcibdev)
0342                 continue;
0343             if (!smc_ib_check_link_gid(lgr->lnk[i].gid,
0344                            lgr->smc_version == SMC_V2,
0345                            smcibdev, ibport))
0346                 smcr_port_err(smcibdev, ibport);
0347         }
0348     }
0349     spin_unlock_bh(&smc_lgr_list.lock);
0350 }
0351 
0352 static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
0353 {
0354     int rc;
0355 
0356     memset(&smcibdev->pattr[ibport - 1], 0,
0357            sizeof(smcibdev->pattr[ibport - 1]));
0358     rc = ib_query_port(smcibdev->ibdev, ibport,
0359                &smcibdev->pattr[ibport - 1]);
0360     if (rc)
0361         goto out;
0362     /* the SMC protocol requires specification of the RoCE MAC address */
0363     rc = smc_ib_fill_mac(smcibdev, ibport);
0364     if (rc)
0365         goto out;
0366     if (!smc_ib_is_valid_local_systemid() &&
0367         smc_ib_port_active(smcibdev, ibport))
0368         /* create unique system identifier */
0369         smc_ib_define_local_systemid(smcibdev, ibport);
0370 out:
0371     return rc;
0372 }
0373 
0374 /* process context wrapper for might_sleep smc_ib_remember_port_attr */
0375 static void smc_ib_port_event_work(struct work_struct *work)
0376 {
0377     struct smc_ib_device *smcibdev = container_of(
0378         work, struct smc_ib_device, port_event_work);
0379     u8 port_idx;
0380 
0381     for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
0382         smc_ib_remember_port_attr(smcibdev, port_idx + 1);
0383         clear_bit(port_idx, &smcibdev->port_event_mask);
0384         if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
0385             set_bit(port_idx, smcibdev->ports_going_away);
0386             smcr_port_err(smcibdev, port_idx + 1);
0387         } else {
0388             clear_bit(port_idx, smcibdev->ports_going_away);
0389             smcr_port_add(smcibdev, port_idx + 1);
0390             smc_ib_gid_check(smcibdev, port_idx + 1);
0391         }
0392     }
0393 }
0394 
0395 /* can be called in IRQ context */
0396 static void smc_ib_global_event_handler(struct ib_event_handler *handler,
0397                     struct ib_event *ibevent)
0398 {
0399     struct smc_ib_device *smcibdev;
0400     bool schedule = false;
0401     u8 port_idx;
0402 
0403     smcibdev = container_of(handler, struct smc_ib_device, event_handler);
0404 
0405     switch (ibevent->event) {
0406     case IB_EVENT_DEVICE_FATAL:
0407         /* terminate all ports on device */
0408         for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
0409             set_bit(port_idx, &smcibdev->port_event_mask);
0410             if (!test_and_set_bit(port_idx,
0411                           smcibdev->ports_going_away))
0412                 schedule = true;
0413         }
0414         if (schedule)
0415             schedule_work(&smcibdev->port_event_work);
0416         break;
0417     case IB_EVENT_PORT_ACTIVE:
0418         port_idx = ibevent->element.port_num - 1;
0419         if (port_idx >= SMC_MAX_PORTS)
0420             break;
0421         set_bit(port_idx, &smcibdev->port_event_mask);
0422         if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
0423             schedule_work(&smcibdev->port_event_work);
0424         break;
0425     case IB_EVENT_PORT_ERR:
0426         port_idx = ibevent->element.port_num - 1;
0427         if (port_idx >= SMC_MAX_PORTS)
0428             break;
0429         set_bit(port_idx, &smcibdev->port_event_mask);
0430         if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
0431             schedule_work(&smcibdev->port_event_work);
0432         break;
0433     case IB_EVENT_GID_CHANGE:
0434         port_idx = ibevent->element.port_num - 1;
0435         if (port_idx >= SMC_MAX_PORTS)
0436             break;
0437         set_bit(port_idx, &smcibdev->port_event_mask);
0438         schedule_work(&smcibdev->port_event_work);
0439         break;
0440     default:
0441         break;
0442     }
0443 }
0444 
0445 void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
0446 {
0447     if (lnk->roce_pd)
0448         ib_dealloc_pd(lnk->roce_pd);
0449     lnk->roce_pd = NULL;
0450 }
0451 
0452 int smc_ib_create_protection_domain(struct smc_link *lnk)
0453 {
0454     int rc;
0455 
0456     lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
0457     rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
0458     if (IS_ERR(lnk->roce_pd))
0459         lnk->roce_pd = NULL;
0460     return rc;
0461 }
0462 
0463 static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr,
0464                       struct smc_ib_device *smcibdev)
0465 {
0466     struct smc_link_group *lgr;
0467     bool rc = false;
0468     int i;
0469 
0470     spin_lock_bh(&smc_lgr->lock);
0471     list_for_each_entry(lgr, &smc_lgr->list, list) {
0472         if (lgr->is_smcd)
0473             continue;
0474         for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
0475             if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
0476                 lgr->lnk[i].smcibdev != smcibdev)
0477                 continue;
0478             if (lgr->type == SMC_LGR_SINGLE ||
0479                 lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) {
0480                 rc = true;
0481                 goto out;
0482             }
0483         }
0484     }
0485 out:
0486     spin_unlock_bh(&smc_lgr->lock);
0487     return rc;
0488 }
0489 
0490 static int smc_nl_handle_dev_port(struct sk_buff *skb,
0491                   struct ib_device *ibdev,
0492                   struct smc_ib_device *smcibdev,
0493                   int port)
0494 {
0495     char smc_pnet[SMC_MAX_PNETID_LEN + 1];
0496     struct nlattr *port_attrs;
0497     unsigned char port_state;
0498     int lnk_count = 0;
0499 
0500     port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port);
0501     if (!port_attrs)
0502         goto errout;
0503 
0504     if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR,
0505                smcibdev->pnetid_by_user[port]))
0506         goto errattr;
0507     memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN);
0508     smc_pnet[SMC_MAX_PNETID_LEN] = 0;
0509     if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
0510         goto errattr;
0511     if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV,
0512             smcibdev->ndev_ifidx[port]))
0513         goto errattr;
0514     if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1))
0515         goto errattr;
0516     port_state = smc_ib_port_active(smcibdev, port + 1);
0517     if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state))
0518         goto errattr;
0519     lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]);
0520     if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count))
0521         goto errattr;
0522     nla_nest_end(skb, port_attrs);
0523     return 0;
0524 errattr:
0525     nla_nest_cancel(skb, port_attrs);
0526 errout:
0527     return -EMSGSIZE;
0528 }
0529 
0530 static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev,
0531                      struct sk_buff *skb)
0532 {
0533     if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid))
0534         return false;
0535     if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid))
0536         return false;
0537     if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor))
0538         return false;
0539     if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device))
0540         return false;
0541     if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id))
0542         return false;
0543     return true;
0544 }
0545 
0546 static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev,
0547                   struct sk_buff *skb,
0548                   struct netlink_callback *cb)
0549 {
0550     char smc_ibname[IB_DEVICE_NAME_MAX];
0551     struct smc_pci_dev smc_pci_dev;
0552     struct pci_dev *pci_dev;
0553     unsigned char is_crit;
0554     struct nlattr *attrs;
0555     void *nlh;
0556     int i;
0557 
0558     nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
0559               &smc_gen_nl_family, NLM_F_MULTI,
0560               SMC_NETLINK_GET_DEV_SMCR);
0561     if (!nlh)
0562         goto errmsg;
0563     attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR);
0564     if (!attrs)
0565         goto errout;
0566     is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev);
0567     if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit))
0568         goto errattr;
0569     if (smcibdev->ibdev->dev.parent) {
0570         memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
0571         pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent);
0572         smc_set_pci_values(pci_dev, &smc_pci_dev);
0573         if (!smc_nl_handle_pci_values(&smc_pci_dev, skb))
0574             goto errattr;
0575     }
0576     snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name);
0577     if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname))
0578         goto errattr;
0579     for (i = 1; i <= SMC_MAX_PORTS; i++) {
0580         if (!rdma_is_port_valid(smcibdev->ibdev, i))
0581             continue;
0582         if (smc_nl_handle_dev_port(skb, smcibdev->ibdev,
0583                        smcibdev, i - 1))
0584             goto errattr;
0585     }
0586 
0587     nla_nest_end(skb, attrs);
0588     genlmsg_end(skb, nlh);
0589     return 0;
0590 
0591 errattr:
0592     nla_nest_cancel(skb, attrs);
0593 errout:
0594     genlmsg_cancel(skb, nlh);
0595 errmsg:
0596     return -EMSGSIZE;
0597 }
0598 
0599 static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list,
0600                  struct sk_buff *skb,
0601                  struct netlink_callback *cb)
0602 {
0603     struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
0604     struct smc_ib_device *smcibdev;
0605     int snum = cb_ctx->pos[0];
0606     int num = 0;
0607 
0608     mutex_lock(&dev_list->mutex);
0609     list_for_each_entry(smcibdev, &dev_list->list, list) {
0610         if (num < snum)
0611             goto next;
0612         if (smc_nl_handle_smcr_dev(smcibdev, skb, cb))
0613             goto errout;
0614 next:
0615         num++;
0616     }
0617 errout:
0618     mutex_unlock(&dev_list->mutex);
0619     cb_ctx->pos[0] = num;
0620 }
0621 
0622 int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
0623 {
0624     smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb);
0625     return skb->len;
0626 }
0627 
0628 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
0629 {
0630     struct smc_link *lnk = (struct smc_link *)priv;
0631     struct smc_ib_device *smcibdev = lnk->smcibdev;
0632     u8 port_idx;
0633 
0634     switch (ibevent->event) {
0635     case IB_EVENT_QP_FATAL:
0636     case IB_EVENT_QP_ACCESS_ERR:
0637         port_idx = ibevent->element.qp->port - 1;
0638         if (port_idx >= SMC_MAX_PORTS)
0639             break;
0640         set_bit(port_idx, &smcibdev->port_event_mask);
0641         if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
0642             schedule_work(&smcibdev->port_event_work);
0643         break;
0644     default:
0645         break;
0646     }
0647 }
0648 
0649 void smc_ib_destroy_queue_pair(struct smc_link *lnk)
0650 {
0651     if (lnk->roce_qp)
0652         ib_destroy_qp(lnk->roce_qp);
0653     lnk->roce_qp = NULL;
0654 }
0655 
0656 /* create a queue pair within the protection domain for a link */
0657 int smc_ib_create_queue_pair(struct smc_link *lnk)
0658 {
0659     int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
0660     struct ib_qp_init_attr qp_attr = {
0661         .event_handler = smc_ib_qp_event_handler,
0662         .qp_context = lnk,
0663         .send_cq = lnk->smcibdev->roce_cq_send,
0664         .recv_cq = lnk->smcibdev->roce_cq_recv,
0665         .srq = NULL,
0666         .cap = {
0667                 /* include unsolicited rdma_writes as well,
0668                  * there are max. 2 RDMA_WRITE per 1 WR_SEND
0669                  */
0670             .max_send_wr = SMC_WR_BUF_CNT * 3,
0671             .max_recv_wr = SMC_WR_BUF_CNT * 3,
0672             .max_send_sge = SMC_IB_MAX_SEND_SGE,
0673             .max_recv_sge = sges_per_buf,
0674             .max_inline_data = 0,
0675         },
0676         .sq_sig_type = IB_SIGNAL_REQ_WR,
0677         .qp_type = IB_QPT_RC,
0678     };
0679     int rc;
0680 
0681     lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
0682     rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
0683     if (IS_ERR(lnk->roce_qp))
0684         lnk->roce_qp = NULL;
0685     else
0686         smc_wr_remember_qp_attr(lnk);
0687     return rc;
0688 }
0689 
0690 void smc_ib_put_memory_region(struct ib_mr *mr)
0691 {
0692     ib_dereg_mr(mr);
0693 }
0694 
0695 static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
0696 {
0697     unsigned int offset = 0;
0698     int sg_num;
0699 
0700     /* map the largest prefix of a dma mapped SG list */
0701     sg_num = ib_map_mr_sg(buf_slot->mr[link_idx],
0702                   buf_slot->sgt[link_idx].sgl,
0703                   buf_slot->sgt[link_idx].orig_nents,
0704                   &offset, PAGE_SIZE);
0705 
0706     return sg_num;
0707 }
0708 
0709 /* Allocate a memory region and map the dma mapped SG list of buf_slot */
0710 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
0711                  struct smc_buf_desc *buf_slot, u8 link_idx)
0712 {
0713     if (buf_slot->mr[link_idx])
0714         return 0; /* already done */
0715 
0716     buf_slot->mr[link_idx] =
0717         ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
0718     if (IS_ERR(buf_slot->mr[link_idx])) {
0719         int rc;
0720 
0721         rc = PTR_ERR(buf_slot->mr[link_idx]);
0722         buf_slot->mr[link_idx] = NULL;
0723         return rc;
0724     }
0725 
0726     if (smc_ib_map_mr_sg(buf_slot, link_idx) !=
0727                  buf_slot->sgt[link_idx].orig_nents)
0728         return -EINVAL;
0729 
0730     return 0;
0731 }
0732 
0733 bool smc_ib_is_sg_need_sync(struct smc_link *lnk,
0734                 struct smc_buf_desc *buf_slot)
0735 {
0736     struct scatterlist *sg;
0737     unsigned int i;
0738     bool ret = false;
0739 
0740     /* for now there is just one DMA address */
0741     for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
0742             buf_slot->sgt[lnk->link_idx].nents, i) {
0743         if (!sg_dma_len(sg))
0744             break;
0745         if (dma_need_sync(lnk->smcibdev->ibdev->dma_device,
0746                   sg_dma_address(sg))) {
0747             ret = true;
0748             goto out;
0749         }
0750     }
0751 
0752 out:
0753     return ret;
0754 }
0755 
0756 /* synchronize buffer usage for cpu access */
0757 void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
0758                 struct smc_buf_desc *buf_slot,
0759                 enum dma_data_direction data_direction)
0760 {
0761     struct scatterlist *sg;
0762     unsigned int i;
0763 
0764     if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx)))
0765         return;
0766 
0767     /* for now there is just one DMA address */
0768     for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
0769             buf_slot->sgt[lnk->link_idx].nents, i) {
0770         if (!sg_dma_len(sg))
0771             break;
0772         ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
0773                        sg_dma_address(sg),
0774                        sg_dma_len(sg),
0775                        data_direction);
0776     }
0777 }
0778 
0779 /* synchronize buffer usage for device access */
0780 void smc_ib_sync_sg_for_device(struct smc_link *lnk,
0781                    struct smc_buf_desc *buf_slot,
0782                    enum dma_data_direction data_direction)
0783 {
0784     struct scatterlist *sg;
0785     unsigned int i;
0786 
0787     if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx)))
0788         return;
0789 
0790     /* for now there is just one DMA address */
0791     for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
0792             buf_slot->sgt[lnk->link_idx].nents, i) {
0793         if (!sg_dma_len(sg))
0794             break;
0795         ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
0796                           sg_dma_address(sg),
0797                           sg_dma_len(sg),
0798                           data_direction);
0799     }
0800 }
0801 
0802 /* Map a new TX or RX buffer SG-table to DMA */
0803 int smc_ib_buf_map_sg(struct smc_link *lnk,
0804               struct smc_buf_desc *buf_slot,
0805               enum dma_data_direction data_direction)
0806 {
0807     int mapped_nents;
0808 
0809     mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
0810                      buf_slot->sgt[lnk->link_idx].sgl,
0811                      buf_slot->sgt[lnk->link_idx].orig_nents,
0812                      data_direction);
0813     if (!mapped_nents)
0814         return -ENOMEM;
0815 
0816     return mapped_nents;
0817 }
0818 
0819 void smc_ib_buf_unmap_sg(struct smc_link *lnk,
0820              struct smc_buf_desc *buf_slot,
0821              enum dma_data_direction data_direction)
0822 {
0823     if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
0824         return; /* already unmapped */
0825 
0826     ib_dma_unmap_sg(lnk->smcibdev->ibdev,
0827             buf_slot->sgt[lnk->link_idx].sgl,
0828             buf_slot->sgt[lnk->link_idx].orig_nents,
0829             data_direction);
0830     buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
0831 }
0832 
0833 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
0834 {
0835     struct ib_cq_init_attr cqattr = {
0836         .cqe = SMC_MAX_CQE, .comp_vector = 0 };
0837     int cqe_size_order, smc_order;
0838     long rc;
0839 
0840     mutex_lock(&smcibdev->mutex);
0841     rc = 0;
0842     if (smcibdev->initialized)
0843         goto out;
0844     /* the calculated number of cq entries fits to mlx5 cq allocation */
0845     cqe_size_order = cache_line_size() == 128 ? 7 : 6;
0846     smc_order = MAX_ORDER - cqe_size_order - 1;
0847     if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
0848         cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
0849     smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
0850                           smc_wr_tx_cq_handler, NULL,
0851                           smcibdev, &cqattr);
0852     rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
0853     if (IS_ERR(smcibdev->roce_cq_send)) {
0854         smcibdev->roce_cq_send = NULL;
0855         goto out;
0856     }
0857     smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
0858                           smc_wr_rx_cq_handler, NULL,
0859                           smcibdev, &cqattr);
0860     rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
0861     if (IS_ERR(smcibdev->roce_cq_recv)) {
0862         smcibdev->roce_cq_recv = NULL;
0863         goto err;
0864     }
0865     smc_wr_add_dev(smcibdev);
0866     smcibdev->initialized = 1;
0867     goto out;
0868 
0869 err:
0870     ib_destroy_cq(smcibdev->roce_cq_send);
0871 out:
0872     mutex_unlock(&smcibdev->mutex);
0873     return rc;
0874 }
0875 
0876 static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
0877 {
0878     mutex_lock(&smcibdev->mutex);
0879     if (!smcibdev->initialized)
0880         goto out;
0881     smcibdev->initialized = 0;
0882     ib_destroy_cq(smcibdev->roce_cq_recv);
0883     ib_destroy_cq(smcibdev->roce_cq_send);
0884     smc_wr_remove_dev(smcibdev);
0885 out:
0886     mutex_unlock(&smcibdev->mutex);
0887 }
0888 
0889 static struct ib_client smc_ib_client;
0890 
0891 static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port)
0892 {
0893     struct ib_device *ibdev = smcibdev->ibdev;
0894     struct net_device *ndev;
0895 
0896     if (!ibdev->ops.get_netdev)
0897         return;
0898     ndev = ibdev->ops.get_netdev(ibdev, port + 1);
0899     if (ndev) {
0900         smcibdev->ndev_ifidx[port] = ndev->ifindex;
0901         dev_put(ndev);
0902     }
0903 }
0904 
0905 void smc_ib_ndev_change(struct net_device *ndev, unsigned long event)
0906 {
0907     struct smc_ib_device *smcibdev;
0908     struct ib_device *libdev;
0909     struct net_device *lndev;
0910     u8 port_cnt;
0911     int i;
0912 
0913     mutex_lock(&smc_ib_devices.mutex);
0914     list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
0915         port_cnt = smcibdev->ibdev->phys_port_cnt;
0916         for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) {
0917             libdev = smcibdev->ibdev;
0918             if (!libdev->ops.get_netdev)
0919                 continue;
0920             lndev = libdev->ops.get_netdev(libdev, i + 1);
0921             dev_put(lndev);
0922             if (lndev != ndev)
0923                 continue;
0924             if (event == NETDEV_REGISTER)
0925                 smcibdev->ndev_ifidx[i] = ndev->ifindex;
0926             if (event == NETDEV_UNREGISTER)
0927                 smcibdev->ndev_ifidx[i] = 0;
0928         }
0929     }
0930     mutex_unlock(&smc_ib_devices.mutex);
0931 }
0932 
0933 /* callback function for ib_register_client() */
0934 static int smc_ib_add_dev(struct ib_device *ibdev)
0935 {
0936     struct smc_ib_device *smcibdev;
0937     u8 port_cnt;
0938     int i;
0939 
0940     if (ibdev->node_type != RDMA_NODE_IB_CA)
0941         return -EOPNOTSUPP;
0942 
0943     smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
0944     if (!smcibdev)
0945         return -ENOMEM;
0946 
0947     smcibdev->ibdev = ibdev;
0948     INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
0949     atomic_set(&smcibdev->lnk_cnt, 0);
0950     init_waitqueue_head(&smcibdev->lnks_deleted);
0951     mutex_init(&smcibdev->mutex);
0952     mutex_lock(&smc_ib_devices.mutex);
0953     list_add_tail(&smcibdev->list, &smc_ib_devices.list);
0954     mutex_unlock(&smc_ib_devices.mutex);
0955     ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
0956     INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
0957                   smc_ib_global_event_handler);
0958     ib_register_event_handler(&smcibdev->event_handler);
0959 
0960     /* trigger reading of the port attributes */
0961     port_cnt = smcibdev->ibdev->phys_port_cnt;
0962     pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
0963                 smcibdev->ibdev->name, port_cnt);
0964     for (i = 0;
0965          i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
0966          i++) {
0967         set_bit(i, &smcibdev->port_event_mask);
0968         /* determine pnetids of the port */
0969         if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
0970                        smcibdev->pnetid[i]))
0971             smc_pnetid_by_table_ib(smcibdev, i + 1);
0972         smc_copy_netdev_ifindex(smcibdev, i);
0973         pr_warn_ratelimited("smc:    ib device %s port %d has pnetid "
0974                     "%.16s%s\n",
0975                     smcibdev->ibdev->name, i + 1,
0976                     smcibdev->pnetid[i],
0977                     smcibdev->pnetid_by_user[i] ?
0978                      " (user defined)" :
0979                      "");
0980     }
0981     schedule_work(&smcibdev->port_event_work);
0982     return 0;
0983 }
0984 
0985 /* callback function for ib_unregister_client() */
0986 static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
0987 {
0988     struct smc_ib_device *smcibdev = client_data;
0989 
0990     mutex_lock(&smc_ib_devices.mutex);
0991     list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
0992     mutex_unlock(&smc_ib_devices.mutex);
0993     pr_warn_ratelimited("smc: removing ib device %s\n",
0994                 smcibdev->ibdev->name);
0995     smc_smcr_terminate_all(smcibdev);
0996     smc_ib_cleanup_per_ibdev(smcibdev);
0997     ib_unregister_event_handler(&smcibdev->event_handler);
0998     cancel_work_sync(&smcibdev->port_event_work);
0999     kfree(smcibdev);
1000 }
1001 
1002 static struct ib_client smc_ib_client = {
1003     .name   = "smc_ib",
1004     .add    = smc_ib_add_dev,
1005     .remove = smc_ib_remove_dev,
1006 };
1007 
1008 int __init smc_ib_register_client(void)
1009 {
1010     smc_ib_init_local_systemid();
1011     return ib_register_client(&smc_ib_client);
1012 }
1013 
1014 void smc_ib_unregister_client(void)
1015 {
1016     ib_unregister_client(&smc_ib_client);
1017 }