0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015 #include <linux/etherdevice.h>
0016 #include <linux/if_vlan.h>
0017 #include <linux/random.h>
0018 #include <linux/workqueue.h>
0019 #include <linux/scatterlist.h>
0020 #include <linux/wait.h>
0021 #include <linux/mutex.h>
0022 #include <linux/inetdevice.h>
0023 #include <rdma/ib_verbs.h>
0024 #include <rdma/ib_cache.h>
0025
0026 #include "smc_pnet.h"
0027 #include "smc_ib.h"
0028 #include "smc_core.h"
0029 #include "smc_wr.h"
0030 #include "smc.h"
0031 #include "smc_netlink.h"
0032
0033 #define SMC_MAX_CQE 32766
0034
0035 #define SMC_QP_MIN_RNR_TIMER 5
0036 #define SMC_QP_TIMEOUT 15
0037 #define SMC_QP_RETRY_CNT 7
0038 #define SMC_QP_RNR_RETRY 7
0039
0040 struct smc_ib_devices smc_ib_devices = {
0041 .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex),
0042 .list = LIST_HEAD_INIT(smc_ib_devices.list),
0043 };
0044
0045 u8 local_systemid[SMC_SYSTEMID_LEN];
0046
0047 static int smc_ib_modify_qp_init(struct smc_link *lnk)
0048 {
0049 struct ib_qp_attr qp_attr;
0050
0051 memset(&qp_attr, 0, sizeof(qp_attr));
0052 qp_attr.qp_state = IB_QPS_INIT;
0053 qp_attr.pkey_index = 0;
0054 qp_attr.port_num = lnk->ibport;
0055 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
0056 | IB_ACCESS_REMOTE_WRITE;
0057 return ib_modify_qp(lnk->roce_qp, &qp_attr,
0058 IB_QP_STATE | IB_QP_PKEY_INDEX |
0059 IB_QP_ACCESS_FLAGS | IB_QP_PORT);
0060 }
0061
0062 static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
0063 {
0064 enum ib_qp_attr_mask qp_attr_mask =
0065 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
0066 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
0067 struct ib_qp_attr qp_attr;
0068 u8 hop_lim = 1;
0069
0070 memset(&qp_attr, 0, sizeof(qp_attr));
0071 qp_attr.qp_state = IB_QPS_RTR;
0072 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
0073 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
0074 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
0075 if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
0076 hop_lim = IPV6_DEFAULT_HOPLIMIT;
0077 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0);
0078 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
0079 if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
0080 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac,
0081 sizeof(lnk->lgr->nexthop_mac));
0082 else
0083 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
0084 sizeof(lnk->peer_mac));
0085 qp_attr.dest_qp_num = lnk->peer_qpn;
0086 qp_attr.rq_psn = lnk->peer_psn;
0087 qp_attr.max_dest_rd_atomic = 1;
0088
0089
0090 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
0091
0092 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
0093 }
0094
0095 int smc_ib_modify_qp_rts(struct smc_link *lnk)
0096 {
0097 struct ib_qp_attr qp_attr;
0098
0099 memset(&qp_attr, 0, sizeof(qp_attr));
0100 qp_attr.qp_state = IB_QPS_RTS;
0101 qp_attr.timeout = SMC_QP_TIMEOUT;
0102 qp_attr.retry_cnt = SMC_QP_RETRY_CNT;
0103 qp_attr.rnr_retry = SMC_QP_RNR_RETRY;
0104 qp_attr.sq_psn = lnk->psn_initial;
0105 qp_attr.max_rd_atomic = 1;
0106
0107
0108 return ib_modify_qp(lnk->roce_qp, &qp_attr,
0109 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
0110 IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
0111 IB_QP_MAX_QP_RD_ATOMIC);
0112 }
0113
0114 int smc_ib_modify_qp_error(struct smc_link *lnk)
0115 {
0116 struct ib_qp_attr qp_attr;
0117
0118 memset(&qp_attr, 0, sizeof(qp_attr));
0119 qp_attr.qp_state = IB_QPS_ERR;
0120 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
0121 }
0122
0123 int smc_ib_ready_link(struct smc_link *lnk)
0124 {
0125 struct smc_link_group *lgr = smc_get_lgr(lnk);
0126 int rc = 0;
0127
0128 rc = smc_ib_modify_qp_init(lnk);
0129 if (rc)
0130 goto out;
0131
0132 rc = smc_ib_modify_qp_rtr(lnk);
0133 if (rc)
0134 goto out;
0135 smc_wr_remember_qp_attr(lnk);
0136 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
0137 IB_CQ_SOLICITED_MASK);
0138 if (rc)
0139 goto out;
0140 rc = smc_wr_rx_post_init(lnk);
0141 if (rc)
0142 goto out;
0143 smc_wr_remember_qp_attr(lnk);
0144
0145 if (lgr->role == SMC_SERV) {
0146 rc = smc_ib_modify_qp_rts(lnk);
0147 if (rc)
0148 goto out;
0149 smc_wr_remember_qp_attr(lnk);
0150 }
0151 out:
0152 return rc;
0153 }
0154
0155 static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
0156 {
0157 const struct ib_gid_attr *attr;
0158 int rc;
0159
0160 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
0161 if (IS_ERR(attr))
0162 return -ENODEV;
0163
0164 rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]);
0165 rdma_put_gid_attr(attr);
0166 return rc;
0167 }
0168
0169
0170
0171
0172
0173
0174 static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
0175 u8 ibport)
0176 {
0177 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
0178 sizeof(smcibdev->mac[ibport - 1]));
0179 }
0180
0181 bool smc_ib_is_valid_local_systemid(void)
0182 {
0183 return !is_zero_ether_addr(&local_systemid[2]);
0184 }
0185
0186 static void smc_ib_init_local_systemid(void)
0187 {
0188 get_random_bytes(&local_systemid[0], 2);
0189 }
0190
0191 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
0192 {
0193 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
0194 }
0195
0196 int smc_ib_find_route(__be32 saddr, __be32 daddr,
0197 u8 nexthop_mac[], u8 *uses_gateway)
0198 {
0199 struct neighbour *neigh = NULL;
0200 struct rtable *rt = NULL;
0201 struct flowi4 fl4 = {
0202 .saddr = saddr,
0203 .daddr = daddr
0204 };
0205
0206 if (daddr == cpu_to_be32(INADDR_NONE))
0207 goto out;
0208 rt = ip_route_output_flow(&init_net, &fl4, NULL);
0209 if (IS_ERR(rt))
0210 goto out;
0211 if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET)
0212 goto out;
0213 neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr);
0214 if (neigh) {
0215 memcpy(nexthop_mac, neigh->ha, ETH_ALEN);
0216 *uses_gateway = rt->rt_uses_gateway;
0217 return 0;
0218 }
0219 out:
0220 return -ENOENT;
0221 }
0222
0223 static int smc_ib_determine_gid_rcu(const struct net_device *ndev,
0224 const struct ib_gid_attr *attr,
0225 u8 gid[], u8 *sgid_index,
0226 struct smc_init_info_smcrv2 *smcrv2)
0227 {
0228 if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) {
0229 if (gid)
0230 memcpy(gid, &attr->gid, SMC_GID_SIZE);
0231 if (sgid_index)
0232 *sgid_index = attr->index;
0233 return 0;
0234 }
0235 if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
0236 smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) {
0237 struct in_device *in_dev = __in_dev_get_rcu(ndev);
0238 const struct in_ifaddr *ifa;
0239 bool subnet_match = false;
0240
0241 if (!in_dev)
0242 goto out;
0243 in_dev_for_each_ifa_rcu(ifa, in_dev) {
0244 if (!inet_ifa_match(smcrv2->saddr, ifa))
0245 continue;
0246 subnet_match = true;
0247 break;
0248 }
0249 if (!subnet_match)
0250 goto out;
0251 if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr,
0252 smcrv2->daddr,
0253 smcrv2->nexthop_mac,
0254 &smcrv2->uses_gateway))
0255 goto out;
0256
0257 if (gid)
0258 memcpy(gid, &attr->gid, SMC_GID_SIZE);
0259 if (sgid_index)
0260 *sgid_index = attr->index;
0261 return 0;
0262 }
0263 out:
0264 return -ENODEV;
0265 }
0266
0267
0268 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
0269 unsigned short vlan_id, u8 gid[], u8 *sgid_index,
0270 struct smc_init_info_smcrv2 *smcrv2)
0271 {
0272 const struct ib_gid_attr *attr;
0273 const struct net_device *ndev;
0274 int i;
0275
0276 for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
0277 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
0278 if (IS_ERR(attr))
0279 continue;
0280
0281 rcu_read_lock();
0282 ndev = rdma_read_gid_attr_ndev_rcu(attr);
0283 if (!IS_ERR(ndev) &&
0284 ((!vlan_id && !is_vlan_dev(ndev)) ||
0285 (vlan_id && is_vlan_dev(ndev) &&
0286 vlan_dev_vlan_id(ndev) == vlan_id))) {
0287 if (!smc_ib_determine_gid_rcu(ndev, attr, gid,
0288 sgid_index, smcrv2)) {
0289 rcu_read_unlock();
0290 rdma_put_gid_attr(attr);
0291 return 0;
0292 }
0293 }
0294 rcu_read_unlock();
0295 rdma_put_gid_attr(attr);
0296 }
0297 return -ENODEV;
0298 }
0299
0300
0301 static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2,
0302 struct smc_ib_device *smcibdev, u8 ibport)
0303 {
0304 const struct ib_gid_attr *attr;
0305 bool rc = false;
0306 int i;
0307
0308 for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
0309 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
0310 if (IS_ERR(attr))
0311 continue;
0312
0313 rcu_read_lock();
0314 if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) ||
0315 (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
0316 !(ipv6_addr_type((const struct in6_addr *)&attr->gid)
0317 & IPV6_ADDR_LINKLOCAL)))
0318 if (!memcmp(gid, &attr->gid, SMC_GID_SIZE))
0319 rc = true;
0320 rcu_read_unlock();
0321 rdma_put_gid_attr(attr);
0322 }
0323 return rc;
0324 }
0325
0326
0327 static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport)
0328 {
0329 struct smc_link_group *lgr;
0330 int i;
0331
0332 spin_lock_bh(&smc_lgr_list.lock);
0333 list_for_each_entry(lgr, &smc_lgr_list.list, list) {
0334 if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
0335 SMC_MAX_PNETID_LEN))
0336 continue;
0337 if (list_empty(&lgr->list))
0338 continue;
0339 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
0340 if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
0341 lgr->lnk[i].smcibdev != smcibdev)
0342 continue;
0343 if (!smc_ib_check_link_gid(lgr->lnk[i].gid,
0344 lgr->smc_version == SMC_V2,
0345 smcibdev, ibport))
0346 smcr_port_err(smcibdev, ibport);
0347 }
0348 }
0349 spin_unlock_bh(&smc_lgr_list.lock);
0350 }
0351
0352 static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
0353 {
0354 int rc;
0355
0356 memset(&smcibdev->pattr[ibport - 1], 0,
0357 sizeof(smcibdev->pattr[ibport - 1]));
0358 rc = ib_query_port(smcibdev->ibdev, ibport,
0359 &smcibdev->pattr[ibport - 1]);
0360 if (rc)
0361 goto out;
0362
0363 rc = smc_ib_fill_mac(smcibdev, ibport);
0364 if (rc)
0365 goto out;
0366 if (!smc_ib_is_valid_local_systemid() &&
0367 smc_ib_port_active(smcibdev, ibport))
0368
0369 smc_ib_define_local_systemid(smcibdev, ibport);
0370 out:
0371 return rc;
0372 }
0373
0374
0375 static void smc_ib_port_event_work(struct work_struct *work)
0376 {
0377 struct smc_ib_device *smcibdev = container_of(
0378 work, struct smc_ib_device, port_event_work);
0379 u8 port_idx;
0380
0381 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
0382 smc_ib_remember_port_attr(smcibdev, port_idx + 1);
0383 clear_bit(port_idx, &smcibdev->port_event_mask);
0384 if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
0385 set_bit(port_idx, smcibdev->ports_going_away);
0386 smcr_port_err(smcibdev, port_idx + 1);
0387 } else {
0388 clear_bit(port_idx, smcibdev->ports_going_away);
0389 smcr_port_add(smcibdev, port_idx + 1);
0390 smc_ib_gid_check(smcibdev, port_idx + 1);
0391 }
0392 }
0393 }
0394
0395
0396 static void smc_ib_global_event_handler(struct ib_event_handler *handler,
0397 struct ib_event *ibevent)
0398 {
0399 struct smc_ib_device *smcibdev;
0400 bool schedule = false;
0401 u8 port_idx;
0402
0403 smcibdev = container_of(handler, struct smc_ib_device, event_handler);
0404
0405 switch (ibevent->event) {
0406 case IB_EVENT_DEVICE_FATAL:
0407
0408 for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
0409 set_bit(port_idx, &smcibdev->port_event_mask);
0410 if (!test_and_set_bit(port_idx,
0411 smcibdev->ports_going_away))
0412 schedule = true;
0413 }
0414 if (schedule)
0415 schedule_work(&smcibdev->port_event_work);
0416 break;
0417 case IB_EVENT_PORT_ACTIVE:
0418 port_idx = ibevent->element.port_num - 1;
0419 if (port_idx >= SMC_MAX_PORTS)
0420 break;
0421 set_bit(port_idx, &smcibdev->port_event_mask);
0422 if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
0423 schedule_work(&smcibdev->port_event_work);
0424 break;
0425 case IB_EVENT_PORT_ERR:
0426 port_idx = ibevent->element.port_num - 1;
0427 if (port_idx >= SMC_MAX_PORTS)
0428 break;
0429 set_bit(port_idx, &smcibdev->port_event_mask);
0430 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
0431 schedule_work(&smcibdev->port_event_work);
0432 break;
0433 case IB_EVENT_GID_CHANGE:
0434 port_idx = ibevent->element.port_num - 1;
0435 if (port_idx >= SMC_MAX_PORTS)
0436 break;
0437 set_bit(port_idx, &smcibdev->port_event_mask);
0438 schedule_work(&smcibdev->port_event_work);
0439 break;
0440 default:
0441 break;
0442 }
0443 }
0444
0445 void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
0446 {
0447 if (lnk->roce_pd)
0448 ib_dealloc_pd(lnk->roce_pd);
0449 lnk->roce_pd = NULL;
0450 }
0451
0452 int smc_ib_create_protection_domain(struct smc_link *lnk)
0453 {
0454 int rc;
0455
0456 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
0457 rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
0458 if (IS_ERR(lnk->roce_pd))
0459 lnk->roce_pd = NULL;
0460 return rc;
0461 }
0462
0463 static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr,
0464 struct smc_ib_device *smcibdev)
0465 {
0466 struct smc_link_group *lgr;
0467 bool rc = false;
0468 int i;
0469
0470 spin_lock_bh(&smc_lgr->lock);
0471 list_for_each_entry(lgr, &smc_lgr->list, list) {
0472 if (lgr->is_smcd)
0473 continue;
0474 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
0475 if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
0476 lgr->lnk[i].smcibdev != smcibdev)
0477 continue;
0478 if (lgr->type == SMC_LGR_SINGLE ||
0479 lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) {
0480 rc = true;
0481 goto out;
0482 }
0483 }
0484 }
0485 out:
0486 spin_unlock_bh(&smc_lgr->lock);
0487 return rc;
0488 }
0489
0490 static int smc_nl_handle_dev_port(struct sk_buff *skb,
0491 struct ib_device *ibdev,
0492 struct smc_ib_device *smcibdev,
0493 int port)
0494 {
0495 char smc_pnet[SMC_MAX_PNETID_LEN + 1];
0496 struct nlattr *port_attrs;
0497 unsigned char port_state;
0498 int lnk_count = 0;
0499
0500 port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port);
0501 if (!port_attrs)
0502 goto errout;
0503
0504 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR,
0505 smcibdev->pnetid_by_user[port]))
0506 goto errattr;
0507 memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN);
0508 smc_pnet[SMC_MAX_PNETID_LEN] = 0;
0509 if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
0510 goto errattr;
0511 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV,
0512 smcibdev->ndev_ifidx[port]))
0513 goto errattr;
0514 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1))
0515 goto errattr;
0516 port_state = smc_ib_port_active(smcibdev, port + 1);
0517 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state))
0518 goto errattr;
0519 lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]);
0520 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count))
0521 goto errattr;
0522 nla_nest_end(skb, port_attrs);
0523 return 0;
0524 errattr:
0525 nla_nest_cancel(skb, port_attrs);
0526 errout:
0527 return -EMSGSIZE;
0528 }
0529
0530 static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev,
0531 struct sk_buff *skb)
0532 {
0533 if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid))
0534 return false;
0535 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid))
0536 return false;
0537 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor))
0538 return false;
0539 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device))
0540 return false;
0541 if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id))
0542 return false;
0543 return true;
0544 }
0545
0546 static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev,
0547 struct sk_buff *skb,
0548 struct netlink_callback *cb)
0549 {
0550 char smc_ibname[IB_DEVICE_NAME_MAX];
0551 struct smc_pci_dev smc_pci_dev;
0552 struct pci_dev *pci_dev;
0553 unsigned char is_crit;
0554 struct nlattr *attrs;
0555 void *nlh;
0556 int i;
0557
0558 nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
0559 &smc_gen_nl_family, NLM_F_MULTI,
0560 SMC_NETLINK_GET_DEV_SMCR);
0561 if (!nlh)
0562 goto errmsg;
0563 attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR);
0564 if (!attrs)
0565 goto errout;
0566 is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev);
0567 if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit))
0568 goto errattr;
0569 if (smcibdev->ibdev->dev.parent) {
0570 memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
0571 pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent);
0572 smc_set_pci_values(pci_dev, &smc_pci_dev);
0573 if (!smc_nl_handle_pci_values(&smc_pci_dev, skb))
0574 goto errattr;
0575 }
0576 snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name);
0577 if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname))
0578 goto errattr;
0579 for (i = 1; i <= SMC_MAX_PORTS; i++) {
0580 if (!rdma_is_port_valid(smcibdev->ibdev, i))
0581 continue;
0582 if (smc_nl_handle_dev_port(skb, smcibdev->ibdev,
0583 smcibdev, i - 1))
0584 goto errattr;
0585 }
0586
0587 nla_nest_end(skb, attrs);
0588 genlmsg_end(skb, nlh);
0589 return 0;
0590
0591 errattr:
0592 nla_nest_cancel(skb, attrs);
0593 errout:
0594 genlmsg_cancel(skb, nlh);
0595 errmsg:
0596 return -EMSGSIZE;
0597 }
0598
0599 static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list,
0600 struct sk_buff *skb,
0601 struct netlink_callback *cb)
0602 {
0603 struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
0604 struct smc_ib_device *smcibdev;
0605 int snum = cb_ctx->pos[0];
0606 int num = 0;
0607
0608 mutex_lock(&dev_list->mutex);
0609 list_for_each_entry(smcibdev, &dev_list->list, list) {
0610 if (num < snum)
0611 goto next;
0612 if (smc_nl_handle_smcr_dev(smcibdev, skb, cb))
0613 goto errout;
0614 next:
0615 num++;
0616 }
0617 errout:
0618 mutex_unlock(&dev_list->mutex);
0619 cb_ctx->pos[0] = num;
0620 }
0621
0622 int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
0623 {
0624 smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb);
0625 return skb->len;
0626 }
0627
0628 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
0629 {
0630 struct smc_link *lnk = (struct smc_link *)priv;
0631 struct smc_ib_device *smcibdev = lnk->smcibdev;
0632 u8 port_idx;
0633
0634 switch (ibevent->event) {
0635 case IB_EVENT_QP_FATAL:
0636 case IB_EVENT_QP_ACCESS_ERR:
0637 port_idx = ibevent->element.qp->port - 1;
0638 if (port_idx >= SMC_MAX_PORTS)
0639 break;
0640 set_bit(port_idx, &smcibdev->port_event_mask);
0641 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
0642 schedule_work(&smcibdev->port_event_work);
0643 break;
0644 default:
0645 break;
0646 }
0647 }
0648
0649 void smc_ib_destroy_queue_pair(struct smc_link *lnk)
0650 {
0651 if (lnk->roce_qp)
0652 ib_destroy_qp(lnk->roce_qp);
0653 lnk->roce_qp = NULL;
0654 }
0655
0656
0657 int smc_ib_create_queue_pair(struct smc_link *lnk)
0658 {
0659 int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
0660 struct ib_qp_init_attr qp_attr = {
0661 .event_handler = smc_ib_qp_event_handler,
0662 .qp_context = lnk,
0663 .send_cq = lnk->smcibdev->roce_cq_send,
0664 .recv_cq = lnk->smcibdev->roce_cq_recv,
0665 .srq = NULL,
0666 .cap = {
0667
0668
0669
0670 .max_send_wr = SMC_WR_BUF_CNT * 3,
0671 .max_recv_wr = SMC_WR_BUF_CNT * 3,
0672 .max_send_sge = SMC_IB_MAX_SEND_SGE,
0673 .max_recv_sge = sges_per_buf,
0674 .max_inline_data = 0,
0675 },
0676 .sq_sig_type = IB_SIGNAL_REQ_WR,
0677 .qp_type = IB_QPT_RC,
0678 };
0679 int rc;
0680
0681 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
0682 rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
0683 if (IS_ERR(lnk->roce_qp))
0684 lnk->roce_qp = NULL;
0685 else
0686 smc_wr_remember_qp_attr(lnk);
0687 return rc;
0688 }
0689
0690 void smc_ib_put_memory_region(struct ib_mr *mr)
0691 {
0692 ib_dereg_mr(mr);
0693 }
0694
0695 static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
0696 {
0697 unsigned int offset = 0;
0698 int sg_num;
0699
0700
0701 sg_num = ib_map_mr_sg(buf_slot->mr[link_idx],
0702 buf_slot->sgt[link_idx].sgl,
0703 buf_slot->sgt[link_idx].orig_nents,
0704 &offset, PAGE_SIZE);
0705
0706 return sg_num;
0707 }
0708
0709
0710 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
0711 struct smc_buf_desc *buf_slot, u8 link_idx)
0712 {
0713 if (buf_slot->mr[link_idx])
0714 return 0;
0715
0716 buf_slot->mr[link_idx] =
0717 ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
0718 if (IS_ERR(buf_slot->mr[link_idx])) {
0719 int rc;
0720
0721 rc = PTR_ERR(buf_slot->mr[link_idx]);
0722 buf_slot->mr[link_idx] = NULL;
0723 return rc;
0724 }
0725
0726 if (smc_ib_map_mr_sg(buf_slot, link_idx) !=
0727 buf_slot->sgt[link_idx].orig_nents)
0728 return -EINVAL;
0729
0730 return 0;
0731 }
0732
0733 bool smc_ib_is_sg_need_sync(struct smc_link *lnk,
0734 struct smc_buf_desc *buf_slot)
0735 {
0736 struct scatterlist *sg;
0737 unsigned int i;
0738 bool ret = false;
0739
0740
0741 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
0742 buf_slot->sgt[lnk->link_idx].nents, i) {
0743 if (!sg_dma_len(sg))
0744 break;
0745 if (dma_need_sync(lnk->smcibdev->ibdev->dma_device,
0746 sg_dma_address(sg))) {
0747 ret = true;
0748 goto out;
0749 }
0750 }
0751
0752 out:
0753 return ret;
0754 }
0755
0756
0757 void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
0758 struct smc_buf_desc *buf_slot,
0759 enum dma_data_direction data_direction)
0760 {
0761 struct scatterlist *sg;
0762 unsigned int i;
0763
0764 if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx)))
0765 return;
0766
0767
0768 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
0769 buf_slot->sgt[lnk->link_idx].nents, i) {
0770 if (!sg_dma_len(sg))
0771 break;
0772 ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
0773 sg_dma_address(sg),
0774 sg_dma_len(sg),
0775 data_direction);
0776 }
0777 }
0778
0779
0780 void smc_ib_sync_sg_for_device(struct smc_link *lnk,
0781 struct smc_buf_desc *buf_slot,
0782 enum dma_data_direction data_direction)
0783 {
0784 struct scatterlist *sg;
0785 unsigned int i;
0786
0787 if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx)))
0788 return;
0789
0790
0791 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
0792 buf_slot->sgt[lnk->link_idx].nents, i) {
0793 if (!sg_dma_len(sg))
0794 break;
0795 ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
0796 sg_dma_address(sg),
0797 sg_dma_len(sg),
0798 data_direction);
0799 }
0800 }
0801
0802
0803 int smc_ib_buf_map_sg(struct smc_link *lnk,
0804 struct smc_buf_desc *buf_slot,
0805 enum dma_data_direction data_direction)
0806 {
0807 int mapped_nents;
0808
0809 mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
0810 buf_slot->sgt[lnk->link_idx].sgl,
0811 buf_slot->sgt[lnk->link_idx].orig_nents,
0812 data_direction);
0813 if (!mapped_nents)
0814 return -ENOMEM;
0815
0816 return mapped_nents;
0817 }
0818
0819 void smc_ib_buf_unmap_sg(struct smc_link *lnk,
0820 struct smc_buf_desc *buf_slot,
0821 enum dma_data_direction data_direction)
0822 {
0823 if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
0824 return;
0825
0826 ib_dma_unmap_sg(lnk->smcibdev->ibdev,
0827 buf_slot->sgt[lnk->link_idx].sgl,
0828 buf_slot->sgt[lnk->link_idx].orig_nents,
0829 data_direction);
0830 buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
0831 }
0832
0833 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
0834 {
0835 struct ib_cq_init_attr cqattr = {
0836 .cqe = SMC_MAX_CQE, .comp_vector = 0 };
0837 int cqe_size_order, smc_order;
0838 long rc;
0839
0840 mutex_lock(&smcibdev->mutex);
0841 rc = 0;
0842 if (smcibdev->initialized)
0843 goto out;
0844
0845 cqe_size_order = cache_line_size() == 128 ? 7 : 6;
0846 smc_order = MAX_ORDER - cqe_size_order - 1;
0847 if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
0848 cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
0849 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
0850 smc_wr_tx_cq_handler, NULL,
0851 smcibdev, &cqattr);
0852 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
0853 if (IS_ERR(smcibdev->roce_cq_send)) {
0854 smcibdev->roce_cq_send = NULL;
0855 goto out;
0856 }
0857 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
0858 smc_wr_rx_cq_handler, NULL,
0859 smcibdev, &cqattr);
0860 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
0861 if (IS_ERR(smcibdev->roce_cq_recv)) {
0862 smcibdev->roce_cq_recv = NULL;
0863 goto err;
0864 }
0865 smc_wr_add_dev(smcibdev);
0866 smcibdev->initialized = 1;
0867 goto out;
0868
0869 err:
0870 ib_destroy_cq(smcibdev->roce_cq_send);
0871 out:
0872 mutex_unlock(&smcibdev->mutex);
0873 return rc;
0874 }
0875
0876 static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
0877 {
0878 mutex_lock(&smcibdev->mutex);
0879 if (!smcibdev->initialized)
0880 goto out;
0881 smcibdev->initialized = 0;
0882 ib_destroy_cq(smcibdev->roce_cq_recv);
0883 ib_destroy_cq(smcibdev->roce_cq_send);
0884 smc_wr_remove_dev(smcibdev);
0885 out:
0886 mutex_unlock(&smcibdev->mutex);
0887 }
0888
0889 static struct ib_client smc_ib_client;
0890
0891 static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port)
0892 {
0893 struct ib_device *ibdev = smcibdev->ibdev;
0894 struct net_device *ndev;
0895
0896 if (!ibdev->ops.get_netdev)
0897 return;
0898 ndev = ibdev->ops.get_netdev(ibdev, port + 1);
0899 if (ndev) {
0900 smcibdev->ndev_ifidx[port] = ndev->ifindex;
0901 dev_put(ndev);
0902 }
0903 }
0904
0905 void smc_ib_ndev_change(struct net_device *ndev, unsigned long event)
0906 {
0907 struct smc_ib_device *smcibdev;
0908 struct ib_device *libdev;
0909 struct net_device *lndev;
0910 u8 port_cnt;
0911 int i;
0912
0913 mutex_lock(&smc_ib_devices.mutex);
0914 list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
0915 port_cnt = smcibdev->ibdev->phys_port_cnt;
0916 for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) {
0917 libdev = smcibdev->ibdev;
0918 if (!libdev->ops.get_netdev)
0919 continue;
0920 lndev = libdev->ops.get_netdev(libdev, i + 1);
0921 dev_put(lndev);
0922 if (lndev != ndev)
0923 continue;
0924 if (event == NETDEV_REGISTER)
0925 smcibdev->ndev_ifidx[i] = ndev->ifindex;
0926 if (event == NETDEV_UNREGISTER)
0927 smcibdev->ndev_ifidx[i] = 0;
0928 }
0929 }
0930 mutex_unlock(&smc_ib_devices.mutex);
0931 }
0932
0933
0934 static int smc_ib_add_dev(struct ib_device *ibdev)
0935 {
0936 struct smc_ib_device *smcibdev;
0937 u8 port_cnt;
0938 int i;
0939
0940 if (ibdev->node_type != RDMA_NODE_IB_CA)
0941 return -EOPNOTSUPP;
0942
0943 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
0944 if (!smcibdev)
0945 return -ENOMEM;
0946
0947 smcibdev->ibdev = ibdev;
0948 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
0949 atomic_set(&smcibdev->lnk_cnt, 0);
0950 init_waitqueue_head(&smcibdev->lnks_deleted);
0951 mutex_init(&smcibdev->mutex);
0952 mutex_lock(&smc_ib_devices.mutex);
0953 list_add_tail(&smcibdev->list, &smc_ib_devices.list);
0954 mutex_unlock(&smc_ib_devices.mutex);
0955 ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
0956 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
0957 smc_ib_global_event_handler);
0958 ib_register_event_handler(&smcibdev->event_handler);
0959
0960
0961 port_cnt = smcibdev->ibdev->phys_port_cnt;
0962 pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
0963 smcibdev->ibdev->name, port_cnt);
0964 for (i = 0;
0965 i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
0966 i++) {
0967 set_bit(i, &smcibdev->port_event_mask);
0968
0969 if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
0970 smcibdev->pnetid[i]))
0971 smc_pnetid_by_table_ib(smcibdev, i + 1);
0972 smc_copy_netdev_ifindex(smcibdev, i);
0973 pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
0974 "%.16s%s\n",
0975 smcibdev->ibdev->name, i + 1,
0976 smcibdev->pnetid[i],
0977 smcibdev->pnetid_by_user[i] ?
0978 " (user defined)" :
0979 "");
0980 }
0981 schedule_work(&smcibdev->port_event_work);
0982 return 0;
0983 }
0984
0985
0986 static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
0987 {
0988 struct smc_ib_device *smcibdev = client_data;
0989
0990 mutex_lock(&smc_ib_devices.mutex);
0991 list_del_init(&smcibdev->list);
0992 mutex_unlock(&smc_ib_devices.mutex);
0993 pr_warn_ratelimited("smc: removing ib device %s\n",
0994 smcibdev->ibdev->name);
0995 smc_smcr_terminate_all(smcibdev);
0996 smc_ib_cleanup_per_ibdev(smcibdev);
0997 ib_unregister_event_handler(&smcibdev->event_handler);
0998 cancel_work_sync(&smcibdev->port_event_work);
0999 kfree(smcibdev);
1000 }
1001
1002 static struct ib_client smc_ib_client = {
1003 .name = "smc_ib",
1004 .add = smc_ib_add_dev,
1005 .remove = smc_ib_remove_dev,
1006 };
1007
1008 int __init smc_ib_register_client(void)
1009 {
1010 smc_ib_init_local_systemid();
1011 return ib_register_client(&smc_ib_client);
1012 }
1013
1014 void smc_ib_unregister_client(void)
1015 {
1016 ib_unregister_client(&smc_ib_client);
1017 }