0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033 #include <linux/kernel.h>
0034 #include <linux/in.h>
0035 #include <linux/if.h>
0036 #include <linux/netdevice.h>
0037 #include <linux/inetdevice.h>
0038 #include <linux/if_arp.h>
0039 #include <linux/delay.h>
0040 #include <linux/slab.h>
0041 #include <linux/module.h>
0042 #include <net/addrconf.h>
0043
0044 #include "rds_single_path.h"
0045 #include "rds.h"
0046 #include "ib.h"
0047 #include "ib_mr.h"
0048
0049 static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
0050 static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
0051 unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
0052 static atomic_t rds_ib_unloading;
0053
0054 module_param(rds_ib_mr_1m_pool_size, int, 0444);
0055 MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA");
0056 module_param(rds_ib_mr_8k_pool_size, int, 0444);
0057 MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA");
0058 module_param(rds_ib_retry_count, int, 0444);
0059 MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
0060
0061
0062
0063
0064
0065
0066 DECLARE_RWSEM(rds_ib_devices_lock);
0067 struct list_head rds_ib_devices;
0068
0069
0070 DEFINE_SPINLOCK(ib_nodev_conns_lock);
0071 LIST_HEAD(ib_nodev_conns);
0072
0073 static void rds_ib_nodev_connect(void)
0074 {
0075 struct rds_ib_connection *ic;
0076
0077 spin_lock(&ib_nodev_conns_lock);
0078 list_for_each_entry(ic, &ib_nodev_conns, ib_node)
0079 rds_conn_connect_if_down(ic->conn);
0080 spin_unlock(&ib_nodev_conns_lock);
0081 }
0082
0083 static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
0084 {
0085 struct rds_ib_connection *ic;
0086 unsigned long flags;
0087
0088 spin_lock_irqsave(&rds_ibdev->spinlock, flags);
0089 list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
0090 rds_conn_path_drop(&ic->conn->c_path[0], true);
0091 spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
0092 }
0093
0094
0095
0096
0097
0098 static void rds_ib_dev_free(struct work_struct *work)
0099 {
0100 struct rds_ib_ipaddr *i_ipaddr, *i_next;
0101 struct rds_ib_device *rds_ibdev = container_of(work,
0102 struct rds_ib_device, free_work);
0103
0104 if (rds_ibdev->mr_8k_pool)
0105 rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
0106 if (rds_ibdev->mr_1m_pool)
0107 rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
0108 if (rds_ibdev->pd)
0109 ib_dealloc_pd(rds_ibdev->pd);
0110
0111 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
0112 list_del(&i_ipaddr->list);
0113 kfree(i_ipaddr);
0114 }
0115
0116 kfree(rds_ibdev->vector_load);
0117
0118 kfree(rds_ibdev);
0119 }
0120
0121 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
0122 {
0123 BUG_ON(refcount_read(&rds_ibdev->refcount) == 0);
0124 if (refcount_dec_and_test(&rds_ibdev->refcount))
0125 queue_work(rds_wq, &rds_ibdev->free_work);
0126 }
0127
0128 static int rds_ib_add_one(struct ib_device *device)
0129 {
0130 struct rds_ib_device *rds_ibdev;
0131 int ret;
0132
0133
0134 if (device->node_type != RDMA_NODE_IB_CA)
0135 return -EOPNOTSUPP;
0136
0137
0138 if (!(device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
0139 return -EOPNOTSUPP;
0140
0141 rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
0142 ibdev_to_node(device));
0143 if (!rds_ibdev)
0144 return -ENOMEM;
0145
0146 spin_lock_init(&rds_ibdev->spinlock);
0147 refcount_set(&rds_ibdev->refcount, 1);
0148 INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
0149
0150 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
0151 INIT_LIST_HEAD(&rds_ibdev->conn_list);
0152
0153 rds_ibdev->max_wrs = device->attrs.max_qp_wr;
0154 rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE);
0155
0156 rds_ibdev->odp_capable =
0157 !!(device->attrs.kernel_cap_flags &
0158 IBK_ON_DEMAND_PAGING) &&
0159 !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
0160 IB_ODP_SUPPORT_WRITE) &&
0161 !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
0162 IB_ODP_SUPPORT_READ);
0163
0164 rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
0165 min_t(unsigned int, (device->attrs.max_mr / 2),
0166 rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
0167
0168 rds_ibdev->max_8k_mrs = device->attrs.max_mr ?
0169 min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
0170 rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size;
0171
0172 rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
0173 rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
0174
0175 rds_ibdev->vector_load = kcalloc(device->num_comp_vectors,
0176 sizeof(int),
0177 GFP_KERNEL);
0178 if (!rds_ibdev->vector_load) {
0179 pr_err("RDS/IB: %s failed to allocate vector memory\n",
0180 __func__);
0181 ret = -ENOMEM;
0182 goto put_dev;
0183 }
0184
0185 rds_ibdev->dev = device;
0186 rds_ibdev->pd = ib_alloc_pd(device, 0);
0187 if (IS_ERR(rds_ibdev->pd)) {
0188 ret = PTR_ERR(rds_ibdev->pd);
0189 rds_ibdev->pd = NULL;
0190 goto put_dev;
0191 }
0192
0193 rds_ibdev->mr_1m_pool =
0194 rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
0195 if (IS_ERR(rds_ibdev->mr_1m_pool)) {
0196 ret = PTR_ERR(rds_ibdev->mr_1m_pool);
0197 rds_ibdev->mr_1m_pool = NULL;
0198 goto put_dev;
0199 }
0200
0201 rds_ibdev->mr_8k_pool =
0202 rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL);
0203 if (IS_ERR(rds_ibdev->mr_8k_pool)) {
0204 ret = PTR_ERR(rds_ibdev->mr_8k_pool);
0205 rds_ibdev->mr_8k_pool = NULL;
0206 goto put_dev;
0207 }
0208
0209 rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
0210 device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
0211 rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs);
0212
0213 pr_info("RDS/IB: %s: added\n", device->name);
0214
0215 down_write(&rds_ib_devices_lock);
0216 list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
0217 up_write(&rds_ib_devices_lock);
0218 refcount_inc(&rds_ibdev->refcount);
0219
0220 ib_set_client_data(device, &rds_ib_client, rds_ibdev);
0221
0222 rds_ib_nodev_connect();
0223 return 0;
0224
0225 put_dev:
0226 rds_ib_dev_put(rds_ibdev);
0227 return ret;
0228 }
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246 struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
0247 {
0248 struct rds_ib_device *rds_ibdev;
0249
0250 rcu_read_lock();
0251 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
0252 if (rds_ibdev)
0253 refcount_inc(&rds_ibdev->refcount);
0254 rcu_read_unlock();
0255 return rds_ibdev;
0256 }
0257
0258
0259
0260
0261
0262
0263
0264
0265 static void rds_ib_remove_one(struct ib_device *device, void *client_data)
0266 {
0267 struct rds_ib_device *rds_ibdev = client_data;
0268
0269 rds_ib_dev_shutdown(rds_ibdev);
0270
0271
0272 ib_set_client_data(device, &rds_ib_client, NULL);
0273
0274 down_write(&rds_ib_devices_lock);
0275 list_del_rcu(&rds_ibdev->list);
0276 up_write(&rds_ib_devices_lock);
0277
0278
0279
0280
0281
0282
0283 synchronize_rcu();
0284 rds_ib_dev_put(rds_ibdev);
0285 rds_ib_dev_put(rds_ibdev);
0286 }
0287
0288 struct ib_client rds_ib_client = {
0289 .name = "rds_ib",
0290 .add = rds_ib_add_one,
0291 .remove = rds_ib_remove_one
0292 };
0293
0294 static int rds_ib_conn_info_visitor(struct rds_connection *conn,
0295 void *buffer)
0296 {
0297 struct rds_info_rdma_connection *iinfo = buffer;
0298 struct rds_ib_connection *ic = conn->c_transport_data;
0299
0300
0301 if (conn->c_trans != &rds_ib_transport)
0302 return 0;
0303 if (conn->c_isv6)
0304 return 0;
0305
0306 iinfo->src_addr = conn->c_laddr.s6_addr32[3];
0307 iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
0308 if (ic) {
0309 iinfo->tos = conn->c_tos;
0310 iinfo->sl = ic->i_sl;
0311 }
0312
0313 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
0314 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
0315 if (rds_conn_state(conn) == RDS_CONN_UP) {
0316 struct rds_ib_device *rds_ibdev;
0317
0318 rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid,
0319 (union ib_gid *)&iinfo->dst_gid);
0320
0321 rds_ibdev = ic->rds_ibdev;
0322 iinfo->max_send_wr = ic->i_send_ring.w_nr;
0323 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
0324 iinfo->max_send_sge = rds_ibdev->max_sge;
0325 rds_ib_get_mr_info(rds_ibdev, iinfo);
0326 iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs);
0327 }
0328 return 1;
0329 }
0330
0331 #if IS_ENABLED(CONFIG_IPV6)
0332
0333 static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
0334 void *buffer)
0335 {
0336 struct rds6_info_rdma_connection *iinfo6 = buffer;
0337 struct rds_ib_connection *ic = conn->c_transport_data;
0338
0339
0340 if (conn->c_trans != &rds_ib_transport)
0341 return 0;
0342
0343 iinfo6->src_addr = conn->c_laddr;
0344 iinfo6->dst_addr = conn->c_faddr;
0345 if (ic) {
0346 iinfo6->tos = conn->c_tos;
0347 iinfo6->sl = ic->i_sl;
0348 }
0349
0350 memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
0351 memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
0352
0353 if (rds_conn_state(conn) == RDS_CONN_UP) {
0354 struct rds_ib_device *rds_ibdev;
0355
0356 rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid,
0357 (union ib_gid *)&iinfo6->dst_gid);
0358 rds_ibdev = ic->rds_ibdev;
0359 iinfo6->max_send_wr = ic->i_send_ring.w_nr;
0360 iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
0361 iinfo6->max_send_sge = rds_ibdev->max_sge;
0362 rds6_ib_get_mr_info(rds_ibdev, iinfo6);
0363 iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs);
0364 }
0365 return 1;
0366 }
0367 #endif
0368
0369 static void rds_ib_ic_info(struct socket *sock, unsigned int len,
0370 struct rds_info_iterator *iter,
0371 struct rds_info_lengths *lens)
0372 {
0373 u64 buffer[(sizeof(struct rds_info_rdma_connection) + 7) / 8];
0374
0375 rds_for_each_conn_info(sock, len, iter, lens,
0376 rds_ib_conn_info_visitor,
0377 buffer,
0378 sizeof(struct rds_info_rdma_connection));
0379 }
0380
0381 #if IS_ENABLED(CONFIG_IPV6)
0382
0383 static void rds6_ib_ic_info(struct socket *sock, unsigned int len,
0384 struct rds_info_iterator *iter,
0385 struct rds_info_lengths *lens)
0386 {
0387 u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8];
0388
0389 rds_for_each_conn_info(sock, len, iter, lens,
0390 rds6_ib_conn_info_visitor,
0391 buffer,
0392 sizeof(struct rds6_info_rdma_connection));
0393 }
0394 #endif
0395
0396
0397
0398
0399
0400
0401
0402
0403
0404
0405
0406 static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
0407 __u32 scope_id)
0408 {
0409 int ret;
0410 struct rdma_cm_id *cm_id;
0411 #if IS_ENABLED(CONFIG_IPV6)
0412 struct sockaddr_in6 sin6;
0413 #endif
0414 struct sockaddr_in sin;
0415 struct sockaddr *sa;
0416 bool isv4;
0417
0418 isv4 = ipv6_addr_v4mapped(addr);
0419
0420
0421
0422 cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler,
0423 NULL, RDMA_PS_TCP, IB_QPT_RC);
0424 if (IS_ERR(cm_id))
0425 return PTR_ERR(cm_id);
0426
0427 if (isv4) {
0428 memset(&sin, 0, sizeof(sin));
0429 sin.sin_family = AF_INET;
0430 sin.sin_addr.s_addr = addr->s6_addr32[3];
0431 sa = (struct sockaddr *)&sin;
0432 } else {
0433 #if IS_ENABLED(CONFIG_IPV6)
0434 memset(&sin6, 0, sizeof(sin6));
0435 sin6.sin6_family = AF_INET6;
0436 sin6.sin6_addr = *addr;
0437 sin6.sin6_scope_id = scope_id;
0438 sa = (struct sockaddr *)&sin6;
0439
0440
0441
0442
0443
0444
0445 if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) {
0446 struct net_device *dev;
0447
0448 if (scope_id == 0) {
0449 ret = -EADDRNOTAVAIL;
0450 goto out;
0451 }
0452
0453
0454
0455
0456 dev = dev_get_by_index(&init_net, scope_id);
0457 if (!dev) {
0458 ret = -EADDRNOTAVAIL;
0459 goto out;
0460 }
0461 if (!ipv6_chk_addr(&init_net, addr, dev, 1)) {
0462 dev_put(dev);
0463 ret = -EADDRNOTAVAIL;
0464 goto out;
0465 }
0466 dev_put(dev);
0467 }
0468 #else
0469 ret = -EADDRNOTAVAIL;
0470 goto out;
0471 #endif
0472 }
0473
0474
0475 ret = rdma_bind_addr(cm_id, sa);
0476
0477
0478 if (ret || !cm_id->device ||
0479 cm_id->device->node_type != RDMA_NODE_IB_CA)
0480 ret = -EADDRNOTAVAIL;
0481
0482 rdsdebug("addr %pI6c%%%u ret %d node type %d\n",
0483 addr, scope_id, ret,
0484 cm_id->device ? cm_id->device->node_type : -1);
0485
0486 out:
0487 rdma_destroy_id(cm_id);
0488
0489 return ret;
0490 }
0491
0492 static void rds_ib_unregister_client(void)
0493 {
0494 ib_unregister_client(&rds_ib_client);
0495
0496 flush_workqueue(rds_wq);
0497 }
0498
0499 static void rds_ib_set_unloading(void)
0500 {
0501 atomic_set(&rds_ib_unloading, 1);
0502 }
0503
0504 static bool rds_ib_is_unloading(struct rds_connection *conn)
0505 {
0506 struct rds_conn_path *cp = &conn->c_path[0];
0507
0508 return (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags) ||
0509 atomic_read(&rds_ib_unloading) != 0);
0510 }
0511
0512 void rds_ib_exit(void)
0513 {
0514 rds_ib_set_unloading();
0515 synchronize_rcu();
0516 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
0517 #if IS_ENABLED(CONFIG_IPV6)
0518 rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
0519 #endif
0520 rds_ib_unregister_client();
0521 rds_ib_destroy_nodev_conns();
0522 rds_ib_sysctl_exit();
0523 rds_ib_recv_exit();
0524 rds_trans_unregister(&rds_ib_transport);
0525 rds_ib_mr_exit();
0526 }
0527
0528 static u8 rds_ib_get_tos_map(u8 tos)
0529 {
0530
0531
0532
0533
0534 return tos;
0535 }
0536
0537 struct rds_transport rds_ib_transport = {
0538 .laddr_check = rds_ib_laddr_check,
0539 .xmit_path_complete = rds_ib_xmit_path_complete,
0540 .xmit = rds_ib_xmit,
0541 .xmit_rdma = rds_ib_xmit_rdma,
0542 .xmit_atomic = rds_ib_xmit_atomic,
0543 .recv_path = rds_ib_recv_path,
0544 .conn_alloc = rds_ib_conn_alloc,
0545 .conn_free = rds_ib_conn_free,
0546 .conn_path_connect = rds_ib_conn_path_connect,
0547 .conn_path_shutdown = rds_ib_conn_path_shutdown,
0548 .inc_copy_to_user = rds_ib_inc_copy_to_user,
0549 .inc_free = rds_ib_inc_free,
0550 .cm_initiate_connect = rds_ib_cm_initiate_connect,
0551 .cm_handle_connect = rds_ib_cm_handle_connect,
0552 .cm_connect_complete = rds_ib_cm_connect_complete,
0553 .stats_info_copy = rds_ib_stats_info_copy,
0554 .exit = rds_ib_exit,
0555 .get_mr = rds_ib_get_mr,
0556 .sync_mr = rds_ib_sync_mr,
0557 .free_mr = rds_ib_free_mr,
0558 .flush_mrs = rds_ib_flush_mrs,
0559 .get_tos_map = rds_ib_get_tos_map,
0560 .t_owner = THIS_MODULE,
0561 .t_name = "infiniband",
0562 .t_unloading = rds_ib_is_unloading,
0563 .t_type = RDS_TRANS_IB
0564 };
0565
0566 int rds_ib_init(void)
0567 {
0568 int ret;
0569
0570 INIT_LIST_HEAD(&rds_ib_devices);
0571
0572 ret = rds_ib_mr_init();
0573 if (ret)
0574 goto out;
0575
0576 ret = ib_register_client(&rds_ib_client);
0577 if (ret)
0578 goto out_mr_exit;
0579
0580 ret = rds_ib_sysctl_init();
0581 if (ret)
0582 goto out_ibreg;
0583
0584 ret = rds_ib_recv_init();
0585 if (ret)
0586 goto out_sysctl;
0587
0588 rds_trans_register(&rds_ib_transport);
0589
0590 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
0591 #if IS_ENABLED(CONFIG_IPV6)
0592 rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
0593 #endif
0594
0595 goto out;
0596
0597 out_sysctl:
0598 rds_ib_sysctl_exit();
0599 out_ibreg:
0600 rds_ib_unregister_client();
0601 out_mr_exit:
0602 rds_ib_mr_exit();
0603 out:
0604 return ret;
0605 }
0606
0607 MODULE_LICENSE("GPL");