0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033 #include <linux/kernel.h>
0034 #include <linux/in.h>
0035 #include <linux/slab.h>
0036 #include <linux/vmalloc.h>
0037 #include <linux/ratelimit.h>
0038 #include <net/addrconf.h>
0039 #include <rdma/ib_cm.h>
0040
0041 #include "rds_single_path.h"
0042 #include "rds.h"
0043 #include "ib.h"
0044 #include "ib_mr.h"
0045
0046
0047
0048
0049 static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
0050 {
0051 conn->c_version = version;
0052 }
0053
0054
0055
0056
0057 static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
0058 {
0059 struct rds_ib_connection *ic = conn->c_transport_data;
0060
0061 if (rds_ib_sysctl_flow_control && credits != 0) {
0062
0063 ic->i_flowctl = 1;
0064 rds_ib_send_add_credits(conn, credits);
0065 } else {
0066 ic->i_flowctl = 0;
0067 }
0068 }
0069
0070
0071
0072
0073
0074 void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
0075 {
0076 struct rds_ib_connection *ic = conn->c_transport_data;
0077 const union rds_ib_conn_priv *dp = NULL;
0078 __be64 ack_seq = 0;
0079 __be32 credit = 0;
0080 u8 major = 0;
0081 u8 minor = 0;
0082 int err;
0083
0084 dp = event->param.conn.private_data;
0085 if (conn->c_isv6) {
0086 if (event->param.conn.private_data_len >=
0087 sizeof(struct rds6_ib_connect_private)) {
0088 major = dp->ricp_v6.dp_protocol_major;
0089 minor = dp->ricp_v6.dp_protocol_minor;
0090 credit = dp->ricp_v6.dp_credit;
0091
0092
0093
0094
0095
0096 ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
0097 }
0098 } else if (event->param.conn.private_data_len >=
0099 sizeof(struct rds_ib_connect_private)) {
0100 major = dp->ricp_v4.dp_protocol_major;
0101 minor = dp->ricp_v4.dp_protocol_minor;
0102 credit = dp->ricp_v4.dp_credit;
0103 ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
0104 }
0105
0106
0107 if (major) {
0108 rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
0109 rds_ib_set_flow_control(conn, be32_to_cpu(credit));
0110 }
0111
0112 if (conn->c_version < RDS_PROTOCOL_VERSION) {
0113 if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) {
0114 pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
0115 &conn->c_laddr, &conn->c_faddr,
0116 RDS_PROTOCOL_MAJOR(conn->c_version),
0117 RDS_PROTOCOL_MINOR(conn->c_version));
0118 rds_conn_destroy(conn);
0119 return;
0120 }
0121 }
0122
0123 pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c,%d> version %u.%u%s\n",
0124 ic->i_active_side ? "Active" : "Passive",
0125 &conn->c_laddr, &conn->c_faddr, conn->c_tos,
0126 RDS_PROTOCOL_MAJOR(conn->c_version),
0127 RDS_PROTOCOL_MINOR(conn->c_version),
0128 ic->i_flowctl ? ", flow control" : "");
0129
0130
0131 ic->i_sl = ic->i_cm_id->route.path_rec->sl;
0132
0133 atomic_set(&ic->i_cq_quiesce, 0);
0134
0135
0136
0137
0138
0139 rds_ib_send_init_ring(ic);
0140 rds_ib_recv_init_ring(ic);
0141
0142
0143 rds_ib_recv_refill(conn, 1, GFP_KERNEL);
0144
0145
0146 err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
0147 if (err)
0148 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
0149 err);
0150
0151
0152
0153 if (dp) {
0154 if (ack_seq)
0155 rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
0156 NULL);
0157 }
0158
0159 conn->c_proposed_version = conn->c_version;
0160 rds_connect_complete(conn);
0161 }
0162
0163 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
0164 struct rdma_conn_param *conn_param,
0165 union rds_ib_conn_priv *dp,
0166 u32 protocol_version,
0167 u32 max_responder_resources,
0168 u32 max_initiator_depth,
0169 bool isv6)
0170 {
0171 struct rds_ib_connection *ic = conn->c_transport_data;
0172 struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
0173
0174 memset(conn_param, 0, sizeof(struct rdma_conn_param));
0175
0176 conn_param->responder_resources =
0177 min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
0178 conn_param->initiator_depth =
0179 min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
0180 conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
0181 conn_param->rnr_retry_count = 7;
0182
0183 if (dp) {
0184 memset(dp, 0, sizeof(*dp));
0185 if (isv6) {
0186 dp->ricp_v6.dp_saddr = conn->c_laddr;
0187 dp->ricp_v6.dp_daddr = conn->c_faddr;
0188 dp->ricp_v6.dp_protocol_major =
0189 RDS_PROTOCOL_MAJOR(protocol_version);
0190 dp->ricp_v6.dp_protocol_minor =
0191 RDS_PROTOCOL_MINOR(protocol_version);
0192 dp->ricp_v6.dp_protocol_minor_mask =
0193 cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
0194 dp->ricp_v6.dp_ack_seq =
0195 cpu_to_be64(rds_ib_piggyb_ack(ic));
0196 dp->ricp_v6.dp_cmn.ricpc_dp_toss = conn->c_tos;
0197
0198 conn_param->private_data = &dp->ricp_v6;
0199 conn_param->private_data_len = sizeof(dp->ricp_v6);
0200 } else {
0201 dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
0202 dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
0203 dp->ricp_v4.dp_protocol_major =
0204 RDS_PROTOCOL_MAJOR(protocol_version);
0205 dp->ricp_v4.dp_protocol_minor =
0206 RDS_PROTOCOL_MINOR(protocol_version);
0207 dp->ricp_v4.dp_protocol_minor_mask =
0208 cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
0209 dp->ricp_v4.dp_ack_seq =
0210 cpu_to_be64(rds_ib_piggyb_ack(ic));
0211 dp->ricp_v4.dp_cmn.ricpc_dp_toss = conn->c_tos;
0212
0213 conn_param->private_data = &dp->ricp_v4;
0214 conn_param->private_data_len = sizeof(dp->ricp_v4);
0215 }
0216
0217
0218 if (ic->i_flowctl) {
0219 unsigned int credits;
0220
0221 credits = IB_GET_POST_CREDITS
0222 (atomic_read(&ic->i_credits));
0223 if (isv6)
0224 dp->ricp_v6.dp_credit = cpu_to_be32(credits);
0225 else
0226 dp->ricp_v4.dp_credit = cpu_to_be32(credits);
0227 atomic_sub(IB_SET_POST_CREDITS(credits),
0228 &ic->i_credits);
0229 }
0230 }
0231 }
0232
0233 static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
0234 {
0235 rdsdebug("event %u (%s) data %p\n",
0236 event->event, ib_event_msg(event->event), data);
0237 }
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247 static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
0248 {
0249 struct rds_connection *conn = context;
0250 struct rds_ib_connection *ic = conn->c_transport_data;
0251
0252 rdsdebug("conn %p cq %p\n", conn, cq);
0253
0254 rds_ib_stats_inc(s_ib_evt_handler_call);
0255
0256 tasklet_schedule(&ic->i_recv_tasklet);
0257 }
0258
0259 static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
0260 struct ib_wc *wcs)
0261 {
0262 int nr, i;
0263 struct ib_wc *wc;
0264
0265 while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
0266 for (i = 0; i < nr; i++) {
0267 wc = wcs + i;
0268 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
0269 (unsigned long long)wc->wr_id, wc->status,
0270 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
0271
0272 if (wc->wr_id <= ic->i_send_ring.w_nr ||
0273 wc->wr_id == RDS_IB_ACK_WR_ID)
0274 rds_ib_send_cqe_handler(ic, wc);
0275 else
0276 rds_ib_mr_cqe_handler(ic, wc);
0277
0278 }
0279 }
0280 }
0281
0282 static void rds_ib_tasklet_fn_send(unsigned long data)
0283 {
0284 struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
0285 struct rds_connection *conn = ic->conn;
0286
0287 rds_ib_stats_inc(s_ib_tasklet_call);
0288
0289
0290 if (atomic_read(&ic->i_cq_quiesce))
0291 return;
0292
0293 poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
0294 ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
0295 poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
0296
0297 if (rds_conn_up(conn) &&
0298 (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
0299 test_bit(0, &conn->c_map_queued)))
0300 rds_send_xmit(&ic->conn->c_path[0]);
0301 }
0302
0303 static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
0304 struct ib_wc *wcs,
0305 struct rds_ib_ack_state *ack_state)
0306 {
0307 int nr, i;
0308 struct ib_wc *wc;
0309
0310 while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
0311 for (i = 0; i < nr; i++) {
0312 wc = wcs + i;
0313 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
0314 (unsigned long long)wc->wr_id, wc->status,
0315 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
0316
0317 rds_ib_recv_cqe_handler(ic, wc, ack_state);
0318 }
0319 }
0320 }
0321
0322 static void rds_ib_tasklet_fn_recv(unsigned long data)
0323 {
0324 struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
0325 struct rds_connection *conn = ic->conn;
0326 struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
0327 struct rds_ib_ack_state state;
0328
0329 if (!rds_ibdev)
0330 rds_conn_drop(conn);
0331
0332 rds_ib_stats_inc(s_ib_tasklet_call);
0333
0334
0335 if (atomic_read(&ic->i_cq_quiesce))
0336 return;
0337
0338 memset(&state, 0, sizeof(state));
0339 poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
0340 ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
0341 poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
0342
0343 if (state.ack_next_valid)
0344 rds_ib_set_ack(ic, state.ack_next, state.ack_required);
0345 if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
0346 rds_send_drop_acked(conn, state.ack_recv, NULL);
0347 ic->i_ack_recv = state.ack_recv;
0348 }
0349
0350 if (rds_conn_up(conn))
0351 rds_ib_attempt_ack(ic);
0352 }
0353
0354 static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
0355 {
0356 struct rds_connection *conn = data;
0357 struct rds_ib_connection *ic = conn->c_transport_data;
0358
0359 rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
0360 ib_event_msg(event->event));
0361
0362 switch (event->event) {
0363 case IB_EVENT_COMM_EST:
0364 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
0365 break;
0366 default:
0367 rdsdebug("Fatal QP Event %u (%s) - connection %pI6c->%pI6c, reconnecting\n",
0368 event->event, ib_event_msg(event->event),
0369 &conn->c_laddr, &conn->c_faddr);
0370 rds_conn_drop(conn);
0371 break;
0372 }
0373 }
0374
0375 static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
0376 {
0377 struct rds_connection *conn = context;
0378 struct rds_ib_connection *ic = conn->c_transport_data;
0379
0380 rdsdebug("conn %p cq %p\n", conn, cq);
0381
0382 rds_ib_stats_inc(s_ib_evt_handler_call);
0383
0384 tasklet_schedule(&ic->i_send_tasklet);
0385 }
0386
0387 static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
0388 {
0389 int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
0390 int index = rds_ibdev->dev->num_comp_vectors - 1;
0391 int i;
0392
0393 for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
0394 if (rds_ibdev->vector_load[i] < min) {
0395 index = i;
0396 min = rds_ibdev->vector_load[i];
0397 }
0398 }
0399
0400 rds_ibdev->vector_load[index]++;
0401 return index;
0402 }
0403
0404 static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
0405 {
0406 rds_ibdev->vector_load[index]--;
0407 }
0408
0409 static void rds_dma_hdr_free(struct ib_device *dev, struct rds_header *hdr,
0410 dma_addr_t dma_addr, enum dma_data_direction dir)
0411 {
0412 ib_dma_unmap_single(dev, dma_addr, sizeof(*hdr), dir);
0413 kfree(hdr);
0414 }
0415
0416 static struct rds_header *rds_dma_hdr_alloc(struct ib_device *dev,
0417 dma_addr_t *dma_addr, enum dma_data_direction dir)
0418 {
0419 struct rds_header *hdr;
0420
0421 hdr = kzalloc_node(sizeof(*hdr), GFP_KERNEL, ibdev_to_node(dev));
0422 if (!hdr)
0423 return NULL;
0424
0425 *dma_addr = ib_dma_map_single(dev, hdr, sizeof(*hdr),
0426 DMA_BIDIRECTIONAL);
0427 if (ib_dma_mapping_error(dev, *dma_addr)) {
0428 kfree(hdr);
0429 return NULL;
0430 }
0431
0432 return hdr;
0433 }
0434
0435
0436
0437
0438
0439
0440
0441
0442 static void rds_dma_hdrs_free(struct rds_ib_device *dev,
0443 struct rds_header **hdrs, dma_addr_t *dma_addrs, u32 num_hdrs,
0444 enum dma_data_direction dir)
0445 {
0446 u32 i;
0447
0448 for (i = 0; i < num_hdrs; i++)
0449 rds_dma_hdr_free(dev->dev, hdrs[i], dma_addrs[i], dir);
0450 kvfree(hdrs);
0451 kvfree(dma_addrs);
0452 }
0453
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464
0465
0466 static struct rds_header **rds_dma_hdrs_alloc(struct rds_ib_device *dev,
0467 dma_addr_t **dma_addrs, u32 num_hdrs,
0468 enum dma_data_direction dir)
0469 {
0470 struct rds_header **hdrs;
0471 dma_addr_t *hdr_daddrs;
0472 u32 i;
0473
0474 hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL,
0475 ibdev_to_node(dev->dev));
0476 if (!hdrs)
0477 return NULL;
0478
0479 hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL,
0480 ibdev_to_node(dev->dev));
0481 if (!hdr_daddrs) {
0482 kvfree(hdrs);
0483 return NULL;
0484 }
0485
0486 for (i = 0; i < num_hdrs; i++) {
0487 hdrs[i] = rds_dma_hdr_alloc(dev->dev, &hdr_daddrs[i], dir);
0488 if (!hdrs[i]) {
0489 rds_dma_hdrs_free(dev, hdrs, hdr_daddrs, i, dir);
0490 return NULL;
0491 }
0492 }
0493
0494 *dma_addrs = hdr_daddrs;
0495 return hdrs;
0496 }
0497
0498
0499
0500
0501
0502 static int rds_ib_setup_qp(struct rds_connection *conn)
0503 {
0504 struct rds_ib_connection *ic = conn->c_transport_data;
0505 struct ib_device *dev = ic->i_cm_id->device;
0506 struct ib_qp_init_attr attr;
0507 struct ib_cq_init_attr cq_attr = {};
0508 struct rds_ib_device *rds_ibdev;
0509 unsigned long max_wrs;
0510 int ret, fr_queue_space;
0511
0512
0513
0514
0515
0516 rds_ibdev = rds_ib_get_client_data(dev);
0517 if (!rds_ibdev)
0518 return -EOPNOTSUPP;
0519
0520
0521
0522
0523
0524 fr_queue_space = RDS_IB_DEFAULT_FR_WR;
0525
0526
0527 rds_ib_add_conn(rds_ibdev, conn);
0528
0529 max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_send_wr + 1 ?
0530 rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_send_wr;
0531 if (ic->i_send_ring.w_nr != max_wrs)
0532 rds_ib_ring_resize(&ic->i_send_ring, max_wrs);
0533
0534 max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_recv_wr + 1 ?
0535 rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_recv_wr;
0536 if (ic->i_recv_ring.w_nr != max_wrs)
0537 rds_ib_ring_resize(&ic->i_recv_ring, max_wrs);
0538
0539
0540 ic->i_pd = rds_ibdev->pd;
0541
0542 ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
0543 cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
0544 cq_attr.comp_vector = ic->i_scq_vector;
0545 ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
0546 rds_ib_cq_event_handler, conn,
0547 &cq_attr);
0548 if (IS_ERR(ic->i_send_cq)) {
0549 ret = PTR_ERR(ic->i_send_cq);
0550 ic->i_send_cq = NULL;
0551 ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
0552 rdsdebug("ib_create_cq send failed: %d\n", ret);
0553 goto rds_ibdev_out;
0554 }
0555
0556 ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
0557 cq_attr.cqe = ic->i_recv_ring.w_nr;
0558 cq_attr.comp_vector = ic->i_rcq_vector;
0559 ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
0560 rds_ib_cq_event_handler, conn,
0561 &cq_attr);
0562 if (IS_ERR(ic->i_recv_cq)) {
0563 ret = PTR_ERR(ic->i_recv_cq);
0564 ic->i_recv_cq = NULL;
0565 ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
0566 rdsdebug("ib_create_cq recv failed: %d\n", ret);
0567 goto send_cq_out;
0568 }
0569
0570 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
0571 if (ret) {
0572 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
0573 goto recv_cq_out;
0574 }
0575
0576 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
0577 if (ret) {
0578 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
0579 goto recv_cq_out;
0580 }
0581
0582
0583 memset(&attr, 0, sizeof(attr));
0584 attr.event_handler = rds_ib_qp_event_handler;
0585 attr.qp_context = conn;
0586
0587 attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
0588 attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
0589 attr.cap.max_send_sge = rds_ibdev->max_sge;
0590 attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
0591 attr.sq_sig_type = IB_SIGNAL_REQ_WR;
0592 attr.qp_type = IB_QPT_RC;
0593 attr.send_cq = ic->i_send_cq;
0594 attr.recv_cq = ic->i_recv_cq;
0595
0596
0597
0598
0599
0600 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
0601 if (ret) {
0602 rdsdebug("rdma_create_qp failed: %d\n", ret);
0603 goto recv_cq_out;
0604 }
0605
0606 ic->i_send_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_send_hdrs_dma,
0607 ic->i_send_ring.w_nr,
0608 DMA_TO_DEVICE);
0609 if (!ic->i_send_hdrs) {
0610 ret = -ENOMEM;
0611 rdsdebug("DMA send hdrs alloc failed\n");
0612 goto qp_out;
0613 }
0614
0615 ic->i_recv_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_recv_hdrs_dma,
0616 ic->i_recv_ring.w_nr,
0617 DMA_FROM_DEVICE);
0618 if (!ic->i_recv_hdrs) {
0619 ret = -ENOMEM;
0620 rdsdebug("DMA recv hdrs alloc failed\n");
0621 goto send_hdrs_dma_out;
0622 }
0623
0624 ic->i_ack = rds_dma_hdr_alloc(rds_ibdev->dev, &ic->i_ack_dma,
0625 DMA_TO_DEVICE);
0626 if (!ic->i_ack) {
0627 ret = -ENOMEM;
0628 rdsdebug("DMA ack header alloc failed\n");
0629 goto recv_hdrs_dma_out;
0630 }
0631
0632 ic->i_sends = vzalloc_node(array_size(sizeof(struct rds_ib_send_work),
0633 ic->i_send_ring.w_nr),
0634 ibdev_to_node(dev));
0635 if (!ic->i_sends) {
0636 ret = -ENOMEM;
0637 rdsdebug("send allocation failed\n");
0638 goto ack_dma_out;
0639 }
0640
0641 ic->i_recvs = vzalloc_node(array_size(sizeof(struct rds_ib_recv_work),
0642 ic->i_recv_ring.w_nr),
0643 ibdev_to_node(dev));
0644 if (!ic->i_recvs) {
0645 ret = -ENOMEM;
0646 rdsdebug("recv allocation failed\n");
0647 goto sends_out;
0648 }
0649
0650 rds_ib_recv_init_ack(ic);
0651
0652 rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
0653 ic->i_send_cq, ic->i_recv_cq);
0654
0655 goto out;
0656
0657 sends_out:
0658 vfree(ic->i_sends);
0659
0660 ack_dma_out:
0661 rds_dma_hdr_free(rds_ibdev->dev, ic->i_ack, ic->i_ack_dma,
0662 DMA_TO_DEVICE);
0663 ic->i_ack = NULL;
0664
0665 recv_hdrs_dma_out:
0666 rds_dma_hdrs_free(rds_ibdev, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
0667 ic->i_recv_ring.w_nr, DMA_FROM_DEVICE);
0668 ic->i_recv_hdrs = NULL;
0669 ic->i_recv_hdrs_dma = NULL;
0670
0671 send_hdrs_dma_out:
0672 rds_dma_hdrs_free(rds_ibdev, ic->i_send_hdrs, ic->i_send_hdrs_dma,
0673 ic->i_send_ring.w_nr, DMA_TO_DEVICE);
0674 ic->i_send_hdrs = NULL;
0675 ic->i_send_hdrs_dma = NULL;
0676
0677 qp_out:
0678 rdma_destroy_qp(ic->i_cm_id);
0679 recv_cq_out:
0680 ib_destroy_cq(ic->i_recv_cq);
0681 ic->i_recv_cq = NULL;
0682 send_cq_out:
0683 ib_destroy_cq(ic->i_send_cq);
0684 ic->i_send_cq = NULL;
0685 rds_ibdev_out:
0686 rds_ib_remove_conn(rds_ibdev, conn);
0687 out:
0688 rds_ib_dev_put(rds_ibdev);
0689
0690 return ret;
0691 }
0692
0693 static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
0694 {
0695 const union rds_ib_conn_priv *dp = event->param.conn.private_data;
0696 u8 data_len, major, minor;
0697 u32 version = 0;
0698 __be16 mask;
0699 u16 common;
0700
0701
0702
0703
0704
0705
0706
0707
0708
0709
0710
0711
0712 if (!event->param.conn.private_data_len) {
0713 printk(KERN_NOTICE "RDS incoming connection has no private data, "
0714 "rejecting\n");
0715 return 0;
0716 }
0717
0718 if (isv6) {
0719 data_len = sizeof(struct rds6_ib_connect_private);
0720 major = dp->ricp_v6.dp_protocol_major;
0721 minor = dp->ricp_v6.dp_protocol_minor;
0722 mask = dp->ricp_v6.dp_protocol_minor_mask;
0723 } else {
0724 data_len = sizeof(struct rds_ib_connect_private);
0725 major = dp->ricp_v4.dp_protocol_major;
0726 minor = dp->ricp_v4.dp_protocol_minor;
0727 mask = dp->ricp_v4.dp_protocol_minor_mask;
0728 }
0729
0730
0731 if (event->param.conn.private_data_len < data_len || major == 0)
0732 return RDS_PROTOCOL_4_0;
0733
0734 common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
0735 if (major == 4 && common) {
0736 version = RDS_PROTOCOL_4_0;
0737 while ((common >>= 1) != 0)
0738 version++;
0739 } else if (RDS_PROTOCOL_COMPAT_VERSION ==
0740 RDS_PROTOCOL(major, minor)) {
0741 version = RDS_PROTOCOL_COMPAT_VERSION;
0742 } else {
0743 if (isv6)
0744 printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
0745 &dp->ricp_v6.dp_saddr, major, minor);
0746 else
0747 printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
0748 &dp->ricp_v4.dp_saddr, major, minor);
0749 }
0750 return version;
0751 }
0752
0753 #if IS_ENABLED(CONFIG_IPV6)
0754
0755
0756
0757
0758
0759
0760
0761
0762
0763
0764 static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
0765 {
0766 struct net_device *dev;
0767 int idx = 0;
0768
0769 rcu_read_lock();
0770 for_each_netdev_rcu(net, dev) {
0771 if (ipv6_chk_addr(net, addr, dev, 1)) {
0772 idx = dev->ifindex;
0773 break;
0774 }
0775 }
0776 rcu_read_unlock();
0777
0778 return idx;
0779 }
0780 #endif
0781
0782 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
0783 struct rdma_cm_event *event, bool isv6)
0784 {
0785 __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
0786 __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
0787 const struct rds_ib_conn_priv_cmn *dp_cmn;
0788 struct rds_connection *conn = NULL;
0789 struct rds_ib_connection *ic = NULL;
0790 struct rdma_conn_param conn_param;
0791 const union rds_ib_conn_priv *dp;
0792 union rds_ib_conn_priv dp_rep;
0793 struct in6_addr s_mapped_addr;
0794 struct in6_addr d_mapped_addr;
0795 const struct in6_addr *saddr6;
0796 const struct in6_addr *daddr6;
0797 int destroy = 1;
0798 u32 ifindex = 0;
0799 u32 version;
0800 int err = 1;
0801
0802
0803 version = rds_ib_protocol_compatible(event, isv6);
0804 if (!version) {
0805 err = RDS_RDMA_REJ_INCOMPAT;
0806 goto out;
0807 }
0808
0809 dp = event->param.conn.private_data;
0810 if (isv6) {
0811 #if IS_ENABLED(CONFIG_IPV6)
0812 dp_cmn = &dp->ricp_v6.dp_cmn;
0813 saddr6 = &dp->ricp_v6.dp_saddr;
0814 daddr6 = &dp->ricp_v6.dp_daddr;
0815
0816
0817
0818
0819 if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
0820
0821 ifindex = __rds_find_ifindex(&init_net, daddr6);
0822
0823 if (ifindex == 0) {
0824 err = -EOPNOTSUPP;
0825 goto out;
0826 }
0827 } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) {
0828
0829 ifindex = __rds_find_ifindex(&init_net, daddr6);
0830
0831 if (ifindex == 0) {
0832 err = -EOPNOTSUPP;
0833 goto out;
0834 }
0835 }
0836 #else
0837 err = -EOPNOTSUPP;
0838 goto out;
0839 #endif
0840 } else {
0841 dp_cmn = &dp->ricp_v4.dp_cmn;
0842 ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
0843 ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
0844 saddr6 = &s_mapped_addr;
0845 daddr6 = &d_mapped_addr;
0846 }
0847
0848 rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx, tos:%d\n",
0849 saddr6, daddr6, RDS_PROTOCOL_MAJOR(version),
0850 RDS_PROTOCOL_MINOR(version),
0851 (unsigned long long)be64_to_cpu(lguid),
0852 (unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_dp_toss);
0853
0854
0855 conn = rds_conn_create(&init_net, daddr6, saddr6,
0856 &rds_ib_transport, dp_cmn->ricpc_dp_toss,
0857 GFP_KERNEL, ifindex);
0858 if (IS_ERR(conn)) {
0859 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
0860 conn = NULL;
0861 goto out;
0862 }
0863
0864
0865
0866
0867
0868
0869
0870
0871 mutex_lock(&conn->c_cm_lock);
0872 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
0873 if (rds_conn_state(conn) == RDS_CONN_UP) {
0874 rdsdebug("incoming connect while connecting\n");
0875 rds_conn_drop(conn);
0876 rds_ib_stats_inc(s_ib_listen_closed_stale);
0877 } else
0878 if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
0879
0880 rds_ib_stats_inc(s_ib_connect_raced);
0881 }
0882 goto out;
0883 }
0884
0885 ic = conn->c_transport_data;
0886
0887 rds_ib_set_protocol(conn, version);
0888 rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
0889
0890
0891
0892 if (dp_cmn->ricpc_ack_seq)
0893 rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
0894 NULL);
0895
0896 BUG_ON(cm_id->context);
0897 BUG_ON(ic->i_cm_id);
0898
0899 ic->i_cm_id = cm_id;
0900 cm_id->context = conn;
0901
0902
0903
0904 destroy = 0;
0905
0906 err = rds_ib_setup_qp(conn);
0907 if (err) {
0908 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
0909 goto out;
0910 }
0911
0912 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
0913 event->param.conn.responder_resources,
0914 event->param.conn.initiator_depth, isv6);
0915
0916 rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32);
0917
0918 if (rdma_accept(cm_id, &conn_param))
0919 rds_ib_conn_error(conn, "rdma_accept failed\n");
0920
0921 out:
0922 if (conn)
0923 mutex_unlock(&conn->c_cm_lock);
0924 if (err)
0925 rdma_reject(cm_id, &err, sizeof(int),
0926 IB_CM_REJ_CONSUMER_DEFINED);
0927 return destroy;
0928 }
0929
0930
0931 int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
0932 {
0933 struct rds_connection *conn = cm_id->context;
0934 struct rds_ib_connection *ic = conn->c_transport_data;
0935 struct rdma_conn_param conn_param;
0936 union rds_ib_conn_priv dp;
0937 int ret;
0938
0939
0940
0941 rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
0942 ic->i_flowctl = rds_ib_sysctl_flow_control;
0943
0944 ret = rds_ib_setup_qp(conn);
0945 if (ret) {
0946 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
0947 goto out;
0948 }
0949
0950 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
0951 conn->c_proposed_version,
0952 UINT_MAX, UINT_MAX, isv6);
0953 ret = rdma_connect_locked(cm_id, &conn_param);
0954 if (ret)
0955 rds_ib_conn_error(conn, "rdma_connect_locked failed (%d)\n",
0956 ret);
0957
0958 out:
0959
0960
0961
0962 if (ret) {
0963 if (ic->i_cm_id == cm_id)
0964 ret = 0;
0965 }
0966 ic->i_active_side = true;
0967 return ret;
0968 }
0969
0970 int rds_ib_conn_path_connect(struct rds_conn_path *cp)
0971 {
0972 struct rds_connection *conn = cp->cp_conn;
0973 struct sockaddr_storage src, dest;
0974 rdma_cm_event_handler handler;
0975 struct rds_ib_connection *ic;
0976 int ret;
0977
0978 ic = conn->c_transport_data;
0979
0980
0981
0982 #if IS_ENABLED(CONFIG_IPV6)
0983 if (conn->c_isv6)
0984 handler = rds6_rdma_cm_event_handler;
0985 else
0986 #endif
0987 handler = rds_rdma_cm_event_handler;
0988 ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
0989 RDMA_PS_TCP, IB_QPT_RC);
0990 if (IS_ERR(ic->i_cm_id)) {
0991 ret = PTR_ERR(ic->i_cm_id);
0992 ic->i_cm_id = NULL;
0993 rdsdebug("rdma_create_id() failed: %d\n", ret);
0994 goto out;
0995 }
0996
0997 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
0998
0999 if (ipv6_addr_v4mapped(&conn->c_faddr)) {
1000 struct sockaddr_in *sin;
1001
1002 sin = (struct sockaddr_in *)&src;
1003 sin->sin_family = AF_INET;
1004 sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
1005 sin->sin_port = 0;
1006
1007 sin = (struct sockaddr_in *)&dest;
1008 sin->sin_family = AF_INET;
1009 sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
1010 sin->sin_port = htons(RDS_PORT);
1011 } else {
1012 struct sockaddr_in6 *sin6;
1013
1014 sin6 = (struct sockaddr_in6 *)&src;
1015 sin6->sin6_family = AF_INET6;
1016 sin6->sin6_addr = conn->c_laddr;
1017 sin6->sin6_port = 0;
1018 sin6->sin6_scope_id = conn->c_dev_if;
1019
1020 sin6 = (struct sockaddr_in6 *)&dest;
1021 sin6->sin6_family = AF_INET6;
1022 sin6->sin6_addr = conn->c_faddr;
1023 sin6->sin6_port = htons(RDS_CM_PORT);
1024 sin6->sin6_scope_id = conn->c_dev_if;
1025 }
1026
1027 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
1028 (struct sockaddr *)&dest,
1029 RDS_RDMA_RESOLVE_TIMEOUT_MS);
1030 if (ret) {
1031 rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
1032 ret);
1033 rdma_destroy_id(ic->i_cm_id);
1034 ic->i_cm_id = NULL;
1035 }
1036
1037 out:
1038 return ret;
1039 }
1040
1041
1042
1043
1044
1045
1046 void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
1047 {
1048 struct rds_connection *conn = cp->cp_conn;
1049 struct rds_ib_connection *ic = conn->c_transport_data;
1050 int err = 0;
1051
1052 rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
1053 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
1054 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
1055
1056 if (ic->i_cm_id) {
1057 rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
1058 err = rdma_disconnect(ic->i_cm_id);
1059 if (err) {
1060
1061
1062
1063 rdsdebug("failed to disconnect, cm: %p err %d\n",
1064 ic->i_cm_id, err);
1065 }
1066
1067
1068
1069
1070 rds_ib_flush_mrs();
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081 wait_event(rds_ib_ring_empty_wait,
1082 rds_ib_ring_empty(&ic->i_recv_ring) &&
1083 (atomic_read(&ic->i_signaled_sends) == 0) &&
1084 (atomic_read(&ic->i_fastreg_inuse_count) == 0) &&
1085 (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
1086 tasklet_kill(&ic->i_send_tasklet);
1087 tasklet_kill(&ic->i_recv_tasklet);
1088
1089 atomic_set(&ic->i_cq_quiesce, 1);
1090
1091
1092 if (ic->i_cm_id->qp)
1093 rdma_destroy_qp(ic->i_cm_id);
1094 if (ic->i_send_cq) {
1095 if (ic->rds_ibdev)
1096 ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
1097 ib_destroy_cq(ic->i_send_cq);
1098 }
1099
1100 if (ic->i_recv_cq) {
1101 if (ic->rds_ibdev)
1102 ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
1103 ib_destroy_cq(ic->i_recv_cq);
1104 }
1105
1106 if (ic->rds_ibdev) {
1107
1108 if (ic->i_send_hdrs) {
1109 rds_dma_hdrs_free(ic->rds_ibdev,
1110 ic->i_send_hdrs,
1111 ic->i_send_hdrs_dma,
1112 ic->i_send_ring.w_nr,
1113 DMA_TO_DEVICE);
1114 ic->i_send_hdrs = NULL;
1115 ic->i_send_hdrs_dma = NULL;
1116 }
1117
1118 if (ic->i_recv_hdrs) {
1119 rds_dma_hdrs_free(ic->rds_ibdev,
1120 ic->i_recv_hdrs,
1121 ic->i_recv_hdrs_dma,
1122 ic->i_recv_ring.w_nr,
1123 DMA_FROM_DEVICE);
1124 ic->i_recv_hdrs = NULL;
1125 ic->i_recv_hdrs_dma = NULL;
1126 }
1127
1128 if (ic->i_ack) {
1129 rds_dma_hdr_free(ic->rds_ibdev->dev, ic->i_ack,
1130 ic->i_ack_dma, DMA_TO_DEVICE);
1131 ic->i_ack = NULL;
1132 }
1133 } else {
1134 WARN_ON(ic->i_send_hdrs);
1135 WARN_ON(ic->i_send_hdrs_dma);
1136 WARN_ON(ic->i_recv_hdrs);
1137 WARN_ON(ic->i_recv_hdrs_dma);
1138 WARN_ON(ic->i_ack);
1139 }
1140
1141 if (ic->i_sends)
1142 rds_ib_send_clear_ring(ic);
1143 if (ic->i_recvs)
1144 rds_ib_recv_clear_ring(ic);
1145
1146 rdma_destroy_id(ic->i_cm_id);
1147
1148
1149
1150
1151 if (ic->rds_ibdev)
1152 rds_ib_remove_conn(ic->rds_ibdev, conn);
1153
1154 ic->i_cm_id = NULL;
1155 ic->i_pd = NULL;
1156 ic->i_send_cq = NULL;
1157 ic->i_recv_cq = NULL;
1158 }
1159 BUG_ON(ic->rds_ibdev);
1160
1161
1162 if (ic->i_data_op) {
1163 struct rds_message *rm;
1164
1165 rm = container_of(ic->i_data_op, struct rds_message, data);
1166 rds_message_put(rm);
1167 ic->i_data_op = NULL;
1168 }
1169
1170
1171 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
1172 #ifdef KERNEL_HAS_ATOMIC64
1173 atomic64_set(&ic->i_ack_next, 0);
1174 #else
1175 ic->i_ack_next = 0;
1176 #endif
1177 ic->i_ack_recv = 0;
1178
1179
1180 ic->i_flowctl = 0;
1181 atomic_set(&ic->i_credits, 0);
1182
1183
1184 rds_ib_ring_init(&ic->i_send_ring, ic->i_send_ring.w_nr);
1185 rds_ib_ring_init(&ic->i_recv_ring, ic->i_recv_ring.w_nr);
1186
1187 if (ic->i_ibinc) {
1188 rds_inc_put(&ic->i_ibinc->ii_inc);
1189 ic->i_ibinc = NULL;
1190 }
1191
1192 vfree(ic->i_sends);
1193 ic->i_sends = NULL;
1194 vfree(ic->i_recvs);
1195 ic->i_recvs = NULL;
1196 ic->i_active_side = false;
1197 }
1198
1199 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
1200 {
1201 struct rds_ib_connection *ic;
1202 unsigned long flags;
1203 int ret;
1204
1205
1206 ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
1207 if (!ic)
1208 return -ENOMEM;
1209
1210 ret = rds_ib_recv_alloc_caches(ic, gfp);
1211 if (ret) {
1212 kfree(ic);
1213 return ret;
1214 }
1215
1216 INIT_LIST_HEAD(&ic->ib_node);
1217 tasklet_init(&ic->i_send_tasklet, rds_ib_tasklet_fn_send,
1218 (unsigned long)ic);
1219 tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv,
1220 (unsigned long)ic);
1221 mutex_init(&ic->i_recv_mutex);
1222 #ifndef KERNEL_HAS_ATOMIC64
1223 spin_lock_init(&ic->i_ack_lock);
1224 #endif
1225 atomic_set(&ic->i_signaled_sends, 0);
1226 atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
1227
1228
1229
1230
1231
1232 rds_ib_ring_init(&ic->i_send_ring, 0);
1233 rds_ib_ring_init(&ic->i_recv_ring, 0);
1234
1235 ic->conn = conn;
1236 conn->c_transport_data = ic;
1237
1238 spin_lock_irqsave(&ib_nodev_conns_lock, flags);
1239 list_add_tail(&ic->ib_node, &ib_nodev_conns);
1240 spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
1241
1242
1243 rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
1244 return 0;
1245 }
1246
1247
1248
1249
1250 void rds_ib_conn_free(void *arg)
1251 {
1252 struct rds_ib_connection *ic = arg;
1253 spinlock_t *lock_ptr;
1254
1255 rdsdebug("ic %p\n", ic);
1256
1257
1258
1259
1260
1261
1262 lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
1263
1264 spin_lock_irq(lock_ptr);
1265 list_del(&ic->ib_node);
1266 spin_unlock_irq(lock_ptr);
1267
1268 rds_ib_recv_free_caches(ic);
1269
1270 kfree(ic);
1271 }
1272
1273
1274
1275
1276
1277 void
1278 __rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
1279 {
1280 va_list ap;
1281
1282 rds_conn_drop(conn);
1283
1284 va_start(ap, fmt);
1285 vprintk(fmt, ap);
1286 va_end(ap);
1287 }