0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #undef pr_fmt
0011 #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
0012
0013 #include <linux/module.h>
0014 #include <linux/rculist.h>
0015 #include <linux/random.h>
0016
0017 #include "rtrs-clt.h"
0018 #include "rtrs-log.h"
0019
0020 #define RTRS_CONNECT_TIMEOUT_MS 30000
0021
0022
0023
0024
0025
0026 #define RTRS_RECONNECT_BACKOFF 1000
0027
0028
0029
0030
0031
0032 #define RTRS_RECONNECT_SEED 8
0033
0034 #define FIRST_CONN 0x01
0035
0036 #define RTRS_MAX_SEGMENTS 128
0037
0038 MODULE_DESCRIPTION("RDMA Transport Client");
0039 MODULE_LICENSE("GPL");
0040
0041 static const struct rtrs_rdma_dev_pd_ops dev_pd_ops;
0042 static struct rtrs_rdma_dev_pd dev_pd = {
0043 .ops = &dev_pd_ops
0044 };
0045
0046 static struct workqueue_struct *rtrs_wq;
0047 static struct class *rtrs_clt_dev_class;
0048
0049 static inline bool rtrs_clt_is_connected(const struct rtrs_clt_sess *clt)
0050 {
0051 struct rtrs_clt_path *clt_path;
0052 bool connected = false;
0053
0054 rcu_read_lock();
0055 list_for_each_entry_rcu(clt_path, &clt->paths_list, s.entry)
0056 connected |= READ_ONCE(clt_path->state) == RTRS_CLT_CONNECTED;
0057 rcu_read_unlock();
0058
0059 return connected;
0060 }
0061
0062 static struct rtrs_permit *
0063 __rtrs_get_permit(struct rtrs_clt_sess *clt, enum rtrs_clt_con_type con_type)
0064 {
0065 size_t max_depth = clt->queue_depth;
0066 struct rtrs_permit *permit;
0067 int bit;
0068
0069
0070
0071
0072
0073
0074
0075
0076 do {
0077 bit = find_first_zero_bit(clt->permits_map, max_depth);
0078 if (bit >= max_depth)
0079 return NULL;
0080 } while (test_and_set_bit_lock(bit, clt->permits_map));
0081
0082 permit = get_permit(clt, bit);
0083 WARN_ON(permit->mem_id != bit);
0084 permit->cpu_id = raw_smp_processor_id();
0085 permit->con_type = con_type;
0086
0087 return permit;
0088 }
0089
0090 static inline void __rtrs_put_permit(struct rtrs_clt_sess *clt,
0091 struct rtrs_permit *permit)
0092 {
0093 clear_bit_unlock(permit->mem_id, clt->permits_map);
0094 }
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110 struct rtrs_permit *rtrs_clt_get_permit(struct rtrs_clt_sess *clt,
0111 enum rtrs_clt_con_type con_type,
0112 enum wait_type can_wait)
0113 {
0114 struct rtrs_permit *permit;
0115 DEFINE_WAIT(wait);
0116
0117 permit = __rtrs_get_permit(clt, con_type);
0118 if (permit || !can_wait)
0119 return permit;
0120
0121 do {
0122 prepare_to_wait(&clt->permits_wait, &wait,
0123 TASK_UNINTERRUPTIBLE);
0124 permit = __rtrs_get_permit(clt, con_type);
0125 if (permit)
0126 break;
0127
0128 io_schedule();
0129 } while (1);
0130
0131 finish_wait(&clt->permits_wait, &wait);
0132
0133 return permit;
0134 }
0135 EXPORT_SYMBOL(rtrs_clt_get_permit);
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145 void rtrs_clt_put_permit(struct rtrs_clt_sess *clt,
0146 struct rtrs_permit *permit)
0147 {
0148 if (WARN_ON(!test_bit(permit->mem_id, clt->permits_map)))
0149 return;
0150
0151 __rtrs_put_permit(clt, permit);
0152
0153
0154
0155
0156
0157
0158
0159
0160 if (waitqueue_active(&clt->permits_wait))
0161 wake_up(&clt->permits_wait);
0162 }
0163 EXPORT_SYMBOL(rtrs_clt_put_permit);
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173 static
0174 struct rtrs_clt_con *rtrs_permit_to_clt_con(struct rtrs_clt_path *clt_path,
0175 struct rtrs_permit *permit)
0176 {
0177 int id = 0;
0178
0179 if (permit->con_type == RTRS_IO_CON)
0180 id = (permit->cpu_id % (clt_path->s.irq_con_num - 1)) + 1;
0181
0182 return to_clt_con(clt_path->s.con[id]);
0183 }
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197 static bool rtrs_clt_change_state(struct rtrs_clt_path *clt_path,
0198 enum rtrs_clt_state new_state)
0199 {
0200 enum rtrs_clt_state old_state;
0201 bool changed = false;
0202
0203 lockdep_assert_held(&clt_path->state_wq.lock);
0204
0205 old_state = clt_path->state;
0206 switch (new_state) {
0207 case RTRS_CLT_CONNECTING:
0208 switch (old_state) {
0209 case RTRS_CLT_RECONNECTING:
0210 changed = true;
0211 fallthrough;
0212 default:
0213 break;
0214 }
0215 break;
0216 case RTRS_CLT_RECONNECTING:
0217 switch (old_state) {
0218 case RTRS_CLT_CONNECTED:
0219 case RTRS_CLT_CONNECTING_ERR:
0220 case RTRS_CLT_CLOSED:
0221 changed = true;
0222 fallthrough;
0223 default:
0224 break;
0225 }
0226 break;
0227 case RTRS_CLT_CONNECTED:
0228 switch (old_state) {
0229 case RTRS_CLT_CONNECTING:
0230 changed = true;
0231 fallthrough;
0232 default:
0233 break;
0234 }
0235 break;
0236 case RTRS_CLT_CONNECTING_ERR:
0237 switch (old_state) {
0238 case RTRS_CLT_CONNECTING:
0239 changed = true;
0240 fallthrough;
0241 default:
0242 break;
0243 }
0244 break;
0245 case RTRS_CLT_CLOSING:
0246 switch (old_state) {
0247 case RTRS_CLT_CONNECTING:
0248 case RTRS_CLT_CONNECTING_ERR:
0249 case RTRS_CLT_RECONNECTING:
0250 case RTRS_CLT_CONNECTED:
0251 changed = true;
0252 fallthrough;
0253 default:
0254 break;
0255 }
0256 break;
0257 case RTRS_CLT_CLOSED:
0258 switch (old_state) {
0259 case RTRS_CLT_CLOSING:
0260 changed = true;
0261 fallthrough;
0262 default:
0263 break;
0264 }
0265 break;
0266 case RTRS_CLT_DEAD:
0267 switch (old_state) {
0268 case RTRS_CLT_CLOSED:
0269 changed = true;
0270 fallthrough;
0271 default:
0272 break;
0273 }
0274 break;
0275 default:
0276 break;
0277 }
0278 if (changed) {
0279 clt_path->state = new_state;
0280 wake_up_locked(&clt_path->state_wq);
0281 }
0282
0283 return changed;
0284 }
0285
0286 static bool rtrs_clt_change_state_from_to(struct rtrs_clt_path *clt_path,
0287 enum rtrs_clt_state old_state,
0288 enum rtrs_clt_state new_state)
0289 {
0290 bool changed = false;
0291
0292 spin_lock_irq(&clt_path->state_wq.lock);
0293 if (clt_path->state == old_state)
0294 changed = rtrs_clt_change_state(clt_path, new_state);
0295 spin_unlock_irq(&clt_path->state_wq.lock);
0296
0297 return changed;
0298 }
0299
0300 static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path);
0301 static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
0302 {
0303 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
0304
0305 if (rtrs_clt_change_state_from_to(clt_path,
0306 RTRS_CLT_CONNECTED,
0307 RTRS_CLT_RECONNECTING)) {
0308 queue_work(rtrs_wq, &clt_path->err_recovery_work);
0309 } else {
0310
0311
0312
0313
0314
0315 rtrs_clt_change_state_from_to(clt_path,
0316 RTRS_CLT_CONNECTING,
0317 RTRS_CLT_CONNECTING_ERR);
0318 }
0319 }
0320
0321 static void rtrs_clt_fast_reg_done(struct ib_cq *cq, struct ib_wc *wc)
0322 {
0323 struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
0324
0325 if (wc->status != IB_WC_SUCCESS) {
0326 rtrs_err(con->c.path, "Failed IB_WR_REG_MR: %s\n",
0327 ib_wc_status_msg(wc->status));
0328 rtrs_rdma_error_recovery(con);
0329 }
0330 }
0331
0332 static struct ib_cqe fast_reg_cqe = {
0333 .done = rtrs_clt_fast_reg_done
0334 };
0335
0336 static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
0337 bool notify, bool can_wait);
0338
0339 static void rtrs_clt_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
0340 {
0341 struct rtrs_clt_io_req *req =
0342 container_of(wc->wr_cqe, typeof(*req), inv_cqe);
0343 struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
0344
0345 if (wc->status != IB_WC_SUCCESS) {
0346 rtrs_err(con->c.path, "Failed IB_WR_LOCAL_INV: %s\n",
0347 ib_wc_status_msg(wc->status));
0348 rtrs_rdma_error_recovery(con);
0349 }
0350 req->need_inv = false;
0351 if (req->need_inv_comp)
0352 complete(&req->inv_comp);
0353 else
0354
0355 complete_rdma_req(req, req->inv_errno, true, false);
0356 }
0357
0358 static int rtrs_inv_rkey(struct rtrs_clt_io_req *req)
0359 {
0360 struct rtrs_clt_con *con = req->con;
0361 struct ib_send_wr wr = {
0362 .opcode = IB_WR_LOCAL_INV,
0363 .wr_cqe = &req->inv_cqe,
0364 .send_flags = IB_SEND_SIGNALED,
0365 .ex.invalidate_rkey = req->mr->rkey,
0366 };
0367 req->inv_cqe.done = rtrs_clt_inv_rkey_done;
0368
0369 return ib_post_send(con->c.qp, &wr, NULL);
0370 }
0371
0372 static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
0373 bool notify, bool can_wait)
0374 {
0375 struct rtrs_clt_con *con = req->con;
0376 struct rtrs_clt_path *clt_path;
0377 int err;
0378
0379 if (WARN_ON(!req->in_use))
0380 return;
0381 if (WARN_ON(!req->con))
0382 return;
0383 clt_path = to_clt_path(con->c.path);
0384
0385 if (req->sg_cnt) {
0386 if (req->dir == DMA_FROM_DEVICE && req->need_inv) {
0387
0388
0389
0390
0391
0392
0393
0394
0395
0396
0397
0398
0399
0400
0401 if (can_wait) {
0402 req->need_inv_comp = true;
0403 } else {
0404
0405 WARN_ON(!notify);
0406
0407 req->inv_errno = errno;
0408 }
0409
0410 refcount_inc(&req->ref);
0411 err = rtrs_inv_rkey(req);
0412 if (err) {
0413 rtrs_err(con->c.path, "Send INV WR key=%#x: %d\n",
0414 req->mr->rkey, err);
0415 } else if (can_wait) {
0416 wait_for_completion(&req->inv_comp);
0417 } else {
0418
0419
0420
0421
0422 WARN_ON_ONCE(1);
0423
0424 return;
0425 }
0426 if (!refcount_dec_and_test(&req->ref))
0427 return;
0428 }
0429 ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist,
0430 req->sg_cnt, req->dir);
0431 }
0432 if (!refcount_dec_and_test(&req->ref))
0433 return;
0434 if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
0435 atomic_dec(&clt_path->stats->inflight);
0436
0437 req->in_use = false;
0438 req->con = NULL;
0439
0440 if (errno) {
0441 rtrs_err_rl(con->c.path, "IO request failed: error=%d path=%s [%s:%u] notify=%d\n",
0442 errno, kobject_name(&clt_path->kobj), clt_path->hca_name,
0443 clt_path->hca_port, notify);
0444 }
0445
0446 if (notify)
0447 req->conf(req->priv, errno);
0448 }
0449
0450 static int rtrs_post_send_rdma(struct rtrs_clt_con *con,
0451 struct rtrs_clt_io_req *req,
0452 struct rtrs_rbuf *rbuf, u32 off,
0453 u32 imm, struct ib_send_wr *wr)
0454 {
0455 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
0456 enum ib_send_flags flags;
0457 struct ib_sge sge;
0458
0459 if (!req->sg_size) {
0460 rtrs_wrn(con->c.path,
0461 "Doing RDMA Write failed, no data supplied\n");
0462 return -EINVAL;
0463 }
0464
0465
0466 sge.addr = req->iu->dma_addr;
0467 sge.length = req->sg_size;
0468 sge.lkey = clt_path->s.dev->ib_pd->local_dma_lkey;
0469
0470
0471
0472
0473
0474 flags = atomic_inc_return(&con->c.wr_cnt) % clt_path->s.signal_interval ?
0475 0 : IB_SEND_SIGNALED;
0476
0477 ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev,
0478 req->iu->dma_addr,
0479 req->sg_size, DMA_TO_DEVICE);
0480
0481 return rtrs_iu_post_rdma_write_imm(&con->c, req->iu, &sge, 1,
0482 rbuf->rkey, rbuf->addr + off,
0483 imm, flags, wr, NULL);
0484 }
0485
0486 static void process_io_rsp(struct rtrs_clt_path *clt_path, u32 msg_id,
0487 s16 errno, bool w_inval)
0488 {
0489 struct rtrs_clt_io_req *req;
0490
0491 if (WARN_ON(msg_id >= clt_path->queue_depth))
0492 return;
0493
0494 req = &clt_path->reqs[msg_id];
0495
0496 req->need_inv &= !w_inval;
0497 complete_rdma_req(req, errno, true, false);
0498 }
0499
0500 static void rtrs_clt_recv_done(struct rtrs_clt_con *con, struct ib_wc *wc)
0501 {
0502 struct rtrs_iu *iu;
0503 int err;
0504 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
0505
0506 WARN_ON((clt_path->flags & RTRS_MSG_NEW_RKEY_F) == 0);
0507 iu = container_of(wc->wr_cqe, struct rtrs_iu,
0508 cqe);
0509 err = rtrs_iu_post_recv(&con->c, iu);
0510 if (err) {
0511 rtrs_err(con->c.path, "post iu failed %d\n", err);
0512 rtrs_rdma_error_recovery(con);
0513 }
0514 }
0515
0516 static void rtrs_clt_rkey_rsp_done(struct rtrs_clt_con *con, struct ib_wc *wc)
0517 {
0518 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
0519 struct rtrs_msg_rkey_rsp *msg;
0520 u32 imm_type, imm_payload;
0521 bool w_inval = false;
0522 struct rtrs_iu *iu;
0523 u32 buf_id;
0524 int err;
0525
0526 WARN_ON((clt_path->flags & RTRS_MSG_NEW_RKEY_F) == 0);
0527
0528 iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
0529
0530 if (wc->byte_len < sizeof(*msg)) {
0531 rtrs_err(con->c.path, "rkey response is malformed: size %d\n",
0532 wc->byte_len);
0533 goto out;
0534 }
0535 ib_dma_sync_single_for_cpu(clt_path->s.dev->ib_dev, iu->dma_addr,
0536 iu->size, DMA_FROM_DEVICE);
0537 msg = iu->buf;
0538 if (le16_to_cpu(msg->type) != RTRS_MSG_RKEY_RSP) {
0539 rtrs_err(clt_path->clt,
0540 "rkey response is malformed: type %d\n",
0541 le16_to_cpu(msg->type));
0542 goto out;
0543 }
0544 buf_id = le16_to_cpu(msg->buf_id);
0545 if (WARN_ON(buf_id >= clt_path->queue_depth))
0546 goto out;
0547
0548 rtrs_from_imm(be32_to_cpu(wc->ex.imm_data), &imm_type, &imm_payload);
0549 if (imm_type == RTRS_IO_RSP_IMM ||
0550 imm_type == RTRS_IO_RSP_W_INV_IMM) {
0551 u32 msg_id;
0552
0553 w_inval = (imm_type == RTRS_IO_RSP_W_INV_IMM);
0554 rtrs_from_io_rsp_imm(imm_payload, &msg_id, &err);
0555
0556 if (WARN_ON(buf_id != msg_id))
0557 goto out;
0558 clt_path->rbufs[buf_id].rkey = le32_to_cpu(msg->rkey);
0559 process_io_rsp(clt_path, msg_id, err, w_inval);
0560 }
0561 ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev, iu->dma_addr,
0562 iu->size, DMA_FROM_DEVICE);
0563 return rtrs_clt_recv_done(con, wc);
0564 out:
0565 rtrs_rdma_error_recovery(con);
0566 }
0567
0568 static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc);
0569
0570 static struct ib_cqe io_comp_cqe = {
0571 .done = rtrs_clt_rdma_done
0572 };
0573
0574
0575
0576
0577
0578 static int rtrs_post_recv_empty_x2(struct rtrs_con *con, struct ib_cqe *cqe)
0579 {
0580 struct ib_recv_wr wr_arr[2], *wr;
0581 int i;
0582
0583 memset(wr_arr, 0, sizeof(wr_arr));
0584 for (i = 0; i < ARRAY_SIZE(wr_arr); i++) {
0585 wr = &wr_arr[i];
0586 wr->wr_cqe = cqe;
0587 if (i)
0588
0589 wr->next = &wr_arr[i - 1];
0590 }
0591
0592 return ib_post_recv(con->qp, wr, NULL);
0593 }
0594
0595 static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
0596 {
0597 struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
0598 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
0599 u32 imm_type, imm_payload;
0600 bool w_inval = false;
0601 int err;
0602
0603 if (wc->status != IB_WC_SUCCESS) {
0604 if (wc->status != IB_WC_WR_FLUSH_ERR) {
0605 rtrs_err(clt_path->clt, "RDMA failed: %s\n",
0606 ib_wc_status_msg(wc->status));
0607 rtrs_rdma_error_recovery(con);
0608 }
0609 return;
0610 }
0611 rtrs_clt_update_wc_stats(con);
0612
0613 switch (wc->opcode) {
0614 case IB_WC_RECV_RDMA_WITH_IMM:
0615
0616
0617
0618
0619 if (WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done))
0620 return;
0621 rtrs_from_imm(be32_to_cpu(wc->ex.imm_data),
0622 &imm_type, &imm_payload);
0623 if (imm_type == RTRS_IO_RSP_IMM ||
0624 imm_type == RTRS_IO_RSP_W_INV_IMM) {
0625 u32 msg_id;
0626
0627 w_inval = (imm_type == RTRS_IO_RSP_W_INV_IMM);
0628 rtrs_from_io_rsp_imm(imm_payload, &msg_id, &err);
0629
0630 process_io_rsp(clt_path, msg_id, err, w_inval);
0631 } else if (imm_type == RTRS_HB_MSG_IMM) {
0632 WARN_ON(con->c.cid);
0633 rtrs_send_hb_ack(&clt_path->s);
0634 if (clt_path->flags & RTRS_MSG_NEW_RKEY_F)
0635 return rtrs_clt_recv_done(con, wc);
0636 } else if (imm_type == RTRS_HB_ACK_IMM) {
0637 WARN_ON(con->c.cid);
0638 clt_path->s.hb_missed_cnt = 0;
0639 clt_path->s.hb_cur_latency =
0640 ktime_sub(ktime_get(), clt_path->s.hb_last_sent);
0641 if (clt_path->flags & RTRS_MSG_NEW_RKEY_F)
0642 return rtrs_clt_recv_done(con, wc);
0643 } else {
0644 rtrs_wrn(con->c.path, "Unknown IMM type %u\n",
0645 imm_type);
0646 }
0647 if (w_inval)
0648
0649
0650
0651
0652 err = rtrs_post_recv_empty_x2(&con->c, &io_comp_cqe);
0653 else
0654 err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
0655 if (err) {
0656 rtrs_err(con->c.path, "rtrs_post_recv_empty(): %d\n",
0657 err);
0658 rtrs_rdma_error_recovery(con);
0659 }
0660 break;
0661 case IB_WC_RECV:
0662
0663
0664
0665 WARN_ON(!(wc->wc_flags & IB_WC_WITH_INVALIDATE ||
0666 wc->wc_flags & IB_WC_WITH_IMM));
0667 WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done);
0668 if (clt_path->flags & RTRS_MSG_NEW_RKEY_F) {
0669 if (wc->wc_flags & IB_WC_WITH_INVALIDATE)
0670 return rtrs_clt_recv_done(con, wc);
0671
0672 return rtrs_clt_rkey_rsp_done(con, wc);
0673 }
0674 break;
0675 case IB_WC_RDMA_WRITE:
0676
0677
0678
0679
0680 break;
0681
0682 default:
0683 rtrs_wrn(clt_path->clt, "Unexpected WC type: %d\n", wc->opcode);
0684 return;
0685 }
0686 }
0687
0688 static int post_recv_io(struct rtrs_clt_con *con, size_t q_size)
0689 {
0690 int err, i;
0691 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
0692
0693 for (i = 0; i < q_size; i++) {
0694 if (clt_path->flags & RTRS_MSG_NEW_RKEY_F) {
0695 struct rtrs_iu *iu = &con->rsp_ius[i];
0696
0697 err = rtrs_iu_post_recv(&con->c, iu);
0698 } else {
0699 err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
0700 }
0701 if (err)
0702 return err;
0703 }
0704
0705 return 0;
0706 }
0707
0708 static int post_recv_path(struct rtrs_clt_path *clt_path)
0709 {
0710 size_t q_size = 0;
0711 int err, cid;
0712
0713 for (cid = 0; cid < clt_path->s.con_num; cid++) {
0714 if (cid == 0)
0715 q_size = SERVICE_CON_QUEUE_DEPTH;
0716 else
0717 q_size = clt_path->queue_depth;
0718
0719
0720
0721
0722
0723 q_size *= 2;
0724
0725 err = post_recv_io(to_clt_con(clt_path->s.con[cid]), q_size);
0726 if (err) {
0727 rtrs_err(clt_path->clt, "post_recv_io(), err: %d\n",
0728 err);
0729 return err;
0730 }
0731 }
0732
0733 return 0;
0734 }
0735
0736 struct path_it {
0737 int i;
0738 struct list_head skip_list;
0739 struct rtrs_clt_sess *clt;
0740 struct rtrs_clt_path *(*next_path)(struct path_it *it);
0741 };
0742
0743
0744
0745
0746
0747
0748
0749
0750
0751
0752
0753
0754 static inline struct rtrs_clt_path *
0755 rtrs_clt_get_next_path_or_null(struct list_head *head, struct rtrs_clt_path *clt_path)
0756 {
0757 return list_next_or_null_rcu(head, &clt_path->s.entry, typeof(*clt_path), s.entry) ?:
0758 list_next_or_null_rcu(head,
0759 READ_ONCE((&clt_path->s.entry)->next),
0760 typeof(*clt_path), s.entry);
0761 }
0762
0763
0764
0765
0766
0767
0768
0769
0770
0771
0772 static struct rtrs_clt_path *get_next_path_rr(struct path_it *it)
0773 {
0774 struct rtrs_clt_path __rcu **ppcpu_path;
0775 struct rtrs_clt_path *path;
0776 struct rtrs_clt_sess *clt;
0777
0778 clt = it->clt;
0779
0780
0781
0782
0783
0784
0785
0786 ppcpu_path = this_cpu_ptr(clt->pcpu_path);
0787 path = rcu_dereference(*ppcpu_path);
0788 if (!path)
0789 path = list_first_or_null_rcu(&clt->paths_list,
0790 typeof(*path), s.entry);
0791 else
0792 path = rtrs_clt_get_next_path_or_null(&clt->paths_list, path);
0793
0794 rcu_assign_pointer(*ppcpu_path, path);
0795
0796 return path;
0797 }
0798
0799
0800
0801
0802
0803
0804
0805
0806
0807
0808 static struct rtrs_clt_path *get_next_path_min_inflight(struct path_it *it)
0809 {
0810 struct rtrs_clt_path *min_path = NULL;
0811 struct rtrs_clt_sess *clt = it->clt;
0812 struct rtrs_clt_path *clt_path;
0813 int min_inflight = INT_MAX;
0814 int inflight;
0815
0816 list_for_each_entry_rcu(clt_path, &clt->paths_list, s.entry) {
0817 if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
0818 continue;
0819
0820 if (!list_empty(raw_cpu_ptr(clt_path->mp_skip_entry)))
0821 continue;
0822
0823 inflight = atomic_read(&clt_path->stats->inflight);
0824
0825 if (inflight < min_inflight) {
0826 min_inflight = inflight;
0827 min_path = clt_path;
0828 }
0829 }
0830
0831
0832
0833
0834
0835 if (min_path)
0836 list_add(raw_cpu_ptr(min_path->mp_skip_entry), &it->skip_list);
0837
0838 return min_path;
0839 }
0840
0841
0842
0843
0844
0845
0846
0847
0848
0849
0850
0851
0852
0853
0854
0855
0856
0857
0858
0859 static struct rtrs_clt_path *get_next_path_min_latency(struct path_it *it)
0860 {
0861 struct rtrs_clt_path *min_path = NULL;
0862 struct rtrs_clt_sess *clt = it->clt;
0863 struct rtrs_clt_path *clt_path;
0864 ktime_t min_latency = KTIME_MAX;
0865 ktime_t latency;
0866
0867 list_for_each_entry_rcu(clt_path, &clt->paths_list, s.entry) {
0868 if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
0869 continue;
0870
0871 if (!list_empty(raw_cpu_ptr(clt_path->mp_skip_entry)))
0872 continue;
0873
0874 latency = clt_path->s.hb_cur_latency;
0875
0876 if (latency < min_latency) {
0877 min_latency = latency;
0878 min_path = clt_path;
0879 }
0880 }
0881
0882
0883
0884
0885
0886 if (min_path)
0887 list_add(raw_cpu_ptr(min_path->mp_skip_entry), &it->skip_list);
0888
0889 return min_path;
0890 }
0891
0892 static inline void path_it_init(struct path_it *it, struct rtrs_clt_sess *clt)
0893 {
0894 INIT_LIST_HEAD(&it->skip_list);
0895 it->clt = clt;
0896 it->i = 0;
0897
0898 if (clt->mp_policy == MP_POLICY_RR)
0899 it->next_path = get_next_path_rr;
0900 else if (clt->mp_policy == MP_POLICY_MIN_INFLIGHT)
0901 it->next_path = get_next_path_min_inflight;
0902 else
0903 it->next_path = get_next_path_min_latency;
0904 }
0905
0906 static inline void path_it_deinit(struct path_it *it)
0907 {
0908 struct list_head *skip, *tmp;
0909
0910
0911
0912
0913
0914 list_for_each_safe(skip, tmp, &it->skip_list)
0915 list_del_init(skip);
0916 }
0917
0918
0919
0920
0921
0922
0923
0924
0925
0926
0927
0928
0929
0930
0931
0932
0933
0934
0935
0936 static void rtrs_clt_init_req(struct rtrs_clt_io_req *req,
0937 struct rtrs_clt_path *clt_path,
0938 void (*conf)(void *priv, int errno),
0939 struct rtrs_permit *permit, void *priv,
0940 const struct kvec *vec, size_t usr_len,
0941 struct scatterlist *sg, size_t sg_cnt,
0942 size_t data_len, int dir)
0943 {
0944 struct iov_iter iter;
0945 size_t len;
0946
0947 req->permit = permit;
0948 req->in_use = true;
0949 req->usr_len = usr_len;
0950 req->data_len = data_len;
0951 req->sglist = sg;
0952 req->sg_cnt = sg_cnt;
0953 req->priv = priv;
0954 req->dir = dir;
0955 req->con = rtrs_permit_to_clt_con(clt_path, permit);
0956 req->conf = conf;
0957 req->need_inv = false;
0958 req->need_inv_comp = false;
0959 req->inv_errno = 0;
0960 refcount_set(&req->ref, 1);
0961 req->mp_policy = clt_path->clt->mp_policy;
0962
0963 iov_iter_kvec(&iter, READ, vec, 1, usr_len);
0964 len = _copy_from_iter(req->iu->buf, usr_len, &iter);
0965 WARN_ON(len != usr_len);
0966
0967 reinit_completion(&req->inv_comp);
0968 }
0969
0970 static struct rtrs_clt_io_req *
0971 rtrs_clt_get_req(struct rtrs_clt_path *clt_path,
0972 void (*conf)(void *priv, int errno),
0973 struct rtrs_permit *permit, void *priv,
0974 const struct kvec *vec, size_t usr_len,
0975 struct scatterlist *sg, size_t sg_cnt,
0976 size_t data_len, int dir)
0977 {
0978 struct rtrs_clt_io_req *req;
0979
0980 req = &clt_path->reqs[permit->mem_id];
0981 rtrs_clt_init_req(req, clt_path, conf, permit, priv, vec, usr_len,
0982 sg, sg_cnt, data_len, dir);
0983 return req;
0984 }
0985
0986 static struct rtrs_clt_io_req *
0987 rtrs_clt_get_copy_req(struct rtrs_clt_path *alive_path,
0988 struct rtrs_clt_io_req *fail_req)
0989 {
0990 struct rtrs_clt_io_req *req;
0991 struct kvec vec = {
0992 .iov_base = fail_req->iu->buf,
0993 .iov_len = fail_req->usr_len
0994 };
0995
0996 req = &alive_path->reqs[fail_req->permit->mem_id];
0997 rtrs_clt_init_req(req, alive_path, fail_req->conf, fail_req->permit,
0998 fail_req->priv, &vec, fail_req->usr_len,
0999 fail_req->sglist, fail_req->sg_cnt,
1000 fail_req->data_len, fail_req->dir);
1001 return req;
1002 }
1003
1004 static int rtrs_post_rdma_write_sg(struct rtrs_clt_con *con,
1005 struct rtrs_clt_io_req *req,
1006 struct rtrs_rbuf *rbuf, bool fr_en,
1007 u32 count, u32 size, u32 imm,
1008 struct ib_send_wr *wr,
1009 struct ib_send_wr *tail)
1010 {
1011 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
1012 struct ib_sge *sge = req->sge;
1013 enum ib_send_flags flags;
1014 struct scatterlist *sg;
1015 size_t num_sge;
1016 int i;
1017 struct ib_send_wr *ptail = NULL;
1018
1019 if (fr_en) {
1020 i = 0;
1021 sge[i].addr = req->mr->iova;
1022 sge[i].length = req->mr->length;
1023 sge[i].lkey = req->mr->lkey;
1024 i++;
1025 num_sge = 2;
1026 ptail = tail;
1027 } else {
1028 for_each_sg(req->sglist, sg, count, i) {
1029 sge[i].addr = sg_dma_address(sg);
1030 sge[i].length = sg_dma_len(sg);
1031 sge[i].lkey = clt_path->s.dev->ib_pd->local_dma_lkey;
1032 }
1033 num_sge = 1 + count;
1034 }
1035 sge[i].addr = req->iu->dma_addr;
1036 sge[i].length = size;
1037 sge[i].lkey = clt_path->s.dev->ib_pd->local_dma_lkey;
1038
1039
1040
1041
1042
1043 flags = atomic_inc_return(&con->c.wr_cnt) % clt_path->s.signal_interval ?
1044 0 : IB_SEND_SIGNALED;
1045
1046 ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev,
1047 req->iu->dma_addr,
1048 size, DMA_TO_DEVICE);
1049
1050 return rtrs_iu_post_rdma_write_imm(&con->c, req->iu, sge, num_sge,
1051 rbuf->rkey, rbuf->addr, imm,
1052 flags, wr, ptail);
1053 }
1054
1055 static int rtrs_map_sg_fr(struct rtrs_clt_io_req *req, size_t count)
1056 {
1057 int nr;
1058
1059
1060 nr = ib_map_mr_sg(req->mr, req->sglist, count, NULL, SZ_4K);
1061 if (nr < 0)
1062 return nr;
1063 if (nr < req->sg_cnt)
1064 return -EINVAL;
1065 ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
1066
1067 return nr;
1068 }
1069
1070 static int rtrs_clt_write_req(struct rtrs_clt_io_req *req)
1071 {
1072 struct rtrs_clt_con *con = req->con;
1073 struct rtrs_path *s = con->c.path;
1074 struct rtrs_clt_path *clt_path = to_clt_path(s);
1075 struct rtrs_msg_rdma_write *msg;
1076
1077 struct rtrs_rbuf *rbuf;
1078 int ret, count = 0;
1079 u32 imm, buf_id;
1080 struct ib_reg_wr rwr;
1081 struct ib_send_wr inv_wr;
1082 struct ib_send_wr *wr = NULL;
1083 bool fr_en = false;
1084
1085 const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len;
1086
1087 if (tsize > clt_path->chunk_size) {
1088 rtrs_wrn(s, "Write request failed, size too big %zu > %d\n",
1089 tsize, clt_path->chunk_size);
1090 return -EMSGSIZE;
1091 }
1092 if (req->sg_cnt) {
1093 count = ib_dma_map_sg(clt_path->s.dev->ib_dev, req->sglist,
1094 req->sg_cnt, req->dir);
1095 if (!count) {
1096 rtrs_wrn(s, "Write request failed, map failed\n");
1097 return -EINVAL;
1098 }
1099 }
1100
1101 msg = req->iu->buf + req->usr_len;
1102 msg->type = cpu_to_le16(RTRS_MSG_WRITE);
1103 msg->usr_len = cpu_to_le16(req->usr_len);
1104
1105
1106 imm = req->permit->mem_off + req->data_len + req->usr_len;
1107 imm = rtrs_to_io_req_imm(imm);
1108 buf_id = req->permit->mem_id;
1109 req->sg_size = tsize;
1110 rbuf = &clt_path->rbufs[buf_id];
1111
1112 if (count) {
1113 ret = rtrs_map_sg_fr(req, count);
1114 if (ret < 0) {
1115 rtrs_err_rl(s,
1116 "Write request failed, failed to map fast reg. data, err: %d\n",
1117 ret);
1118 ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist,
1119 req->sg_cnt, req->dir);
1120 return ret;
1121 }
1122 inv_wr = (struct ib_send_wr) {
1123 .opcode = IB_WR_LOCAL_INV,
1124 .wr_cqe = &req->inv_cqe,
1125 .send_flags = IB_SEND_SIGNALED,
1126 .ex.invalidate_rkey = req->mr->rkey,
1127 };
1128 req->inv_cqe.done = rtrs_clt_inv_rkey_done;
1129 rwr = (struct ib_reg_wr) {
1130 .wr.opcode = IB_WR_REG_MR,
1131 .wr.wr_cqe = &fast_reg_cqe,
1132 .mr = req->mr,
1133 .key = req->mr->rkey,
1134 .access = (IB_ACCESS_LOCAL_WRITE),
1135 };
1136 wr = &rwr.wr;
1137 fr_en = true;
1138 refcount_inc(&req->ref);
1139 }
1140
1141
1142
1143
1144 rtrs_clt_update_all_stats(req, WRITE);
1145
1146 ret = rtrs_post_rdma_write_sg(req->con, req, rbuf, fr_en, count,
1147 req->usr_len + sizeof(*msg),
1148 imm, wr, &inv_wr);
1149 if (ret) {
1150 rtrs_err_rl(s,
1151 "Write request failed: error=%d path=%s [%s:%u]\n",
1152 ret, kobject_name(&clt_path->kobj), clt_path->hca_name,
1153 clt_path->hca_port);
1154 if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
1155 atomic_dec(&clt_path->stats->inflight);
1156 if (req->sg_cnt)
1157 ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist,
1158 req->sg_cnt, req->dir);
1159 }
1160
1161 return ret;
1162 }
1163
1164 static int rtrs_clt_read_req(struct rtrs_clt_io_req *req)
1165 {
1166 struct rtrs_clt_con *con = req->con;
1167 struct rtrs_path *s = con->c.path;
1168 struct rtrs_clt_path *clt_path = to_clt_path(s);
1169 struct rtrs_msg_rdma_read *msg;
1170 struct rtrs_ib_dev *dev = clt_path->s.dev;
1171
1172 struct ib_reg_wr rwr;
1173 struct ib_send_wr *wr = NULL;
1174
1175 int ret, count = 0;
1176 u32 imm, buf_id;
1177
1178 const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len;
1179
1180 if (tsize > clt_path->chunk_size) {
1181 rtrs_wrn(s,
1182 "Read request failed, message size is %zu, bigger than CHUNK_SIZE %d\n",
1183 tsize, clt_path->chunk_size);
1184 return -EMSGSIZE;
1185 }
1186
1187 if (req->sg_cnt) {
1188 count = ib_dma_map_sg(dev->ib_dev, req->sglist, req->sg_cnt,
1189 req->dir);
1190 if (!count) {
1191 rtrs_wrn(s,
1192 "Read request failed, dma map failed\n");
1193 return -EINVAL;
1194 }
1195 }
1196
1197 msg = req->iu->buf + req->usr_len;
1198 msg->type = cpu_to_le16(RTRS_MSG_READ);
1199 msg->usr_len = cpu_to_le16(req->usr_len);
1200
1201 if (count) {
1202 ret = rtrs_map_sg_fr(req, count);
1203 if (ret < 0) {
1204 rtrs_err_rl(s,
1205 "Read request failed, failed to map fast reg. data, err: %d\n",
1206 ret);
1207 ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt,
1208 req->dir);
1209 return ret;
1210 }
1211 rwr = (struct ib_reg_wr) {
1212 .wr.opcode = IB_WR_REG_MR,
1213 .wr.wr_cqe = &fast_reg_cqe,
1214 .mr = req->mr,
1215 .key = req->mr->rkey,
1216 .access = (IB_ACCESS_LOCAL_WRITE |
1217 IB_ACCESS_REMOTE_WRITE),
1218 };
1219 wr = &rwr.wr;
1220
1221 msg->sg_cnt = cpu_to_le16(1);
1222 msg->flags = cpu_to_le16(RTRS_MSG_NEED_INVAL_F);
1223
1224 msg->desc[0].addr = cpu_to_le64(req->mr->iova);
1225 msg->desc[0].key = cpu_to_le32(req->mr->rkey);
1226 msg->desc[0].len = cpu_to_le32(req->mr->length);
1227
1228
1229 req->need_inv = !!RTRS_MSG_NEED_INVAL_F;
1230
1231 } else {
1232 msg->sg_cnt = 0;
1233 msg->flags = 0;
1234 }
1235
1236
1237
1238
1239 imm = req->permit->mem_off + req->data_len + req->usr_len;
1240 imm = rtrs_to_io_req_imm(imm);
1241 buf_id = req->permit->mem_id;
1242
1243 req->sg_size = sizeof(*msg);
1244 req->sg_size += le16_to_cpu(msg->sg_cnt) * sizeof(struct rtrs_sg_desc);
1245 req->sg_size += req->usr_len;
1246
1247
1248
1249
1250
1251 rtrs_clt_update_all_stats(req, READ);
1252
1253 ret = rtrs_post_send_rdma(req->con, req, &clt_path->rbufs[buf_id],
1254 req->data_len, imm, wr);
1255 if (ret) {
1256 rtrs_err_rl(s,
1257 "Read request failed: error=%d path=%s [%s:%u]\n",
1258 ret, kobject_name(&clt_path->kobj), clt_path->hca_name,
1259 clt_path->hca_port);
1260 if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
1261 atomic_dec(&clt_path->stats->inflight);
1262 req->need_inv = false;
1263 if (req->sg_cnt)
1264 ib_dma_unmap_sg(dev->ib_dev, req->sglist,
1265 req->sg_cnt, req->dir);
1266 }
1267
1268 return ret;
1269 }
1270
1271
1272
1273
1274
1275
1276 static int rtrs_clt_failover_req(struct rtrs_clt_sess *clt,
1277 struct rtrs_clt_io_req *fail_req)
1278 {
1279 struct rtrs_clt_path *alive_path;
1280 struct rtrs_clt_io_req *req;
1281 int err = -ECONNABORTED;
1282 struct path_it it;
1283
1284 rcu_read_lock();
1285 for (path_it_init(&it, clt);
1286 (alive_path = it.next_path(&it)) && it.i < it.clt->paths_num;
1287 it.i++) {
1288 if (READ_ONCE(alive_path->state) != RTRS_CLT_CONNECTED)
1289 continue;
1290 req = rtrs_clt_get_copy_req(alive_path, fail_req);
1291 if (req->dir == DMA_TO_DEVICE)
1292 err = rtrs_clt_write_req(req);
1293 else
1294 err = rtrs_clt_read_req(req);
1295 if (err) {
1296 req->in_use = false;
1297 continue;
1298 }
1299
1300 rtrs_clt_inc_failover_cnt(alive_path->stats);
1301 break;
1302 }
1303 path_it_deinit(&it);
1304 rcu_read_unlock();
1305
1306 return err;
1307 }
1308
1309 static void fail_all_outstanding_reqs(struct rtrs_clt_path *clt_path)
1310 {
1311 struct rtrs_clt_sess *clt = clt_path->clt;
1312 struct rtrs_clt_io_req *req;
1313 int i, err;
1314
1315 if (!clt_path->reqs)
1316 return;
1317 for (i = 0; i < clt_path->queue_depth; ++i) {
1318 req = &clt_path->reqs[i];
1319 if (!req->in_use)
1320 continue;
1321
1322
1323
1324
1325
1326
1327 complete_rdma_req(req, -ECONNABORTED, false, true);
1328
1329 err = rtrs_clt_failover_req(clt, req);
1330 if (err)
1331
1332 req->conf(req->priv, err);
1333 }
1334 }
1335
1336 static void free_path_reqs(struct rtrs_clt_path *clt_path)
1337 {
1338 struct rtrs_clt_io_req *req;
1339 int i;
1340
1341 if (!clt_path->reqs)
1342 return;
1343 for (i = 0; i < clt_path->queue_depth; ++i) {
1344 req = &clt_path->reqs[i];
1345 if (req->mr)
1346 ib_dereg_mr(req->mr);
1347 kfree(req->sge);
1348 rtrs_iu_free(req->iu, clt_path->s.dev->ib_dev, 1);
1349 }
1350 kfree(clt_path->reqs);
1351 clt_path->reqs = NULL;
1352 }
1353
1354 static int alloc_path_reqs(struct rtrs_clt_path *clt_path)
1355 {
1356 struct rtrs_clt_io_req *req;
1357 int i, err = -ENOMEM;
1358
1359 clt_path->reqs = kcalloc(clt_path->queue_depth,
1360 sizeof(*clt_path->reqs),
1361 GFP_KERNEL);
1362 if (!clt_path->reqs)
1363 return -ENOMEM;
1364
1365 for (i = 0; i < clt_path->queue_depth; ++i) {
1366 req = &clt_path->reqs[i];
1367 req->iu = rtrs_iu_alloc(1, clt_path->max_hdr_size, GFP_KERNEL,
1368 clt_path->s.dev->ib_dev,
1369 DMA_TO_DEVICE,
1370 rtrs_clt_rdma_done);
1371 if (!req->iu)
1372 goto out;
1373
1374 req->sge = kcalloc(2, sizeof(*req->sge), GFP_KERNEL);
1375 if (!req->sge)
1376 goto out;
1377
1378 req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd,
1379 IB_MR_TYPE_MEM_REG,
1380 clt_path->max_pages_per_mr);
1381 if (IS_ERR(req->mr)) {
1382 err = PTR_ERR(req->mr);
1383 req->mr = NULL;
1384 pr_err("Failed to alloc clt_path->max_pages_per_mr %d\n",
1385 clt_path->max_pages_per_mr);
1386 goto out;
1387 }
1388
1389 init_completion(&req->inv_comp);
1390 }
1391
1392 return 0;
1393
1394 out:
1395 free_path_reqs(clt_path);
1396
1397 return err;
1398 }
1399
1400 static int alloc_permits(struct rtrs_clt_sess *clt)
1401 {
1402 unsigned int chunk_bits;
1403 int err, i;
1404
1405 clt->permits_map = bitmap_zalloc(clt->queue_depth, GFP_KERNEL);
1406 if (!clt->permits_map) {
1407 err = -ENOMEM;
1408 goto out_err;
1409 }
1410 clt->permits = kcalloc(clt->queue_depth, permit_size(clt), GFP_KERNEL);
1411 if (!clt->permits) {
1412 err = -ENOMEM;
1413 goto err_map;
1414 }
1415 chunk_bits = ilog2(clt->queue_depth - 1) + 1;
1416 for (i = 0; i < clt->queue_depth; i++) {
1417 struct rtrs_permit *permit;
1418
1419 permit = get_permit(clt, i);
1420 permit->mem_id = i;
1421 permit->mem_off = i << (MAX_IMM_PAYL_BITS - chunk_bits);
1422 }
1423
1424 return 0;
1425
1426 err_map:
1427 bitmap_free(clt->permits_map);
1428 clt->permits_map = NULL;
1429 out_err:
1430 return err;
1431 }
1432
1433 static void free_permits(struct rtrs_clt_sess *clt)
1434 {
1435 if (clt->permits_map)
1436 wait_event(clt->permits_wait,
1437 bitmap_empty(clt->permits_map, clt->queue_depth));
1438
1439 bitmap_free(clt->permits_map);
1440 clt->permits_map = NULL;
1441 kfree(clt->permits);
1442 clt->permits = NULL;
1443 }
1444
1445 static void query_fast_reg_mode(struct rtrs_clt_path *clt_path)
1446 {
1447 struct ib_device *ib_dev;
1448 u64 max_pages_per_mr;
1449 int mr_page_shift;
1450
1451 ib_dev = clt_path->s.dev->ib_dev;
1452
1453
1454
1455
1456
1457
1458 mr_page_shift = max(12, ffs(ib_dev->attrs.page_size_cap) - 1);
1459 max_pages_per_mr = ib_dev->attrs.max_mr_size;
1460 do_div(max_pages_per_mr, (1ull << mr_page_shift));
1461 clt_path->max_pages_per_mr =
1462 min3(clt_path->max_pages_per_mr, (u32)max_pages_per_mr,
1463 ib_dev->attrs.max_fast_reg_page_list_len);
1464 clt_path->clt->max_segments =
1465 min(clt_path->max_pages_per_mr, clt_path->clt->max_segments);
1466 }
1467
1468 static bool rtrs_clt_change_state_get_old(struct rtrs_clt_path *clt_path,
1469 enum rtrs_clt_state new_state,
1470 enum rtrs_clt_state *old_state)
1471 {
1472 bool changed;
1473
1474 spin_lock_irq(&clt_path->state_wq.lock);
1475 if (old_state)
1476 *old_state = clt_path->state;
1477 changed = rtrs_clt_change_state(clt_path, new_state);
1478 spin_unlock_irq(&clt_path->state_wq.lock);
1479
1480 return changed;
1481 }
1482
1483 static void rtrs_clt_hb_err_handler(struct rtrs_con *c)
1484 {
1485 struct rtrs_clt_con *con = container_of(c, typeof(*con), c);
1486
1487 rtrs_rdma_error_recovery(con);
1488 }
1489
1490 static void rtrs_clt_init_hb(struct rtrs_clt_path *clt_path)
1491 {
1492 rtrs_init_hb(&clt_path->s, &io_comp_cqe,
1493 RTRS_HB_INTERVAL_MS,
1494 RTRS_HB_MISSED_MAX,
1495 rtrs_clt_hb_err_handler,
1496 rtrs_wq);
1497 }
1498
1499 static void rtrs_clt_reconnect_work(struct work_struct *work);
1500 static void rtrs_clt_close_work(struct work_struct *work);
1501
1502 static void rtrs_clt_err_recovery_work(struct work_struct *work)
1503 {
1504 struct rtrs_clt_path *clt_path;
1505 struct rtrs_clt_sess *clt;
1506 int delay_ms;
1507
1508 clt_path = container_of(work, struct rtrs_clt_path, err_recovery_work);
1509 clt = clt_path->clt;
1510 delay_ms = clt->reconnect_delay_sec * 1000;
1511 rtrs_clt_stop_and_destroy_conns(clt_path);
1512 queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
1513 msecs_to_jiffies(delay_ms +
1514 prandom_u32() %
1515 RTRS_RECONNECT_SEED));
1516 }
1517
1518 static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
1519 const struct rtrs_addr *path,
1520 size_t con_num, u32 nr_poll_queues)
1521 {
1522 struct rtrs_clt_path *clt_path;
1523 int err = -ENOMEM;
1524 int cpu;
1525 size_t total_con;
1526
1527 clt_path = kzalloc(sizeof(*clt_path), GFP_KERNEL);
1528 if (!clt_path)
1529 goto err;
1530
1531
1532
1533
1534
1535 total_con = con_num + nr_poll_queues + 1;
1536 clt_path->s.con = kcalloc(total_con, sizeof(*clt_path->s.con),
1537 GFP_KERNEL);
1538 if (!clt_path->s.con)
1539 goto err_free_path;
1540
1541 clt_path->s.con_num = total_con;
1542 clt_path->s.irq_con_num = con_num + 1;
1543
1544 clt_path->stats = kzalloc(sizeof(*clt_path->stats), GFP_KERNEL);
1545 if (!clt_path->stats)
1546 goto err_free_con;
1547
1548 mutex_init(&clt_path->init_mutex);
1549 uuid_gen(&clt_path->s.uuid);
1550 memcpy(&clt_path->s.dst_addr, path->dst,
1551 rdma_addr_size((struct sockaddr *)path->dst));
1552
1553
1554
1555
1556
1557
1558 if (path->src)
1559 memcpy(&clt_path->s.src_addr, path->src,
1560 rdma_addr_size((struct sockaddr *)path->src));
1561 strscpy(clt_path->s.sessname, clt->sessname,
1562 sizeof(clt_path->s.sessname));
1563 clt_path->clt = clt;
1564 clt_path->max_pages_per_mr = RTRS_MAX_SEGMENTS;
1565 init_waitqueue_head(&clt_path->state_wq);
1566 clt_path->state = RTRS_CLT_CONNECTING;
1567 atomic_set(&clt_path->connected_cnt, 0);
1568 INIT_WORK(&clt_path->close_work, rtrs_clt_close_work);
1569 INIT_WORK(&clt_path->err_recovery_work, rtrs_clt_err_recovery_work);
1570 INIT_DELAYED_WORK(&clt_path->reconnect_dwork, rtrs_clt_reconnect_work);
1571 rtrs_clt_init_hb(clt_path);
1572
1573 clt_path->mp_skip_entry = alloc_percpu(typeof(*clt_path->mp_skip_entry));
1574 if (!clt_path->mp_skip_entry)
1575 goto err_free_stats;
1576
1577 for_each_possible_cpu(cpu)
1578 INIT_LIST_HEAD(per_cpu_ptr(clt_path->mp_skip_entry, cpu));
1579
1580 err = rtrs_clt_init_stats(clt_path->stats);
1581 if (err)
1582 goto err_free_percpu;
1583
1584 return clt_path;
1585
1586 err_free_percpu:
1587 free_percpu(clt_path->mp_skip_entry);
1588 err_free_stats:
1589 kfree(clt_path->stats);
1590 err_free_con:
1591 kfree(clt_path->s.con);
1592 err_free_path:
1593 kfree(clt_path);
1594 err:
1595 return ERR_PTR(err);
1596 }
1597
1598 void free_path(struct rtrs_clt_path *clt_path)
1599 {
1600 free_percpu(clt_path->mp_skip_entry);
1601 mutex_destroy(&clt_path->init_mutex);
1602 kfree(clt_path->s.con);
1603 kfree(clt_path->rbufs);
1604 kfree(clt_path);
1605 }
1606
1607 static int create_con(struct rtrs_clt_path *clt_path, unsigned int cid)
1608 {
1609 struct rtrs_clt_con *con;
1610
1611 con = kzalloc(sizeof(*con), GFP_KERNEL);
1612 if (!con)
1613 return -ENOMEM;
1614
1615
1616 con->cpu = (cid ? cid - 1 : 0) % nr_cpu_ids;
1617 con->c.cid = cid;
1618 con->c.path = &clt_path->s;
1619
1620 atomic_set(&con->c.wr_cnt, 1);
1621 mutex_init(&con->con_mutex);
1622
1623 clt_path->s.con[cid] = &con->c;
1624
1625 return 0;
1626 }
1627
1628 static void destroy_con(struct rtrs_clt_con *con)
1629 {
1630 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
1631
1632 clt_path->s.con[con->c.cid] = NULL;
1633 mutex_destroy(&con->con_mutex);
1634 kfree(con);
1635 }
1636
1637 static int create_con_cq_qp(struct rtrs_clt_con *con)
1638 {
1639 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
1640 u32 max_send_wr, max_recv_wr, cq_num, max_send_sge, wr_limit;
1641 int err, cq_vector;
1642 struct rtrs_msg_rkey_rsp *rsp;
1643
1644 lockdep_assert_held(&con->con_mutex);
1645 if (con->c.cid == 0) {
1646 max_send_sge = 1;
1647
1648 if (WARN_ON(clt_path->s.dev))
1649 return -EINVAL;
1650
1651
1652
1653
1654
1655
1656 clt_path->s.dev = rtrs_ib_dev_find_or_add(con->c.cm_id->device,
1657 &dev_pd);
1658 if (!clt_path->s.dev) {
1659 rtrs_wrn(clt_path->clt,
1660 "rtrs_ib_dev_find_get_or_add(): no memory\n");
1661 return -ENOMEM;
1662 }
1663 clt_path->s.dev_ref = 1;
1664 query_fast_reg_mode(clt_path);
1665 wr_limit = clt_path->s.dev->ib_dev->attrs.max_qp_wr;
1666
1667
1668
1669
1670
1671
1672
1673 max_send_wr =
1674 min_t(int, wr_limit, SERVICE_CON_QUEUE_DEPTH * 2 + 2);
1675 max_recv_wr = max_send_wr;
1676 } else {
1677
1678
1679
1680
1681
1682 if (WARN_ON(!clt_path->s.dev))
1683 return -EINVAL;
1684 if (WARN_ON(!clt_path->queue_depth))
1685 return -EINVAL;
1686
1687 wr_limit = clt_path->s.dev->ib_dev->attrs.max_qp_wr;
1688
1689 clt_path->s.dev_ref++;
1690 max_send_wr = min_t(int, wr_limit,
1691
1692 clt_path->queue_depth * 3 + 1);
1693 max_recv_wr = min_t(int, wr_limit,
1694 clt_path->queue_depth * 3 + 1);
1695 max_send_sge = 2;
1696 }
1697 atomic_set(&con->c.sq_wr_avail, max_send_wr);
1698 cq_num = max_send_wr + max_recv_wr;
1699
1700 if (clt_path->flags & RTRS_MSG_NEW_RKEY_F || con->c.cid == 0) {
1701 con->rsp_ius = rtrs_iu_alloc(cq_num, sizeof(*rsp),
1702 GFP_KERNEL,
1703 clt_path->s.dev->ib_dev,
1704 DMA_FROM_DEVICE,
1705 rtrs_clt_rdma_done);
1706 if (!con->rsp_ius)
1707 return -ENOMEM;
1708 con->queue_num = cq_num;
1709 }
1710 cq_num = max_send_wr + max_recv_wr;
1711 cq_vector = con->cpu % clt_path->s.dev->ib_dev->num_comp_vectors;
1712 if (con->c.cid >= clt_path->s.irq_con_num)
1713 err = rtrs_cq_qp_create(&clt_path->s, &con->c, max_send_sge,
1714 cq_vector, cq_num, max_send_wr,
1715 max_recv_wr, IB_POLL_DIRECT);
1716 else
1717 err = rtrs_cq_qp_create(&clt_path->s, &con->c, max_send_sge,
1718 cq_vector, cq_num, max_send_wr,
1719 max_recv_wr, IB_POLL_SOFTIRQ);
1720
1721
1722
1723
1724 return err;
1725 }
1726
1727 static void destroy_con_cq_qp(struct rtrs_clt_con *con)
1728 {
1729 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
1730
1731
1732
1733
1734
1735 lockdep_assert_held(&con->con_mutex);
1736 rtrs_cq_qp_destroy(&con->c);
1737 if (con->rsp_ius) {
1738 rtrs_iu_free(con->rsp_ius, clt_path->s.dev->ib_dev,
1739 con->queue_num);
1740 con->rsp_ius = NULL;
1741 con->queue_num = 0;
1742 }
1743 if (clt_path->s.dev_ref && !--clt_path->s.dev_ref) {
1744 rtrs_ib_dev_put(clt_path->s.dev);
1745 clt_path->s.dev = NULL;
1746 }
1747 }
1748
1749 static void stop_cm(struct rtrs_clt_con *con)
1750 {
1751 rdma_disconnect(con->c.cm_id);
1752 if (con->c.qp)
1753 ib_drain_qp(con->c.qp);
1754 }
1755
1756 static void destroy_cm(struct rtrs_clt_con *con)
1757 {
1758 rdma_destroy_id(con->c.cm_id);
1759 con->c.cm_id = NULL;
1760 }
1761
1762 static int rtrs_rdma_addr_resolved(struct rtrs_clt_con *con)
1763 {
1764 struct rtrs_path *s = con->c.path;
1765 int err;
1766
1767 mutex_lock(&con->con_mutex);
1768 err = create_con_cq_qp(con);
1769 mutex_unlock(&con->con_mutex);
1770 if (err) {
1771 rtrs_err(s, "create_con_cq_qp(), err: %d\n", err);
1772 return err;
1773 }
1774 err = rdma_resolve_route(con->c.cm_id, RTRS_CONNECT_TIMEOUT_MS);
1775 if (err)
1776 rtrs_err(s, "Resolving route failed, err: %d\n", err);
1777
1778 return err;
1779 }
1780
1781 static int rtrs_rdma_route_resolved(struct rtrs_clt_con *con)
1782 {
1783 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
1784 struct rtrs_clt_sess *clt = clt_path->clt;
1785 struct rtrs_msg_conn_req msg;
1786 struct rdma_conn_param param;
1787
1788 int err;
1789
1790 param = (struct rdma_conn_param) {
1791 .retry_count = 7,
1792 .rnr_retry_count = 7,
1793 .private_data = &msg,
1794 .private_data_len = sizeof(msg),
1795 };
1796
1797 msg = (struct rtrs_msg_conn_req) {
1798 .magic = cpu_to_le16(RTRS_MAGIC),
1799 .version = cpu_to_le16(RTRS_PROTO_VER),
1800 .cid = cpu_to_le16(con->c.cid),
1801 .cid_num = cpu_to_le16(clt_path->s.con_num),
1802 .recon_cnt = cpu_to_le16(clt_path->s.recon_cnt),
1803 };
1804 msg.first_conn = clt_path->for_new_clt ? FIRST_CONN : 0;
1805 uuid_copy(&msg.sess_uuid, &clt_path->s.uuid);
1806 uuid_copy(&msg.paths_uuid, &clt->paths_uuid);
1807
1808 err = rdma_connect_locked(con->c.cm_id, ¶m);
1809 if (err)
1810 rtrs_err(clt, "rdma_connect_locked(): %d\n", err);
1811
1812 return err;
1813 }
1814
1815 static int rtrs_rdma_conn_established(struct rtrs_clt_con *con,
1816 struct rdma_cm_event *ev)
1817 {
1818 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
1819 struct rtrs_clt_sess *clt = clt_path->clt;
1820 const struct rtrs_msg_conn_rsp *msg;
1821 u16 version, queue_depth;
1822 int errno;
1823 u8 len;
1824
1825 msg = ev->param.conn.private_data;
1826 len = ev->param.conn.private_data_len;
1827 if (len < sizeof(*msg)) {
1828 rtrs_err(clt, "Invalid RTRS connection response\n");
1829 return -ECONNRESET;
1830 }
1831 if (le16_to_cpu(msg->magic) != RTRS_MAGIC) {
1832 rtrs_err(clt, "Invalid RTRS magic\n");
1833 return -ECONNRESET;
1834 }
1835 version = le16_to_cpu(msg->version);
1836 if (version >> 8 != RTRS_PROTO_VER_MAJOR) {
1837 rtrs_err(clt, "Unsupported major RTRS version: %d, expected %d\n",
1838 version >> 8, RTRS_PROTO_VER_MAJOR);
1839 return -ECONNRESET;
1840 }
1841 errno = le16_to_cpu(msg->errno);
1842 if (errno) {
1843 rtrs_err(clt, "Invalid RTRS message: errno %d\n",
1844 errno);
1845 return -ECONNRESET;
1846 }
1847 if (con->c.cid == 0) {
1848 queue_depth = le16_to_cpu(msg->queue_depth);
1849
1850 if (clt_path->queue_depth > 0 && queue_depth != clt_path->queue_depth) {
1851 rtrs_err(clt, "Error: queue depth changed\n");
1852
1853
1854
1855
1856 clt_path->reconnect_attempts = -1;
1857 rtrs_err(clt,
1858 "Disabling auto-reconnect. Trigger a manual reconnect after issue is resolved\n");
1859 return -ECONNRESET;
1860 }
1861
1862 if (!clt_path->rbufs) {
1863 clt_path->rbufs = kcalloc(queue_depth,
1864 sizeof(*clt_path->rbufs),
1865 GFP_KERNEL);
1866 if (!clt_path->rbufs)
1867 return -ENOMEM;
1868 }
1869 clt_path->queue_depth = queue_depth;
1870 clt_path->s.signal_interval = min_not_zero(queue_depth,
1871 (unsigned short) SERVICE_CON_QUEUE_DEPTH);
1872 clt_path->max_hdr_size = le32_to_cpu(msg->max_hdr_size);
1873 clt_path->max_io_size = le32_to_cpu(msg->max_io_size);
1874 clt_path->flags = le32_to_cpu(msg->flags);
1875 clt_path->chunk_size = clt_path->max_io_size + clt_path->max_hdr_size;
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885 mutex_lock(&clt->paths_mutex);
1886 clt->queue_depth = clt_path->queue_depth;
1887 clt->max_io_size = min_not_zero(clt_path->max_io_size,
1888 clt->max_io_size);
1889 mutex_unlock(&clt->paths_mutex);
1890
1891
1892
1893
1894 clt_path->hca_port = con->c.cm_id->port_num;
1895 scnprintf(clt_path->hca_name, sizeof(clt_path->hca_name),
1896 clt_path->s.dev->ib_dev->name);
1897 clt_path->s.src_addr = con->c.cm_id->route.addr.src_addr;
1898
1899 clt_path->for_new_clt = 1;
1900 }
1901
1902 return 0;
1903 }
1904
1905 static inline void flag_success_on_conn(struct rtrs_clt_con *con)
1906 {
1907 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
1908
1909 atomic_inc(&clt_path->connected_cnt);
1910 con->cm_err = 1;
1911 }
1912
1913 static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con,
1914 struct rdma_cm_event *ev)
1915 {
1916 struct rtrs_path *s = con->c.path;
1917 const struct rtrs_msg_conn_rsp *msg;
1918 const char *rej_msg;
1919 int status, errno;
1920 u8 data_len;
1921
1922 status = ev->status;
1923 rej_msg = rdma_reject_msg(con->c.cm_id, status);
1924 msg = rdma_consumer_reject_data(con->c.cm_id, ev, &data_len);
1925
1926 if (msg && data_len >= sizeof(*msg)) {
1927 errno = (int16_t)le16_to_cpu(msg->errno);
1928 if (errno == -EBUSY)
1929 rtrs_err(s,
1930 "Previous session is still exists on the server, please reconnect later\n");
1931 else
1932 rtrs_err(s,
1933 "Connect rejected: status %d (%s), rtrs errno %d\n",
1934 status, rej_msg, errno);
1935 } else {
1936 rtrs_err(s,
1937 "Connect rejected but with malformed message: status %d (%s)\n",
1938 status, rej_msg);
1939 }
1940
1941 return -ECONNRESET;
1942 }
1943
1944 void rtrs_clt_close_conns(struct rtrs_clt_path *clt_path, bool wait)
1945 {
1946 if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSING, NULL))
1947 queue_work(rtrs_wq, &clt_path->close_work);
1948 if (wait)
1949 flush_work(&clt_path->close_work);
1950 }
1951
1952 static inline void flag_error_on_conn(struct rtrs_clt_con *con, int cm_err)
1953 {
1954 if (con->cm_err == 1) {
1955 struct rtrs_clt_path *clt_path;
1956
1957 clt_path = to_clt_path(con->c.path);
1958 if (atomic_dec_and_test(&clt_path->connected_cnt))
1959
1960 wake_up(&clt_path->state_wq);
1961 }
1962 con->cm_err = cm_err;
1963 }
1964
1965 static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id,
1966 struct rdma_cm_event *ev)
1967 {
1968 struct rtrs_clt_con *con = cm_id->context;
1969 struct rtrs_path *s = con->c.path;
1970 struct rtrs_clt_path *clt_path = to_clt_path(s);
1971 int cm_err = 0;
1972
1973 switch (ev->event) {
1974 case RDMA_CM_EVENT_ADDR_RESOLVED:
1975 cm_err = rtrs_rdma_addr_resolved(con);
1976 break;
1977 case RDMA_CM_EVENT_ROUTE_RESOLVED:
1978 cm_err = rtrs_rdma_route_resolved(con);
1979 break;
1980 case RDMA_CM_EVENT_ESTABLISHED:
1981 cm_err = rtrs_rdma_conn_established(con, ev);
1982 if (!cm_err) {
1983
1984
1985
1986
1987 flag_success_on_conn(con);
1988 wake_up(&clt_path->state_wq);
1989 return 0;
1990 }
1991 break;
1992 case RDMA_CM_EVENT_REJECTED:
1993 cm_err = rtrs_rdma_conn_rejected(con, ev);
1994 break;
1995 case RDMA_CM_EVENT_DISCONNECTED:
1996
1997 cm_err = -ECONNRESET;
1998 break;
1999 case RDMA_CM_EVENT_CONNECT_ERROR:
2000 case RDMA_CM_EVENT_UNREACHABLE:
2001 case RDMA_CM_EVENT_ADDR_CHANGE:
2002 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
2003 rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n",
2004 rdma_event_msg(ev->event), ev->status);
2005 cm_err = -ECONNRESET;
2006 break;
2007 case RDMA_CM_EVENT_ADDR_ERROR:
2008 case RDMA_CM_EVENT_ROUTE_ERROR:
2009 rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n",
2010 rdma_event_msg(ev->event), ev->status);
2011 cm_err = -EHOSTUNREACH;
2012 break;
2013 case RDMA_CM_EVENT_DEVICE_REMOVAL:
2014
2015
2016
2017 rtrs_clt_close_conns(clt_path, false);
2018 return 0;
2019 default:
2020 rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %d)\n",
2021 rdma_event_msg(ev->event), ev->status);
2022 cm_err = -ECONNRESET;
2023 break;
2024 }
2025
2026 if (cm_err) {
2027
2028
2029
2030
2031 flag_error_on_conn(con, cm_err);
2032 rtrs_rdma_error_recovery(con);
2033 }
2034
2035 return 0;
2036 }
2037
2038 static int create_cm(struct rtrs_clt_con *con)
2039 {
2040 struct rtrs_path *s = con->c.path;
2041 struct rtrs_clt_path *clt_path = to_clt_path(s);
2042 struct rdma_cm_id *cm_id;
2043 int err;
2044
2045 cm_id = rdma_create_id(&init_net, rtrs_clt_rdma_cm_handler, con,
2046 clt_path->s.dst_addr.ss_family == AF_IB ?
2047 RDMA_PS_IB : RDMA_PS_TCP, IB_QPT_RC);
2048 if (IS_ERR(cm_id)) {
2049 err = PTR_ERR(cm_id);
2050 rtrs_err(s, "Failed to create CM ID, err: %d\n", err);
2051
2052 return err;
2053 }
2054 con->c.cm_id = cm_id;
2055 con->cm_err = 0;
2056
2057 err = rdma_set_reuseaddr(cm_id, 1);
2058 if (err != 0) {
2059 rtrs_err(s, "Set address reuse failed, err: %d\n", err);
2060 goto destroy_cm;
2061 }
2062 err = rdma_resolve_addr(cm_id, (struct sockaddr *)&clt_path->s.src_addr,
2063 (struct sockaddr *)&clt_path->s.dst_addr,
2064 RTRS_CONNECT_TIMEOUT_MS);
2065 if (err) {
2066 rtrs_err(s, "Failed to resolve address, err: %d\n", err);
2067 goto destroy_cm;
2068 }
2069
2070
2071
2072
2073
2074 err = wait_event_interruptible_timeout(
2075 clt_path->state_wq,
2076 con->cm_err || clt_path->state != RTRS_CLT_CONNECTING,
2077 msecs_to_jiffies(RTRS_CONNECT_TIMEOUT_MS));
2078 if (err == 0 || err == -ERESTARTSYS) {
2079 if (err == 0)
2080 err = -ETIMEDOUT;
2081
2082 goto errr;
2083 }
2084 if (con->cm_err < 0) {
2085 err = con->cm_err;
2086 goto errr;
2087 }
2088 if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTING) {
2089
2090 err = -ECONNABORTED;
2091 goto errr;
2092 }
2093
2094 return 0;
2095
2096 errr:
2097 stop_cm(con);
2098 mutex_lock(&con->con_mutex);
2099 destroy_con_cq_qp(con);
2100 mutex_unlock(&con->con_mutex);
2101 destroy_cm:
2102 destroy_cm(con);
2103
2104 return err;
2105 }
2106
2107 static void rtrs_clt_path_up(struct rtrs_clt_path *clt_path)
2108 {
2109 struct rtrs_clt_sess *clt = clt_path->clt;
2110 int up;
2111
2112
2113
2114
2115
2116
2117
2118
2119 mutex_lock(&clt->paths_ev_mutex);
2120 up = ++clt->paths_up;
2121
2122
2123
2124
2125
2126 if (up > MAX_PATHS_NUM && up == MAX_PATHS_NUM + clt->paths_num)
2127 clt->paths_up = clt->paths_num;
2128 else if (up == 1)
2129 clt->link_ev(clt->priv, RTRS_CLT_LINK_EV_RECONNECTED);
2130 mutex_unlock(&clt->paths_ev_mutex);
2131
2132
2133 clt_path->established = true;
2134 clt_path->reconnect_attempts = 0;
2135 clt_path->stats->reconnects.successful_cnt++;
2136 }
2137
2138 static void rtrs_clt_path_down(struct rtrs_clt_path *clt_path)
2139 {
2140 struct rtrs_clt_sess *clt = clt_path->clt;
2141
2142 if (!clt_path->established)
2143 return;
2144
2145 clt_path->established = false;
2146 mutex_lock(&clt->paths_ev_mutex);
2147 WARN_ON(!clt->paths_up);
2148 if (--clt->paths_up == 0)
2149 clt->link_ev(clt->priv, RTRS_CLT_LINK_EV_DISCONNECTED);
2150 mutex_unlock(&clt->paths_ev_mutex);
2151 }
2152
2153 static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path)
2154 {
2155 struct rtrs_clt_con *con;
2156 unsigned int cid;
2157
2158 WARN_ON(READ_ONCE(clt_path->state) == RTRS_CLT_CONNECTED);
2159
2160
2161
2162
2163
2164 mutex_lock(&clt_path->init_mutex);
2165 mutex_unlock(&clt_path->init_mutex);
2166
2167
2168
2169
2170
2171 synchronize_rcu();
2172
2173 rtrs_stop_hb(&clt_path->s);
2174
2175
2176
2177
2178
2179
2180
2181
2182 for (cid = 0; cid < clt_path->s.con_num; cid++) {
2183 if (!clt_path->s.con[cid])
2184 break;
2185 con = to_clt_con(clt_path->s.con[cid]);
2186 stop_cm(con);
2187 }
2188 fail_all_outstanding_reqs(clt_path);
2189 free_path_reqs(clt_path);
2190 rtrs_clt_path_down(clt_path);
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200 wait_event_timeout(clt_path->state_wq,
2201 !atomic_read(&clt_path->connected_cnt),
2202 msecs_to_jiffies(RTRS_CONNECT_TIMEOUT_MS));
2203
2204 for (cid = 0; cid < clt_path->s.con_num; cid++) {
2205 if (!clt_path->s.con[cid])
2206 break;
2207 con = to_clt_con(clt_path->s.con[cid]);
2208 mutex_lock(&con->con_mutex);
2209 destroy_con_cq_qp(con);
2210 mutex_unlock(&con->con_mutex);
2211 destroy_cm(con);
2212 destroy_con(con);
2213 }
2214 }
2215
2216 static inline bool xchg_paths(struct rtrs_clt_path __rcu **rcu_ppcpu_path,
2217 struct rtrs_clt_path *clt_path,
2218 struct rtrs_clt_path *next)
2219 {
2220 struct rtrs_clt_path **ppcpu_path;
2221
2222
2223 ppcpu_path = (typeof(ppcpu_path))rcu_ppcpu_path;
2224 return clt_path == cmpxchg(ppcpu_path, clt_path, next);
2225 }
2226
2227 static void rtrs_clt_remove_path_from_arr(struct rtrs_clt_path *clt_path)
2228 {
2229 struct rtrs_clt_sess *clt = clt_path->clt;
2230 struct rtrs_clt_path *next;
2231 bool wait_for_grace = false;
2232 int cpu;
2233
2234 mutex_lock(&clt->paths_mutex);
2235 list_del_rcu(&clt_path->s.entry);
2236
2237
2238 synchronize_rcu();
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269 clt->paths_num--;
2270
2271
2272
2273
2274
2275 rcu_read_lock();
2276 next = rtrs_clt_get_next_path_or_null(&clt->paths_list, clt_path);
2277 rcu_read_unlock();
2278
2279
2280
2281
2282
2283 for_each_possible_cpu(cpu) {
2284 struct rtrs_clt_path __rcu **ppcpu_path;
2285
2286 ppcpu_path = per_cpu_ptr(clt->pcpu_path, cpu);
2287 if (rcu_dereference_protected(*ppcpu_path,
2288 lockdep_is_held(&clt->paths_mutex)) != clt_path)
2289
2290
2291
2292
2293
2294
2295 continue;
2296
2297
2298
2299
2300
2301 if (xchg_paths(ppcpu_path, clt_path, next))
2302
2303
2304
2305
2306
2307
2308 wait_for_grace = true;
2309 }
2310 if (wait_for_grace)
2311 synchronize_rcu();
2312
2313 mutex_unlock(&clt->paths_mutex);
2314 }
2315
2316 static void rtrs_clt_add_path_to_arr(struct rtrs_clt_path *clt_path)
2317 {
2318 struct rtrs_clt_sess *clt = clt_path->clt;
2319
2320 mutex_lock(&clt->paths_mutex);
2321 clt->paths_num++;
2322
2323 list_add_tail_rcu(&clt_path->s.entry, &clt->paths_list);
2324 mutex_unlock(&clt->paths_mutex);
2325 }
2326
2327 static void rtrs_clt_close_work(struct work_struct *work)
2328 {
2329 struct rtrs_clt_path *clt_path;
2330
2331 clt_path = container_of(work, struct rtrs_clt_path, close_work);
2332
2333 cancel_work_sync(&clt_path->err_recovery_work);
2334 cancel_delayed_work_sync(&clt_path->reconnect_dwork);
2335 rtrs_clt_stop_and_destroy_conns(clt_path);
2336 rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSED, NULL);
2337 }
2338
2339 static int init_conns(struct rtrs_clt_path *clt_path)
2340 {
2341 unsigned int cid;
2342 int err;
2343
2344
2345
2346
2347
2348
2349 clt_path->s.recon_cnt++;
2350
2351
2352 for (cid = 0; cid < clt_path->s.con_num; cid++) {
2353 err = create_con(clt_path, cid);
2354 if (err)
2355 goto destroy;
2356
2357 err = create_cm(to_clt_con(clt_path->s.con[cid]));
2358 if (err) {
2359 destroy_con(to_clt_con(clt_path->s.con[cid]));
2360 goto destroy;
2361 }
2362 }
2363 err = alloc_path_reqs(clt_path);
2364 if (err)
2365 goto destroy;
2366
2367 rtrs_start_hb(&clt_path->s);
2368
2369 return 0;
2370
2371 destroy:
2372 while (cid--) {
2373 struct rtrs_clt_con *con = to_clt_con(clt_path->s.con[cid]);
2374
2375 stop_cm(con);
2376
2377 mutex_lock(&con->con_mutex);
2378 destroy_con_cq_qp(con);
2379 mutex_unlock(&con->con_mutex);
2380 destroy_cm(con);
2381 destroy_con(con);
2382 }
2383
2384
2385
2386
2387
2388 rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING_ERR, NULL);
2389
2390 return err;
2391 }
2392
2393 static void rtrs_clt_info_req_done(struct ib_cq *cq, struct ib_wc *wc)
2394 {
2395 struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
2396 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
2397 struct rtrs_iu *iu;
2398
2399 iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
2400 rtrs_iu_free(iu, clt_path->s.dev->ib_dev, 1);
2401
2402 if (wc->status != IB_WC_SUCCESS) {
2403 rtrs_err(clt_path->clt, "Path info request send failed: %s\n",
2404 ib_wc_status_msg(wc->status));
2405 rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING_ERR, NULL);
2406 return;
2407 }
2408
2409 rtrs_clt_update_wc_stats(con);
2410 }
2411
2412 static int process_info_rsp(struct rtrs_clt_path *clt_path,
2413 const struct rtrs_msg_info_rsp *msg)
2414 {
2415 unsigned int sg_cnt, total_len;
2416 int i, sgi;
2417
2418 sg_cnt = le16_to_cpu(msg->sg_cnt);
2419 if (!sg_cnt || (clt_path->queue_depth % sg_cnt)) {
2420 rtrs_err(clt_path->clt,
2421 "Incorrect sg_cnt %d, is not multiple\n",
2422 sg_cnt);
2423 return -EINVAL;
2424 }
2425
2426
2427
2428
2429
2430 if ((ilog2(sg_cnt - 1) + 1) + (ilog2(clt_path->chunk_size - 1) + 1) >
2431 MAX_IMM_PAYL_BITS) {
2432 rtrs_err(clt_path->clt,
2433 "RDMA immediate size (%db) not enough to encode %d buffers of size %dB\n",
2434 MAX_IMM_PAYL_BITS, sg_cnt, clt_path->chunk_size);
2435 return -EINVAL;
2436 }
2437 total_len = 0;
2438 for (sgi = 0, i = 0; sgi < sg_cnt && i < clt_path->queue_depth; sgi++) {
2439 const struct rtrs_sg_desc *desc = &msg->desc[sgi];
2440 u32 len, rkey;
2441 u64 addr;
2442
2443 addr = le64_to_cpu(desc->addr);
2444 rkey = le32_to_cpu(desc->key);
2445 len = le32_to_cpu(desc->len);
2446
2447 total_len += len;
2448
2449 if (!len || (len % clt_path->chunk_size)) {
2450 rtrs_err(clt_path->clt, "Incorrect [%d].len %d\n",
2451 sgi,
2452 len);
2453 return -EINVAL;
2454 }
2455 for ( ; len && i < clt_path->queue_depth; i++) {
2456 clt_path->rbufs[i].addr = addr;
2457 clt_path->rbufs[i].rkey = rkey;
2458
2459 len -= clt_path->chunk_size;
2460 addr += clt_path->chunk_size;
2461 }
2462 }
2463
2464 if (sgi != sg_cnt || i != clt_path->queue_depth) {
2465 rtrs_err(clt_path->clt,
2466 "Incorrect sg vector, not fully mapped\n");
2467 return -EINVAL;
2468 }
2469 if (total_len != clt_path->chunk_size * clt_path->queue_depth) {
2470 rtrs_err(clt_path->clt, "Incorrect total_len %d\n", total_len);
2471 return -EINVAL;
2472 }
2473
2474 return 0;
2475 }
2476
2477 static void rtrs_clt_info_rsp_done(struct ib_cq *cq, struct ib_wc *wc)
2478 {
2479 struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
2480 struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
2481 struct rtrs_msg_info_rsp *msg;
2482 enum rtrs_clt_state state;
2483 struct rtrs_iu *iu;
2484 size_t rx_sz;
2485 int err;
2486
2487 state = RTRS_CLT_CONNECTING_ERR;
2488
2489 WARN_ON(con->c.cid);
2490 iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
2491 if (wc->status != IB_WC_SUCCESS) {
2492 rtrs_err(clt_path->clt, "Path info response recv failed: %s\n",
2493 ib_wc_status_msg(wc->status));
2494 goto out;
2495 }
2496 WARN_ON(wc->opcode != IB_WC_RECV);
2497
2498 if (wc->byte_len < sizeof(*msg)) {
2499 rtrs_err(clt_path->clt, "Path info response is malformed: size %d\n",
2500 wc->byte_len);
2501 goto out;
2502 }
2503 ib_dma_sync_single_for_cpu(clt_path->s.dev->ib_dev, iu->dma_addr,
2504 iu->size, DMA_FROM_DEVICE);
2505 msg = iu->buf;
2506 if (le16_to_cpu(msg->type) != RTRS_MSG_INFO_RSP) {
2507 rtrs_err(clt_path->clt, "Path info response is malformed: type %d\n",
2508 le16_to_cpu(msg->type));
2509 goto out;
2510 }
2511 rx_sz = sizeof(*msg);
2512 rx_sz += sizeof(msg->desc[0]) * le16_to_cpu(msg->sg_cnt);
2513 if (wc->byte_len < rx_sz) {
2514 rtrs_err(clt_path->clt, "Path info response is malformed: size %d\n",
2515 wc->byte_len);
2516 goto out;
2517 }
2518 err = process_info_rsp(clt_path, msg);
2519 if (err)
2520 goto out;
2521
2522 err = post_recv_path(clt_path);
2523 if (err)
2524 goto out;
2525
2526 state = RTRS_CLT_CONNECTED;
2527
2528 out:
2529 rtrs_clt_update_wc_stats(con);
2530 rtrs_iu_free(iu, clt_path->s.dev->ib_dev, 1);
2531 rtrs_clt_change_state_get_old(clt_path, state, NULL);
2532 }
2533
2534 static int rtrs_send_path_info(struct rtrs_clt_path *clt_path)
2535 {
2536 struct rtrs_clt_con *usr_con = to_clt_con(clt_path->s.con[0]);
2537 struct rtrs_msg_info_req *msg;
2538 struct rtrs_iu *tx_iu, *rx_iu;
2539 size_t rx_sz;
2540 int err;
2541
2542 rx_sz = sizeof(struct rtrs_msg_info_rsp);
2543 rx_sz += sizeof(struct rtrs_sg_desc) * clt_path->queue_depth;
2544
2545 tx_iu = rtrs_iu_alloc(1, sizeof(struct rtrs_msg_info_req), GFP_KERNEL,
2546 clt_path->s.dev->ib_dev, DMA_TO_DEVICE,
2547 rtrs_clt_info_req_done);
2548 rx_iu = rtrs_iu_alloc(1, rx_sz, GFP_KERNEL, clt_path->s.dev->ib_dev,
2549 DMA_FROM_DEVICE, rtrs_clt_info_rsp_done);
2550 if (!tx_iu || !rx_iu) {
2551 err = -ENOMEM;
2552 goto out;
2553 }
2554
2555 err = rtrs_iu_post_recv(&usr_con->c, rx_iu);
2556 if (err) {
2557 rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %d\n", err);
2558 goto out;
2559 }
2560 rx_iu = NULL;
2561
2562 msg = tx_iu->buf;
2563 msg->type = cpu_to_le16(RTRS_MSG_INFO_REQ);
2564 memcpy(msg->pathname, clt_path->s.sessname, sizeof(msg->pathname));
2565
2566 ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev,
2567 tx_iu->dma_addr,
2568 tx_iu->size, DMA_TO_DEVICE);
2569
2570
2571 err = rtrs_iu_post_send(&usr_con->c, tx_iu, sizeof(*msg), NULL);
2572 if (err) {
2573 rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %d\n", err);
2574 goto out;
2575 }
2576 tx_iu = NULL;
2577
2578
2579 wait_event_interruptible_timeout(clt_path->state_wq,
2580 clt_path->state != RTRS_CLT_CONNECTING,
2581 msecs_to_jiffies(
2582 RTRS_CONNECT_TIMEOUT_MS));
2583 if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED) {
2584 if (READ_ONCE(clt_path->state) == RTRS_CLT_CONNECTING_ERR)
2585 err = -ECONNRESET;
2586 else
2587 err = -ETIMEDOUT;
2588 }
2589
2590 out:
2591 if (tx_iu)
2592 rtrs_iu_free(tx_iu, clt_path->s.dev->ib_dev, 1);
2593 if (rx_iu)
2594 rtrs_iu_free(rx_iu, clt_path->s.dev->ib_dev, 1);
2595 if (err)
2596
2597 rtrs_clt_change_state_get_old(clt_path,
2598 RTRS_CLT_CONNECTING_ERR, NULL);
2599
2600 return err;
2601 }
2602
2603
2604
2605
2606
2607
2608
2609 static int init_path(struct rtrs_clt_path *clt_path)
2610 {
2611 int err;
2612 char str[NAME_MAX];
2613 struct rtrs_addr path = {
2614 .src = &clt_path->s.src_addr,
2615 .dst = &clt_path->s.dst_addr,
2616 };
2617
2618 rtrs_addr_to_str(&path, str, sizeof(str));
2619
2620 mutex_lock(&clt_path->init_mutex);
2621 err = init_conns(clt_path);
2622 if (err) {
2623 rtrs_err(clt_path->clt,
2624 "init_conns() failed: err=%d path=%s [%s:%u]\n", err,
2625 str, clt_path->hca_name, clt_path->hca_port);
2626 goto out;
2627 }
2628 err = rtrs_send_path_info(clt_path);
2629 if (err) {
2630 rtrs_err(clt_path->clt,
2631 "rtrs_send_path_info() failed: err=%d path=%s [%s:%u]\n",
2632 err, str, clt_path->hca_name, clt_path->hca_port);
2633 goto out;
2634 }
2635 rtrs_clt_path_up(clt_path);
2636 out:
2637 mutex_unlock(&clt_path->init_mutex);
2638
2639 return err;
2640 }
2641
2642 static void rtrs_clt_reconnect_work(struct work_struct *work)
2643 {
2644 struct rtrs_clt_path *clt_path;
2645 struct rtrs_clt_sess *clt;
2646 int err;
2647
2648 clt_path = container_of(to_delayed_work(work), struct rtrs_clt_path,
2649 reconnect_dwork);
2650 clt = clt_path->clt;
2651
2652 if (READ_ONCE(clt_path->state) != RTRS_CLT_RECONNECTING)
2653 return;
2654
2655 if (clt_path->reconnect_attempts >= clt->max_reconnect_attempts) {
2656
2657 rtrs_clt_close_conns(clt_path, false);
2658 return;
2659 }
2660 clt_path->reconnect_attempts++;
2661
2662 msleep(RTRS_RECONNECT_BACKOFF);
2663 if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING, NULL)) {
2664 err = init_path(clt_path);
2665 if (err)
2666 goto reconnect_again;
2667 }
2668
2669 return;
2670
2671 reconnect_again:
2672 if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_RECONNECTING, NULL)) {
2673 clt_path->stats->reconnects.fail_cnt++;
2674 queue_work(rtrs_wq, &clt_path->err_recovery_work);
2675 }
2676 }
2677
2678 static void rtrs_clt_dev_release(struct device *dev)
2679 {
2680 struct rtrs_clt_sess *clt = container_of(dev, struct rtrs_clt_sess,
2681 dev);
2682
2683 mutex_destroy(&clt->paths_ev_mutex);
2684 mutex_destroy(&clt->paths_mutex);
2685 kfree(clt);
2686 }
2687
2688 static struct rtrs_clt_sess *alloc_clt(const char *sessname, size_t paths_num,
2689 u16 port, size_t pdu_sz, void *priv,
2690 void (*link_ev)(void *priv,
2691 enum rtrs_clt_link_ev ev),
2692 unsigned int reconnect_delay_sec,
2693 unsigned int max_reconnect_attempts)
2694 {
2695 struct rtrs_clt_sess *clt;
2696 int err;
2697
2698 if (!paths_num || paths_num > MAX_PATHS_NUM)
2699 return ERR_PTR(-EINVAL);
2700
2701 if (strlen(sessname) >= sizeof(clt->sessname))
2702 return ERR_PTR(-EINVAL);
2703
2704 clt = kzalloc(sizeof(*clt), GFP_KERNEL);
2705 if (!clt)
2706 return ERR_PTR(-ENOMEM);
2707
2708 clt->pcpu_path = alloc_percpu(typeof(*clt->pcpu_path));
2709 if (!clt->pcpu_path) {
2710 kfree(clt);
2711 return ERR_PTR(-ENOMEM);
2712 }
2713
2714 clt->dev.class = rtrs_clt_dev_class;
2715 clt->dev.release = rtrs_clt_dev_release;
2716 uuid_gen(&clt->paths_uuid);
2717 INIT_LIST_HEAD_RCU(&clt->paths_list);
2718 clt->paths_num = paths_num;
2719 clt->paths_up = MAX_PATHS_NUM;
2720 clt->port = port;
2721 clt->pdu_sz = pdu_sz;
2722 clt->max_segments = RTRS_MAX_SEGMENTS;
2723 clt->reconnect_delay_sec = reconnect_delay_sec;
2724 clt->max_reconnect_attempts = max_reconnect_attempts;
2725 clt->priv = priv;
2726 clt->link_ev = link_ev;
2727 clt->mp_policy = MP_POLICY_MIN_INFLIGHT;
2728 strscpy(clt->sessname, sessname, sizeof(clt->sessname));
2729 init_waitqueue_head(&clt->permits_wait);
2730 mutex_init(&clt->paths_ev_mutex);
2731 mutex_init(&clt->paths_mutex);
2732 device_initialize(&clt->dev);
2733
2734 err = dev_set_name(&clt->dev, "%s", sessname);
2735 if (err)
2736 goto err_put;
2737
2738
2739
2740
2741
2742 dev_set_uevent_suppress(&clt->dev, true);
2743 err = device_add(&clt->dev);
2744 if (err)
2745 goto err_put;
2746
2747 clt->kobj_paths = kobject_create_and_add("paths", &clt->dev.kobj);
2748 if (!clt->kobj_paths) {
2749 err = -ENOMEM;
2750 goto err_del;
2751 }
2752 err = rtrs_clt_create_sysfs_root_files(clt);
2753 if (err) {
2754 kobject_del(clt->kobj_paths);
2755 kobject_put(clt->kobj_paths);
2756 goto err_del;
2757 }
2758 dev_set_uevent_suppress(&clt->dev, false);
2759 kobject_uevent(&clt->dev.kobj, KOBJ_ADD);
2760
2761 return clt;
2762 err_del:
2763 device_del(&clt->dev);
2764 err_put:
2765 free_percpu(clt->pcpu_path);
2766 put_device(&clt->dev);
2767 return ERR_PTR(err);
2768 }
2769
2770 static void free_clt(struct rtrs_clt_sess *clt)
2771 {
2772 free_percpu(clt->pcpu_path);
2773
2774
2775
2776
2777 device_unregister(&clt->dev);
2778 }
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798 struct rtrs_clt_sess *rtrs_clt_open(struct rtrs_clt_ops *ops,
2799 const char *pathname,
2800 const struct rtrs_addr *paths,
2801 size_t paths_num, u16 port,
2802 size_t pdu_sz, u8 reconnect_delay_sec,
2803 s16 max_reconnect_attempts, u32 nr_poll_queues)
2804 {
2805 struct rtrs_clt_path *clt_path, *tmp;
2806 struct rtrs_clt_sess *clt;
2807 int err, i;
2808
2809 if (strchr(pathname, '/') || strchr(pathname, '.')) {
2810 pr_err("pathname cannot contain / and .\n");
2811 err = -EINVAL;
2812 goto out;
2813 }
2814
2815 clt = alloc_clt(pathname, paths_num, port, pdu_sz, ops->priv,
2816 ops->link_ev,
2817 reconnect_delay_sec,
2818 max_reconnect_attempts);
2819 if (IS_ERR(clt)) {
2820 err = PTR_ERR(clt);
2821 goto out;
2822 }
2823 for (i = 0; i < paths_num; i++) {
2824 struct rtrs_clt_path *clt_path;
2825
2826 clt_path = alloc_path(clt, &paths[i], nr_cpu_ids,
2827 nr_poll_queues);
2828 if (IS_ERR(clt_path)) {
2829 err = PTR_ERR(clt_path);
2830 goto close_all_path;
2831 }
2832 if (!i)
2833 clt_path->for_new_clt = 1;
2834 list_add_tail_rcu(&clt_path->s.entry, &clt->paths_list);
2835
2836 err = init_path(clt_path);
2837 if (err) {
2838 list_del_rcu(&clt_path->s.entry);
2839 rtrs_clt_close_conns(clt_path, true);
2840 free_percpu(clt_path->stats->pcpu_stats);
2841 kfree(clt_path->stats);
2842 free_path(clt_path);
2843 goto close_all_path;
2844 }
2845
2846 err = rtrs_clt_create_path_files(clt_path);
2847 if (err) {
2848 list_del_rcu(&clt_path->s.entry);
2849 rtrs_clt_close_conns(clt_path, true);
2850 free_percpu(clt_path->stats->pcpu_stats);
2851 kfree(clt_path->stats);
2852 free_path(clt_path);
2853 goto close_all_path;
2854 }
2855 }
2856 err = alloc_permits(clt);
2857 if (err)
2858 goto close_all_path;
2859
2860 return clt;
2861
2862 close_all_path:
2863 list_for_each_entry_safe(clt_path, tmp, &clt->paths_list, s.entry) {
2864 rtrs_clt_destroy_path_files(clt_path, NULL);
2865 rtrs_clt_close_conns(clt_path, true);
2866 kobject_put(&clt_path->kobj);
2867 }
2868 rtrs_clt_destroy_sysfs_root(clt);
2869 free_clt(clt);
2870
2871 out:
2872 return ERR_PTR(err);
2873 }
2874 EXPORT_SYMBOL(rtrs_clt_open);
2875
2876
2877
2878
2879
2880 void rtrs_clt_close(struct rtrs_clt_sess *clt)
2881 {
2882 struct rtrs_clt_path *clt_path, *tmp;
2883
2884
2885 rtrs_clt_destroy_sysfs_root(clt);
2886
2887
2888 list_for_each_entry_safe(clt_path, tmp, &clt->paths_list, s.entry) {
2889 rtrs_clt_close_conns(clt_path, true);
2890 rtrs_clt_destroy_path_files(clt_path, NULL);
2891 kobject_put(&clt_path->kobj);
2892 }
2893 free_permits(clt);
2894 free_clt(clt);
2895 }
2896 EXPORT_SYMBOL(rtrs_clt_close);
2897
2898 int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_path *clt_path)
2899 {
2900 enum rtrs_clt_state old_state;
2901 int err = -EBUSY;
2902 bool changed;
2903
2904 changed = rtrs_clt_change_state_get_old(clt_path,
2905 RTRS_CLT_RECONNECTING,
2906 &old_state);
2907 if (changed) {
2908 clt_path->reconnect_attempts = 0;
2909 rtrs_clt_stop_and_destroy_conns(clt_path);
2910 queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, 0);
2911 }
2912 if (changed || old_state == RTRS_CLT_RECONNECTING) {
2913
2914
2915
2916
2917
2918 flush_delayed_work(&clt_path->reconnect_dwork);
2919 err = (READ_ONCE(clt_path->state) ==
2920 RTRS_CLT_CONNECTED ? 0 : -ENOTCONN);
2921 }
2922
2923 return err;
2924 }
2925
2926 int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_path *clt_path,
2927 const struct attribute *sysfs_self)
2928 {
2929 enum rtrs_clt_state old_state;
2930 bool changed;
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941 do {
2942 rtrs_clt_close_conns(clt_path, true);
2943 changed = rtrs_clt_change_state_get_old(clt_path,
2944 RTRS_CLT_DEAD,
2945 &old_state);
2946 } while (!changed && old_state != RTRS_CLT_DEAD);
2947
2948 if (changed) {
2949 rtrs_clt_remove_path_from_arr(clt_path);
2950 rtrs_clt_destroy_path_files(clt_path, sysfs_self);
2951 kobject_put(&clt_path->kobj);
2952 }
2953
2954 return 0;
2955 }
2956
2957 void rtrs_clt_set_max_reconnect_attempts(struct rtrs_clt_sess *clt, int value)
2958 {
2959 clt->max_reconnect_attempts = (unsigned int)value;
2960 }
2961
2962 int rtrs_clt_get_max_reconnect_attempts(const struct rtrs_clt_sess *clt)
2963 {
2964 return (int)clt->max_reconnect_attempts;
2965 }
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991 int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops,
2992 struct rtrs_clt_sess *clt, struct rtrs_permit *permit,
2993 const struct kvec *vec, size_t nr, size_t data_len,
2994 struct scatterlist *sg, unsigned int sg_cnt)
2995 {
2996 struct rtrs_clt_io_req *req;
2997 struct rtrs_clt_path *clt_path;
2998
2999 enum dma_data_direction dma_dir;
3000 int err = -ECONNABORTED, i;
3001 size_t usr_len, hdr_len;
3002 struct path_it it;
3003
3004
3005 for (i = 0, usr_len = 0; i < nr; i++)
3006 usr_len += vec[i].iov_len;
3007
3008 if (dir == READ) {
3009 hdr_len = sizeof(struct rtrs_msg_rdma_read) +
3010 sg_cnt * sizeof(struct rtrs_sg_desc);
3011 dma_dir = DMA_FROM_DEVICE;
3012 } else {
3013 hdr_len = sizeof(struct rtrs_msg_rdma_write);
3014 dma_dir = DMA_TO_DEVICE;
3015 }
3016
3017 rcu_read_lock();
3018 for (path_it_init(&it, clt);
3019 (clt_path = it.next_path(&it)) && it.i < it.clt->paths_num; it.i++) {
3020 if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
3021 continue;
3022
3023 if (usr_len + hdr_len > clt_path->max_hdr_size) {
3024 rtrs_wrn_rl(clt_path->clt,
3025 "%s request failed, user message size is %zu and header length %zu, but max size is %u\n",
3026 dir == READ ? "Read" : "Write",
3027 usr_len, hdr_len, clt_path->max_hdr_size);
3028 err = -EMSGSIZE;
3029 break;
3030 }
3031 req = rtrs_clt_get_req(clt_path, ops->conf_fn, permit, ops->priv,
3032 vec, usr_len, sg, sg_cnt, data_len,
3033 dma_dir);
3034 if (dir == READ)
3035 err = rtrs_clt_read_req(req);
3036 else
3037 err = rtrs_clt_write_req(req);
3038 if (err) {
3039 req->in_use = false;
3040 continue;
3041 }
3042
3043 break;
3044 }
3045 path_it_deinit(&it);
3046 rcu_read_unlock();
3047
3048 return err;
3049 }
3050 EXPORT_SYMBOL(rtrs_clt_request);
3051
3052 int rtrs_clt_rdma_cq_direct(struct rtrs_clt_sess *clt, unsigned int index)
3053 {
3054
3055 int cnt = -1;
3056 struct rtrs_con *con;
3057 struct rtrs_clt_path *clt_path;
3058 struct path_it it;
3059
3060 rcu_read_lock();
3061 for (path_it_init(&it, clt);
3062 (clt_path = it.next_path(&it)) && it.i < it.clt->paths_num; it.i++) {
3063 if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
3064 continue;
3065
3066 con = clt_path->s.con[index + 1];
3067 cnt = ib_process_cq_direct(con->cq, -1);
3068 if (cnt)
3069 break;
3070 }
3071 path_it_deinit(&it);
3072 rcu_read_unlock();
3073
3074 return cnt;
3075 }
3076 EXPORT_SYMBOL(rtrs_clt_rdma_cq_direct);
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086 int rtrs_clt_query(struct rtrs_clt_sess *clt, struct rtrs_attrs *attr)
3087 {
3088 if (!rtrs_clt_is_connected(clt))
3089 return -ECOMM;
3090
3091 attr->queue_depth = clt->queue_depth;
3092 attr->max_segments = clt->max_segments;
3093
3094 attr->max_io_size = min_t(int, clt->max_io_size,
3095 clt->max_segments * SZ_4K);
3096
3097 return 0;
3098 }
3099 EXPORT_SYMBOL(rtrs_clt_query);
3100
3101 int rtrs_clt_create_path_from_sysfs(struct rtrs_clt_sess *clt,
3102 struct rtrs_addr *addr)
3103 {
3104 struct rtrs_clt_path *clt_path;
3105 int err;
3106
3107 clt_path = alloc_path(clt, addr, nr_cpu_ids, 0);
3108 if (IS_ERR(clt_path))
3109 return PTR_ERR(clt_path);
3110
3111 mutex_lock(&clt->paths_mutex);
3112 if (clt->paths_num == 0) {
3113
3114
3115
3116
3117
3118 clt_path->for_new_clt = 1;
3119 }
3120
3121 mutex_unlock(&clt->paths_mutex);
3122
3123
3124
3125
3126
3127
3128 rtrs_clt_add_path_to_arr(clt_path);
3129
3130 err = init_path(clt_path);
3131 if (err)
3132 goto close_path;
3133
3134 err = rtrs_clt_create_path_files(clt_path);
3135 if (err)
3136 goto close_path;
3137
3138 return 0;
3139
3140 close_path:
3141 rtrs_clt_remove_path_from_arr(clt_path);
3142 rtrs_clt_close_conns(clt_path, true);
3143 free_percpu(clt_path->stats->pcpu_stats);
3144 kfree(clt_path->stats);
3145 free_path(clt_path);
3146
3147 return err;
3148 }
3149
3150 static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev)
3151 {
3152 if (!(dev->ib_dev->attrs.device_cap_flags &
3153 IB_DEVICE_MEM_MGT_EXTENSIONS)) {
3154 pr_err("Memory registrations not supported.\n");
3155 return -ENOTSUPP;
3156 }
3157
3158 return 0;
3159 }
3160
3161 static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = {
3162 .init = rtrs_clt_ib_dev_init
3163 };
3164
3165 static int __init rtrs_client_init(void)
3166 {
3167 rtrs_rdma_dev_pd_init(0, &dev_pd);
3168
3169 rtrs_clt_dev_class = class_create(THIS_MODULE, "rtrs-client");
3170 if (IS_ERR(rtrs_clt_dev_class)) {
3171 pr_err("Failed to create rtrs-client dev class\n");
3172 return PTR_ERR(rtrs_clt_dev_class);
3173 }
3174 rtrs_wq = alloc_workqueue("rtrs_client_wq", 0, 0);
3175 if (!rtrs_wq) {
3176 class_destroy(rtrs_clt_dev_class);
3177 return -ENOMEM;
3178 }
3179
3180 return 0;
3181 }
3182
3183 static void __exit rtrs_client_exit(void)
3184 {
3185 destroy_workqueue(rtrs_wq);
3186 class_destroy(rtrs_clt_dev_class);
3187 rtrs_rdma_dev_pd_deinit(&dev_pd);
3188 }
3189
3190 module_init(rtrs_client_init);
3191 module_exit(rtrs_client_exit);