0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033 #include <rdma/ib_umem.h>
0034 #include <rdma/ib_umem_odp.h>
0035 #include <linux/kernel.h>
0036 #include <linux/dma-buf.h>
0037 #include <linux/dma-resv.h>
0038
0039 #include "mlx5_ib.h"
0040 #include "cmd.h"
0041 #include "umr.h"
0042 #include "qp.h"
0043
0044 #include <linux/mlx5/eq.h>
0045
0046
0047 struct mlx5_pagefault {
0048 u32 bytes_committed;
0049 u32 token;
0050 u8 event_subtype;
0051 u8 type;
0052 union {
0053
0054 struct {
0055
0056 u32 packet_size;
0057
0058
0059
0060 u32 wq_num;
0061
0062
0063
0064
0065 u16 wqe_index;
0066 } wqe;
0067
0068 struct {
0069 u32 r_key;
0070
0071
0072
0073
0074 u32 packet_size;
0075 u32 rdma_op_len;
0076 u64 rdma_va;
0077 } rdma;
0078 };
0079
0080 struct mlx5_ib_pf_eq *eq;
0081 struct work_struct work;
0082 };
0083
0084 #define MAX_PREFETCH_LEN (4*1024*1024U)
0085
0086
0087
0088 #define MMU_NOTIFIER_TIMEOUT 1000
0089
0090 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
0091 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
0092 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
0093 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
0094 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
0095
0096 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
0097
0098 static u64 mlx5_imr_ksm_entries;
0099
0100 static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
0101 struct mlx5_ib_mr *imr, int flags)
0102 {
0103 struct mlx5_klm *end = pklm + nentries;
0104
0105 if (flags & MLX5_IB_UPD_XLT_ZAP) {
0106 for (; pklm != end; pklm++, idx++) {
0107 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
0108 pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey);
0109 pklm->va = 0;
0110 }
0111 return;
0112 }
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131 lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
0132
0133 for (; pklm != end; pklm++, idx++) {
0134 struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
0135
0136 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
0137 if (mtt) {
0138 pklm->key = cpu_to_be32(mtt->ibmr.lkey);
0139 pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
0140 } else {
0141 pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey);
0142 pklm->va = 0;
0143 }
0144 }
0145 }
0146
0147 static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
0148 {
0149 u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
0150
0151 if (umem_dma & ODP_READ_ALLOWED_BIT)
0152 mtt_entry |= MLX5_IB_MTT_READ;
0153 if (umem_dma & ODP_WRITE_ALLOWED_BIT)
0154 mtt_entry |= MLX5_IB_MTT_WRITE;
0155
0156 return mtt_entry;
0157 }
0158
0159 static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
0160 struct mlx5_ib_mr *mr, int flags)
0161 {
0162 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
0163 dma_addr_t pa;
0164 size_t i;
0165
0166 if (flags & MLX5_IB_UPD_XLT_ZAP)
0167 return;
0168
0169 for (i = 0; i < nentries; i++) {
0170 pa = odp->dma_list[idx + i];
0171 pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
0172 }
0173 }
0174
0175 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
0176 struct mlx5_ib_mr *mr, int flags)
0177 {
0178 if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
0179 populate_klm(xlt, idx, nentries, mr, flags);
0180 } else {
0181 populate_mtt(xlt, idx, nentries, mr, flags);
0182 }
0183 }
0184
0185
0186
0187
0188
0189
0190
0191 static void free_implicit_child_mr_work(struct work_struct *work)
0192 {
0193 struct mlx5_ib_mr *mr =
0194 container_of(work, struct mlx5_ib_mr, odp_destroy.work);
0195 struct mlx5_ib_mr *imr = mr->parent;
0196 struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
0197 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
0198
0199 mlx5r_deref_wait_odp_mkey(&mr->mmkey);
0200
0201 mutex_lock(&odp_imr->umem_mutex);
0202 mlx5r_umr_update_xlt(mr->parent,
0203 ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0,
0204 MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC);
0205 mutex_unlock(&odp_imr->umem_mutex);
0206 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
0207
0208 mlx5r_deref_odp_mkey(&imr->mmkey);
0209 }
0210
0211 static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
0212 {
0213 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
0214 unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
0215 struct mlx5_ib_mr *imr = mr->parent;
0216
0217 if (!refcount_inc_not_zero(&imr->mmkey.usecount))
0218 return;
0219
0220 xa_erase(&imr->implicit_children, idx);
0221
0222
0223 INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
0224 queue_work(system_unbound_wq, &mr->odp_destroy.work);
0225 }
0226
0227 static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
0228 const struct mmu_notifier_range *range,
0229 unsigned long cur_seq)
0230 {
0231 struct ib_umem_odp *umem_odp =
0232 container_of(mni, struct ib_umem_odp, notifier);
0233 struct mlx5_ib_mr *mr;
0234 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
0235 sizeof(struct mlx5_mtt)) - 1;
0236 u64 idx = 0, blk_start_idx = 0;
0237 u64 invalidations = 0;
0238 unsigned long start;
0239 unsigned long end;
0240 int in_block = 0;
0241 u64 addr;
0242
0243 if (!mmu_notifier_range_blockable(range))
0244 return false;
0245
0246 mutex_lock(&umem_odp->umem_mutex);
0247 mmu_interval_set_seq(mni, cur_seq);
0248
0249
0250
0251
0252 if (!umem_odp->npages)
0253 goto out;
0254 mr = umem_odp->private;
0255
0256 start = max_t(u64, ib_umem_start(umem_odp), range->start);
0257 end = min_t(u64, ib_umem_end(umem_odp), range->end);
0258
0259
0260
0261
0262
0263
0264
0265 for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
0266 idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
0267
0268
0269
0270
0271
0272
0273 if (umem_odp->dma_list[idx] &
0274 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
0275 if (!in_block) {
0276 blk_start_idx = idx;
0277 in_block = 1;
0278 }
0279
0280
0281 invalidations += idx - blk_start_idx + 1;
0282 } else {
0283 u64 umr_offset = idx & umr_block_mask;
0284
0285 if (in_block && umr_offset == 0) {
0286 mlx5r_umr_update_xlt(mr, blk_start_idx,
0287 idx - blk_start_idx, 0,
0288 MLX5_IB_UPD_XLT_ZAP |
0289 MLX5_IB_UPD_XLT_ATOMIC);
0290 in_block = 0;
0291 }
0292 }
0293 }
0294 if (in_block)
0295 mlx5r_umr_update_xlt(mr, blk_start_idx,
0296 idx - blk_start_idx + 1, 0,
0297 MLX5_IB_UPD_XLT_ZAP |
0298 MLX5_IB_UPD_XLT_ATOMIC);
0299
0300 mlx5_update_odp_stats(mr, invalidations, invalidations);
0301
0302
0303
0304
0305
0306
0307
0308 ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
0309
0310 if (unlikely(!umem_odp->npages && mr->parent))
0311 destroy_unused_implicit_child_mr(mr);
0312 out:
0313 mutex_unlock(&umem_odp->umem_mutex);
0314 return true;
0315 }
0316
0317 const struct mmu_interval_notifier_ops mlx5_mn_ops = {
0318 .invalidate = mlx5_ib_invalidate_range,
0319 };
0320
0321 static void internal_fill_odp_caps(struct mlx5_ib_dev *dev)
0322 {
0323 struct ib_odp_caps *caps = &dev->odp_caps;
0324
0325 memset(caps, 0, sizeof(*caps));
0326
0327 if (!MLX5_CAP_GEN(dev->mdev, pg) || !mlx5r_umr_can_load_pas(dev, 0))
0328 return;
0329
0330 caps->general_caps = IB_ODP_SUPPORT;
0331
0332 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
0333 dev->odp_max_size = U64_MAX;
0334 else
0335 dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
0336
0337 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
0338 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
0339
0340 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
0341 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
0342
0343 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
0344 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
0345
0346 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
0347 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
0348
0349 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
0350 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
0351
0352 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
0353 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
0354
0355 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
0356 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
0357
0358 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
0359 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
0360
0361 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
0362 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
0363
0364 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
0365 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
0366
0367 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
0368 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
0369
0370 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
0371 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
0372
0373 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
0374 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
0375
0376 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
0377 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
0378
0379 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
0380 MLX5_CAP_GEN(dev->mdev, null_mkey) &&
0381 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
0382 !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
0383 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
0384 }
0385
0386 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
0387 struct mlx5_pagefault *pfault,
0388 int error)
0389 {
0390 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
0391 pfault->wqe.wq_num : pfault->token;
0392 u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {};
0393 int err;
0394
0395 MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
0396 MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
0397 MLX5_SET(page_fault_resume_in, in, token, pfault->token);
0398 MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
0399 MLX5_SET(page_fault_resume_in, in, error, !!error);
0400
0401 err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
0402 if (err)
0403 mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
0404 wq_num, err);
0405 }
0406
0407 static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
0408 unsigned long idx)
0409 {
0410 struct mlx5_ib_dev *dev = mr_to_mdev(imr);
0411 struct ib_umem_odp *odp;
0412 struct mlx5_ib_mr *mr;
0413 struct mlx5_ib_mr *ret;
0414 int err;
0415
0416 odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
0417 idx * MLX5_IMR_MTT_SIZE,
0418 MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
0419 if (IS_ERR(odp))
0420 return ERR_CAST(odp);
0421
0422 mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[MLX5_IMR_MTT_CACHE_ENTRY],
0423 imr->access_flags);
0424 if (IS_ERR(mr)) {
0425 ib_umem_odp_release(odp);
0426 return mr;
0427 }
0428
0429 mr->access_flags = imr->access_flags;
0430 mr->ibmr.pd = imr->ibmr.pd;
0431 mr->ibmr.device = &mr_to_mdev(imr)->ib_dev;
0432 mr->umem = &odp->umem;
0433 mr->ibmr.lkey = mr->mmkey.key;
0434 mr->ibmr.rkey = mr->mmkey.key;
0435 mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE;
0436 mr->parent = imr;
0437 odp->private = mr;
0438
0439
0440
0441
0442
0443 refcount_set(&mr->mmkey.usecount, 2);
0444
0445 err = mlx5r_umr_update_xlt(mr, 0,
0446 MLX5_IMR_MTT_ENTRIES,
0447 PAGE_SHIFT,
0448 MLX5_IB_UPD_XLT_ZAP |
0449 MLX5_IB_UPD_XLT_ENABLE);
0450 if (err) {
0451 ret = ERR_PTR(err);
0452 goto out_mr;
0453 }
0454
0455 xa_lock(&imr->implicit_children);
0456 ret = __xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
0457 GFP_KERNEL);
0458 if (unlikely(ret)) {
0459 if (xa_is_err(ret)) {
0460 ret = ERR_PTR(xa_err(ret));
0461 goto out_lock;
0462 }
0463
0464
0465
0466
0467 refcount_inc(&ret->mmkey.usecount);
0468 goto out_lock;
0469 }
0470 xa_unlock(&imr->implicit_children);
0471
0472 mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
0473 return mr;
0474
0475 out_lock:
0476 xa_unlock(&imr->implicit_children);
0477 out_mr:
0478 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
0479 return ret;
0480 }
0481
0482 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
0483 int access_flags)
0484 {
0485 struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
0486 struct ib_umem_odp *umem_odp;
0487 struct mlx5_ib_mr *imr;
0488 int err;
0489
0490 if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE))
0491 return ERR_PTR(-EOPNOTSUPP);
0492
0493 umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
0494 if (IS_ERR(umem_odp))
0495 return ERR_CAST(umem_odp);
0496
0497 imr = mlx5_mr_cache_alloc(dev,
0498 &dev->cache.ent[MLX5_IMR_KSM_CACHE_ENTRY],
0499 access_flags);
0500 if (IS_ERR(imr)) {
0501 ib_umem_odp_release(umem_odp);
0502 return imr;
0503 }
0504
0505 imr->access_flags = access_flags;
0506 imr->ibmr.pd = &pd->ibpd;
0507 imr->ibmr.iova = 0;
0508 imr->umem = &umem_odp->umem;
0509 imr->ibmr.lkey = imr->mmkey.key;
0510 imr->ibmr.rkey = imr->mmkey.key;
0511 imr->ibmr.device = &dev->ib_dev;
0512 imr->is_odp_implicit = true;
0513 xa_init(&imr->implicit_children);
0514
0515 err = mlx5r_umr_update_xlt(imr, 0,
0516 mlx5_imr_ksm_entries,
0517 MLX5_KSM_PAGE_SHIFT,
0518 MLX5_IB_UPD_XLT_INDIRECT |
0519 MLX5_IB_UPD_XLT_ZAP |
0520 MLX5_IB_UPD_XLT_ENABLE);
0521 if (err)
0522 goto out_mr;
0523
0524 err = mlx5r_store_odp_mkey(dev, &imr->mmkey);
0525 if (err)
0526 goto out_mr;
0527
0528 mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr);
0529 return imr;
0530 out_mr:
0531 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
0532 mlx5_ib_dereg_mr(&imr->ibmr, NULL);
0533 return ERR_PTR(err);
0534 }
0535
0536 void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr)
0537 {
0538 struct mlx5_ib_mr *mtt;
0539 unsigned long idx;
0540
0541
0542
0543
0544
0545 xa_for_each(&mr->implicit_children, idx, mtt) {
0546 xa_erase(&mr->implicit_children, idx);
0547 mlx5_ib_dereg_mr(&mtt->ibmr, NULL);
0548 }
0549 }
0550
0551 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
0552 #define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
0553 #define MLX5_PF_FLAGS_ENABLE BIT(3)
0554 static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
0555 u64 user_va, size_t bcnt, u32 *bytes_mapped,
0556 u32 flags)
0557 {
0558 int page_shift, ret, np;
0559 bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
0560 u64 access_mask;
0561 u64 start_idx;
0562 bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
0563 u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
0564
0565 if (flags & MLX5_PF_FLAGS_ENABLE)
0566 xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
0567
0568 page_shift = odp->page_shift;
0569 start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
0570 access_mask = ODP_READ_ALLOWED_BIT;
0571
0572 if (odp->umem.writable && !downgrade)
0573 access_mask |= ODP_WRITE_ALLOWED_BIT;
0574
0575 np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
0576 if (np < 0)
0577 return np;
0578
0579
0580
0581
0582
0583 ret = mlx5r_umr_update_xlt(mr, start_idx, np, page_shift, xlt_flags);
0584 mutex_unlock(&odp->umem_mutex);
0585
0586 if (ret < 0) {
0587 if (ret != -EAGAIN)
0588 mlx5_ib_err(mr_to_mdev(mr),
0589 "Failed to update mkey page tables\n");
0590 goto out;
0591 }
0592
0593 if (bytes_mapped) {
0594 u32 new_mappings = (np << page_shift) -
0595 (user_va - round_down(user_va, 1 << page_shift));
0596
0597 *bytes_mapped += min_t(u32, new_mappings, bcnt);
0598 }
0599
0600 return np << (page_shift - PAGE_SHIFT);
0601
0602 out:
0603 return ret;
0604 }
0605
0606 static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
0607 struct ib_umem_odp *odp_imr, u64 user_va,
0608 size_t bcnt, u32 *bytes_mapped, u32 flags)
0609 {
0610 unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
0611 unsigned long upd_start_idx = end_idx + 1;
0612 unsigned long upd_len = 0;
0613 unsigned long npages = 0;
0614 int err;
0615 int ret;
0616
0617 if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
0618 mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
0619 return -EFAULT;
0620
0621
0622 while (bcnt) {
0623 unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
0624 struct ib_umem_odp *umem_odp;
0625 struct mlx5_ib_mr *mtt;
0626 u64 len;
0627
0628 xa_lock(&imr->implicit_children);
0629 mtt = xa_load(&imr->implicit_children, idx);
0630 if (unlikely(!mtt)) {
0631 xa_unlock(&imr->implicit_children);
0632 mtt = implicit_get_child_mr(imr, idx);
0633 if (IS_ERR(mtt)) {
0634 ret = PTR_ERR(mtt);
0635 goto out;
0636 }
0637 upd_start_idx = min(upd_start_idx, idx);
0638 upd_len = idx - upd_start_idx + 1;
0639 } else {
0640 refcount_inc(&mtt->mmkey.usecount);
0641 xa_unlock(&imr->implicit_children);
0642 }
0643
0644 umem_odp = to_ib_umem_odp(mtt->umem);
0645 len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
0646 user_va;
0647
0648 ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
0649 bytes_mapped, flags);
0650
0651 mlx5r_deref_odp_mkey(&mtt->mmkey);
0652
0653 if (ret < 0)
0654 goto out;
0655 user_va += len;
0656 bcnt -= len;
0657 npages += ret;
0658 }
0659
0660 ret = npages;
0661
0662
0663
0664
0665
0666
0667 out:
0668 if (likely(!upd_len))
0669 return ret;
0670
0671
0672
0673
0674
0675
0676
0677
0678
0679
0680 mutex_lock(&odp_imr->umem_mutex);
0681 err = mlx5r_umr_update_xlt(imr, upd_start_idx, upd_len, 0,
0682 MLX5_IB_UPD_XLT_INDIRECT |
0683 MLX5_IB_UPD_XLT_ATOMIC);
0684 mutex_unlock(&odp_imr->umem_mutex);
0685 if (err) {
0686 mlx5_ib_err(mr_to_mdev(imr), "Failed to update PAS\n");
0687 return err;
0688 }
0689 return ret;
0690 }
0691
0692 static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
0693 u32 *bytes_mapped, u32 flags)
0694 {
0695 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
0696 u32 xlt_flags = 0;
0697 int err;
0698 unsigned int page_size;
0699
0700 if (flags & MLX5_PF_FLAGS_ENABLE)
0701 xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
0702
0703 dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
0704 err = ib_umem_dmabuf_map_pages(umem_dmabuf);
0705 if (err) {
0706 dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
0707 return err;
0708 }
0709
0710 page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc,
0711 log_page_size, 0,
0712 umem_dmabuf->umem.iova);
0713 if (unlikely(page_size < PAGE_SIZE)) {
0714 ib_umem_dmabuf_unmap_pages(umem_dmabuf);
0715 err = -EINVAL;
0716 } else {
0717 err = mlx5r_umr_update_mr_pas(mr, xlt_flags);
0718 }
0719 dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
0720
0721 if (err)
0722 return err;
0723
0724 if (bytes_mapped)
0725 *bytes_mapped += bcnt;
0726
0727 return ib_umem_num_pages(mr->umem);
0728 }
0729
0730
0731
0732
0733
0734
0735
0736
0737
0738
0739 static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
0740 u32 *bytes_mapped, u32 flags)
0741 {
0742 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
0743
0744 if (unlikely(io_virt < mr->ibmr.iova))
0745 return -EFAULT;
0746
0747 if (mr->umem->is_dmabuf)
0748 return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags);
0749
0750 if (!odp->is_implicit_odp) {
0751 u64 user_va;
0752
0753 if (check_add_overflow(io_virt - mr->ibmr.iova,
0754 (u64)odp->umem.address, &user_va))
0755 return -EFAULT;
0756 if (unlikely(user_va >= ib_umem_end(odp) ||
0757 ib_umem_end(odp) - user_va < bcnt))
0758 return -EFAULT;
0759 return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
0760 flags);
0761 }
0762 return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
0763 flags);
0764 }
0765
0766 int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr)
0767 {
0768 int ret;
0769
0770 ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem), mr->umem->address,
0771 mr->umem->length, NULL,
0772 MLX5_PF_FLAGS_SNAPSHOT | MLX5_PF_FLAGS_ENABLE);
0773 return ret >= 0 ? 0 : ret;
0774 }
0775
0776 int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr)
0777 {
0778 int ret;
0779
0780 ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL,
0781 MLX5_PF_FLAGS_ENABLE);
0782
0783 return ret >= 0 ? 0 : ret;
0784 }
0785
0786 struct pf_frame {
0787 struct pf_frame *next;
0788 u32 key;
0789 u64 io_virt;
0790 size_t bcnt;
0791 int depth;
0792 };
0793
0794 static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key)
0795 {
0796 if (!mmkey)
0797 return false;
0798 if (mmkey->type == MLX5_MKEY_MW)
0799 return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key);
0800 return mmkey->key == key;
0801 }
0802
0803
0804
0805
0806
0807
0808
0809
0810
0811
0812
0813
0814 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
0815 struct ib_pd *pd, u32 key,
0816 u64 io_virt, size_t bcnt,
0817 u32 *bytes_committed,
0818 u32 *bytes_mapped)
0819 {
0820 int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0;
0821 struct pf_frame *head = NULL, *frame;
0822 struct mlx5_ib_mkey *mmkey;
0823 struct mlx5_ib_mr *mr;
0824 struct mlx5_klm *pklm;
0825 u32 *out = NULL;
0826 size_t offset;
0827
0828 io_virt += *bytes_committed;
0829 bcnt -= *bytes_committed;
0830
0831 next_mr:
0832 xa_lock(&dev->odp_mkeys);
0833 mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
0834 if (!mmkey) {
0835 xa_unlock(&dev->odp_mkeys);
0836 mlx5_ib_dbg(
0837 dev,
0838 "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
0839 key);
0840 if (bytes_mapped)
0841 *bytes_mapped += bcnt;
0842
0843
0844
0845
0846
0847 ret = 0;
0848 goto end;
0849 }
0850 refcount_inc(&mmkey->usecount);
0851 xa_unlock(&dev->odp_mkeys);
0852
0853 if (!mkey_is_eq(mmkey, key)) {
0854 mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
0855 ret = -EFAULT;
0856 goto end;
0857 }
0858
0859 switch (mmkey->type) {
0860 case MLX5_MKEY_MR:
0861 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
0862
0863 ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
0864 if (ret < 0)
0865 goto end;
0866
0867 mlx5_update_odp_stats(mr, faults, ret);
0868
0869 npages += ret;
0870 ret = 0;
0871 break;
0872
0873 case MLX5_MKEY_MW:
0874 case MLX5_MKEY_INDIRECT_DEVX:
0875 if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
0876 mlx5_ib_dbg(dev, "indirection level exceeded\n");
0877 ret = -EFAULT;
0878 goto end;
0879 }
0880
0881 outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
0882 sizeof(*pklm) * (mmkey->ndescs - 2);
0883
0884 if (outlen > cur_outlen) {
0885 kfree(out);
0886 out = kzalloc(outlen, GFP_KERNEL);
0887 if (!out) {
0888 ret = -ENOMEM;
0889 goto end;
0890 }
0891 cur_outlen = outlen;
0892 }
0893
0894 pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
0895 bsf0_klm0_pas_mtt0_1);
0896
0897 ret = mlx5_core_query_mkey(dev->mdev, mmkey->key, out, outlen);
0898 if (ret)
0899 goto end;
0900
0901 offset = io_virt - MLX5_GET64(query_mkey_out, out,
0902 memory_key_mkey_entry.start_addr);
0903
0904 for (i = 0; bcnt && i < mmkey->ndescs; i++, pklm++) {
0905 if (offset >= be32_to_cpu(pklm->bcount)) {
0906 offset -= be32_to_cpu(pklm->bcount);
0907 continue;
0908 }
0909
0910 frame = kzalloc(sizeof(*frame), GFP_KERNEL);
0911 if (!frame) {
0912 ret = -ENOMEM;
0913 goto end;
0914 }
0915
0916 frame->key = be32_to_cpu(pklm->key);
0917 frame->io_virt = be64_to_cpu(pklm->va) + offset;
0918 frame->bcnt = min_t(size_t, bcnt,
0919 be32_to_cpu(pklm->bcount) - offset);
0920 frame->depth = depth + 1;
0921 frame->next = head;
0922 head = frame;
0923
0924 bcnt -= frame->bcnt;
0925 offset = 0;
0926 }
0927 break;
0928
0929 default:
0930 mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type);
0931 ret = -EFAULT;
0932 goto end;
0933 }
0934
0935 if (head) {
0936 frame = head;
0937 head = frame->next;
0938
0939 key = frame->key;
0940 io_virt = frame->io_virt;
0941 bcnt = frame->bcnt;
0942 depth = frame->depth;
0943 kfree(frame);
0944
0945 mlx5r_deref_odp_mkey(mmkey);
0946 goto next_mr;
0947 }
0948
0949 end:
0950 if (mmkey)
0951 mlx5r_deref_odp_mkey(mmkey);
0952 while (head) {
0953 frame = head;
0954 head = frame->next;
0955 kfree(frame);
0956 }
0957 kfree(out);
0958
0959 *bytes_committed = 0;
0960 return ret ? ret : npages;
0961 }
0962
0963
0964
0965
0966
0967
0968
0969
0970
0971
0972
0973
0974
0975
0976
0977
0978
0979
0980
0981
0982 static int pagefault_data_segments(struct mlx5_ib_dev *dev,
0983 struct mlx5_pagefault *pfault,
0984 void *wqe,
0985 void *wqe_end, u32 *bytes_mapped,
0986 u32 *total_wqe_bytes, bool receive_queue)
0987 {
0988 int ret = 0, npages = 0;
0989 u64 io_virt;
0990 u32 key;
0991 u32 byte_count;
0992 size_t bcnt;
0993 int inline_segment;
0994
0995 if (bytes_mapped)
0996 *bytes_mapped = 0;
0997 if (total_wqe_bytes)
0998 *total_wqe_bytes = 0;
0999
1000 while (wqe < wqe_end) {
1001 struct mlx5_wqe_data_seg *dseg = wqe;
1002
1003 io_virt = be64_to_cpu(dseg->addr);
1004 key = be32_to_cpu(dseg->lkey);
1005 byte_count = be32_to_cpu(dseg->byte_count);
1006 inline_segment = !!(byte_count & MLX5_INLINE_SEG);
1007 bcnt = byte_count & ~MLX5_INLINE_SEG;
1008
1009 if (inline_segment) {
1010 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
1011 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
1012 16);
1013 } else {
1014 wqe += sizeof(*dseg);
1015 }
1016
1017
1018 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY &&
1019 io_virt == 0)
1020 break;
1021
1022 if (!inline_segment && total_wqe_bytes) {
1023 *total_wqe_bytes += bcnt - min_t(size_t, bcnt,
1024 pfault->bytes_committed);
1025 }
1026
1027
1028 if (bcnt == 0)
1029 bcnt = 1U << 31;
1030
1031 if (inline_segment || bcnt <= pfault->bytes_committed) {
1032 pfault->bytes_committed -=
1033 min_t(size_t, bcnt,
1034 pfault->bytes_committed);
1035 continue;
1036 }
1037
1038 ret = pagefault_single_data_segment(dev, NULL, key,
1039 io_virt, bcnt,
1040 &pfault->bytes_committed,
1041 bytes_mapped);
1042 if (ret < 0)
1043 break;
1044 npages += ret;
1045 }
1046
1047 return ret < 0 ? ret : npages;
1048 }
1049
1050
1051
1052
1053
1054 static int mlx5_ib_mr_initiator_pfault_handler(
1055 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
1056 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
1057 {
1058 struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
1059 u16 wqe_index = pfault->wqe.wqe_index;
1060 struct mlx5_base_av *av;
1061 unsigned ds, opcode;
1062 u32 qpn = qp->trans_qp.base.mqp.qpn;
1063
1064 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
1065 if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
1066 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
1067 ds, wqe_length);
1068 return -EFAULT;
1069 }
1070
1071 if (ds == 0) {
1072 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
1073 wqe_index, qpn);
1074 return -EFAULT;
1075 }
1076
1077 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
1078 *wqe += sizeof(*ctrl);
1079
1080 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
1081 MLX5_WQE_CTRL_OPCODE_MASK;
1082
1083 if (qp->type == IB_QPT_XRC_INI)
1084 *wqe += sizeof(struct mlx5_wqe_xrc_seg);
1085
1086 if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) {
1087 av = *wqe;
1088 if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
1089 *wqe += sizeof(struct mlx5_av);
1090 else
1091 *wqe += sizeof(struct mlx5_base_av);
1092 }
1093
1094 switch (opcode) {
1095 case MLX5_OPCODE_RDMA_WRITE:
1096 case MLX5_OPCODE_RDMA_WRITE_IMM:
1097 case MLX5_OPCODE_RDMA_READ:
1098 *wqe += sizeof(struct mlx5_wqe_raddr_seg);
1099 break;
1100 case MLX5_OPCODE_ATOMIC_CS:
1101 case MLX5_OPCODE_ATOMIC_FA:
1102 *wqe += sizeof(struct mlx5_wqe_raddr_seg);
1103 *wqe += sizeof(struct mlx5_wqe_atomic_seg);
1104 break;
1105 }
1106
1107 return 0;
1108 }
1109
1110
1111
1112
1113 static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev,
1114 struct mlx5_ib_srq *srq,
1115 void **wqe, void **wqe_end,
1116 int wqe_length)
1117 {
1118 int wqe_size = 1 << srq->msrq.wqe_shift;
1119
1120 if (wqe_size > wqe_length) {
1121 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
1122 return -EFAULT;
1123 }
1124
1125 *wqe_end = *wqe + wqe_size;
1126 *wqe += sizeof(struct mlx5_wqe_srq_next_seg);
1127
1128 return 0;
1129 }
1130
1131 static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev,
1132 struct mlx5_ib_qp *qp,
1133 void *wqe, void **wqe_end,
1134 int wqe_length)
1135 {
1136 struct mlx5_ib_wq *wq = &qp->rq;
1137 int wqe_size = 1 << wq->wqe_shift;
1138
1139 if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) {
1140 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
1141 return -EFAULT;
1142 }
1143
1144 if (wqe_size > wqe_length) {
1145 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
1146 return -EFAULT;
1147 }
1148
1149 *wqe_end = wqe + wqe_size;
1150
1151 return 0;
1152 }
1153
1154 static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev,
1155 u32 wq_num, int pf_type)
1156 {
1157 struct mlx5_core_rsc_common *common = NULL;
1158 struct mlx5_core_srq *srq;
1159
1160 switch (pf_type) {
1161 case MLX5_WQE_PF_TYPE_RMP:
1162 srq = mlx5_cmd_get_srq(dev, wq_num);
1163 if (srq)
1164 common = &srq->common;
1165 break;
1166 case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE:
1167 case MLX5_WQE_PF_TYPE_RESP:
1168 case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC:
1169 common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP);
1170 break;
1171 default:
1172 break;
1173 }
1174
1175 return common;
1176 }
1177
1178 static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res)
1179 {
1180 struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res;
1181
1182 return to_mibqp(mqp);
1183 }
1184
1185 static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res)
1186 {
1187 struct mlx5_core_srq *msrq =
1188 container_of(res, struct mlx5_core_srq, common);
1189
1190 return to_mibsrq(msrq);
1191 }
1192
1193 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
1194 struct mlx5_pagefault *pfault)
1195 {
1196 bool sq = pfault->type & MLX5_PFAULT_REQUESTOR;
1197 u16 wqe_index = pfault->wqe.wqe_index;
1198 void *wqe, *wqe_start = NULL, *wqe_end = NULL;
1199 u32 bytes_mapped, total_wqe_bytes;
1200 struct mlx5_core_rsc_common *res;
1201 int resume_with_error = 1;
1202 struct mlx5_ib_qp *qp;
1203 size_t bytes_copied;
1204 int ret = 0;
1205
1206 res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type);
1207 if (!res) {
1208 mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num);
1209 return;
1210 }
1211
1212 if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ &&
1213 res->res != MLX5_RES_XSRQ) {
1214 mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n",
1215 pfault->type);
1216 goto resolve_page_fault;
1217 }
1218
1219 wqe_start = (void *)__get_free_page(GFP_KERNEL);
1220 if (!wqe_start) {
1221 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
1222 goto resolve_page_fault;
1223 }
1224
1225 wqe = wqe_start;
1226 qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
1227 if (qp && sq) {
1228 ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
1229 &bytes_copied);
1230 if (ret)
1231 goto read_user;
1232 ret = mlx5_ib_mr_initiator_pfault_handler(
1233 dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
1234 } else if (qp && !sq) {
1235 ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
1236 &bytes_copied);
1237 if (ret)
1238 goto read_user;
1239 ret = mlx5_ib_mr_responder_pfault_handler_rq(
1240 dev, qp, wqe, &wqe_end, bytes_copied);
1241 } else if (!qp) {
1242 struct mlx5_ib_srq *srq = res_to_srq(res);
1243
1244 ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
1245 &bytes_copied);
1246 if (ret)
1247 goto read_user;
1248 ret = mlx5_ib_mr_responder_pfault_handler_srq(
1249 dev, srq, &wqe, &wqe_end, bytes_copied);
1250 }
1251
1252 if (ret < 0 || wqe >= wqe_end)
1253 goto resolve_page_fault;
1254
1255 ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped,
1256 &total_wqe_bytes, !sq);
1257 if (ret == -EAGAIN)
1258 goto out;
1259
1260 if (ret < 0 || total_wqe_bytes > bytes_mapped)
1261 goto resolve_page_fault;
1262
1263 out:
1264 ret = 0;
1265 resume_with_error = 0;
1266
1267 read_user:
1268 if (ret)
1269 mlx5_ib_err(
1270 dev,
1271 "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
1272 ret, wqe_index, pfault->token);
1273
1274 resolve_page_fault:
1275 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
1276 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
1277 pfault->wqe.wq_num, resume_with_error,
1278 pfault->type);
1279 mlx5_core_res_put(res);
1280 free_page((unsigned long)wqe_start);
1281 }
1282
1283 static int pages_in_range(u64 address, u32 length)
1284 {
1285 return (ALIGN(address + length, PAGE_SIZE) -
1286 (address & PAGE_MASK)) >> PAGE_SHIFT;
1287 }
1288
1289 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
1290 struct mlx5_pagefault *pfault)
1291 {
1292 u64 address;
1293 u32 length;
1294 u32 prefetch_len = pfault->bytes_committed;
1295 int prefetch_activated = 0;
1296 u32 rkey = pfault->rdma.r_key;
1297 int ret;
1298
1299
1300
1301
1302
1303
1304
1305 pfault->rdma.rdma_va += pfault->bytes_committed;
1306 pfault->rdma.rdma_op_len -= min(pfault->bytes_committed,
1307 pfault->rdma.rdma_op_len);
1308 pfault->bytes_committed = 0;
1309
1310 address = pfault->rdma.rdma_va;
1311 length = pfault->rdma.rdma_op_len;
1312
1313
1314
1315
1316 if (length == 0) {
1317 prefetch_activated = 1;
1318 length = pfault->rdma.packet_size;
1319 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
1320 }
1321
1322 ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
1323 &pfault->bytes_committed, NULL);
1324 if (ret == -EAGAIN) {
1325
1326 prefetch_activated = 0;
1327 } else if (ret < 0 || pages_in_range(address, length) > ret) {
1328 mlx5_ib_page_fault_resume(dev, pfault, 1);
1329 if (ret != -ENOENT)
1330 mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
1331 ret, pfault->token, pfault->type);
1332 return;
1333 }
1334
1335 mlx5_ib_page_fault_resume(dev, pfault, 0);
1336 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
1337 pfault->token, pfault->type,
1338 prefetch_activated);
1339
1340
1341
1342
1343
1344
1345 if (prefetch_activated) {
1346 u32 bytes_committed = 0;
1347
1348 ret = pagefault_single_data_segment(dev, NULL, rkey, address,
1349 prefetch_len,
1350 &bytes_committed, NULL);
1351 if (ret < 0 && ret != -EAGAIN) {
1352 mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
1353 ret, pfault->token, address, prefetch_len);
1354 }
1355 }
1356 }
1357
1358 static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
1359 {
1360 u8 event_subtype = pfault->event_subtype;
1361
1362 switch (event_subtype) {
1363 case MLX5_PFAULT_SUBTYPE_WQE:
1364 mlx5_ib_mr_wqe_pfault_handler(dev, pfault);
1365 break;
1366 case MLX5_PFAULT_SUBTYPE_RDMA:
1367 mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
1368 break;
1369 default:
1370 mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
1371 event_subtype);
1372 mlx5_ib_page_fault_resume(dev, pfault, 1);
1373 }
1374 }
1375
1376 static void mlx5_ib_eqe_pf_action(struct work_struct *work)
1377 {
1378 struct mlx5_pagefault *pfault = container_of(work,
1379 struct mlx5_pagefault,
1380 work);
1381 struct mlx5_ib_pf_eq *eq = pfault->eq;
1382
1383 mlx5_ib_pfault(eq->dev, pfault);
1384 mempool_free(pfault, eq->pool);
1385 }
1386
1387 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
1388 {
1389 struct mlx5_eqe_page_fault *pf_eqe;
1390 struct mlx5_pagefault *pfault;
1391 struct mlx5_eqe *eqe;
1392 int cc = 0;
1393
1394 while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
1395 pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
1396 if (!pfault) {
1397 schedule_work(&eq->work);
1398 break;
1399 }
1400
1401 pf_eqe = &eqe->data.page_fault;
1402 pfault->event_subtype = eqe->sub_type;
1403 pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
1404
1405 mlx5_ib_dbg(eq->dev,
1406 "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
1407 eqe->sub_type, pfault->bytes_committed);
1408
1409 switch (eqe->sub_type) {
1410 case MLX5_PFAULT_SUBTYPE_RDMA:
1411
1412 pfault->type =
1413 be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
1414 pfault->token =
1415 be32_to_cpu(pf_eqe->rdma.pftype_token) &
1416 MLX5_24BIT_MASK;
1417 pfault->rdma.r_key =
1418 be32_to_cpu(pf_eqe->rdma.r_key);
1419 pfault->rdma.packet_size =
1420 be16_to_cpu(pf_eqe->rdma.packet_length);
1421 pfault->rdma.rdma_op_len =
1422 be32_to_cpu(pf_eqe->rdma.rdma_op_len);
1423 pfault->rdma.rdma_va =
1424 be64_to_cpu(pf_eqe->rdma.rdma_va);
1425 mlx5_ib_dbg(eq->dev,
1426 "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
1427 pfault->type, pfault->token,
1428 pfault->rdma.r_key);
1429 mlx5_ib_dbg(eq->dev,
1430 "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
1431 pfault->rdma.rdma_op_len,
1432 pfault->rdma.rdma_va);
1433 break;
1434
1435 case MLX5_PFAULT_SUBTYPE_WQE:
1436
1437 pfault->type =
1438 (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
1439 pfault->token =
1440 be32_to_cpu(pf_eqe->wqe.token);
1441 pfault->wqe.wq_num =
1442 be32_to_cpu(pf_eqe->wqe.pftype_wq) &
1443 MLX5_24BIT_MASK;
1444 pfault->wqe.wqe_index =
1445 be16_to_cpu(pf_eqe->wqe.wqe_index);
1446 pfault->wqe.packet_size =
1447 be16_to_cpu(pf_eqe->wqe.packet_length);
1448 mlx5_ib_dbg(eq->dev,
1449 "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
1450 pfault->type, pfault->token,
1451 pfault->wqe.wq_num,
1452 pfault->wqe.wqe_index);
1453 break;
1454
1455 default:
1456 mlx5_ib_warn(eq->dev,
1457 "Unsupported page fault event sub-type: 0x%02hhx\n",
1458 eqe->sub_type);
1459
1460
1461
1462 }
1463
1464 pfault->eq = eq;
1465 INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
1466 queue_work(eq->wq, &pfault->work);
1467
1468 cc = mlx5_eq_update_cc(eq->core, ++cc);
1469 }
1470
1471 mlx5_eq_update_ci(eq->core, cc, 1);
1472 }
1473
1474 static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type,
1475 void *data)
1476 {
1477 struct mlx5_ib_pf_eq *eq =
1478 container_of(nb, struct mlx5_ib_pf_eq, irq_nb);
1479 unsigned long flags;
1480
1481 if (spin_trylock_irqsave(&eq->lock, flags)) {
1482 mlx5_ib_eq_pf_process(eq);
1483 spin_unlock_irqrestore(&eq->lock, flags);
1484 } else {
1485 schedule_work(&eq->work);
1486 }
1487
1488 return IRQ_HANDLED;
1489 }
1490
1491
1492
1493
1494
1495 static void mempool_refill(mempool_t *pool)
1496 {
1497 while (pool->curr_nr < pool->min_nr)
1498 mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
1499 }
1500
1501 static void mlx5_ib_eq_pf_action(struct work_struct *work)
1502 {
1503 struct mlx5_ib_pf_eq *eq =
1504 container_of(work, struct mlx5_ib_pf_eq, work);
1505
1506 mempool_refill(eq->pool);
1507
1508 spin_lock_irq(&eq->lock);
1509 mlx5_ib_eq_pf_process(eq);
1510 spin_unlock_irq(&eq->lock);
1511 }
1512
1513 enum {
1514 MLX5_IB_NUM_PF_EQE = 0x1000,
1515 MLX5_IB_NUM_PF_DRAIN = 64,
1516 };
1517
1518 int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
1519 {
1520 struct mlx5_eq_param param = {};
1521 int err = 0;
1522
1523 mutex_lock(&dev->odp_eq_mutex);
1524 if (eq->core)
1525 goto unlock;
1526 INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
1527 spin_lock_init(&eq->lock);
1528 eq->dev = dev;
1529
1530 eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
1531 sizeof(struct mlx5_pagefault));
1532 if (!eq->pool) {
1533 err = -ENOMEM;
1534 goto unlock;
1535 }
1536
1537 eq->wq = alloc_workqueue("mlx5_ib_page_fault",
1538 WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
1539 MLX5_NUM_CMD_EQE);
1540 if (!eq->wq) {
1541 err = -ENOMEM;
1542 goto err_mempool;
1543 }
1544
1545 eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
1546 param = (struct mlx5_eq_param) {
1547 .nent = MLX5_IB_NUM_PF_EQE,
1548 };
1549 param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
1550 eq->core = mlx5_eq_create_generic(dev->mdev, ¶m);
1551 if (IS_ERR(eq->core)) {
1552 err = PTR_ERR(eq->core);
1553 goto err_wq;
1554 }
1555 err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb);
1556 if (err) {
1557 mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err);
1558 goto err_eq;
1559 }
1560
1561 mutex_unlock(&dev->odp_eq_mutex);
1562 return 0;
1563 err_eq:
1564 mlx5_eq_destroy_generic(dev->mdev, eq->core);
1565 err_wq:
1566 eq->core = NULL;
1567 destroy_workqueue(eq->wq);
1568 err_mempool:
1569 mempool_destroy(eq->pool);
1570 unlock:
1571 mutex_unlock(&dev->odp_eq_mutex);
1572 return err;
1573 }
1574
1575 static int
1576 mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
1577 {
1578 int err;
1579
1580 if (!eq->core)
1581 return 0;
1582 mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
1583 err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
1584 cancel_work_sync(&eq->work);
1585 destroy_workqueue(eq->wq);
1586 mempool_destroy(eq->pool);
1587
1588 return err;
1589 }
1590
1591 void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
1592 {
1593 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1594 return;
1595
1596 switch (ent->order - 2) {
1597 case MLX5_IMR_MTT_CACHE_ENTRY:
1598 ent->page = PAGE_SHIFT;
1599 ent->ndescs = MLX5_IMR_MTT_ENTRIES;
1600 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
1601 ent->limit = 0;
1602 break;
1603
1604 case MLX5_IMR_KSM_CACHE_ENTRY:
1605 ent->page = MLX5_KSM_PAGE_SHIFT;
1606 ent->ndescs = mlx5_imr_ksm_entries;
1607 ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
1608 ent->limit = 0;
1609 break;
1610 }
1611 }
1612
1613 static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
1614 .advise_mr = mlx5_ib_advise_mr,
1615 };
1616
1617 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
1618 {
1619 int ret = 0;
1620
1621 internal_fill_odp_caps(dev);
1622
1623 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
1624 return ret;
1625
1626 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
1627
1628 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
1629 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
1630 if (ret) {
1631 mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret);
1632 return ret;
1633 }
1634 }
1635
1636 mutex_init(&dev->odp_eq_mutex);
1637 return ret;
1638 }
1639
1640 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
1641 {
1642 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
1643 return;
1644
1645 mlx5_ib_odp_destroy_eq(dev, &dev->odp_pf_eq);
1646 }
1647
1648 int mlx5_ib_odp_init(void)
1649 {
1650 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
1651 MLX5_IMR_MTT_BITS);
1652
1653 return 0;
1654 }
1655
1656 struct prefetch_mr_work {
1657 struct work_struct work;
1658 u32 pf_flags;
1659 u32 num_sge;
1660 struct {
1661 u64 io_virt;
1662 struct mlx5_ib_mr *mr;
1663 size_t length;
1664 } frags[];
1665 };
1666
1667 static void destroy_prefetch_work(struct prefetch_mr_work *work)
1668 {
1669 u32 i;
1670
1671 for (i = 0; i < work->num_sge; ++i)
1672 mlx5r_deref_odp_mkey(&work->frags[i].mr->mmkey);
1673
1674 kvfree(work);
1675 }
1676
1677 static struct mlx5_ib_mr *
1678 get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
1679 u32 lkey)
1680 {
1681 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1682 struct mlx5_ib_mr *mr = NULL;
1683 struct mlx5_ib_mkey *mmkey;
1684
1685 xa_lock(&dev->odp_mkeys);
1686 mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
1687 if (!mmkey || mmkey->key != lkey) {
1688 mr = ERR_PTR(-ENOENT);
1689 goto end;
1690 }
1691 if (mmkey->type != MLX5_MKEY_MR) {
1692 mr = ERR_PTR(-EINVAL);
1693 goto end;
1694 }
1695
1696 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
1697
1698 if (mr->ibmr.pd != pd) {
1699 mr = ERR_PTR(-EPERM);
1700 goto end;
1701 }
1702
1703
1704 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1705 !mr->umem->writable) {
1706 mr = ERR_PTR(-EPERM);
1707 goto end;
1708 }
1709
1710 refcount_inc(&mmkey->usecount);
1711 end:
1712 xa_unlock(&dev->odp_mkeys);
1713 return mr;
1714 }
1715
1716 static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
1717 {
1718 struct prefetch_mr_work *work =
1719 container_of(w, struct prefetch_mr_work, work);
1720 u32 bytes_mapped = 0;
1721 int ret;
1722 u32 i;
1723
1724
1725 WARN_ON(!work->num_sge);
1726 for (i = 0; i < work->num_sge; ++i) {
1727 ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
1728 work->frags[i].length, &bytes_mapped,
1729 work->pf_flags);
1730 if (ret <= 0)
1731 continue;
1732 mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
1733 }
1734
1735 destroy_prefetch_work(work);
1736 }
1737
1738 static int init_prefetch_work(struct ib_pd *pd,
1739 enum ib_uverbs_advise_mr_advice advice,
1740 u32 pf_flags, struct prefetch_mr_work *work,
1741 struct ib_sge *sg_list, u32 num_sge)
1742 {
1743 u32 i;
1744
1745 INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
1746 work->pf_flags = pf_flags;
1747
1748 for (i = 0; i < num_sge; ++i) {
1749 struct mlx5_ib_mr *mr;
1750
1751 mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
1752 if (IS_ERR(mr)) {
1753 work->num_sge = i;
1754 return PTR_ERR(mr);
1755 }
1756 work->frags[i].io_virt = sg_list[i].addr;
1757 work->frags[i].length = sg_list[i].length;
1758 work->frags[i].mr = mr;
1759 }
1760 work->num_sge = num_sge;
1761 return 0;
1762 }
1763
1764 static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
1765 enum ib_uverbs_advise_mr_advice advice,
1766 u32 pf_flags, struct ib_sge *sg_list,
1767 u32 num_sge)
1768 {
1769 u32 bytes_mapped = 0;
1770 int ret = 0;
1771 u32 i;
1772
1773 for (i = 0; i < num_sge; ++i) {
1774 struct mlx5_ib_mr *mr;
1775
1776 mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
1777 if (IS_ERR(mr))
1778 return PTR_ERR(mr);
1779 ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
1780 &bytes_mapped, pf_flags);
1781 if (ret < 0) {
1782 mlx5r_deref_odp_mkey(&mr->mmkey);
1783 return ret;
1784 }
1785 mlx5_update_odp_stats(mr, prefetch, ret);
1786 mlx5r_deref_odp_mkey(&mr->mmkey);
1787 }
1788
1789 return 0;
1790 }
1791
1792 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
1793 enum ib_uverbs_advise_mr_advice advice,
1794 u32 flags, struct ib_sge *sg_list, u32 num_sge)
1795 {
1796 u32 pf_flags = 0;
1797 struct prefetch_mr_work *work;
1798 int rc;
1799
1800 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
1801 pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
1802
1803 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1804 pf_flags |= MLX5_PF_FLAGS_SNAPSHOT;
1805
1806 if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
1807 return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
1808 num_sge);
1809
1810 work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
1811 if (!work)
1812 return -ENOMEM;
1813
1814 rc = init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge);
1815 if (rc) {
1816 destroy_prefetch_work(work);
1817 return rc;
1818 }
1819 queue_work(system_unbound_wq, &work->work);
1820 return 0;
1821 }