0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033 #include <linux/types.h>
0034 #include <linux/sched.h>
0035 #include <linux/sched/mm.h>
0036 #include <linux/sched/task.h>
0037 #include <linux/pid.h>
0038 #include <linux/slab.h>
0039 #include <linux/export.h>
0040 #include <linux/vmalloc.h>
0041 #include <linux/hugetlb.h>
0042 #include <linux/interval_tree.h>
0043 #include <linux/hmm.h>
0044 #include <linux/pagemap.h>
0045
0046 #include <rdma/ib_verbs.h>
0047 #include <rdma/ib_umem.h>
0048 #include <rdma/ib_umem_odp.h>
0049
0050 #include "uverbs.h"
0051
0052 static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
0053 const struct mmu_interval_notifier_ops *ops)
0054 {
0055 int ret;
0056
0057 umem_odp->umem.is_odp = 1;
0058 mutex_init(&umem_odp->umem_mutex);
0059
0060 if (!umem_odp->is_implicit_odp) {
0061 size_t page_size = 1UL << umem_odp->page_shift;
0062 unsigned long start;
0063 unsigned long end;
0064 size_t ndmas, npfns;
0065
0066 start = ALIGN_DOWN(umem_odp->umem.address, page_size);
0067 if (check_add_overflow(umem_odp->umem.address,
0068 (unsigned long)umem_odp->umem.length,
0069 &end))
0070 return -EOVERFLOW;
0071 end = ALIGN(end, page_size);
0072 if (unlikely(end < page_size))
0073 return -EOVERFLOW;
0074
0075 ndmas = (end - start) >> umem_odp->page_shift;
0076 if (!ndmas)
0077 return -EINVAL;
0078
0079 npfns = (end - start) >> PAGE_SHIFT;
0080 umem_odp->pfn_list = kvcalloc(
0081 npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
0082 if (!umem_odp->pfn_list)
0083 return -ENOMEM;
0084
0085 umem_odp->dma_list = kvcalloc(
0086 ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
0087 if (!umem_odp->dma_list) {
0088 ret = -ENOMEM;
0089 goto out_pfn_list;
0090 }
0091
0092 ret = mmu_interval_notifier_insert(&umem_odp->notifier,
0093 umem_odp->umem.owning_mm,
0094 start, end - start, ops);
0095 if (ret)
0096 goto out_dma_list;
0097 }
0098
0099 return 0;
0100
0101 out_dma_list:
0102 kvfree(umem_odp->dma_list);
0103 out_pfn_list:
0104 kvfree(umem_odp->pfn_list);
0105 return ret;
0106 }
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118 struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
0119 int access)
0120 {
0121 struct ib_umem *umem;
0122 struct ib_umem_odp *umem_odp;
0123 int ret;
0124
0125 if (access & IB_ACCESS_HUGETLB)
0126 return ERR_PTR(-EINVAL);
0127
0128 umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
0129 if (!umem_odp)
0130 return ERR_PTR(-ENOMEM);
0131 umem = &umem_odp->umem;
0132 umem->ibdev = device;
0133 umem->writable = ib_access_writable(access);
0134 umem->owning_mm = current->mm;
0135 umem_odp->is_implicit_odp = 1;
0136 umem_odp->page_shift = PAGE_SHIFT;
0137
0138 umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
0139 ret = ib_init_umem_odp(umem_odp, NULL);
0140 if (ret) {
0141 put_pid(umem_odp->tgid);
0142 kfree(umem_odp);
0143 return ERR_PTR(ret);
0144 }
0145 return umem_odp;
0146 }
0147 EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159 struct ib_umem_odp *
0160 ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
0161 size_t size,
0162 const struct mmu_interval_notifier_ops *ops)
0163 {
0164
0165
0166
0167
0168 struct ib_umem_odp *odp_data;
0169 struct ib_umem *umem;
0170 int ret;
0171
0172 if (WARN_ON(!root->is_implicit_odp))
0173 return ERR_PTR(-EINVAL);
0174
0175 odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
0176 if (!odp_data)
0177 return ERR_PTR(-ENOMEM);
0178 umem = &odp_data->umem;
0179 umem->ibdev = root->umem.ibdev;
0180 umem->length = size;
0181 umem->address = addr;
0182 umem->writable = root->umem.writable;
0183 umem->owning_mm = root->umem.owning_mm;
0184 odp_data->page_shift = PAGE_SHIFT;
0185 odp_data->notifier.ops = ops;
0186
0187
0188
0189
0190
0191 if (!mmget_not_zero(umem->owning_mm)) {
0192 ret = -EFAULT;
0193 goto out_free;
0194 }
0195
0196 odp_data->tgid = get_pid(root->tgid);
0197 ret = ib_init_umem_odp(odp_data, ops);
0198 if (ret)
0199 goto out_tgid;
0200 mmput(umem->owning_mm);
0201 return odp_data;
0202
0203 out_tgid:
0204 put_pid(odp_data->tgid);
0205 mmput(umem->owning_mm);
0206 out_free:
0207 kfree(odp_data);
0208 return ERR_PTR(ret);
0209 }
0210 EXPORT_SYMBOL(ib_umem_odp_alloc_child);
0211
0212
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224
0225 struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
0226 unsigned long addr, size_t size, int access,
0227 const struct mmu_interval_notifier_ops *ops)
0228 {
0229 struct ib_umem_odp *umem_odp;
0230 int ret;
0231
0232 if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
0233 return ERR_PTR(-EINVAL);
0234
0235 umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
0236 if (!umem_odp)
0237 return ERR_PTR(-ENOMEM);
0238
0239 umem_odp->umem.ibdev = device;
0240 umem_odp->umem.length = size;
0241 umem_odp->umem.address = addr;
0242 umem_odp->umem.writable = ib_access_writable(access);
0243 umem_odp->umem.owning_mm = current->mm;
0244 umem_odp->notifier.ops = ops;
0245
0246 umem_odp->page_shift = PAGE_SHIFT;
0247 #ifdef CONFIG_HUGETLB_PAGE
0248 if (access & IB_ACCESS_HUGETLB)
0249 umem_odp->page_shift = HPAGE_SHIFT;
0250 #endif
0251
0252 umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
0253 ret = ib_init_umem_odp(umem_odp, ops);
0254 if (ret)
0255 goto err_put_pid;
0256 return umem_odp;
0257
0258 err_put_pid:
0259 put_pid(umem_odp->tgid);
0260 kfree(umem_odp);
0261 return ERR_PTR(ret);
0262 }
0263 EXPORT_SYMBOL(ib_umem_odp_get);
0264
0265 void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
0266 {
0267
0268
0269
0270
0271
0272
0273 if (!umem_odp->is_implicit_odp) {
0274 mutex_lock(&umem_odp->umem_mutex);
0275 ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
0276 ib_umem_end(umem_odp));
0277 mutex_unlock(&umem_odp->umem_mutex);
0278 mmu_interval_notifier_remove(&umem_odp->notifier);
0279 kvfree(umem_odp->dma_list);
0280 kvfree(umem_odp->pfn_list);
0281 }
0282 put_pid(umem_odp->tgid);
0283 kfree(umem_odp);
0284 }
0285 EXPORT_SYMBOL(ib_umem_odp_release);
0286
0287
0288
0289
0290
0291
0292
0293
0294
0295
0296
0297
0298 static int ib_umem_odp_map_dma_single_page(
0299 struct ib_umem_odp *umem_odp,
0300 unsigned int dma_index,
0301 struct page *page,
0302 u64 access_mask)
0303 {
0304 struct ib_device *dev = umem_odp->umem.ibdev;
0305 dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
0306
0307 if (*dma_addr) {
0308
0309
0310
0311
0312
0313 *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
0314 return 0;
0315 }
0316
0317 *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
0318 DMA_BIDIRECTIONAL);
0319 if (ib_dma_mapping_error(dev, *dma_addr)) {
0320 *dma_addr = 0;
0321 return -EFAULT;
0322 }
0323 umem_odp->npages++;
0324 *dma_addr |= access_mask;
0325 return 0;
0326 }
0327
0328
0329
0330
0331
0332
0333
0334
0335
0336
0337
0338
0339
0340
0341
0342
0343
0344
0345
0346
0347
0348 int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
0349 u64 bcnt, u64 access_mask, bool fault)
0350 __acquires(&umem_odp->umem_mutex)
0351 {
0352 struct task_struct *owning_process = NULL;
0353 struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
0354 int pfn_index, dma_index, ret = 0, start_idx;
0355 unsigned int page_shift, hmm_order, pfn_start_idx;
0356 unsigned long num_pfns, current_seq;
0357 struct hmm_range range = {};
0358 unsigned long timeout;
0359
0360 if (access_mask == 0)
0361 return -EINVAL;
0362
0363 if (user_virt < ib_umem_start(umem_odp) ||
0364 user_virt + bcnt > ib_umem_end(umem_odp))
0365 return -EFAULT;
0366
0367 page_shift = umem_odp->page_shift;
0368
0369
0370
0371
0372
0373
0374 owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
0375 if (!owning_process || !mmget_not_zero(owning_mm)) {
0376 ret = -EINVAL;
0377 goto out_put_task;
0378 }
0379
0380 range.notifier = &umem_odp->notifier;
0381 range.start = ALIGN_DOWN(user_virt, 1UL << page_shift);
0382 range.end = ALIGN(user_virt + bcnt, 1UL << page_shift);
0383 pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
0384 num_pfns = (range.end - range.start) >> PAGE_SHIFT;
0385 if (fault) {
0386 range.default_flags = HMM_PFN_REQ_FAULT;
0387
0388 if (access_mask & ODP_WRITE_ALLOWED_BIT)
0389 range.default_flags |= HMM_PFN_REQ_WRITE;
0390 }
0391
0392 range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
0393 timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
0394
0395 retry:
0396 current_seq = range.notifier_seq =
0397 mmu_interval_read_begin(&umem_odp->notifier);
0398
0399 mmap_read_lock(owning_mm);
0400 ret = hmm_range_fault(&range);
0401 mmap_read_unlock(owning_mm);
0402 if (unlikely(ret)) {
0403 if (ret == -EBUSY && !time_after(jiffies, timeout))
0404 goto retry;
0405 goto out_put_mm;
0406 }
0407
0408 start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift;
0409 dma_index = start_idx;
0410
0411 mutex_lock(&umem_odp->umem_mutex);
0412 if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) {
0413 mutex_unlock(&umem_odp->umem_mutex);
0414 goto retry;
0415 }
0416
0417 for (pfn_index = 0; pfn_index < num_pfns;
0418 pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
0419
0420 if (fault) {
0421
0422
0423
0424
0425 WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
0426 WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
0427 } else {
0428 if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
0429 WARN_ON(umem_odp->dma_list[dma_index]);
0430 continue;
0431 }
0432 access_mask = ODP_READ_ALLOWED_BIT;
0433 if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
0434 access_mask |= ODP_WRITE_ALLOWED_BIT;
0435 }
0436
0437 hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
0438
0439
0440
0441 if (hmm_order + PAGE_SHIFT < page_shift) {
0442 ret = -EINVAL;
0443 ibdev_dbg(umem_odp->umem.ibdev,
0444 "%s: un-expected hmm_order %u, page_shift %u\n",
0445 __func__, hmm_order, page_shift);
0446 break;
0447 }
0448
0449 ret = ib_umem_odp_map_dma_single_page(
0450 umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
0451 access_mask);
0452 if (ret < 0) {
0453 ibdev_dbg(umem_odp->umem.ibdev,
0454 "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
0455 break;
0456 }
0457 }
0458
0459 if (!ret)
0460 ret = dma_index - start_idx;
0461 else
0462 mutex_unlock(&umem_odp->umem_mutex);
0463
0464 out_put_mm:
0465 mmput_async(owning_mm);
0466 out_put_task:
0467 if (owning_process)
0468 put_task_struct(owning_process);
0469 return ret;
0470 }
0471 EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
0472
0473 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
0474 u64 bound)
0475 {
0476 dma_addr_t dma_addr;
0477 dma_addr_t dma;
0478 int idx;
0479 u64 addr;
0480 struct ib_device *dev = umem_odp->umem.ibdev;
0481
0482 lockdep_assert_held(&umem_odp->umem_mutex);
0483
0484 virt = max_t(u64, virt, ib_umem_start(umem_odp));
0485 bound = min_t(u64, bound, ib_umem_end(umem_odp));
0486 for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
0487 idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
0488 dma = umem_odp->dma_list[idx];
0489
0490
0491 if (dma) {
0492 unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
0493 struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
0494
0495 dma_addr = dma & ODP_DMA_ADDR_MASK;
0496 ib_dma_unmap_page(dev, dma_addr,
0497 BIT(umem_odp->page_shift),
0498 DMA_BIDIRECTIONAL);
0499 if (dma & ODP_WRITE_ALLOWED_BIT) {
0500 struct page *head_page = compound_head(page);
0501
0502
0503
0504
0505
0506
0507
0508
0509
0510 set_page_dirty(head_page);
0511 }
0512 umem_odp->dma_list[idx] = 0;
0513 umem_odp->npages--;
0514 }
0515 }
0516 }
0517 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);