0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024 #include <linux/types.h>
0025 #include <linux/sched/task.h>
0026 #include "amdgpu_sync.h"
0027 #include "amdgpu_object.h"
0028 #include "amdgpu_vm.h"
0029 #include "amdgpu_mn.h"
0030 #include "amdgpu.h"
0031 #include "amdgpu_xgmi.h"
0032 #include "kfd_priv.h"
0033 #include "kfd_svm.h"
0034 #include "kfd_migrate.h"
0035 #include "kfd_smi_events.h"
0036
0037 #ifdef dev_fmt
0038 #undef dev_fmt
0039 #endif
0040 #define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__
0041
0042 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
0043
0044
0045
0046
0047 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (2UL * NSEC_PER_MSEC)
0048
0049
0050
0051
0052
0053 static uint64_t max_svm_range_pages;
0054
0055 struct criu_svm_metadata {
0056 struct list_head list;
0057 struct kfd_criu_svm_range_priv_data data;
0058 };
0059
0060 static void svm_range_evict_svm_bo_worker(struct work_struct *work);
0061 static bool
0062 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
0063 const struct mmu_notifier_range *range,
0064 unsigned long cur_seq);
0065 static int
0066 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
0067 uint64_t *bo_s, uint64_t *bo_l);
0068 static const struct mmu_interval_notifier_ops svm_range_mn_ops = {
0069 .invalidate = svm_range_cpu_invalidate_pagetables,
0070 };
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081 static void svm_range_unlink(struct svm_range *prange)
0082 {
0083 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
0084 prange, prange->start, prange->last);
0085
0086 if (prange->svm_bo) {
0087 spin_lock(&prange->svm_bo->list_lock);
0088 list_del(&prange->svm_bo_list);
0089 spin_unlock(&prange->svm_bo->list_lock);
0090 }
0091
0092 list_del(&prange->list);
0093 if (prange->it_node.start != 0 && prange->it_node.last != 0)
0094 interval_tree_remove(&prange->it_node, &prange->svms->objects);
0095 }
0096
0097 static void
0098 svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange)
0099 {
0100 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
0101 prange, prange->start, prange->last);
0102
0103 mmu_interval_notifier_insert_locked(&prange->notifier, mm,
0104 prange->start << PAGE_SHIFT,
0105 prange->npages << PAGE_SHIFT,
0106 &svm_range_mn_ops);
0107 }
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117 static void svm_range_add_to_svms(struct svm_range *prange)
0118 {
0119 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
0120 prange, prange->start, prange->last);
0121
0122 list_move_tail(&prange->list, &prange->svms->list);
0123 prange->it_node.start = prange->start;
0124 prange->it_node.last = prange->last;
0125 interval_tree_insert(&prange->it_node, &prange->svms->objects);
0126 }
0127
0128 static void svm_range_remove_notifier(struct svm_range *prange)
0129 {
0130 pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n",
0131 prange->svms, prange,
0132 prange->notifier.interval_tree.start >> PAGE_SHIFT,
0133 prange->notifier.interval_tree.last >> PAGE_SHIFT);
0134
0135 if (prange->notifier.interval_tree.start != 0 &&
0136 prange->notifier.interval_tree.last != 0)
0137 mmu_interval_notifier_remove(&prange->notifier);
0138 }
0139
0140 static bool
0141 svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr)
0142 {
0143 return dma_addr && !dma_mapping_error(dev, dma_addr) &&
0144 !(dma_addr & SVM_RANGE_VRAM_DOMAIN);
0145 }
0146
0147 static int
0148 svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
0149 unsigned long offset, unsigned long npages,
0150 unsigned long *hmm_pfns, uint32_t gpuidx)
0151 {
0152 enum dma_data_direction dir = DMA_BIDIRECTIONAL;
0153 dma_addr_t *addr = prange->dma_addr[gpuidx];
0154 struct device *dev = adev->dev;
0155 struct page *page;
0156 int i, r;
0157
0158 if (!addr) {
0159 addr = kvcalloc(prange->npages, sizeof(*addr), GFP_KERNEL);
0160 if (!addr)
0161 return -ENOMEM;
0162 prange->dma_addr[gpuidx] = addr;
0163 }
0164
0165 addr += offset;
0166 for (i = 0; i < npages; i++) {
0167 if (svm_is_valid_dma_mapping_addr(dev, addr[i]))
0168 dma_unmap_page(dev, addr[i], PAGE_SIZE, dir);
0169
0170 page = hmm_pfn_to_page(hmm_pfns[i]);
0171 if (is_zone_device_page(page)) {
0172 struct amdgpu_device *bo_adev =
0173 amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
0174
0175 addr[i] = (hmm_pfns[i] << PAGE_SHIFT) +
0176 bo_adev->vm_manager.vram_base_offset -
0177 bo_adev->kfd.dev->pgmap.range.start;
0178 addr[i] |= SVM_RANGE_VRAM_DOMAIN;
0179 pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]);
0180 continue;
0181 }
0182 addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
0183 r = dma_mapping_error(dev, addr[i]);
0184 if (r) {
0185 dev_err(dev, "failed %d dma_map_page\n", r);
0186 return r;
0187 }
0188 pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n",
0189 addr[i] >> PAGE_SHIFT, page_to_pfn(page));
0190 }
0191 return 0;
0192 }
0193
0194 static int
0195 svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
0196 unsigned long offset, unsigned long npages,
0197 unsigned long *hmm_pfns)
0198 {
0199 struct kfd_process *p;
0200 uint32_t gpuidx;
0201 int r;
0202
0203 p = container_of(prange->svms, struct kfd_process, svms);
0204
0205 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
0206 struct kfd_process_device *pdd;
0207
0208 pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
0209 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
0210 if (!pdd) {
0211 pr_debug("failed to find device idx %d\n", gpuidx);
0212 return -EINVAL;
0213 }
0214
0215 r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages,
0216 hmm_pfns, gpuidx);
0217 if (r)
0218 break;
0219 }
0220
0221 return r;
0222 }
0223
0224 void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
0225 unsigned long offset, unsigned long npages)
0226 {
0227 enum dma_data_direction dir = DMA_BIDIRECTIONAL;
0228 int i;
0229
0230 if (!dma_addr)
0231 return;
0232
0233 for (i = offset; i < offset + npages; i++) {
0234 if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i]))
0235 continue;
0236 pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT);
0237 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
0238 dma_addr[i] = 0;
0239 }
0240 }
0241
0242 void svm_range_free_dma_mappings(struct svm_range *prange)
0243 {
0244 struct kfd_process_device *pdd;
0245 dma_addr_t *dma_addr;
0246 struct device *dev;
0247 struct kfd_process *p;
0248 uint32_t gpuidx;
0249
0250 p = container_of(prange->svms, struct kfd_process, svms);
0251
0252 for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
0253 dma_addr = prange->dma_addr[gpuidx];
0254 if (!dma_addr)
0255 continue;
0256
0257 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
0258 if (!pdd) {
0259 pr_debug("failed to find device idx %d\n", gpuidx);
0260 continue;
0261 }
0262 dev = &pdd->dev->pdev->dev;
0263 svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
0264 kvfree(dma_addr);
0265 prange->dma_addr[gpuidx] = NULL;
0266 }
0267 }
0268
0269 static void svm_range_free(struct svm_range *prange, bool update_mem_usage)
0270 {
0271 uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT;
0272 struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
0273
0274 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange,
0275 prange->start, prange->last);
0276
0277 svm_range_vram_node_free(prange);
0278 svm_range_free_dma_mappings(prange);
0279
0280 if (update_mem_usage && !p->xnack_enabled) {
0281 pr_debug("unreserve mem limit: %lld\n", size);
0282 amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
0283 KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
0284 }
0285 mutex_destroy(&prange->lock);
0286 mutex_destroy(&prange->migrate_mutex);
0287 kfree(prange);
0288 }
0289
0290 static void
0291 svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc,
0292 uint8_t *granularity, uint32_t *flags)
0293 {
0294 *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
0295 *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
0296 *granularity = 9;
0297 *flags =
0298 KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT;
0299 }
0300
0301 static struct
0302 svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
0303 uint64_t last, bool update_mem_usage)
0304 {
0305 uint64_t size = last - start + 1;
0306 struct svm_range *prange;
0307 struct kfd_process *p;
0308
0309 prange = kzalloc(sizeof(*prange), GFP_KERNEL);
0310 if (!prange)
0311 return NULL;
0312
0313 p = container_of(svms, struct kfd_process, svms);
0314 if (!p->xnack_enabled && update_mem_usage &&
0315 amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT,
0316 KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)) {
0317 pr_info("SVM mapping failed, exceeds resident system memory limit\n");
0318 kfree(prange);
0319 return NULL;
0320 }
0321 prange->npages = size;
0322 prange->svms = svms;
0323 prange->start = start;
0324 prange->last = last;
0325 INIT_LIST_HEAD(&prange->list);
0326 INIT_LIST_HEAD(&prange->update_list);
0327 INIT_LIST_HEAD(&prange->svm_bo_list);
0328 INIT_LIST_HEAD(&prange->deferred_list);
0329 INIT_LIST_HEAD(&prange->child_list);
0330 atomic_set(&prange->invalid, 0);
0331 prange->validate_timestamp = 0;
0332 mutex_init(&prange->migrate_mutex);
0333 mutex_init(&prange->lock);
0334
0335 if (p->xnack_enabled)
0336 bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
0337 MAX_GPU_INSTANCE);
0338
0339 svm_range_set_default_attributes(&prange->preferred_loc,
0340 &prange->prefetch_loc,
0341 &prange->granularity, &prange->flags);
0342
0343 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
0344
0345 return prange;
0346 }
0347
0348 static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo)
0349 {
0350 if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref))
0351 return false;
0352
0353 return true;
0354 }
0355
0356 static void svm_range_bo_release(struct kref *kref)
0357 {
0358 struct svm_range_bo *svm_bo;
0359
0360 svm_bo = container_of(kref, struct svm_range_bo, kref);
0361 pr_debug("svm_bo 0x%p\n", svm_bo);
0362
0363 spin_lock(&svm_bo->list_lock);
0364 while (!list_empty(&svm_bo->range_list)) {
0365 struct svm_range *prange =
0366 list_first_entry(&svm_bo->range_list,
0367 struct svm_range, svm_bo_list);
0368
0369
0370
0371 list_del_init(&prange->svm_bo_list);
0372 spin_unlock(&svm_bo->list_lock);
0373
0374 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
0375 prange->start, prange->last);
0376 mutex_lock(&prange->lock);
0377 prange->svm_bo = NULL;
0378 mutex_unlock(&prange->lock);
0379
0380 spin_lock(&svm_bo->list_lock);
0381 }
0382 spin_unlock(&svm_bo->list_lock);
0383 if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) {
0384
0385
0386
0387
0388 dma_fence_signal(&svm_bo->eviction_fence->base);
0389 cancel_work_sync(&svm_bo->eviction_work);
0390 }
0391 dma_fence_put(&svm_bo->eviction_fence->base);
0392 amdgpu_bo_unref(&svm_bo->bo);
0393 kfree(svm_bo);
0394 }
0395
0396 static void svm_range_bo_wq_release(struct work_struct *work)
0397 {
0398 struct svm_range_bo *svm_bo;
0399
0400 svm_bo = container_of(work, struct svm_range_bo, release_work);
0401 svm_range_bo_release(&svm_bo->kref);
0402 }
0403
0404 static void svm_range_bo_release_async(struct kref *kref)
0405 {
0406 struct svm_range_bo *svm_bo;
0407
0408 svm_bo = container_of(kref, struct svm_range_bo, kref);
0409 pr_debug("svm_bo 0x%p\n", svm_bo);
0410 INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release);
0411 schedule_work(&svm_bo->release_work);
0412 }
0413
0414 void svm_range_bo_unref_async(struct svm_range_bo *svm_bo)
0415 {
0416 kref_put(&svm_bo->kref, svm_range_bo_release_async);
0417 }
0418
0419 static void svm_range_bo_unref(struct svm_range_bo *svm_bo)
0420 {
0421 if (svm_bo)
0422 kref_put(&svm_bo->kref, svm_range_bo_release);
0423 }
0424
0425 static bool
0426 svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange)
0427 {
0428 struct amdgpu_device *bo_adev;
0429
0430 mutex_lock(&prange->lock);
0431 if (!prange->svm_bo) {
0432 mutex_unlock(&prange->lock);
0433 return false;
0434 }
0435 if (prange->ttm_res) {
0436
0437 mutex_unlock(&prange->lock);
0438 return true;
0439 }
0440 if (svm_bo_ref_unless_zero(prange->svm_bo)) {
0441
0442
0443
0444
0445
0446 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
0447 if (bo_adev != adev) {
0448 mutex_unlock(&prange->lock);
0449
0450 spin_lock(&prange->svm_bo->list_lock);
0451 list_del_init(&prange->svm_bo_list);
0452 spin_unlock(&prange->svm_bo->list_lock);
0453
0454 svm_range_bo_unref(prange->svm_bo);
0455 return false;
0456 }
0457 if (READ_ONCE(prange->svm_bo->evicting)) {
0458 struct dma_fence *f;
0459 struct svm_range_bo *svm_bo;
0460
0461
0462
0463 mutex_unlock(&prange->lock);
0464 svm_bo = prange->svm_bo;
0465 f = dma_fence_get(&svm_bo->eviction_fence->base);
0466 svm_range_bo_unref(prange->svm_bo);
0467
0468
0469
0470 dma_fence_wait(f, false);
0471 dma_fence_put(f);
0472 } else {
0473
0474
0475
0476 mutex_unlock(&prange->lock);
0477 pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n",
0478 prange->svms, prange->start, prange->last);
0479
0480 prange->ttm_res = prange->svm_bo->bo->tbo.resource;
0481 return true;
0482 }
0483
0484 } else {
0485 mutex_unlock(&prange->lock);
0486 }
0487
0488
0489
0490
0491
0492
0493 while (!list_empty_careful(&prange->svm_bo_list))
0494 ;
0495
0496 return false;
0497 }
0498
0499 static struct svm_range_bo *svm_range_bo_new(void)
0500 {
0501 struct svm_range_bo *svm_bo;
0502
0503 svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL);
0504 if (!svm_bo)
0505 return NULL;
0506
0507 kref_init(&svm_bo->kref);
0508 INIT_LIST_HEAD(&svm_bo->range_list);
0509 spin_lock_init(&svm_bo->list_lock);
0510
0511 return svm_bo;
0512 }
0513
0514 int
0515 svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange,
0516 bool clear)
0517 {
0518 struct amdgpu_bo_param bp;
0519 struct svm_range_bo *svm_bo;
0520 struct amdgpu_bo_user *ubo;
0521 struct amdgpu_bo *bo;
0522 struct kfd_process *p;
0523 struct mm_struct *mm;
0524 int r;
0525
0526 p = container_of(prange->svms, struct kfd_process, svms);
0527 pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms,
0528 prange->start, prange->last);
0529
0530 if (svm_range_validate_svm_bo(adev, prange))
0531 return 0;
0532
0533 svm_bo = svm_range_bo_new();
0534 if (!svm_bo) {
0535 pr_debug("failed to alloc svm bo\n");
0536 return -ENOMEM;
0537 }
0538 mm = get_task_mm(p->lead_thread);
0539 if (!mm) {
0540 pr_debug("failed to get mm\n");
0541 kfree(svm_bo);
0542 return -ESRCH;
0543 }
0544 svm_bo->eviction_fence =
0545 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
0546 mm,
0547 svm_bo);
0548 mmput(mm);
0549 INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker);
0550 svm_bo->evicting = 0;
0551 memset(&bp, 0, sizeof(bp));
0552 bp.size = prange->npages * PAGE_SIZE;
0553 bp.byte_align = PAGE_SIZE;
0554 bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
0555 bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
0556 bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0;
0557 bp.flags |= AMDGPU_GEM_CREATE_DISCARDABLE;
0558 bp.type = ttm_bo_type_device;
0559 bp.resv = NULL;
0560
0561 r = amdgpu_bo_create_user(adev, &bp, &ubo);
0562 if (r) {
0563 pr_debug("failed %d to create bo\n", r);
0564 goto create_bo_failed;
0565 }
0566 bo = &ubo->bo;
0567 r = amdgpu_bo_reserve(bo, true);
0568 if (r) {
0569 pr_debug("failed %d to reserve bo\n", r);
0570 goto reserve_bo_failed;
0571 }
0572
0573 r = dma_resv_reserve_fences(bo->tbo.base.resv, 1);
0574 if (r) {
0575 pr_debug("failed %d to reserve bo\n", r);
0576 amdgpu_bo_unreserve(bo);
0577 goto reserve_bo_failed;
0578 }
0579 amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true);
0580
0581 amdgpu_bo_unreserve(bo);
0582
0583 svm_bo->bo = bo;
0584 prange->svm_bo = svm_bo;
0585 prange->ttm_res = bo->tbo.resource;
0586 prange->offset = 0;
0587
0588 spin_lock(&svm_bo->list_lock);
0589 list_add(&prange->svm_bo_list, &svm_bo->range_list);
0590 spin_unlock(&svm_bo->list_lock);
0591
0592 return 0;
0593
0594 reserve_bo_failed:
0595 amdgpu_bo_unref(&bo);
0596 create_bo_failed:
0597 dma_fence_put(&svm_bo->eviction_fence->base);
0598 kfree(svm_bo);
0599 prange->ttm_res = NULL;
0600
0601 return r;
0602 }
0603
0604 void svm_range_vram_node_free(struct svm_range *prange)
0605 {
0606 svm_range_bo_unref(prange->svm_bo);
0607 prange->ttm_res = NULL;
0608 }
0609
0610 struct amdgpu_device *
0611 svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id)
0612 {
0613 struct kfd_process_device *pdd;
0614 struct kfd_process *p;
0615 int32_t gpu_idx;
0616
0617 p = container_of(prange->svms, struct kfd_process, svms);
0618
0619 gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id);
0620 if (gpu_idx < 0) {
0621 pr_debug("failed to get device by id 0x%x\n", gpu_id);
0622 return NULL;
0623 }
0624 pdd = kfd_process_device_from_gpuidx(p, gpu_idx);
0625 if (!pdd) {
0626 pr_debug("failed to get device by idx 0x%x\n", gpu_idx);
0627 return NULL;
0628 }
0629
0630 return pdd->dev->adev;
0631 }
0632
0633 struct kfd_process_device *
0634 svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev)
0635 {
0636 struct kfd_process *p;
0637 int32_t gpu_idx, gpuid;
0638 int r;
0639
0640 p = container_of(prange->svms, struct kfd_process, svms);
0641
0642 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpu_idx);
0643 if (r) {
0644 pr_debug("failed to get device id by adev %p\n", adev);
0645 return NULL;
0646 }
0647
0648 return kfd_process_device_from_gpuidx(p, gpu_idx);
0649 }
0650
0651 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
0652 {
0653 struct ttm_operation_ctx ctx = { false, false };
0654
0655 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);
0656
0657 return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
0658 }
0659
0660 static int
0661 svm_range_check_attr(struct kfd_process *p,
0662 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
0663 {
0664 uint32_t i;
0665
0666 for (i = 0; i < nattr; i++) {
0667 uint32_t val = attrs[i].value;
0668 int gpuidx = MAX_GPU_INSTANCE;
0669
0670 switch (attrs[i].type) {
0671 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
0672 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM &&
0673 val != KFD_IOCTL_SVM_LOCATION_UNDEFINED)
0674 gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
0675 break;
0676 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
0677 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM)
0678 gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
0679 break;
0680 case KFD_IOCTL_SVM_ATTR_ACCESS:
0681 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
0682 case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
0683 gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
0684 break;
0685 case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
0686 break;
0687 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
0688 break;
0689 case KFD_IOCTL_SVM_ATTR_GRANULARITY:
0690 break;
0691 default:
0692 pr_debug("unknown attr type 0x%x\n", attrs[i].type);
0693 return -EINVAL;
0694 }
0695
0696 if (gpuidx < 0) {
0697 pr_debug("no GPU 0x%x found\n", val);
0698 return -EINVAL;
0699 } else if (gpuidx < MAX_GPU_INSTANCE &&
0700 !test_bit(gpuidx, p->svms.bitmap_supported)) {
0701 pr_debug("GPU 0x%x not supported\n", val);
0702 return -EINVAL;
0703 }
0704 }
0705
0706 return 0;
0707 }
0708
0709 static void
0710 svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange,
0711 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
0712 bool *update_mapping)
0713 {
0714 uint32_t i;
0715 int gpuidx;
0716
0717 for (i = 0; i < nattr; i++) {
0718 switch (attrs[i].type) {
0719 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
0720 prange->preferred_loc = attrs[i].value;
0721 break;
0722 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
0723 prange->prefetch_loc = attrs[i].value;
0724 break;
0725 case KFD_IOCTL_SVM_ATTR_ACCESS:
0726 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
0727 case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
0728 *update_mapping = true;
0729 gpuidx = kfd_process_gpuidx_from_gpuid(p,
0730 attrs[i].value);
0731 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
0732 bitmap_clear(prange->bitmap_access, gpuidx, 1);
0733 bitmap_clear(prange->bitmap_aip, gpuidx, 1);
0734 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
0735 bitmap_set(prange->bitmap_access, gpuidx, 1);
0736 bitmap_clear(prange->bitmap_aip, gpuidx, 1);
0737 } else {
0738 bitmap_clear(prange->bitmap_access, gpuidx, 1);
0739 bitmap_set(prange->bitmap_aip, gpuidx, 1);
0740 }
0741 break;
0742 case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
0743 *update_mapping = true;
0744 prange->flags |= attrs[i].value;
0745 break;
0746 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
0747 *update_mapping = true;
0748 prange->flags &= ~attrs[i].value;
0749 break;
0750 case KFD_IOCTL_SVM_ATTR_GRANULARITY:
0751 prange->granularity = attrs[i].value;
0752 break;
0753 default:
0754 WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
0755 }
0756 }
0757 }
0758
0759 static bool
0760 svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
0761 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
0762 {
0763 uint32_t i;
0764 int gpuidx;
0765
0766 for (i = 0; i < nattr; i++) {
0767 switch (attrs[i].type) {
0768 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
0769 if (prange->preferred_loc != attrs[i].value)
0770 return false;
0771 break;
0772 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
0773
0774
0775
0776 return false;
0777 case KFD_IOCTL_SVM_ATTR_ACCESS:
0778 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
0779 case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
0780 gpuidx = kfd_process_gpuidx_from_gpuid(p,
0781 attrs[i].value);
0782 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
0783 if (test_bit(gpuidx, prange->bitmap_access) ||
0784 test_bit(gpuidx, prange->bitmap_aip))
0785 return false;
0786 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
0787 if (!test_bit(gpuidx, prange->bitmap_access))
0788 return false;
0789 } else {
0790 if (!test_bit(gpuidx, prange->bitmap_aip))
0791 return false;
0792 }
0793 break;
0794 case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
0795 if ((prange->flags & attrs[i].value) != attrs[i].value)
0796 return false;
0797 break;
0798 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
0799 if ((prange->flags & attrs[i].value) != 0)
0800 return false;
0801 break;
0802 case KFD_IOCTL_SVM_ATTR_GRANULARITY:
0803 if (prange->granularity != attrs[i].value)
0804 return false;
0805 break;
0806 default:
0807 WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
0808 }
0809 }
0810
0811 return true;
0812 }
0813
0814
0815
0816
0817
0818
0819
0820
0821
0822
0823 static void svm_range_debug_dump(struct svm_range_list *svms)
0824 {
0825 struct interval_tree_node *node;
0826 struct svm_range *prange;
0827
0828 pr_debug("dump svms 0x%p list\n", svms);
0829 pr_debug("range\tstart\tpage\tend\t\tlocation\n");
0830
0831 list_for_each_entry(prange, &svms->list, list) {
0832 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
0833 prange, prange->start, prange->npages,
0834 prange->start + prange->npages - 1,
0835 prange->actual_loc);
0836 }
0837
0838 pr_debug("dump svms 0x%p interval tree\n", svms);
0839 pr_debug("range\tstart\tpage\tend\t\tlocation\n");
0840 node = interval_tree_iter_first(&svms->objects, 0, ~0ULL);
0841 while (node) {
0842 prange = container_of(node, struct svm_range, it_node);
0843 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
0844 prange, prange->start, prange->npages,
0845 prange->start + prange->npages - 1,
0846 prange->actual_loc);
0847 node = interval_tree_iter_next(node, 0, ~0ULL);
0848 }
0849 }
0850
0851 static int
0852 svm_range_split_array(void *ppnew, void *ppold, size_t size,
0853 uint64_t old_start, uint64_t old_n,
0854 uint64_t new_start, uint64_t new_n)
0855 {
0856 unsigned char *new, *old, *pold;
0857 uint64_t d;
0858
0859 if (!ppold)
0860 return 0;
0861 pold = *(unsigned char **)ppold;
0862 if (!pold)
0863 return 0;
0864
0865 new = kvmalloc_array(new_n, size, GFP_KERNEL);
0866 if (!new)
0867 return -ENOMEM;
0868
0869 d = (new_start - old_start) * size;
0870 memcpy(new, pold + d, new_n * size);
0871
0872 old = kvmalloc_array(old_n, size, GFP_KERNEL);
0873 if (!old) {
0874 kvfree(new);
0875 return -ENOMEM;
0876 }
0877
0878 d = (new_start == old_start) ? new_n * size : 0;
0879 memcpy(old, pold + d, old_n * size);
0880
0881 kvfree(pold);
0882 *(void **)ppold = old;
0883 *(void **)ppnew = new;
0884
0885 return 0;
0886 }
0887
0888 static int
0889 svm_range_split_pages(struct svm_range *new, struct svm_range *old,
0890 uint64_t start, uint64_t last)
0891 {
0892 uint64_t npages = last - start + 1;
0893 int i, r;
0894
0895 for (i = 0; i < MAX_GPU_INSTANCE; i++) {
0896 r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i],
0897 sizeof(*old->dma_addr[i]), old->start,
0898 npages, new->start, new->npages);
0899 if (r)
0900 return r;
0901 }
0902
0903 return 0;
0904 }
0905
0906 static int
0907 svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
0908 uint64_t start, uint64_t last)
0909 {
0910 uint64_t npages = last - start + 1;
0911
0912 pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n",
0913 new->svms, new, new->start, start, last);
0914
0915 if (new->start == old->start) {
0916 new->offset = old->offset;
0917 old->offset += new->npages;
0918 } else {
0919 new->offset = old->offset + npages;
0920 }
0921
0922 new->svm_bo = svm_range_bo_ref(old->svm_bo);
0923 new->ttm_res = old->ttm_res;
0924
0925 spin_lock(&new->svm_bo->list_lock);
0926 list_add(&new->svm_bo_list, &new->svm_bo->range_list);
0927 spin_unlock(&new->svm_bo->list_lock);
0928
0929 return 0;
0930 }
0931
0932
0933
0934
0935
0936
0937
0938
0939
0940
0941
0942
0943
0944
0945
0946
0947 static int
0948 svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
0949 uint64_t start, uint64_t last)
0950 {
0951 int r;
0952
0953 pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n",
0954 new->svms, new->start, old->start, old->last, start, last);
0955
0956 if (new->start < old->start ||
0957 new->last > old->last) {
0958 WARN_ONCE(1, "invalid new range start or last\n");
0959 return -EINVAL;
0960 }
0961
0962 r = svm_range_split_pages(new, old, start, last);
0963 if (r)
0964 return r;
0965
0966 if (old->actual_loc && old->ttm_res) {
0967 r = svm_range_split_nodes(new, old, start, last);
0968 if (r)
0969 return r;
0970 }
0971
0972 old->npages = last - start + 1;
0973 old->start = start;
0974 old->last = last;
0975 new->flags = old->flags;
0976 new->preferred_loc = old->preferred_loc;
0977 new->prefetch_loc = old->prefetch_loc;
0978 new->actual_loc = old->actual_loc;
0979 new->granularity = old->granularity;
0980 new->mapped_to_gpu = old->mapped_to_gpu;
0981 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
0982 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
0983
0984 return 0;
0985 }
0986
0987
0988
0989
0990
0991
0992
0993
0994
0995
0996
0997
0998
0999
1000
1001
1002
1003
1004
1005
1006
1007 static int
1008 svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last,
1009 struct svm_range **new)
1010 {
1011 uint64_t old_start = prange->start;
1012 uint64_t old_last = prange->last;
1013 struct svm_range_list *svms;
1014 int r = 0;
1015
1016 pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms,
1017 old_start, old_last, start, last);
1018
1019 if (old_start != start && old_last != last)
1020 return -EINVAL;
1021 if (start < old_start || last > old_last)
1022 return -EINVAL;
1023
1024 svms = prange->svms;
1025 if (old_start == start)
1026 *new = svm_range_new(svms, last + 1, old_last, false);
1027 else
1028 *new = svm_range_new(svms, old_start, start - 1, false);
1029 if (!*new)
1030 return -ENOMEM;
1031
1032 r = svm_range_split_adjust(*new, prange, start, last);
1033 if (r) {
1034 pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n",
1035 r, old_start, old_last, start, last);
1036 svm_range_free(*new, false);
1037 *new = NULL;
1038 }
1039
1040 return r;
1041 }
1042
1043 static int
1044 svm_range_split_tail(struct svm_range *prange,
1045 uint64_t new_last, struct list_head *insert_list)
1046 {
1047 struct svm_range *tail;
1048 int r = svm_range_split(prange, prange->start, new_last, &tail);
1049
1050 if (!r)
1051 list_add(&tail->list, insert_list);
1052 return r;
1053 }
1054
1055 static int
1056 svm_range_split_head(struct svm_range *prange,
1057 uint64_t new_start, struct list_head *insert_list)
1058 {
1059 struct svm_range *head;
1060 int r = svm_range_split(prange, new_start, prange->last, &head);
1061
1062 if (!r)
1063 list_add(&head->list, insert_list);
1064 return r;
1065 }
1066
1067 static void
1068 svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
1069 struct svm_range *pchild, enum svm_work_list_ops op)
1070 {
1071 pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
1072 pchild, pchild->start, pchild->last, prange, op);
1073
1074 pchild->work_item.mm = mm;
1075 pchild->work_item.op = op;
1076 list_add_tail(&pchild->child_list, &prange->child_list);
1077 }
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096 int
1097 svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
1098 unsigned long addr, struct svm_range *parent,
1099 struct svm_range *prange)
1100 {
1101 struct svm_range *head, *tail;
1102 unsigned long start, last, size;
1103 int r;
1104
1105
1106
1107
1108
1109 size = 1UL << prange->granularity;
1110 start = ALIGN_DOWN(addr, size);
1111 last = ALIGN(addr + 1, size) - 1;
1112
1113 pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n",
1114 prange->svms, prange->start, prange->last, start, last, size);
1115
1116 if (start > prange->start) {
1117 r = svm_range_split(prange, start, prange->last, &head);
1118 if (r)
1119 return r;
1120 svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE);
1121 }
1122
1123 if (last < prange->last) {
1124 r = svm_range_split(prange, prange->start, last, &tail);
1125 if (r)
1126 return r;
1127 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
1128 }
1129
1130
1131 if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) {
1132 prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP;
1133 pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n",
1134 prange, prange->start, prange->last,
1135 SVM_OP_ADD_RANGE_AND_MAP);
1136 }
1137 return 0;
1138 }
1139
1140 static uint64_t
1141 svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange,
1142 int domain)
1143 {
1144 struct amdgpu_device *bo_adev;
1145 uint32_t flags = prange->flags;
1146 uint32_t mapping_flags = 0;
1147 uint64_t pte_flags;
1148 bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
1149 bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT;
1150
1151 if (domain == SVM_RANGE_VRAM_DOMAIN)
1152 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
1153
1154 switch (KFD_GC_VERSION(adev->kfd.dev)) {
1155 case IP_VERSION(9, 4, 1):
1156 if (domain == SVM_RANGE_VRAM_DOMAIN) {
1157 if (bo_adev == adev) {
1158 mapping_flags |= coherent ?
1159 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1160 } else {
1161 mapping_flags |= coherent ?
1162 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1163 if (amdgpu_xgmi_same_hive(adev, bo_adev))
1164 snoop = true;
1165 }
1166 } else {
1167 mapping_flags |= coherent ?
1168 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1169 }
1170 break;
1171 case IP_VERSION(9, 4, 2):
1172 if (domain == SVM_RANGE_VRAM_DOMAIN) {
1173 if (bo_adev == adev) {
1174 mapping_flags |= coherent ?
1175 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1176 if (adev->gmc.xgmi.connected_to_cpu)
1177 snoop = true;
1178 } else {
1179 mapping_flags |= coherent ?
1180 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1181 if (amdgpu_xgmi_same_hive(adev, bo_adev))
1182 snoop = true;
1183 }
1184 } else {
1185 mapping_flags |= coherent ?
1186 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1187 }
1188 break;
1189 default:
1190 mapping_flags |= coherent ?
1191 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1192 }
1193
1194 mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
1195
1196 if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO)
1197 mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE;
1198 if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC)
1199 mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
1200
1201 pte_flags = AMDGPU_PTE_VALID;
1202 pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM;
1203 pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
1204
1205 pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags);
1206 return pte_flags;
1207 }
1208
1209 static int
1210 svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
1211 uint64_t start, uint64_t last,
1212 struct dma_fence **fence)
1213 {
1214 uint64_t init_pte_value = 0;
1215
1216 pr_debug("[0x%llx 0x%llx]\n", start, last);
1217
1218 return amdgpu_vm_update_range(adev, vm, false, true, true, NULL, start,
1219 last, init_pte_value, 0, 0, NULL, NULL,
1220 fence);
1221 }
1222
1223 static int
1224 svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
1225 unsigned long last, uint32_t trigger)
1226 {
1227 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1228 struct kfd_process_device *pdd;
1229 struct dma_fence *fence = NULL;
1230 struct kfd_process *p;
1231 uint32_t gpuidx;
1232 int r = 0;
1233
1234 if (!prange->mapped_to_gpu) {
1235 pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n",
1236 prange, prange->start, prange->last);
1237 return 0;
1238 }
1239
1240 if (prange->start == start && prange->last == last) {
1241 pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange);
1242 prange->mapped_to_gpu = false;
1243 }
1244
1245 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
1246 MAX_GPU_INSTANCE);
1247 p = container_of(prange->svms, struct kfd_process, svms);
1248
1249 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1250 pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
1251 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1252 if (!pdd) {
1253 pr_debug("failed to find device idx %d\n", gpuidx);
1254 return -EINVAL;
1255 }
1256
1257 kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid,
1258 start, last, trigger);
1259
1260 r = svm_range_unmap_from_gpu(pdd->dev->adev,
1261 drm_priv_to_vm(pdd->drm_priv),
1262 start, last, &fence);
1263 if (r)
1264 break;
1265
1266 if (fence) {
1267 r = dma_fence_wait(fence, false);
1268 dma_fence_put(fence);
1269 fence = NULL;
1270 if (r)
1271 break;
1272 }
1273 kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT);
1274 }
1275
1276 return r;
1277 }
1278
1279 static int
1280 svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
1281 unsigned long offset, unsigned long npages, bool readonly,
1282 dma_addr_t *dma_addr, struct amdgpu_device *bo_adev,
1283 struct dma_fence **fence, bool flush_tlb)
1284 {
1285 struct amdgpu_device *adev = pdd->dev->adev;
1286 struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv);
1287 uint64_t pte_flags;
1288 unsigned long last_start;
1289 int last_domain;
1290 int r = 0;
1291 int64_t i, j;
1292
1293 last_start = prange->start + offset;
1294
1295 pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms,
1296 last_start, last_start + npages - 1, readonly);
1297
1298 for (i = offset; i < offset + npages; i++) {
1299 last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN;
1300 dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN;
1301
1302
1303
1304
1305 if (i < offset + npages - 1 &&
1306 last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN))
1307 continue;
1308
1309 pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n",
1310 last_start, prange->start + i, last_domain ? "GPU" : "CPU");
1311
1312 pte_flags = svm_range_get_pte_flags(adev, prange, last_domain);
1313 if (readonly)
1314 pte_flags &= ~AMDGPU_PTE_WRITEABLE;
1315
1316 pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n",
1317 prange->svms, last_start, prange->start + i,
1318 (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0,
1319 pte_flags);
1320
1321 r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, NULL,
1322 last_start, prange->start + i,
1323 pte_flags,
1324 (last_start - prange->start) << PAGE_SHIFT,
1325 bo_adev ? bo_adev->vm_manager.vram_base_offset : 0,
1326 NULL, dma_addr, &vm->last_update);
1327
1328 for (j = last_start - prange->start; j <= i; j++)
1329 dma_addr[j] |= last_domain;
1330
1331 if (r) {
1332 pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start);
1333 goto out;
1334 }
1335 last_start = prange->start + i + 1;
1336 }
1337
1338 r = amdgpu_vm_update_pdes(adev, vm, false);
1339 if (r) {
1340 pr_debug("failed %d to update directories 0x%lx\n", r,
1341 prange->start);
1342 goto out;
1343 }
1344
1345 if (fence)
1346 *fence = dma_fence_get(vm->last_update);
1347
1348 out:
1349 return r;
1350 }
1351
1352 static int
1353 svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
1354 unsigned long npages, bool readonly,
1355 unsigned long *bitmap, bool wait, bool flush_tlb)
1356 {
1357 struct kfd_process_device *pdd;
1358 struct amdgpu_device *bo_adev;
1359 struct kfd_process *p;
1360 struct dma_fence *fence = NULL;
1361 uint32_t gpuidx;
1362 int r = 0;
1363
1364 if (prange->svm_bo && prange->ttm_res)
1365 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
1366 else
1367 bo_adev = NULL;
1368
1369 p = container_of(prange->svms, struct kfd_process, svms);
1370 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1371 pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
1372 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1373 if (!pdd) {
1374 pr_debug("failed to find device idx %d\n", gpuidx);
1375 return -EINVAL;
1376 }
1377
1378 pdd = kfd_bind_process_to_device(pdd->dev, p);
1379 if (IS_ERR(pdd))
1380 return -EINVAL;
1381
1382 if (bo_adev && pdd->dev->adev != bo_adev &&
1383 !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) {
1384 pr_debug("cannot map to device idx %d\n", gpuidx);
1385 continue;
1386 }
1387
1388 r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly,
1389 prange->dma_addr[gpuidx],
1390 bo_adev, wait ? &fence : NULL,
1391 flush_tlb);
1392 if (r)
1393 break;
1394
1395 if (fence) {
1396 r = dma_fence_wait(fence, false);
1397 dma_fence_put(fence);
1398 fence = NULL;
1399 if (r) {
1400 pr_debug("failed %d to dma fence wait\n", r);
1401 break;
1402 }
1403 }
1404
1405 kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
1406 }
1407
1408 return r;
1409 }
1410
1411 struct svm_validate_context {
1412 struct kfd_process *process;
1413 struct svm_range *prange;
1414 bool intr;
1415 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1416 struct ttm_validate_buffer tv[MAX_GPU_INSTANCE];
1417 struct list_head validate_list;
1418 struct ww_acquire_ctx ticket;
1419 };
1420
1421 static int svm_range_reserve_bos(struct svm_validate_context *ctx)
1422 {
1423 struct kfd_process_device *pdd;
1424 struct amdgpu_vm *vm;
1425 uint32_t gpuidx;
1426 int r;
1427
1428 INIT_LIST_HEAD(&ctx->validate_list);
1429 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1430 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1431 if (!pdd) {
1432 pr_debug("failed to find device idx %d\n", gpuidx);
1433 return -EINVAL;
1434 }
1435 vm = drm_priv_to_vm(pdd->drm_priv);
1436
1437 ctx->tv[gpuidx].bo = &vm->root.bo->tbo;
1438 ctx->tv[gpuidx].num_shared = 4;
1439 list_add(&ctx->tv[gpuidx].head, &ctx->validate_list);
1440 }
1441
1442 r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list,
1443 ctx->intr, NULL);
1444 if (r) {
1445 pr_debug("failed %d to reserve bo\n", r);
1446 return r;
1447 }
1448
1449 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1450 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1451 if (!pdd) {
1452 pr_debug("failed to find device idx %d\n", gpuidx);
1453 r = -EINVAL;
1454 goto unreserve_out;
1455 }
1456
1457 r = amdgpu_vm_validate_pt_bos(pdd->dev->adev,
1458 drm_priv_to_vm(pdd->drm_priv),
1459 svm_range_bo_validate, NULL);
1460 if (r) {
1461 pr_debug("failed %d validate pt bos\n", r);
1462 goto unreserve_out;
1463 }
1464 }
1465
1466 return 0;
1467
1468 unreserve_out:
1469 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list);
1470 return r;
1471 }
1472
1473 static void svm_range_unreserve_bos(struct svm_validate_context *ctx)
1474 {
1475 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list);
1476 }
1477
1478 static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx)
1479 {
1480 struct kfd_process_device *pdd;
1481
1482 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1483
1484 return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev);
1485 }
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511 static int svm_range_validate_and_map(struct mm_struct *mm,
1512 struct svm_range *prange, int32_t gpuidx,
1513 bool intr, bool wait, bool flush_tlb)
1514 {
1515 struct svm_validate_context ctx;
1516 unsigned long start, end, addr;
1517 struct kfd_process *p;
1518 void *owner;
1519 int32_t idx;
1520 int r = 0;
1521
1522 ctx.process = container_of(prange->svms, struct kfd_process, svms);
1523 ctx.prange = prange;
1524 ctx.intr = intr;
1525
1526 if (gpuidx < MAX_GPU_INSTANCE) {
1527 bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE);
1528 bitmap_set(ctx.bitmap, gpuidx, 1);
1529 } else if (ctx.process->xnack_enabled) {
1530 bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
1531
1532
1533
1534
1535
1536 if (prange->actual_loc) {
1537 gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process,
1538 prange->actual_loc);
1539 if (gpuidx < 0) {
1540 WARN_ONCE(1, "failed get device by id 0x%x\n",
1541 prange->actual_loc);
1542 return -EINVAL;
1543 }
1544 if (test_bit(gpuidx, prange->bitmap_access))
1545 bitmap_set(ctx.bitmap, gpuidx, 1);
1546 }
1547 } else {
1548 bitmap_or(ctx.bitmap, prange->bitmap_access,
1549 prange->bitmap_aip, MAX_GPU_INSTANCE);
1550 }
1551
1552 if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) {
1553 if (!prange->mapped_to_gpu)
1554 return 0;
1555
1556 bitmap_copy(ctx.bitmap, prange->bitmap_access, MAX_GPU_INSTANCE);
1557 }
1558
1559 if (prange->actual_loc && !prange->ttm_res) {
1560
1561
1562
1563 WARN_ONCE(1, "VRAM BO missing during validation\n");
1564 return -EINVAL;
1565 }
1566
1567 svm_range_reserve_bos(&ctx);
1568
1569 p = container_of(prange->svms, struct kfd_process, svms);
1570 owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap,
1571 MAX_GPU_INSTANCE));
1572 for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) {
1573 if (kfd_svm_page_owner(p, idx) != owner) {
1574 owner = NULL;
1575 break;
1576 }
1577 }
1578
1579 start = prange->start << PAGE_SHIFT;
1580 end = (prange->last + 1) << PAGE_SHIFT;
1581 for (addr = start; addr < end && !r; ) {
1582 struct hmm_range *hmm_range;
1583 struct vm_area_struct *vma;
1584 unsigned long next;
1585 unsigned long offset;
1586 unsigned long npages;
1587 bool readonly;
1588
1589 vma = find_vma(mm, addr);
1590 if (!vma || addr < vma->vm_start) {
1591 r = -EFAULT;
1592 goto unreserve_out;
1593 }
1594 readonly = !(vma->vm_flags & VM_WRITE);
1595
1596 next = min(vma->vm_end, end);
1597 npages = (next - addr) >> PAGE_SHIFT;
1598 WRITE_ONCE(p->svms.faulting_task, current);
1599 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
1600 addr, npages, &hmm_range,
1601 readonly, true, owner);
1602 WRITE_ONCE(p->svms.faulting_task, NULL);
1603 if (r) {
1604 pr_debug("failed %d to get svm range pages\n", r);
1605 goto unreserve_out;
1606 }
1607
1608 offset = (addr - start) >> PAGE_SHIFT;
1609 r = svm_range_dma_map(prange, ctx.bitmap, offset, npages,
1610 hmm_range->hmm_pfns);
1611 if (r) {
1612 pr_debug("failed %d to dma map range\n", r);
1613 goto unreserve_out;
1614 }
1615
1616 svm_range_lock(prange);
1617 if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
1618 pr_debug("hmm update the range, need validate again\n");
1619 r = -EAGAIN;
1620 goto unlock_out;
1621 }
1622 if (!list_empty(&prange->child_list)) {
1623 pr_debug("range split by unmap in parallel, validate again\n");
1624 r = -EAGAIN;
1625 goto unlock_out;
1626 }
1627
1628 r = svm_range_map_to_gpus(prange, offset, npages, readonly,
1629 ctx.bitmap, wait, flush_tlb);
1630
1631 unlock_out:
1632 svm_range_unlock(prange);
1633
1634 addr = next;
1635 }
1636
1637 if (addr == end) {
1638 prange->validated_once = true;
1639 prange->mapped_to_gpu = true;
1640 }
1641
1642 unreserve_out:
1643 svm_range_unreserve_bos(&ctx);
1644
1645 if (!r)
1646 prange->validate_timestamp = ktime_get_boottime();
1647
1648 return r;
1649 }
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660 void
1661 svm_range_list_lock_and_flush_work(struct svm_range_list *svms,
1662 struct mm_struct *mm)
1663 {
1664 retry_flush_work:
1665 flush_work(&svms->deferred_list_work);
1666 mmap_write_lock(mm);
1667
1668 if (list_empty(&svms->deferred_range_list))
1669 return;
1670 mmap_write_unlock(mm);
1671 pr_debug("retry flush\n");
1672 goto retry_flush_work;
1673 }
1674
1675 static void svm_range_restore_work(struct work_struct *work)
1676 {
1677 struct delayed_work *dwork = to_delayed_work(work);
1678 struct amdkfd_process_info *process_info;
1679 struct svm_range_list *svms;
1680 struct svm_range *prange;
1681 struct kfd_process *p;
1682 struct mm_struct *mm;
1683 int evicted_ranges;
1684 int invalid;
1685 int r;
1686
1687 svms = container_of(dwork, struct svm_range_list, restore_work);
1688 evicted_ranges = atomic_read(&svms->evicted_ranges);
1689 if (!evicted_ranges)
1690 return;
1691
1692 pr_debug("restore svm ranges\n");
1693
1694 p = container_of(svms, struct kfd_process, svms);
1695 process_info = p->kgd_process_info;
1696
1697
1698 mm = get_task_mm(p->lead_thread);
1699 if (!mm) {
1700 pr_debug("svms 0x%p process mm gone\n", svms);
1701 return;
1702 }
1703
1704 mutex_lock(&process_info->lock);
1705 svm_range_list_lock_and_flush_work(svms, mm);
1706 mutex_lock(&svms->lock);
1707
1708 evicted_ranges = atomic_read(&svms->evicted_ranges);
1709
1710 list_for_each_entry(prange, &svms->list, list) {
1711 invalid = atomic_read(&prange->invalid);
1712 if (!invalid)
1713 continue;
1714
1715 pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n",
1716 prange->svms, prange, prange->start, prange->last,
1717 invalid);
1718
1719
1720
1721
1722 mutex_lock(&prange->migrate_mutex);
1723
1724 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
1725 false, true, false);
1726 if (r)
1727 pr_debug("failed %d to map 0x%lx to gpus\n", r,
1728 prange->start);
1729
1730 mutex_unlock(&prange->migrate_mutex);
1731 if (r)
1732 goto out_reschedule;
1733
1734 if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid)
1735 goto out_reschedule;
1736 }
1737
1738 if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) !=
1739 evicted_ranges)
1740 goto out_reschedule;
1741
1742 evicted_ranges = 0;
1743
1744 r = kgd2kfd_resume_mm(mm);
1745 if (r) {
1746
1747
1748
1749 pr_debug("failed %d to resume KFD\n", r);
1750 }
1751
1752 pr_debug("restore svm ranges successfully\n");
1753
1754 out_reschedule:
1755 mutex_unlock(&svms->lock);
1756 mmap_write_unlock(mm);
1757 mutex_unlock(&process_info->lock);
1758
1759
1760 if (evicted_ranges) {
1761 pr_debug("reschedule to restore svm range\n");
1762 schedule_delayed_work(&svms->restore_work,
1763 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1764
1765 kfd_smi_event_queue_restore_rescheduled(mm);
1766 }
1767 mmput(mm);
1768 }
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785 static int
1786 svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
1787 unsigned long start, unsigned long last,
1788 enum mmu_notifier_event event)
1789 {
1790 struct svm_range_list *svms = prange->svms;
1791 struct svm_range *pchild;
1792 struct kfd_process *p;
1793 int r = 0;
1794
1795 p = container_of(svms, struct kfd_process, svms);
1796
1797 pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
1798 svms, prange->start, prange->last, start, last);
1799
1800 if (!p->xnack_enabled ||
1801 (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) {
1802 int evicted_ranges;
1803 bool mapped = prange->mapped_to_gpu;
1804
1805 list_for_each_entry(pchild, &prange->child_list, child_list) {
1806 if (!pchild->mapped_to_gpu)
1807 continue;
1808 mapped = true;
1809 mutex_lock_nested(&pchild->lock, 1);
1810 if (pchild->start <= last && pchild->last >= start) {
1811 pr_debug("increment pchild invalid [0x%lx 0x%lx]\n",
1812 pchild->start, pchild->last);
1813 atomic_inc(&pchild->invalid);
1814 }
1815 mutex_unlock(&pchild->lock);
1816 }
1817
1818 if (!mapped)
1819 return r;
1820
1821 if (prange->start <= last && prange->last >= start)
1822 atomic_inc(&prange->invalid);
1823
1824 evicted_ranges = atomic_inc_return(&svms->evicted_ranges);
1825 if (evicted_ranges != 1)
1826 return r;
1827
1828 pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n",
1829 prange->svms, prange->start, prange->last);
1830
1831
1832 r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
1833 if (r)
1834 pr_debug("failed to quiesce KFD\n");
1835
1836 pr_debug("schedule to restore svm %p ranges\n", svms);
1837 schedule_delayed_work(&svms->restore_work,
1838 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1839 } else {
1840 unsigned long s, l;
1841 uint32_t trigger;
1842
1843 if (event == MMU_NOTIFY_MIGRATE)
1844 trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE;
1845 else
1846 trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY;
1847
1848 pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n",
1849 prange->svms, start, last);
1850 list_for_each_entry(pchild, &prange->child_list, child_list) {
1851 mutex_lock_nested(&pchild->lock, 1);
1852 s = max(start, pchild->start);
1853 l = min(last, pchild->last);
1854 if (l >= s)
1855 svm_range_unmap_from_gpus(pchild, s, l, trigger);
1856 mutex_unlock(&pchild->lock);
1857 }
1858 s = max(start, prange->start);
1859 l = min(last, prange->last);
1860 if (l >= s)
1861 svm_range_unmap_from_gpus(prange, s, l, trigger);
1862 }
1863
1864 return r;
1865 }
1866
1867 static struct svm_range *svm_range_clone(struct svm_range *old)
1868 {
1869 struct svm_range *new;
1870
1871 new = svm_range_new(old->svms, old->start, old->last, false);
1872 if (!new)
1873 return NULL;
1874
1875 if (old->svm_bo) {
1876 new->ttm_res = old->ttm_res;
1877 new->offset = old->offset;
1878 new->svm_bo = svm_range_bo_ref(old->svm_bo);
1879 spin_lock(&new->svm_bo->list_lock);
1880 list_add(&new->svm_bo_list, &new->svm_bo->range_list);
1881 spin_unlock(&new->svm_bo->list_lock);
1882 }
1883 new->flags = old->flags;
1884 new->preferred_loc = old->preferred_loc;
1885 new->prefetch_loc = old->prefetch_loc;
1886 new->actual_loc = old->actual_loc;
1887 new->granularity = old->granularity;
1888 new->mapped_to_gpu = old->mapped_to_gpu;
1889 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
1890 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
1891
1892 return new;
1893 }
1894
1895 void svm_range_set_max_pages(struct amdgpu_device *adev)
1896 {
1897 uint64_t max_pages;
1898 uint64_t pages, _pages;
1899
1900
1901 pages = adev->gmc.real_vram_size >> 17;
1902 pages = clamp(pages, 1ULL << 9, 1ULL << 18);
1903 pages = rounddown_pow_of_two(pages);
1904 do {
1905 max_pages = READ_ONCE(max_svm_range_pages);
1906 _pages = min_not_zero(max_pages, pages);
1907 } while (cmpxchg(&max_svm_range_pages, max_pages, _pages) != max_pages);
1908 }
1909
1910 static int
1911 svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last,
1912 uint64_t max_pages, struct list_head *insert_list,
1913 struct list_head *update_list)
1914 {
1915 struct svm_range *prange;
1916 uint64_t l;
1917
1918 pr_debug("max_svm_range_pages 0x%llx adding [0x%llx 0x%llx]\n",
1919 max_pages, start, last);
1920
1921 while (last >= start) {
1922 l = min(last, ALIGN_DOWN(start + max_pages, max_pages) - 1);
1923
1924 prange = svm_range_new(svms, start, l, true);
1925 if (!prange)
1926 return -ENOMEM;
1927 list_add(&prange->list, insert_list);
1928 list_add(&prange->update_list, update_list);
1929
1930 start = l + 1;
1931 }
1932 return 0;
1933 }
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964 static int
1965 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
1966 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
1967 struct list_head *update_list, struct list_head *insert_list,
1968 struct list_head *remove_list)
1969 {
1970 unsigned long last = start + size - 1UL;
1971 struct svm_range_list *svms = &p->svms;
1972 struct interval_tree_node *node;
1973 struct svm_range *prange;
1974 struct svm_range *tmp;
1975 struct list_head new_list;
1976 int r = 0;
1977
1978 pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last);
1979
1980 INIT_LIST_HEAD(update_list);
1981 INIT_LIST_HEAD(insert_list);
1982 INIT_LIST_HEAD(remove_list);
1983 INIT_LIST_HEAD(&new_list);
1984
1985 node = interval_tree_iter_first(&svms->objects, start, last);
1986 while (node) {
1987 struct interval_tree_node *next;
1988 unsigned long next_start;
1989
1990 pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start,
1991 node->last);
1992
1993 prange = container_of(node, struct svm_range, it_node);
1994 next = interval_tree_iter_next(node, start, last);
1995 next_start = min(node->last, last) + 1;
1996
1997 if (svm_range_is_same_attrs(p, prange, nattr, attrs)) {
1998
1999 } else if (node->start < start || node->last > last) {
2000
2001
2002
2003
2004 struct svm_range *old = prange;
2005
2006 prange = svm_range_clone(old);
2007 if (!prange) {
2008 r = -ENOMEM;
2009 goto out;
2010 }
2011
2012 list_add(&old->update_list, remove_list);
2013 list_add(&prange->list, insert_list);
2014 list_add(&prange->update_list, update_list);
2015
2016 if (node->start < start) {
2017 pr_debug("change old range start\n");
2018 r = svm_range_split_head(prange, start,
2019 insert_list);
2020 if (r)
2021 goto out;
2022 }
2023 if (node->last > last) {
2024 pr_debug("change old range last\n");
2025 r = svm_range_split_tail(prange, last,
2026 insert_list);
2027 if (r)
2028 goto out;
2029 }
2030 } else {
2031
2032
2033
2034 list_add(&prange->update_list, update_list);
2035 }
2036
2037
2038 if (node->start > start) {
2039 r = svm_range_split_new(svms, start, node->start - 1,
2040 READ_ONCE(max_svm_range_pages),
2041 &new_list, update_list);
2042 if (r)
2043 goto out;
2044 }
2045
2046 node = next;
2047 start = next_start;
2048 }
2049
2050
2051 if (start <= last)
2052 r = svm_range_split_new(svms, start, last,
2053 READ_ONCE(max_svm_range_pages),
2054 &new_list, update_list);
2055
2056 out:
2057 if (r) {
2058 list_for_each_entry_safe(prange, tmp, insert_list, list)
2059 svm_range_free(prange, false);
2060 list_for_each_entry_safe(prange, tmp, &new_list, list)
2061 svm_range_free(prange, true);
2062 } else {
2063 list_splice(&new_list, insert_list);
2064 }
2065
2066 return r;
2067 }
2068
2069 static void
2070 svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
2071 struct svm_range *prange)
2072 {
2073 unsigned long start;
2074 unsigned long last;
2075
2076 start = prange->notifier.interval_tree.start >> PAGE_SHIFT;
2077 last = prange->notifier.interval_tree.last >> PAGE_SHIFT;
2078
2079 if (prange->start == start && prange->last == last)
2080 return;
2081
2082 pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
2083 prange->svms, prange, start, last, prange->start,
2084 prange->last);
2085
2086 if (start != 0 && last != 0) {
2087 interval_tree_remove(&prange->it_node, &prange->svms->objects);
2088 svm_range_remove_notifier(prange);
2089 }
2090 prange->it_node.start = prange->start;
2091 prange->it_node.last = prange->last;
2092
2093 interval_tree_insert(&prange->it_node, &prange->svms->objects);
2094 svm_range_add_notifier_locked(mm, prange);
2095 }
2096
2097 static void
2098 svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange,
2099 struct mm_struct *mm)
2100 {
2101 switch (prange->work_item.op) {
2102 case SVM_OP_NULL:
2103 pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2104 svms, prange, prange->start, prange->last);
2105 break;
2106 case SVM_OP_UNMAP_RANGE:
2107 pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2108 svms, prange, prange->start, prange->last);
2109 svm_range_unlink(prange);
2110 svm_range_remove_notifier(prange);
2111 svm_range_free(prange, true);
2112 break;
2113 case SVM_OP_UPDATE_RANGE_NOTIFIER:
2114 pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2115 svms, prange, prange->start, prange->last);
2116 svm_range_update_notifier_and_interval_tree(mm, prange);
2117 break;
2118 case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
2119 pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2120 svms, prange, prange->start, prange->last);
2121 svm_range_update_notifier_and_interval_tree(mm, prange);
2122
2123 break;
2124 case SVM_OP_ADD_RANGE:
2125 pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
2126 prange->start, prange->last);
2127 svm_range_add_to_svms(prange);
2128 svm_range_add_notifier_locked(mm, prange);
2129 break;
2130 case SVM_OP_ADD_RANGE_AND_MAP:
2131 pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
2132 prange, prange->start, prange->last);
2133 svm_range_add_to_svms(prange);
2134 svm_range_add_notifier_locked(mm, prange);
2135
2136 break;
2137 default:
2138 WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange,
2139 prange->work_item.op);
2140 }
2141 }
2142
2143 static void svm_range_drain_retry_fault(struct svm_range_list *svms)
2144 {
2145 struct kfd_process_device *pdd;
2146 struct kfd_process *p;
2147 int drain;
2148 uint32_t i;
2149
2150 p = container_of(svms, struct kfd_process, svms);
2151
2152 restart:
2153 drain = atomic_read(&svms->drain_pagefaults);
2154 if (!drain)
2155 return;
2156
2157 for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
2158 pdd = p->pdds[i];
2159 if (!pdd)
2160 continue;
2161
2162 pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
2163
2164 amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
2165 &pdd->dev->adev->irq.ih1);
2166 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
2167 }
2168 if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain)
2169 goto restart;
2170 }
2171
2172 static void svm_range_deferred_list_work(struct work_struct *work)
2173 {
2174 struct svm_range_list *svms;
2175 struct svm_range *prange;
2176 struct mm_struct *mm;
2177
2178 svms = container_of(work, struct svm_range_list, deferred_list_work);
2179 pr_debug("enter svms 0x%p\n", svms);
2180
2181 spin_lock(&svms->deferred_list_lock);
2182 while (!list_empty(&svms->deferred_range_list)) {
2183 prange = list_first_entry(&svms->deferred_range_list,
2184 struct svm_range, deferred_list);
2185 spin_unlock(&svms->deferred_list_lock);
2186
2187 pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
2188 prange->start, prange->last, prange->work_item.op);
2189
2190 mm = prange->work_item.mm;
2191 retry:
2192 mmap_write_lock(mm);
2193
2194
2195
2196
2197 if (unlikely(atomic_read(&svms->drain_pagefaults))) {
2198 mmap_write_unlock(mm);
2199 svm_range_drain_retry_fault(svms);
2200 goto retry;
2201 }
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211 spin_lock(&svms->deferred_list_lock);
2212 list_del_init(&prange->deferred_list);
2213 spin_unlock(&svms->deferred_list_lock);
2214
2215 mutex_lock(&svms->lock);
2216 mutex_lock(&prange->migrate_mutex);
2217 while (!list_empty(&prange->child_list)) {
2218 struct svm_range *pchild;
2219
2220 pchild = list_first_entry(&prange->child_list,
2221 struct svm_range, child_list);
2222 pr_debug("child prange 0x%p op %d\n", pchild,
2223 pchild->work_item.op);
2224 list_del_init(&pchild->child_list);
2225 svm_range_handle_list_op(svms, pchild, mm);
2226 }
2227 mutex_unlock(&prange->migrate_mutex);
2228
2229 svm_range_handle_list_op(svms, prange, mm);
2230 mutex_unlock(&svms->lock);
2231 mmap_write_unlock(mm);
2232
2233
2234 mmput(mm);
2235
2236 spin_lock(&svms->deferred_list_lock);
2237 }
2238 spin_unlock(&svms->deferred_list_lock);
2239 pr_debug("exit svms 0x%p\n", svms);
2240 }
2241
2242 void
2243 svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
2244 struct mm_struct *mm, enum svm_work_list_ops op)
2245 {
2246 spin_lock(&svms->deferred_list_lock);
2247
2248 if (!list_empty(&prange->deferred_list)) {
2249 pr_debug("update exist prange 0x%p work op %d\n", prange, op);
2250 WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n");
2251 if (op != SVM_OP_NULL &&
2252 prange->work_item.op != SVM_OP_UNMAP_RANGE)
2253 prange->work_item.op = op;
2254 } else {
2255 prange->work_item.op = op;
2256
2257
2258 mmget(mm);
2259 prange->work_item.mm = mm;
2260 list_add_tail(&prange->deferred_list,
2261 &prange->svms->deferred_range_list);
2262 pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
2263 prange, prange->start, prange->last, op);
2264 }
2265 spin_unlock(&svms->deferred_list_lock);
2266 }
2267
2268 void schedule_deferred_list_work(struct svm_range_list *svms)
2269 {
2270 spin_lock(&svms->deferred_list_lock);
2271 if (!list_empty(&svms->deferred_range_list))
2272 schedule_work(&svms->deferred_list_work);
2273 spin_unlock(&svms->deferred_list_lock);
2274 }
2275
2276 static void
2277 svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
2278 struct svm_range *prange, unsigned long start,
2279 unsigned long last)
2280 {
2281 struct svm_range *head;
2282 struct svm_range *tail;
2283
2284 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2285 pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange,
2286 prange->start, prange->last);
2287 return;
2288 }
2289 if (start > prange->last || last < prange->start)
2290 return;
2291
2292 head = tail = prange;
2293 if (start > prange->start)
2294 svm_range_split(prange, prange->start, start - 1, &tail);
2295 if (last < tail->last)
2296 svm_range_split(tail, last + 1, tail->last, &head);
2297
2298 if (head != prange && tail != prange) {
2299 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
2300 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
2301 } else if (tail != prange) {
2302 svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE);
2303 } else if (head != prange) {
2304 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
2305 } else if (parent != prange) {
2306 prange->work_item.op = SVM_OP_UNMAP_RANGE;
2307 }
2308 }
2309
2310 static void
2311 svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
2312 unsigned long start, unsigned long last)
2313 {
2314 uint32_t trigger = KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU;
2315 struct svm_range_list *svms;
2316 struct svm_range *pchild;
2317 struct kfd_process *p;
2318 unsigned long s, l;
2319 bool unmap_parent;
2320
2321 p = kfd_lookup_process_by_mm(mm);
2322 if (!p)
2323 return;
2324 svms = &p->svms;
2325
2326 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
2327 prange, prange->start, prange->last, start, last);
2328
2329
2330
2331
2332
2333 atomic_inc(&svms->drain_pagefaults);
2334
2335 unmap_parent = start <= prange->start && last >= prange->last;
2336
2337 list_for_each_entry(pchild, &prange->child_list, child_list) {
2338 mutex_lock_nested(&pchild->lock, 1);
2339 s = max(start, pchild->start);
2340 l = min(last, pchild->last);
2341 if (l >= s)
2342 svm_range_unmap_from_gpus(pchild, s, l, trigger);
2343 svm_range_unmap_split(mm, prange, pchild, start, last);
2344 mutex_unlock(&pchild->lock);
2345 }
2346 s = max(start, prange->start);
2347 l = min(last, prange->last);
2348 if (l >= s)
2349 svm_range_unmap_from_gpus(prange, s, l, trigger);
2350 svm_range_unmap_split(mm, prange, prange, start, last);
2351
2352 if (unmap_parent)
2353 svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
2354 else
2355 svm_range_add_list_work(svms, prange, mm,
2356 SVM_OP_UPDATE_RANGE_NOTIFIER);
2357 schedule_deferred_list_work(svms);
2358
2359 kfd_unref_process(p);
2360 }
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382 static bool
2383 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
2384 const struct mmu_notifier_range *range,
2385 unsigned long cur_seq)
2386 {
2387 struct svm_range *prange;
2388 unsigned long start;
2389 unsigned long last;
2390
2391 if (range->event == MMU_NOTIFY_RELEASE)
2392 return true;
2393 if (!mmget_not_zero(mni->mm))
2394 return true;
2395
2396 start = mni->interval_tree.start;
2397 last = mni->interval_tree.last;
2398 start = max(start, range->start) >> PAGE_SHIFT;
2399 last = min(last, range->end - 1) >> PAGE_SHIFT;
2400 pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n",
2401 start, last, range->start >> PAGE_SHIFT,
2402 (range->end - 1) >> PAGE_SHIFT,
2403 mni->interval_tree.start >> PAGE_SHIFT,
2404 mni->interval_tree.last >> PAGE_SHIFT, range->event);
2405
2406 prange = container_of(mni, struct svm_range, notifier);
2407
2408 svm_range_lock(prange);
2409 mmu_interval_set_seq(mni, cur_seq);
2410
2411 switch (range->event) {
2412 case MMU_NOTIFY_UNMAP:
2413 svm_range_unmap_from_cpu(mni->mm, prange, start, last);
2414 break;
2415 default:
2416 svm_range_evict(prange, mni->mm, start, last, range->event);
2417 break;
2418 }
2419
2420 svm_range_unlock(prange);
2421 mmput(mni->mm);
2422
2423 return true;
2424 }
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436 struct svm_range *
2437 svm_range_from_addr(struct svm_range_list *svms, unsigned long addr,
2438 struct svm_range **parent)
2439 {
2440 struct interval_tree_node *node;
2441 struct svm_range *prange;
2442 struct svm_range *pchild;
2443
2444 node = interval_tree_iter_first(&svms->objects, addr, addr);
2445 if (!node)
2446 return NULL;
2447
2448 prange = container_of(node, struct svm_range, it_node);
2449 pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n",
2450 addr, prange->start, prange->last, node->start, node->last);
2451
2452 if (addr >= prange->start && addr <= prange->last) {
2453 if (parent)
2454 *parent = prange;
2455 return prange;
2456 }
2457 list_for_each_entry(pchild, &prange->child_list, child_list)
2458 if (addr >= pchild->start && addr <= pchild->last) {
2459 pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n",
2460 addr, pchild->start, pchild->last);
2461 if (parent)
2462 *parent = prange;
2463 return pchild;
2464 }
2465
2466 return NULL;
2467 }
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490 static int32_t
2491 svm_range_best_restore_location(struct svm_range *prange,
2492 struct amdgpu_device *adev,
2493 int32_t *gpuidx)
2494 {
2495 struct amdgpu_device *bo_adev, *preferred_adev;
2496 struct kfd_process *p;
2497 uint32_t gpuid;
2498 int r;
2499
2500 p = container_of(prange->svms, struct kfd_process, svms);
2501
2502 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, gpuidx);
2503 if (r < 0) {
2504 pr_debug("failed to get gpuid from kgd\n");
2505 return -1;
2506 }
2507
2508 if (prange->preferred_loc == gpuid ||
2509 prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) {
2510 return prange->preferred_loc;
2511 } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
2512 preferred_adev = svm_range_get_adev_by_id(prange,
2513 prange->preferred_loc);
2514 if (amdgpu_xgmi_same_hive(adev, preferred_adev))
2515 return prange->preferred_loc;
2516
2517 }
2518
2519 if (test_bit(*gpuidx, prange->bitmap_access))
2520 return gpuid;
2521
2522 if (test_bit(*gpuidx, prange->bitmap_aip)) {
2523 if (!prange->actual_loc)
2524 return 0;
2525
2526 bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc);
2527 if (amdgpu_xgmi_same_hive(adev, bo_adev))
2528 return prange->actual_loc;
2529 else
2530 return 0;
2531 }
2532
2533 return -1;
2534 }
2535
2536 static int
2537 svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
2538 unsigned long *start, unsigned long *last,
2539 bool *is_heap_stack)
2540 {
2541 struct vm_area_struct *vma;
2542 struct interval_tree_node *node;
2543 unsigned long start_limit, end_limit;
2544
2545 vma = find_vma(p->mm, addr << PAGE_SHIFT);
2546 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
2547 pr_debug("VMA does not exist in address [0x%llx]\n", addr);
2548 return -EFAULT;
2549 }
2550
2551 *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk &&
2552 vma->vm_end >= vma->vm_mm->start_brk) ||
2553 (vma->vm_start <= vma->vm_mm->start_stack &&
2554 vma->vm_end >= vma->vm_mm->start_stack);
2555
2556 start_limit = max(vma->vm_start >> PAGE_SHIFT,
2557 (unsigned long)ALIGN_DOWN(addr, 2UL << 8));
2558 end_limit = min(vma->vm_end >> PAGE_SHIFT,
2559 (unsigned long)ALIGN(addr + 1, 2UL << 8));
2560
2561 node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX);
2562 if (node) {
2563 end_limit = min(end_limit, node->start);
2564
2565 node = container_of(rb_prev(&node->rb),
2566 struct interval_tree_node, rb);
2567 } else {
2568
2569
2570
2571 node = container_of(rb_last(&p->svms.objects.rb_root),
2572 struct interval_tree_node, rb);
2573 }
2574 if (node) {
2575 if (node->last >= addr) {
2576 WARN(1, "Overlap with prev node and page fault addr\n");
2577 return -EFAULT;
2578 }
2579 start_limit = max(start_limit, node->last + 1);
2580 }
2581
2582 *start = start_limit;
2583 *last = end_limit - 1;
2584
2585 pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n",
2586 vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT,
2587 *start, *last, *is_heap_stack);
2588
2589 return 0;
2590 }
2591
2592 static int
2593 svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last,
2594 uint64_t *bo_s, uint64_t *bo_l)
2595 {
2596 struct amdgpu_bo_va_mapping *mapping;
2597 struct interval_tree_node *node;
2598 struct amdgpu_bo *bo = NULL;
2599 unsigned long userptr;
2600 uint32_t i;
2601 int r;
2602
2603 for (i = 0; i < p->n_pdds; i++) {
2604 struct amdgpu_vm *vm;
2605
2606 if (!p->pdds[i]->drm_priv)
2607 continue;
2608
2609 vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
2610 r = amdgpu_bo_reserve(vm->root.bo, false);
2611 if (r)
2612 return r;
2613
2614
2615 node = interval_tree_iter_first(&vm->va, 0, ~0ULL);
2616 while (node) {
2617 mapping = container_of((struct rb_node *)node,
2618 struct amdgpu_bo_va_mapping, rb);
2619 bo = mapping->bo_va->base.bo;
2620
2621 if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
2622 start << PAGE_SHIFT,
2623 last << PAGE_SHIFT,
2624 &userptr)) {
2625 node = interval_tree_iter_next(node, 0, ~0ULL);
2626 continue;
2627 }
2628
2629 pr_debug("[0x%llx 0x%llx] already userptr mapped\n",
2630 start, last);
2631 if (bo_s && bo_l) {
2632 *bo_s = userptr >> PAGE_SHIFT;
2633 *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1;
2634 }
2635 amdgpu_bo_unreserve(vm->root.bo);
2636 return -EADDRINUSE;
2637 }
2638 amdgpu_bo_unreserve(vm->root.bo);
2639 }
2640 return 0;
2641 }
2642
2643 static struct
2644 svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev,
2645 struct kfd_process *p,
2646 struct mm_struct *mm,
2647 int64_t addr)
2648 {
2649 struct svm_range *prange = NULL;
2650 unsigned long start, last;
2651 uint32_t gpuid, gpuidx;
2652 bool is_heap_stack;
2653 uint64_t bo_s = 0;
2654 uint64_t bo_l = 0;
2655 int r;
2656
2657 if (svm_range_get_range_boundaries(p, addr, &start, &last,
2658 &is_heap_stack))
2659 return NULL;
2660
2661 r = svm_range_check_vm(p, start, last, &bo_s, &bo_l);
2662 if (r != -EADDRINUSE)
2663 r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l);
2664
2665 if (r == -EADDRINUSE) {
2666 if (addr >= bo_s && addr <= bo_l)
2667 return NULL;
2668
2669
2670 start = addr;
2671 last = addr;
2672 }
2673
2674 prange = svm_range_new(&p->svms, start, last, true);
2675 if (!prange) {
2676 pr_debug("Failed to create prange in address [0x%llx]\n", addr);
2677 return NULL;
2678 }
2679 if (kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx)) {
2680 pr_debug("failed to get gpuid from kgd\n");
2681 svm_range_free(prange, true);
2682 return NULL;
2683 }
2684
2685 if (is_heap_stack)
2686 prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM;
2687
2688 svm_range_add_to_svms(prange);
2689 svm_range_add_notifier_locked(mm, prange);
2690
2691 return prange;
2692 }
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706 static bool svm_range_skip_recover(struct svm_range *prange)
2707 {
2708 struct svm_range_list *svms = prange->svms;
2709
2710 spin_lock(&svms->deferred_list_lock);
2711 if (list_empty(&prange->deferred_list) &&
2712 list_empty(&prange->child_list)) {
2713 spin_unlock(&svms->deferred_list_lock);
2714 return false;
2715 }
2716 spin_unlock(&svms->deferred_list_lock);
2717
2718 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2719 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n",
2720 svms, prange, prange->start, prange->last);
2721 return true;
2722 }
2723 if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP ||
2724 prange->work_item.op == SVM_OP_ADD_RANGE) {
2725 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n",
2726 svms, prange, prange->start, prange->last);
2727 return true;
2728 }
2729 return false;
2730 }
2731
2732 static void
2733 svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
2734 int32_t gpuidx)
2735 {
2736 struct kfd_process_device *pdd;
2737
2738
2739
2740
2741
2742 if (gpuidx == MAX_GPU_INSTANCE) {
2743 uint32_t gpuid;
2744 int r;
2745
2746 r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx);
2747 if (r < 0)
2748 return;
2749 }
2750
2751
2752
2753
2754 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
2755 if (pdd)
2756 WRITE_ONCE(pdd->faults, pdd->faults + 1);
2757 }
2758
2759 static bool
2760 svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
2761 {
2762 unsigned long requested = VM_READ;
2763
2764 if (write_fault)
2765 requested |= VM_WRITE;
2766
2767 pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
2768 vma->vm_flags);
2769 return (vma->vm_flags & requested) == requested;
2770 }
2771
2772 int
2773 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
2774 uint64_t addr, bool write_fault)
2775 {
2776 struct mm_struct *mm = NULL;
2777 struct svm_range_list *svms;
2778 struct svm_range *prange;
2779 struct kfd_process *p;
2780 ktime_t timestamp = ktime_get_boottime();
2781 int32_t best_loc;
2782 int32_t gpuidx = MAX_GPU_INSTANCE;
2783 bool write_locked = false;
2784 struct vm_area_struct *vma;
2785 bool migration = false;
2786 int r = 0;
2787
2788 if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
2789 pr_debug("device does not support SVM\n");
2790 return -EFAULT;
2791 }
2792
2793 p = kfd_lookup_process_by_pasid(pasid);
2794 if (!p) {
2795 pr_debug("kfd process not founded pasid 0x%x\n", pasid);
2796 return 0;
2797 }
2798 svms = &p->svms;
2799
2800 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
2801
2802 if (atomic_read(&svms->drain_pagefaults)) {
2803 pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
2804 r = 0;
2805 goto out;
2806 }
2807
2808 if (!p->xnack_enabled) {
2809 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
2810 r = -EFAULT;
2811 goto out;
2812 }
2813
2814
2815
2816
2817 mm = get_task_mm(p->lead_thread);
2818 if (!mm) {
2819 pr_debug("svms 0x%p failed to get mm\n", svms);
2820 r = 0;
2821 goto out;
2822 }
2823
2824 mmap_read_lock(mm);
2825 retry_write_locked:
2826 mutex_lock(&svms->lock);
2827 prange = svm_range_from_addr(svms, addr, NULL);
2828 if (!prange) {
2829 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
2830 svms, addr);
2831 if (!write_locked) {
2832
2833
2834
2835
2836 mutex_unlock(&svms->lock);
2837 mmap_read_unlock(mm);
2838 mmap_write_lock(mm);
2839 write_locked = true;
2840 goto retry_write_locked;
2841 }
2842 prange = svm_range_create_unregistered_range(adev, p, mm, addr);
2843 if (!prange) {
2844 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n",
2845 svms, addr);
2846 mmap_write_downgrade(mm);
2847 r = -EFAULT;
2848 goto out_unlock_svms;
2849 }
2850 }
2851 if (write_locked)
2852 mmap_write_downgrade(mm);
2853
2854 mutex_lock(&prange->migrate_mutex);
2855
2856 if (svm_range_skip_recover(prange)) {
2857 amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
2858 r = 0;
2859 goto out_unlock_range;
2860 }
2861
2862
2863 if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
2864 AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
2865 pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
2866 svms, prange->start, prange->last);
2867 r = 0;
2868 goto out_unlock_range;
2869 }
2870
2871
2872
2873
2874 vma = find_vma(mm, addr << PAGE_SHIFT);
2875 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
2876 pr_debug("address 0x%llx VMA is removed\n", addr);
2877 r = 0;
2878 goto out_unlock_range;
2879 }
2880
2881 if (!svm_fault_allowed(vma, write_fault)) {
2882 pr_debug("fault addr 0x%llx no %s permission\n", addr,
2883 write_fault ? "write" : "read");
2884 r = -EPERM;
2885 goto out_unlock_range;
2886 }
2887
2888 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx);
2889 if (best_loc == -1) {
2890 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
2891 svms, prange->start, prange->last);
2892 r = -EACCES;
2893 goto out_unlock_range;
2894 }
2895
2896 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
2897 svms, prange->start, prange->last, best_loc,
2898 prange->actual_loc);
2899
2900 kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr,
2901 write_fault, timestamp);
2902
2903 if (prange->actual_loc != best_loc) {
2904 migration = true;
2905 if (best_loc) {
2906 r = svm_migrate_to_vram(prange, best_loc, mm,
2907 KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
2908 if (r) {
2909 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n",
2910 r, addr);
2911
2912
2913
2914 if (prange->actual_loc)
2915 r = svm_migrate_vram_to_ram(prange, mm,
2916 KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
2917 else
2918 r = 0;
2919 }
2920 } else {
2921 r = svm_migrate_vram_to_ram(prange, mm,
2922 KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
2923 }
2924 if (r) {
2925 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
2926 r, svms, prange->start, prange->last);
2927 goto out_unlock_range;
2928 }
2929 }
2930
2931 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false, false);
2932 if (r)
2933 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
2934 r, svms, prange->start, prange->last);
2935
2936 kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr,
2937 migration);
2938
2939 out_unlock_range:
2940 mutex_unlock(&prange->migrate_mutex);
2941 out_unlock_svms:
2942 mutex_unlock(&svms->lock);
2943 mmap_read_unlock(mm);
2944
2945 svm_range_count_fault(adev, p, gpuidx);
2946
2947 mmput(mm);
2948 out:
2949 kfd_unref_process(p);
2950
2951 if (r == -EAGAIN) {
2952 pr_debug("recover vm fault later\n");
2953 amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
2954 r = 0;
2955 }
2956 return r;
2957 }
2958
2959 void svm_range_list_fini(struct kfd_process *p)
2960 {
2961 struct svm_range *prange;
2962 struct svm_range *next;
2963
2964 pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
2965
2966 cancel_delayed_work_sync(&p->svms.restore_work);
2967
2968
2969 flush_work(&p->svms.deferred_list_work);
2970
2971
2972
2973
2974
2975 atomic_inc(&p->svms.drain_pagefaults);
2976 svm_range_drain_retry_fault(&p->svms);
2977
2978 list_for_each_entry_safe(prange, next, &p->svms.list, list) {
2979 svm_range_unlink(prange);
2980 svm_range_remove_notifier(prange);
2981 svm_range_free(prange, true);
2982 }
2983
2984 mutex_destroy(&p->svms.lock);
2985
2986 pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms);
2987 }
2988
2989 int svm_range_list_init(struct kfd_process *p)
2990 {
2991 struct svm_range_list *svms = &p->svms;
2992 int i;
2993
2994 svms->objects = RB_ROOT_CACHED;
2995 mutex_init(&svms->lock);
2996 INIT_LIST_HEAD(&svms->list);
2997 atomic_set(&svms->evicted_ranges, 0);
2998 atomic_set(&svms->drain_pagefaults, 0);
2999 INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
3000 INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
3001 INIT_LIST_HEAD(&svms->deferred_range_list);
3002 INIT_LIST_HEAD(&svms->criu_svm_metadata_list);
3003 spin_lock_init(&svms->deferred_list_lock);
3004
3005 for (i = 0; i < p->n_pdds; i++)
3006 if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev))
3007 bitmap_set(svms->bitmap_supported, i, 1);
3008
3009 return 0;
3010 }
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032 static int
3033 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
3034 uint64_t *bo_s, uint64_t *bo_l)
3035 {
3036 struct amdgpu_bo_va_mapping *mapping;
3037 struct interval_tree_node *node;
3038 uint32_t i;
3039 int r;
3040
3041 for (i = 0; i < p->n_pdds; i++) {
3042 struct amdgpu_vm *vm;
3043
3044 if (!p->pdds[i]->drm_priv)
3045 continue;
3046
3047 vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
3048 r = amdgpu_bo_reserve(vm->root.bo, false);
3049 if (r)
3050 return r;
3051
3052 node = interval_tree_iter_first(&vm->va, start, last);
3053 if (node) {
3054 pr_debug("range [0x%llx 0x%llx] already TTM mapped\n",
3055 start, last);
3056 mapping = container_of((struct rb_node *)node,
3057 struct amdgpu_bo_va_mapping, rb);
3058 if (bo_s && bo_l) {
3059 *bo_s = mapping->start;
3060 *bo_l = mapping->last;
3061 }
3062 amdgpu_bo_unreserve(vm->root.bo);
3063 return -EADDRINUSE;
3064 }
3065 amdgpu_bo_unreserve(vm->root.bo);
3066 }
3067
3068 return 0;
3069 }
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084 static int
3085 svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size)
3086 {
3087 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
3088 struct vm_area_struct *vma;
3089 unsigned long end;
3090 unsigned long start_unchg = start;
3091
3092 start <<= PAGE_SHIFT;
3093 end = start + (size << PAGE_SHIFT);
3094 do {
3095 vma = find_vma(p->mm, start);
3096 if (!vma || start < vma->vm_start ||
3097 (vma->vm_flags & device_vma))
3098 return -EFAULT;
3099 start = min(end, vma->vm_end);
3100 } while (start < end);
3101
3102 return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL,
3103 NULL);
3104 }
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132 static uint32_t
3133 svm_range_best_prefetch_location(struct svm_range *prange)
3134 {
3135 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
3136 uint32_t best_loc = prange->prefetch_loc;
3137 struct kfd_process_device *pdd;
3138 struct amdgpu_device *bo_adev;
3139 struct kfd_process *p;
3140 uint32_t gpuidx;
3141
3142 p = container_of(prange->svms, struct kfd_process, svms);
3143
3144 if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
3145 goto out;
3146
3147 bo_adev = svm_range_get_adev_by_id(prange, best_loc);
3148 if (!bo_adev) {
3149 WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc);
3150 best_loc = 0;
3151 goto out;
3152 }
3153
3154 if (p->xnack_enabled)
3155 bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
3156 else
3157 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
3158 MAX_GPU_INSTANCE);
3159
3160 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
3161 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
3162 if (!pdd) {
3163 pr_debug("failed to get device by idx 0x%x\n", gpuidx);
3164 continue;
3165 }
3166
3167 if (pdd->dev->adev == bo_adev)
3168 continue;
3169
3170 if (!amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) {
3171 best_loc = 0;
3172 break;
3173 }
3174 }
3175
3176 out:
3177 pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n",
3178 p->xnack_enabled, &p->svms, prange->start, prange->last,
3179 best_loc);
3180
3181 return best_loc;
3182 }
3183
3184
3185
3186
3187 void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm,
3188 void *owner)
3189 {
3190 struct hmm_range *hmm_range;
3191 int r;
3192
3193 if (prange->validated_once)
3194 return;
3195
3196 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
3197 prange->start << PAGE_SHIFT,
3198 prange->npages, &hmm_range,
3199 false, true, owner);
3200 if (!r) {
3201 amdgpu_hmm_range_get_pages_done(hmm_range);
3202 prange->validated_once = true;
3203 }
3204 }
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230 static int
3231 svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
3232 bool *migrated)
3233 {
3234 uint32_t best_loc;
3235 int r = 0;
3236
3237 *migrated = false;
3238 best_loc = svm_range_best_prefetch_location(prange);
3239
3240 if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3241 best_loc == prange->actual_loc)
3242 return 0;
3243
3244 if (!best_loc) {
3245 r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH);
3246 *migrated = !r;
3247 return r;
3248 }
3249
3250 r = svm_migrate_to_vram(prange, best_loc, mm, KFD_MIGRATE_TRIGGER_PREFETCH);
3251 *migrated = !r;
3252
3253 return r;
3254 }
3255
3256 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
3257 {
3258 if (!fence)
3259 return -EINVAL;
3260
3261 if (dma_fence_is_signaled(&fence->base))
3262 return 0;
3263
3264 if (fence->svm_bo) {
3265 WRITE_ONCE(fence->svm_bo->evicting, 1);
3266 schedule_work(&fence->svm_bo->eviction_work);
3267 }
3268
3269 return 0;
3270 }
3271
3272 static void svm_range_evict_svm_bo_worker(struct work_struct *work)
3273 {
3274 struct svm_range_bo *svm_bo;
3275 struct mm_struct *mm;
3276 int r = 0;
3277
3278 svm_bo = container_of(work, struct svm_range_bo, eviction_work);
3279 if (!svm_bo_ref_unless_zero(svm_bo))
3280 return;
3281
3282 if (mmget_not_zero(svm_bo->eviction_fence->mm)) {
3283 mm = svm_bo->eviction_fence->mm;
3284 } else {
3285 svm_range_bo_unref(svm_bo);
3286 return;
3287 }
3288
3289 mmap_read_lock(mm);
3290 spin_lock(&svm_bo->list_lock);
3291 while (!list_empty(&svm_bo->range_list) && !r) {
3292 struct svm_range *prange =
3293 list_first_entry(&svm_bo->range_list,
3294 struct svm_range, svm_bo_list);
3295 int retries = 3;
3296
3297 list_del_init(&prange->svm_bo_list);
3298 spin_unlock(&svm_bo->list_lock);
3299
3300 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
3301 prange->start, prange->last);
3302
3303 mutex_lock(&prange->migrate_mutex);
3304 do {
3305 r = svm_migrate_vram_to_ram(prange, mm,
3306 KFD_MIGRATE_TRIGGER_TTM_EVICTION);
3307 } while (!r && prange->actual_loc && --retries);
3308
3309 if (!r && prange->actual_loc)
3310 pr_info_once("Migration failed during eviction");
3311
3312 if (!prange->actual_loc) {
3313 mutex_lock(&prange->lock);
3314 prange->svm_bo = NULL;
3315 mutex_unlock(&prange->lock);
3316 }
3317 mutex_unlock(&prange->migrate_mutex);
3318
3319 spin_lock(&svm_bo->list_lock);
3320 }
3321 spin_unlock(&svm_bo->list_lock);
3322 mmap_read_unlock(mm);
3323 mmput(mm);
3324
3325 dma_fence_signal(&svm_bo->eviction_fence->base);
3326
3327
3328
3329
3330 WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
3331 svm_range_bo_unref(svm_bo);
3332 }
3333
3334 static int
3335 svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
3336 uint64_t start, uint64_t size, uint32_t nattr,
3337 struct kfd_ioctl_svm_attribute *attrs)
3338 {
3339 struct amdkfd_process_info *process_info = p->kgd_process_info;
3340 struct list_head update_list;
3341 struct list_head insert_list;
3342 struct list_head remove_list;
3343 struct svm_range_list *svms;
3344 struct svm_range *prange;
3345 struct svm_range *next;
3346 bool update_mapping = false;
3347 bool flush_tlb;
3348 int r = 0;
3349
3350 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
3351 p->pasid, &p->svms, start, start + size - 1, size);
3352
3353 r = svm_range_check_attr(p, nattr, attrs);
3354 if (r)
3355 return r;
3356
3357 svms = &p->svms;
3358
3359 mutex_lock(&process_info->lock);
3360
3361 svm_range_list_lock_and_flush_work(svms, mm);
3362
3363 r = svm_range_is_valid(p, start, size);
3364 if (r) {
3365 pr_debug("invalid range r=%d\n", r);
3366 mmap_write_unlock(mm);
3367 goto out;
3368 }
3369
3370 mutex_lock(&svms->lock);
3371
3372
3373 r = svm_range_add(p, start, size, nattr, attrs, &update_list,
3374 &insert_list, &remove_list);
3375 if (r) {
3376 mutex_unlock(&svms->lock);
3377 mmap_write_unlock(mm);
3378 goto out;
3379 }
3380
3381 list_for_each_entry_safe(prange, next, &insert_list, list) {
3382 svm_range_add_to_svms(prange);
3383 svm_range_add_notifier_locked(mm, prange);
3384 }
3385 list_for_each_entry(prange, &update_list, update_list) {
3386 svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping);
3387
3388 }
3389 list_for_each_entry_safe(prange, next, &remove_list, update_list) {
3390 pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n",
3391 prange->svms, prange, prange->start,
3392 prange->last);
3393 svm_range_unlink(prange);
3394 svm_range_remove_notifier(prange);
3395 svm_range_free(prange, false);
3396 }
3397
3398 mmap_write_downgrade(mm);
3399
3400
3401
3402
3403
3404 list_for_each_entry(prange, &update_list, update_list) {
3405 bool migrated;
3406
3407 mutex_lock(&prange->migrate_mutex);
3408
3409 r = svm_range_trigger_migration(mm, prange, &migrated);
3410 if (r)
3411 goto out_unlock_range;
3412
3413 if (migrated && (!p->xnack_enabled ||
3414 (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) &&
3415 prange->mapped_to_gpu) {
3416 pr_debug("restore_work will update mappings of GPUs\n");
3417 mutex_unlock(&prange->migrate_mutex);
3418 continue;
3419 }
3420
3421 if (!migrated && !update_mapping) {
3422 mutex_unlock(&prange->migrate_mutex);
3423 continue;
3424 }
3425
3426 flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu;
3427
3428 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
3429 true, true, flush_tlb);
3430 if (r)
3431 pr_debug("failed %d to map svm range\n", r);
3432
3433 out_unlock_range:
3434 mutex_unlock(&prange->migrate_mutex);
3435 if (r)
3436 break;
3437 }
3438
3439 svm_range_debug_dump(svms);
3440
3441 mutex_unlock(&svms->lock);
3442 mmap_read_unlock(mm);
3443 out:
3444 mutex_unlock(&process_info->lock);
3445
3446 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
3447 &p->svms, start, start + size - 1, r);
3448
3449 return r;
3450 }
3451
3452 static int
3453 svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm,
3454 uint64_t start, uint64_t size, uint32_t nattr,
3455 struct kfd_ioctl_svm_attribute *attrs)
3456 {
3457 DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
3458 DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
3459 bool get_preferred_loc = false;
3460 bool get_prefetch_loc = false;
3461 bool get_granularity = false;
3462 bool get_accessible = false;
3463 bool get_flags = false;
3464 uint64_t last = start + size - 1UL;
3465 uint8_t granularity = 0xff;
3466 struct interval_tree_node *node;
3467 struct svm_range_list *svms;
3468 struct svm_range *prange;
3469 uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3470 uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3471 uint32_t flags_and = 0xffffffff;
3472 uint32_t flags_or = 0;
3473 int gpuidx;
3474 uint32_t i;
3475 int r = 0;
3476
3477 pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start,
3478 start + size - 1, nattr);
3479
3480
3481
3482
3483
3484
3485
3486 flush_work(&p->svms.deferred_list_work);
3487
3488 mmap_read_lock(mm);
3489 r = svm_range_is_valid(p, start, size);
3490 mmap_read_unlock(mm);
3491 if (r) {
3492 pr_debug("invalid range r=%d\n", r);
3493 return r;
3494 }
3495
3496 for (i = 0; i < nattr; i++) {
3497 switch (attrs[i].type) {
3498 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3499 get_preferred_loc = true;
3500 break;
3501 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3502 get_prefetch_loc = true;
3503 break;
3504 case KFD_IOCTL_SVM_ATTR_ACCESS:
3505 get_accessible = true;
3506 break;
3507 case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3508 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3509 get_flags = true;
3510 break;
3511 case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3512 get_granularity = true;
3513 break;
3514 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
3515 case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
3516 fallthrough;
3517 default:
3518 pr_debug("get invalid attr type 0x%x\n", attrs[i].type);
3519 return -EINVAL;
3520 }
3521 }
3522
3523 svms = &p->svms;
3524
3525 mutex_lock(&svms->lock);
3526
3527 node = interval_tree_iter_first(&svms->objects, start, last);
3528 if (!node) {
3529 pr_debug("range attrs not found return default values\n");
3530 svm_range_set_default_attributes(&location, &prefetch_loc,
3531 &granularity, &flags_and);
3532 flags_or = flags_and;
3533 if (p->xnack_enabled)
3534 bitmap_copy(bitmap_access, svms->bitmap_supported,
3535 MAX_GPU_INSTANCE);
3536 else
3537 bitmap_zero(bitmap_access, MAX_GPU_INSTANCE);
3538 bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE);
3539 goto fill_values;
3540 }
3541 bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE);
3542 bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE);
3543
3544 while (node) {
3545 struct interval_tree_node *next;
3546
3547 prange = container_of(node, struct svm_range, it_node);
3548 next = interval_tree_iter_next(node, start, last);
3549
3550 if (get_preferred_loc) {
3551 if (prange->preferred_loc ==
3552 KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3553 (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3554 location != prange->preferred_loc)) {
3555 location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3556 get_preferred_loc = false;
3557 } else {
3558 location = prange->preferred_loc;
3559 }
3560 }
3561 if (get_prefetch_loc) {
3562 if (prange->prefetch_loc ==
3563 KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3564 (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3565 prefetch_loc != prange->prefetch_loc)) {
3566 prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3567 get_prefetch_loc = false;
3568 } else {
3569 prefetch_loc = prange->prefetch_loc;
3570 }
3571 }
3572 if (get_accessible) {
3573 bitmap_and(bitmap_access, bitmap_access,
3574 prange->bitmap_access, MAX_GPU_INSTANCE);
3575 bitmap_and(bitmap_aip, bitmap_aip,
3576 prange->bitmap_aip, MAX_GPU_INSTANCE);
3577 }
3578 if (get_flags) {
3579 flags_and &= prange->flags;
3580 flags_or |= prange->flags;
3581 }
3582
3583 if (get_granularity && prange->granularity < granularity)
3584 granularity = prange->granularity;
3585
3586 node = next;
3587 }
3588 fill_values:
3589 mutex_unlock(&svms->lock);
3590
3591 for (i = 0; i < nattr; i++) {
3592 switch (attrs[i].type) {
3593 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3594 attrs[i].value = location;
3595 break;
3596 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3597 attrs[i].value = prefetch_loc;
3598 break;
3599 case KFD_IOCTL_SVM_ATTR_ACCESS:
3600 gpuidx = kfd_process_gpuidx_from_gpuid(p,
3601 attrs[i].value);
3602 if (gpuidx < 0) {
3603 pr_debug("invalid gpuid %x\n", attrs[i].value);
3604 return -EINVAL;
3605 }
3606 if (test_bit(gpuidx, bitmap_access))
3607 attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS;
3608 else if (test_bit(gpuidx, bitmap_aip))
3609 attrs[i].type =
3610 KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE;
3611 else
3612 attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS;
3613 break;
3614 case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3615 attrs[i].value = flags_and;
3616 break;
3617 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3618 attrs[i].value = ~flags_or;
3619 break;
3620 case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3621 attrs[i].value = (uint32_t)granularity;
3622 break;
3623 }
3624 }
3625
3626 return 0;
3627 }
3628
3629 int kfd_criu_resume_svm(struct kfd_process *p)
3630 {
3631 struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL;
3632 int nattr_common = 4, nattr_accessibility = 1;
3633 struct criu_svm_metadata *criu_svm_md = NULL;
3634 struct svm_range_list *svms = &p->svms;
3635 struct criu_svm_metadata *next = NULL;
3636 uint32_t set_flags = 0xffffffff;
3637 int i, j, num_attrs, ret = 0;
3638 uint64_t set_attr_size;
3639 struct mm_struct *mm;
3640
3641 if (list_empty(&svms->criu_svm_metadata_list)) {
3642 pr_debug("No SVM data from CRIU restore stage 2\n");
3643 return ret;
3644 }
3645
3646 mm = get_task_mm(p->lead_thread);
3647 if (!mm) {
3648 pr_err("failed to get mm for the target process\n");
3649 return -ESRCH;
3650 }
3651
3652 num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);
3653
3654 i = j = 0;
3655 list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
3656 pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n",
3657 i, criu_svm_md->data.start_addr, criu_svm_md->data.size);
3658
3659 for (j = 0; j < num_attrs; j++) {
3660 pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x\ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
3661 i, j, criu_svm_md->data.attrs[j].type,
3662 i, j, criu_svm_md->data.attrs[j].value);
3663 switch (criu_svm_md->data.attrs[j].type) {
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3675 if (criu_svm_md->data.attrs[j].value ==
3676 KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
3677 criu_svm_md->data.attrs[j].type =
3678 KFD_IOCTL_SVM_ATTR_SET_FLAGS;
3679 criu_svm_md->data.attrs[j].value = 0;
3680 }
3681 break;
3682 case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3683 set_flags = criu_svm_md->data.attrs[j].value;
3684 break;
3685 default:
3686 break;
3687 }
3688 }
3689
3690
3691
3692
3693
3694 set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
3695 (num_attrs + 1);
3696 set_attr_new = krealloc(set_attr, set_attr_size,
3697 GFP_KERNEL);
3698 if (!set_attr_new) {
3699 ret = -ENOMEM;
3700 goto exit;
3701 }
3702 set_attr = set_attr_new;
3703
3704 memcpy(set_attr, criu_svm_md->data.attrs, num_attrs *
3705 sizeof(struct kfd_ioctl_svm_attribute));
3706 set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS;
3707 set_attr[num_attrs].value = ~set_flags;
3708
3709 ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr,
3710 criu_svm_md->data.size, num_attrs + 1,
3711 set_attr);
3712 if (ret) {
3713 pr_err("CRIU: failed to set range attributes\n");
3714 goto exit;
3715 }
3716
3717 i++;
3718 }
3719 exit:
3720 kfree(set_attr);
3721 list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) {
3722 pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
3723 criu_svm_md->data.start_addr);
3724 kfree(criu_svm_md);
3725 }
3726
3727 mmput(mm);
3728 return ret;
3729
3730 }
3731
3732 int kfd_criu_restore_svm(struct kfd_process *p,
3733 uint8_t __user *user_priv_ptr,
3734 uint64_t *priv_data_offset,
3735 uint64_t max_priv_data_size)
3736 {
3737 uint64_t svm_priv_data_size, svm_object_md_size, svm_attrs_size;
3738 int nattr_common = 4, nattr_accessibility = 1;
3739 struct criu_svm_metadata *criu_svm_md = NULL;
3740 struct svm_range_list *svms = &p->svms;
3741 uint32_t num_devices;
3742 int ret = 0;
3743
3744 num_devices = p->n_pdds;
3745
3746
3747
3748
3749
3750 svm_attrs_size = sizeof(struct kfd_ioctl_svm_attribute) *
3751 (nattr_common + nattr_accessibility * num_devices);
3752 svm_object_md_size = sizeof(struct criu_svm_metadata) + svm_attrs_size;
3753
3754 svm_priv_data_size = sizeof(struct kfd_criu_svm_range_priv_data) +
3755 svm_attrs_size;
3756
3757 criu_svm_md = kzalloc(svm_object_md_size, GFP_KERNEL);
3758 if (!criu_svm_md) {
3759 pr_err("failed to allocate memory to store svm metadata\n");
3760 return -ENOMEM;
3761 }
3762 if (*priv_data_offset + svm_priv_data_size > max_priv_data_size) {
3763 ret = -EINVAL;
3764 goto exit;
3765 }
3766
3767 ret = copy_from_user(&criu_svm_md->data, user_priv_ptr + *priv_data_offset,
3768 svm_priv_data_size);
3769 if (ret) {
3770 ret = -EFAULT;
3771 goto exit;
3772 }
3773 *priv_data_offset += svm_priv_data_size;
3774
3775 list_add_tail(&criu_svm_md->list, &svms->criu_svm_metadata_list);
3776
3777 return 0;
3778
3779
3780 exit:
3781 kfree(criu_svm_md);
3782 return ret;
3783 }
3784
3785 int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
3786 uint64_t *svm_priv_data_size)
3787 {
3788 uint64_t total_size, accessibility_size, common_attr_size;
3789 int nattr_common = 4, nattr_accessibility = 1;
3790 int num_devices = p->n_pdds;
3791 struct svm_range_list *svms;
3792 struct svm_range *prange;
3793 uint32_t count = 0;
3794
3795 *svm_priv_data_size = 0;
3796
3797 svms = &p->svms;
3798 if (!svms)
3799 return -EINVAL;
3800
3801 mutex_lock(&svms->lock);
3802 list_for_each_entry(prange, &svms->list, list) {
3803 pr_debug("prange: 0x%p start: 0x%lx\t npages: 0x%llx\t end: 0x%llx\n",
3804 prange, prange->start, prange->npages,
3805 prange->start + prange->npages - 1);
3806 count++;
3807 }
3808 mutex_unlock(&svms->lock);
3809
3810 *num_svm_ranges = count;
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827 if (*num_svm_ranges > 0) {
3828 common_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
3829 nattr_common;
3830 accessibility_size = sizeof(struct kfd_ioctl_svm_attribute) *
3831 nattr_accessibility * num_devices;
3832
3833 total_size = sizeof(struct kfd_criu_svm_range_priv_data) +
3834 common_attr_size + accessibility_size;
3835
3836 *svm_priv_data_size = *num_svm_ranges * total_size;
3837 }
3838
3839 pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges,
3840 *svm_priv_data_size);
3841 return 0;
3842 }
3843
3844 int kfd_criu_checkpoint_svm(struct kfd_process *p,
3845 uint8_t __user *user_priv_data,
3846 uint64_t *priv_data_offset)
3847 {
3848 struct kfd_criu_svm_range_priv_data *svm_priv = NULL;
3849 struct kfd_ioctl_svm_attribute *query_attr = NULL;
3850 uint64_t svm_priv_data_size, query_attr_size = 0;
3851 int index, nattr_common = 4, ret = 0;
3852 struct svm_range_list *svms;
3853 int num_devices = p->n_pdds;
3854 struct svm_range *prange;
3855 struct mm_struct *mm;
3856
3857 svms = &p->svms;
3858 if (!svms)
3859 return -EINVAL;
3860
3861 mm = get_task_mm(p->lead_thread);
3862 if (!mm) {
3863 pr_err("failed to get mm for the target process\n");
3864 return -ESRCH;
3865 }
3866
3867 query_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
3868 (nattr_common + num_devices);
3869
3870 query_attr = kzalloc(query_attr_size, GFP_KERNEL);
3871 if (!query_attr) {
3872 ret = -ENOMEM;
3873 goto exit;
3874 }
3875
3876 query_attr[0].type = KFD_IOCTL_SVM_ATTR_PREFERRED_LOC;
3877 query_attr[1].type = KFD_IOCTL_SVM_ATTR_PREFETCH_LOC;
3878 query_attr[2].type = KFD_IOCTL_SVM_ATTR_SET_FLAGS;
3879 query_attr[3].type = KFD_IOCTL_SVM_ATTR_GRANULARITY;
3880
3881 for (index = 0; index < num_devices; index++) {
3882 struct kfd_process_device *pdd = p->pdds[index];
3883
3884 query_attr[index + nattr_common].type =
3885 KFD_IOCTL_SVM_ATTR_ACCESS;
3886 query_attr[index + nattr_common].value = pdd->user_gpu_id;
3887 }
3888
3889 svm_priv_data_size = sizeof(*svm_priv) + query_attr_size;
3890
3891 svm_priv = kzalloc(svm_priv_data_size, GFP_KERNEL);
3892 if (!svm_priv) {
3893 ret = -ENOMEM;
3894 goto exit_query;
3895 }
3896
3897 index = 0;
3898 list_for_each_entry(prange, &svms->list, list) {
3899
3900 svm_priv->object_type = KFD_CRIU_OBJECT_TYPE_SVM_RANGE;
3901 svm_priv->start_addr = prange->start;
3902 svm_priv->size = prange->npages;
3903 memcpy(&svm_priv->attrs, query_attr, query_attr_size);
3904 pr_debug("CRIU: prange: 0x%p start: 0x%lx\t npages: 0x%llx end: 0x%llx\t size: 0x%llx\n",
3905 prange, prange->start, prange->npages,
3906 prange->start + prange->npages - 1,
3907 prange->npages * PAGE_SIZE);
3908
3909 ret = svm_range_get_attr(p, mm, svm_priv->start_addr,
3910 svm_priv->size,
3911 (nattr_common + num_devices),
3912 svm_priv->attrs);
3913 if (ret) {
3914 pr_err("CRIU: failed to obtain range attributes\n");
3915 goto exit_priv;
3916 }
3917
3918 if (copy_to_user(user_priv_data + *priv_data_offset, svm_priv,
3919 svm_priv_data_size)) {
3920 pr_err("Failed to copy svm priv to user\n");
3921 ret = -EFAULT;
3922 goto exit_priv;
3923 }
3924
3925 *priv_data_offset += svm_priv_data_size;
3926
3927 }
3928
3929
3930 exit_priv:
3931 kfree(svm_priv);
3932 exit_query:
3933 kfree(query_attr);
3934 exit:
3935 mmput(mm);
3936 return ret;
3937 }
3938
3939 int
3940 svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
3941 uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs)
3942 {
3943 struct mm_struct *mm = current->mm;
3944 int r;
3945
3946 start >>= PAGE_SHIFT;
3947 size >>= PAGE_SHIFT;
3948
3949 switch (op) {
3950 case KFD_IOCTL_SVM_OP_SET_ATTR:
3951 r = svm_range_set_attr(p, mm, start, size, nattrs, attrs);
3952 break;
3953 case KFD_IOCTL_SVM_OP_GET_ATTR:
3954 r = svm_range_get_attr(p, mm, start, size, nattrs, attrs);
3955 break;
3956 default:
3957 r = EINVAL;
3958 break;
3959 }
3960
3961 return r;
3962 }