Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0 OR MIT
0002 /*
0003  * Copyright 2020-2021 Advanced Micro Devices, Inc.
0004  *
0005  * Permission is hereby granted, free of charge, to any person obtaining a
0006  * copy of this software and associated documentation files (the "Software"),
0007  * to deal in the Software without restriction, including without limitation
0008  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0009  * and/or sell copies of the Software, and to permit persons to whom the
0010  * Software is furnished to do so, subject to the following conditions:
0011  *
0012  * The above copyright notice and this permission notice shall be included in
0013  * all copies or substantial portions of the Software.
0014  *
0015  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0016  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0017  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0018  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0019  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0020  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0021  * OTHER DEALINGS IN THE SOFTWARE.
0022  */
0023 
0024 #include <linux/types.h>
0025 #include <linux/sched/task.h>
0026 #include "amdgpu_sync.h"
0027 #include "amdgpu_object.h"
0028 #include "amdgpu_vm.h"
0029 #include "amdgpu_mn.h"
0030 #include "amdgpu.h"
0031 #include "amdgpu_xgmi.h"
0032 #include "kfd_priv.h"
0033 #include "kfd_svm.h"
0034 #include "kfd_migrate.h"
0035 #include "kfd_smi_events.h"
0036 
0037 #ifdef dev_fmt
0038 #undef dev_fmt
0039 #endif
0040 #define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__
0041 
0042 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
0043 
0044 /* Long enough to ensure no retry fault comes after svm range is restored and
0045  * page table is updated.
0046  */
0047 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING    (2UL * NSEC_PER_MSEC)
0048 
0049 /* Giant svm range split into smaller ranges based on this, it is decided using
0050  * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to
0051  * power of 2MB.
0052  */
0053 static uint64_t max_svm_range_pages;
0054 
0055 struct criu_svm_metadata {
0056     struct list_head list;
0057     struct kfd_criu_svm_range_priv_data data;
0058 };
0059 
0060 static void svm_range_evict_svm_bo_worker(struct work_struct *work);
0061 static bool
0062 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
0063                     const struct mmu_notifier_range *range,
0064                     unsigned long cur_seq);
0065 static int
0066 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
0067            uint64_t *bo_s, uint64_t *bo_l);
0068 static const struct mmu_interval_notifier_ops svm_range_mn_ops = {
0069     .invalidate = svm_range_cpu_invalidate_pagetables,
0070 };
0071 
0072 /**
0073  * svm_range_unlink - unlink svm_range from lists and interval tree
0074  * @prange: svm range structure to be removed
0075  *
0076  * Remove the svm_range from the svms and svm_bo lists and the svms
0077  * interval tree.
0078  *
0079  * Context: The caller must hold svms->lock
0080  */
0081 static void svm_range_unlink(struct svm_range *prange)
0082 {
0083     pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
0084          prange, prange->start, prange->last);
0085 
0086     if (prange->svm_bo) {
0087         spin_lock(&prange->svm_bo->list_lock);
0088         list_del(&prange->svm_bo_list);
0089         spin_unlock(&prange->svm_bo->list_lock);
0090     }
0091 
0092     list_del(&prange->list);
0093     if (prange->it_node.start != 0 && prange->it_node.last != 0)
0094         interval_tree_remove(&prange->it_node, &prange->svms->objects);
0095 }
0096 
0097 static void
0098 svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange)
0099 {
0100     pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
0101          prange, prange->start, prange->last);
0102 
0103     mmu_interval_notifier_insert_locked(&prange->notifier, mm,
0104                      prange->start << PAGE_SHIFT,
0105                      prange->npages << PAGE_SHIFT,
0106                      &svm_range_mn_ops);
0107 }
0108 
0109 /**
0110  * svm_range_add_to_svms - add svm range to svms
0111  * @prange: svm range structure to be added
0112  *
0113  * Add the svm range to svms interval tree and link list
0114  *
0115  * Context: The caller must hold svms->lock
0116  */
0117 static void svm_range_add_to_svms(struct svm_range *prange)
0118 {
0119     pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
0120          prange, prange->start, prange->last);
0121 
0122     list_move_tail(&prange->list, &prange->svms->list);
0123     prange->it_node.start = prange->start;
0124     prange->it_node.last = prange->last;
0125     interval_tree_insert(&prange->it_node, &prange->svms->objects);
0126 }
0127 
0128 static void svm_range_remove_notifier(struct svm_range *prange)
0129 {
0130     pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n",
0131          prange->svms, prange,
0132          prange->notifier.interval_tree.start >> PAGE_SHIFT,
0133          prange->notifier.interval_tree.last >> PAGE_SHIFT);
0134 
0135     if (prange->notifier.interval_tree.start != 0 &&
0136         prange->notifier.interval_tree.last != 0)
0137         mmu_interval_notifier_remove(&prange->notifier);
0138 }
0139 
0140 static bool
0141 svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr)
0142 {
0143     return dma_addr && !dma_mapping_error(dev, dma_addr) &&
0144            !(dma_addr & SVM_RANGE_VRAM_DOMAIN);
0145 }
0146 
0147 static int
0148 svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
0149               unsigned long offset, unsigned long npages,
0150               unsigned long *hmm_pfns, uint32_t gpuidx)
0151 {
0152     enum dma_data_direction dir = DMA_BIDIRECTIONAL;
0153     dma_addr_t *addr = prange->dma_addr[gpuidx];
0154     struct device *dev = adev->dev;
0155     struct page *page;
0156     int i, r;
0157 
0158     if (!addr) {
0159         addr = kvcalloc(prange->npages, sizeof(*addr), GFP_KERNEL);
0160         if (!addr)
0161             return -ENOMEM;
0162         prange->dma_addr[gpuidx] = addr;
0163     }
0164 
0165     addr += offset;
0166     for (i = 0; i < npages; i++) {
0167         if (svm_is_valid_dma_mapping_addr(dev, addr[i]))
0168             dma_unmap_page(dev, addr[i], PAGE_SIZE, dir);
0169 
0170         page = hmm_pfn_to_page(hmm_pfns[i]);
0171         if (is_zone_device_page(page)) {
0172             struct amdgpu_device *bo_adev =
0173                     amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
0174 
0175             addr[i] = (hmm_pfns[i] << PAGE_SHIFT) +
0176                    bo_adev->vm_manager.vram_base_offset -
0177                    bo_adev->kfd.dev->pgmap.range.start;
0178             addr[i] |= SVM_RANGE_VRAM_DOMAIN;
0179             pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]);
0180             continue;
0181         }
0182         addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
0183         r = dma_mapping_error(dev, addr[i]);
0184         if (r) {
0185             dev_err(dev, "failed %d dma_map_page\n", r);
0186             return r;
0187         }
0188         pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n",
0189                      addr[i] >> PAGE_SHIFT, page_to_pfn(page));
0190     }
0191     return 0;
0192 }
0193 
0194 static int
0195 svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
0196           unsigned long offset, unsigned long npages,
0197           unsigned long *hmm_pfns)
0198 {
0199     struct kfd_process *p;
0200     uint32_t gpuidx;
0201     int r;
0202 
0203     p = container_of(prange->svms, struct kfd_process, svms);
0204 
0205     for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
0206         struct kfd_process_device *pdd;
0207 
0208         pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
0209         pdd = kfd_process_device_from_gpuidx(p, gpuidx);
0210         if (!pdd) {
0211             pr_debug("failed to find device idx %d\n", gpuidx);
0212             return -EINVAL;
0213         }
0214 
0215         r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages,
0216                       hmm_pfns, gpuidx);
0217         if (r)
0218             break;
0219     }
0220 
0221     return r;
0222 }
0223 
0224 void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
0225              unsigned long offset, unsigned long npages)
0226 {
0227     enum dma_data_direction dir = DMA_BIDIRECTIONAL;
0228     int i;
0229 
0230     if (!dma_addr)
0231         return;
0232 
0233     for (i = offset; i < offset + npages; i++) {
0234         if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i]))
0235             continue;
0236         pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT);
0237         dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
0238         dma_addr[i] = 0;
0239     }
0240 }
0241 
0242 void svm_range_free_dma_mappings(struct svm_range *prange)
0243 {
0244     struct kfd_process_device *pdd;
0245     dma_addr_t *dma_addr;
0246     struct device *dev;
0247     struct kfd_process *p;
0248     uint32_t gpuidx;
0249 
0250     p = container_of(prange->svms, struct kfd_process, svms);
0251 
0252     for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
0253         dma_addr = prange->dma_addr[gpuidx];
0254         if (!dma_addr)
0255             continue;
0256 
0257         pdd = kfd_process_device_from_gpuidx(p, gpuidx);
0258         if (!pdd) {
0259             pr_debug("failed to find device idx %d\n", gpuidx);
0260             continue;
0261         }
0262         dev = &pdd->dev->pdev->dev;
0263         svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
0264         kvfree(dma_addr);
0265         prange->dma_addr[gpuidx] = NULL;
0266     }
0267 }
0268 
0269 static void svm_range_free(struct svm_range *prange, bool update_mem_usage)
0270 {
0271     uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT;
0272     struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
0273 
0274     pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange,
0275          prange->start, prange->last);
0276 
0277     svm_range_vram_node_free(prange);
0278     svm_range_free_dma_mappings(prange);
0279 
0280     if (update_mem_usage && !p->xnack_enabled) {
0281         pr_debug("unreserve mem limit: %lld\n", size);
0282         amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
0283                     KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
0284     }
0285     mutex_destroy(&prange->lock);
0286     mutex_destroy(&prange->migrate_mutex);
0287     kfree(prange);
0288 }
0289 
0290 static void
0291 svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc,
0292                  uint8_t *granularity, uint32_t *flags)
0293 {
0294     *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
0295     *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
0296     *granularity = 9;
0297     *flags =
0298         KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT;
0299 }
0300 
0301 static struct
0302 svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
0303              uint64_t last, bool update_mem_usage)
0304 {
0305     uint64_t size = last - start + 1;
0306     struct svm_range *prange;
0307     struct kfd_process *p;
0308 
0309     prange = kzalloc(sizeof(*prange), GFP_KERNEL);
0310     if (!prange)
0311         return NULL;
0312 
0313     p = container_of(svms, struct kfd_process, svms);
0314     if (!p->xnack_enabled && update_mem_usage &&
0315         amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT,
0316                         KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)) {
0317         pr_info("SVM mapping failed, exceeds resident system memory limit\n");
0318         kfree(prange);
0319         return NULL;
0320     }
0321     prange->npages = size;
0322     prange->svms = svms;
0323     prange->start = start;
0324     prange->last = last;
0325     INIT_LIST_HEAD(&prange->list);
0326     INIT_LIST_HEAD(&prange->update_list);
0327     INIT_LIST_HEAD(&prange->svm_bo_list);
0328     INIT_LIST_HEAD(&prange->deferred_list);
0329     INIT_LIST_HEAD(&prange->child_list);
0330     atomic_set(&prange->invalid, 0);
0331     prange->validate_timestamp = 0;
0332     mutex_init(&prange->migrate_mutex);
0333     mutex_init(&prange->lock);
0334 
0335     if (p->xnack_enabled)
0336         bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
0337                 MAX_GPU_INSTANCE);
0338 
0339     svm_range_set_default_attributes(&prange->preferred_loc,
0340                      &prange->prefetch_loc,
0341                      &prange->granularity, &prange->flags);
0342 
0343     pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
0344 
0345     return prange;
0346 }
0347 
0348 static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo)
0349 {
0350     if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref))
0351         return false;
0352 
0353     return true;
0354 }
0355 
0356 static void svm_range_bo_release(struct kref *kref)
0357 {
0358     struct svm_range_bo *svm_bo;
0359 
0360     svm_bo = container_of(kref, struct svm_range_bo, kref);
0361     pr_debug("svm_bo 0x%p\n", svm_bo);
0362 
0363     spin_lock(&svm_bo->list_lock);
0364     while (!list_empty(&svm_bo->range_list)) {
0365         struct svm_range *prange =
0366                 list_first_entry(&svm_bo->range_list,
0367                         struct svm_range, svm_bo_list);
0368         /* list_del_init tells a concurrent svm_range_vram_node_new when
0369          * it's safe to reuse the svm_bo pointer and svm_bo_list head.
0370          */
0371         list_del_init(&prange->svm_bo_list);
0372         spin_unlock(&svm_bo->list_lock);
0373 
0374         pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
0375              prange->start, prange->last);
0376         mutex_lock(&prange->lock);
0377         prange->svm_bo = NULL;
0378         mutex_unlock(&prange->lock);
0379 
0380         spin_lock(&svm_bo->list_lock);
0381     }
0382     spin_unlock(&svm_bo->list_lock);
0383     if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) {
0384         /* We're not in the eviction worker.
0385          * Signal the fence and synchronize with any
0386          * pending eviction work.
0387          */
0388         dma_fence_signal(&svm_bo->eviction_fence->base);
0389         cancel_work_sync(&svm_bo->eviction_work);
0390     }
0391     dma_fence_put(&svm_bo->eviction_fence->base);
0392     amdgpu_bo_unref(&svm_bo->bo);
0393     kfree(svm_bo);
0394 }
0395 
0396 static void svm_range_bo_wq_release(struct work_struct *work)
0397 {
0398     struct svm_range_bo *svm_bo;
0399 
0400     svm_bo = container_of(work, struct svm_range_bo, release_work);
0401     svm_range_bo_release(&svm_bo->kref);
0402 }
0403 
0404 static void svm_range_bo_release_async(struct kref *kref)
0405 {
0406     struct svm_range_bo *svm_bo;
0407 
0408     svm_bo = container_of(kref, struct svm_range_bo, kref);
0409     pr_debug("svm_bo 0x%p\n", svm_bo);
0410     INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release);
0411     schedule_work(&svm_bo->release_work);
0412 }
0413 
0414 void svm_range_bo_unref_async(struct svm_range_bo *svm_bo)
0415 {
0416     kref_put(&svm_bo->kref, svm_range_bo_release_async);
0417 }
0418 
0419 static void svm_range_bo_unref(struct svm_range_bo *svm_bo)
0420 {
0421     if (svm_bo)
0422         kref_put(&svm_bo->kref, svm_range_bo_release);
0423 }
0424 
0425 static bool
0426 svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange)
0427 {
0428     struct amdgpu_device *bo_adev;
0429 
0430     mutex_lock(&prange->lock);
0431     if (!prange->svm_bo) {
0432         mutex_unlock(&prange->lock);
0433         return false;
0434     }
0435     if (prange->ttm_res) {
0436         /* We still have a reference, all is well */
0437         mutex_unlock(&prange->lock);
0438         return true;
0439     }
0440     if (svm_bo_ref_unless_zero(prange->svm_bo)) {
0441         /*
0442          * Migrate from GPU to GPU, remove range from source bo_adev
0443          * svm_bo range list, and return false to allocate svm_bo from
0444          * destination adev.
0445          */
0446         bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
0447         if (bo_adev != adev) {
0448             mutex_unlock(&prange->lock);
0449 
0450             spin_lock(&prange->svm_bo->list_lock);
0451             list_del_init(&prange->svm_bo_list);
0452             spin_unlock(&prange->svm_bo->list_lock);
0453 
0454             svm_range_bo_unref(prange->svm_bo);
0455             return false;
0456         }
0457         if (READ_ONCE(prange->svm_bo->evicting)) {
0458             struct dma_fence *f;
0459             struct svm_range_bo *svm_bo;
0460             /* The BO is getting evicted,
0461              * we need to get a new one
0462              */
0463             mutex_unlock(&prange->lock);
0464             svm_bo = prange->svm_bo;
0465             f = dma_fence_get(&svm_bo->eviction_fence->base);
0466             svm_range_bo_unref(prange->svm_bo);
0467             /* wait for the fence to avoid long spin-loop
0468              * at list_empty_careful
0469              */
0470             dma_fence_wait(f, false);
0471             dma_fence_put(f);
0472         } else {
0473             /* The BO was still around and we got
0474              * a new reference to it
0475              */
0476             mutex_unlock(&prange->lock);
0477             pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n",
0478                  prange->svms, prange->start, prange->last);
0479 
0480             prange->ttm_res = prange->svm_bo->bo->tbo.resource;
0481             return true;
0482         }
0483 
0484     } else {
0485         mutex_unlock(&prange->lock);
0486     }
0487 
0488     /* We need a new svm_bo. Spin-loop to wait for concurrent
0489      * svm_range_bo_release to finish removing this range from
0490      * its range list. After this, it is safe to reuse the
0491      * svm_bo pointer and svm_bo_list head.
0492      */
0493     while (!list_empty_careful(&prange->svm_bo_list))
0494         ;
0495 
0496     return false;
0497 }
0498 
0499 static struct svm_range_bo *svm_range_bo_new(void)
0500 {
0501     struct svm_range_bo *svm_bo;
0502 
0503     svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL);
0504     if (!svm_bo)
0505         return NULL;
0506 
0507     kref_init(&svm_bo->kref);
0508     INIT_LIST_HEAD(&svm_bo->range_list);
0509     spin_lock_init(&svm_bo->list_lock);
0510 
0511     return svm_bo;
0512 }
0513 
0514 int
0515 svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange,
0516             bool clear)
0517 {
0518     struct amdgpu_bo_param bp;
0519     struct svm_range_bo *svm_bo;
0520     struct amdgpu_bo_user *ubo;
0521     struct amdgpu_bo *bo;
0522     struct kfd_process *p;
0523     struct mm_struct *mm;
0524     int r;
0525 
0526     p = container_of(prange->svms, struct kfd_process, svms);
0527     pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms,
0528          prange->start, prange->last);
0529 
0530     if (svm_range_validate_svm_bo(adev, prange))
0531         return 0;
0532 
0533     svm_bo = svm_range_bo_new();
0534     if (!svm_bo) {
0535         pr_debug("failed to alloc svm bo\n");
0536         return -ENOMEM;
0537     }
0538     mm = get_task_mm(p->lead_thread);
0539     if (!mm) {
0540         pr_debug("failed to get mm\n");
0541         kfree(svm_bo);
0542         return -ESRCH;
0543     }
0544     svm_bo->eviction_fence =
0545         amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
0546                        mm,
0547                        svm_bo);
0548     mmput(mm);
0549     INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker);
0550     svm_bo->evicting = 0;
0551     memset(&bp, 0, sizeof(bp));
0552     bp.size = prange->npages * PAGE_SIZE;
0553     bp.byte_align = PAGE_SIZE;
0554     bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
0555     bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
0556     bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0;
0557     bp.flags |= AMDGPU_GEM_CREATE_DISCARDABLE;
0558     bp.type = ttm_bo_type_device;
0559     bp.resv = NULL;
0560 
0561     r = amdgpu_bo_create_user(adev, &bp, &ubo);
0562     if (r) {
0563         pr_debug("failed %d to create bo\n", r);
0564         goto create_bo_failed;
0565     }
0566     bo = &ubo->bo;
0567     r = amdgpu_bo_reserve(bo, true);
0568     if (r) {
0569         pr_debug("failed %d to reserve bo\n", r);
0570         goto reserve_bo_failed;
0571     }
0572 
0573     r = dma_resv_reserve_fences(bo->tbo.base.resv, 1);
0574     if (r) {
0575         pr_debug("failed %d to reserve bo\n", r);
0576         amdgpu_bo_unreserve(bo);
0577         goto reserve_bo_failed;
0578     }
0579     amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true);
0580 
0581     amdgpu_bo_unreserve(bo);
0582 
0583     svm_bo->bo = bo;
0584     prange->svm_bo = svm_bo;
0585     prange->ttm_res = bo->tbo.resource;
0586     prange->offset = 0;
0587 
0588     spin_lock(&svm_bo->list_lock);
0589     list_add(&prange->svm_bo_list, &svm_bo->range_list);
0590     spin_unlock(&svm_bo->list_lock);
0591 
0592     return 0;
0593 
0594 reserve_bo_failed:
0595     amdgpu_bo_unref(&bo);
0596 create_bo_failed:
0597     dma_fence_put(&svm_bo->eviction_fence->base);
0598     kfree(svm_bo);
0599     prange->ttm_res = NULL;
0600 
0601     return r;
0602 }
0603 
0604 void svm_range_vram_node_free(struct svm_range *prange)
0605 {
0606     svm_range_bo_unref(prange->svm_bo);
0607     prange->ttm_res = NULL;
0608 }
0609 
0610 struct amdgpu_device *
0611 svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id)
0612 {
0613     struct kfd_process_device *pdd;
0614     struct kfd_process *p;
0615     int32_t gpu_idx;
0616 
0617     p = container_of(prange->svms, struct kfd_process, svms);
0618 
0619     gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id);
0620     if (gpu_idx < 0) {
0621         pr_debug("failed to get device by id 0x%x\n", gpu_id);
0622         return NULL;
0623     }
0624     pdd = kfd_process_device_from_gpuidx(p, gpu_idx);
0625     if (!pdd) {
0626         pr_debug("failed to get device by idx 0x%x\n", gpu_idx);
0627         return NULL;
0628     }
0629 
0630     return pdd->dev->adev;
0631 }
0632 
0633 struct kfd_process_device *
0634 svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev)
0635 {
0636     struct kfd_process *p;
0637     int32_t gpu_idx, gpuid;
0638     int r;
0639 
0640     p = container_of(prange->svms, struct kfd_process, svms);
0641 
0642     r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpu_idx);
0643     if (r) {
0644         pr_debug("failed to get device id by adev %p\n", adev);
0645         return NULL;
0646     }
0647 
0648     return kfd_process_device_from_gpuidx(p, gpu_idx);
0649 }
0650 
0651 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
0652 {
0653     struct ttm_operation_ctx ctx = { false, false };
0654 
0655     amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);
0656 
0657     return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
0658 }
0659 
0660 static int
0661 svm_range_check_attr(struct kfd_process *p,
0662              uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
0663 {
0664     uint32_t i;
0665 
0666     for (i = 0; i < nattr; i++) {
0667         uint32_t val = attrs[i].value;
0668         int gpuidx = MAX_GPU_INSTANCE;
0669 
0670         switch (attrs[i].type) {
0671         case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
0672             if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM &&
0673                 val != KFD_IOCTL_SVM_LOCATION_UNDEFINED)
0674                 gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
0675             break;
0676         case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
0677             if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM)
0678                 gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
0679             break;
0680         case KFD_IOCTL_SVM_ATTR_ACCESS:
0681         case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
0682         case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
0683             gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
0684             break;
0685         case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
0686             break;
0687         case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
0688             break;
0689         case KFD_IOCTL_SVM_ATTR_GRANULARITY:
0690             break;
0691         default:
0692             pr_debug("unknown attr type 0x%x\n", attrs[i].type);
0693             return -EINVAL;
0694         }
0695 
0696         if (gpuidx < 0) {
0697             pr_debug("no GPU 0x%x found\n", val);
0698             return -EINVAL;
0699         } else if (gpuidx < MAX_GPU_INSTANCE &&
0700                !test_bit(gpuidx, p->svms.bitmap_supported)) {
0701             pr_debug("GPU 0x%x not supported\n", val);
0702             return -EINVAL;
0703         }
0704     }
0705 
0706     return 0;
0707 }
0708 
0709 static void
0710 svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange,
0711               uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
0712               bool *update_mapping)
0713 {
0714     uint32_t i;
0715     int gpuidx;
0716 
0717     for (i = 0; i < nattr; i++) {
0718         switch (attrs[i].type) {
0719         case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
0720             prange->preferred_loc = attrs[i].value;
0721             break;
0722         case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
0723             prange->prefetch_loc = attrs[i].value;
0724             break;
0725         case KFD_IOCTL_SVM_ATTR_ACCESS:
0726         case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
0727         case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
0728             *update_mapping = true;
0729             gpuidx = kfd_process_gpuidx_from_gpuid(p,
0730                                    attrs[i].value);
0731             if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
0732                 bitmap_clear(prange->bitmap_access, gpuidx, 1);
0733                 bitmap_clear(prange->bitmap_aip, gpuidx, 1);
0734             } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
0735                 bitmap_set(prange->bitmap_access, gpuidx, 1);
0736                 bitmap_clear(prange->bitmap_aip, gpuidx, 1);
0737             } else {
0738                 bitmap_clear(prange->bitmap_access, gpuidx, 1);
0739                 bitmap_set(prange->bitmap_aip, gpuidx, 1);
0740             }
0741             break;
0742         case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
0743             *update_mapping = true;
0744             prange->flags |= attrs[i].value;
0745             break;
0746         case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
0747             *update_mapping = true;
0748             prange->flags &= ~attrs[i].value;
0749             break;
0750         case KFD_IOCTL_SVM_ATTR_GRANULARITY:
0751             prange->granularity = attrs[i].value;
0752             break;
0753         default:
0754             WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
0755         }
0756     }
0757 }
0758 
0759 static bool
0760 svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
0761             uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
0762 {
0763     uint32_t i;
0764     int gpuidx;
0765 
0766     for (i = 0; i < nattr; i++) {
0767         switch (attrs[i].type) {
0768         case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
0769             if (prange->preferred_loc != attrs[i].value)
0770                 return false;
0771             break;
0772         case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
0773             /* Prefetch should always trigger a migration even
0774              * if the value of the attribute didn't change.
0775              */
0776             return false;
0777         case KFD_IOCTL_SVM_ATTR_ACCESS:
0778         case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
0779         case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
0780             gpuidx = kfd_process_gpuidx_from_gpuid(p,
0781                                    attrs[i].value);
0782             if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
0783                 if (test_bit(gpuidx, prange->bitmap_access) ||
0784                     test_bit(gpuidx, prange->bitmap_aip))
0785                     return false;
0786             } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
0787                 if (!test_bit(gpuidx, prange->bitmap_access))
0788                     return false;
0789             } else {
0790                 if (!test_bit(gpuidx, prange->bitmap_aip))
0791                     return false;
0792             }
0793             break;
0794         case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
0795             if ((prange->flags & attrs[i].value) != attrs[i].value)
0796                 return false;
0797             break;
0798         case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
0799             if ((prange->flags & attrs[i].value) != 0)
0800                 return false;
0801             break;
0802         case KFD_IOCTL_SVM_ATTR_GRANULARITY:
0803             if (prange->granularity != attrs[i].value)
0804                 return false;
0805             break;
0806         default:
0807             WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
0808         }
0809     }
0810 
0811     return true;
0812 }
0813 
0814 /**
0815  * svm_range_debug_dump - print all range information from svms
0816  * @svms: svm range list header
0817  *
0818  * debug output svm range start, end, prefetch location from svms
0819  * interval tree and link list
0820  *
0821  * Context: The caller must hold svms->lock
0822  */
0823 static void svm_range_debug_dump(struct svm_range_list *svms)
0824 {
0825     struct interval_tree_node *node;
0826     struct svm_range *prange;
0827 
0828     pr_debug("dump svms 0x%p list\n", svms);
0829     pr_debug("range\tstart\tpage\tend\t\tlocation\n");
0830 
0831     list_for_each_entry(prange, &svms->list, list) {
0832         pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
0833              prange, prange->start, prange->npages,
0834              prange->start + prange->npages - 1,
0835              prange->actual_loc);
0836     }
0837 
0838     pr_debug("dump svms 0x%p interval tree\n", svms);
0839     pr_debug("range\tstart\tpage\tend\t\tlocation\n");
0840     node = interval_tree_iter_first(&svms->objects, 0, ~0ULL);
0841     while (node) {
0842         prange = container_of(node, struct svm_range, it_node);
0843         pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
0844              prange, prange->start, prange->npages,
0845              prange->start + prange->npages - 1,
0846              prange->actual_loc);
0847         node = interval_tree_iter_next(node, 0, ~0ULL);
0848     }
0849 }
0850 
0851 static int
0852 svm_range_split_array(void *ppnew, void *ppold, size_t size,
0853               uint64_t old_start, uint64_t old_n,
0854               uint64_t new_start, uint64_t new_n)
0855 {
0856     unsigned char *new, *old, *pold;
0857     uint64_t d;
0858 
0859     if (!ppold)
0860         return 0;
0861     pold = *(unsigned char **)ppold;
0862     if (!pold)
0863         return 0;
0864 
0865     new = kvmalloc_array(new_n, size, GFP_KERNEL);
0866     if (!new)
0867         return -ENOMEM;
0868 
0869     d = (new_start - old_start) * size;
0870     memcpy(new, pold + d, new_n * size);
0871 
0872     old = kvmalloc_array(old_n, size, GFP_KERNEL);
0873     if (!old) {
0874         kvfree(new);
0875         return -ENOMEM;
0876     }
0877 
0878     d = (new_start == old_start) ? new_n * size : 0;
0879     memcpy(old, pold + d, old_n * size);
0880 
0881     kvfree(pold);
0882     *(void **)ppold = old;
0883     *(void **)ppnew = new;
0884 
0885     return 0;
0886 }
0887 
0888 static int
0889 svm_range_split_pages(struct svm_range *new, struct svm_range *old,
0890               uint64_t start, uint64_t last)
0891 {
0892     uint64_t npages = last - start + 1;
0893     int i, r;
0894 
0895     for (i = 0; i < MAX_GPU_INSTANCE; i++) {
0896         r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i],
0897                       sizeof(*old->dma_addr[i]), old->start,
0898                       npages, new->start, new->npages);
0899         if (r)
0900             return r;
0901     }
0902 
0903     return 0;
0904 }
0905 
0906 static int
0907 svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
0908               uint64_t start, uint64_t last)
0909 {
0910     uint64_t npages = last - start + 1;
0911 
0912     pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n",
0913          new->svms, new, new->start, start, last);
0914 
0915     if (new->start == old->start) {
0916         new->offset = old->offset;
0917         old->offset += new->npages;
0918     } else {
0919         new->offset = old->offset + npages;
0920     }
0921 
0922     new->svm_bo = svm_range_bo_ref(old->svm_bo);
0923     new->ttm_res = old->ttm_res;
0924 
0925     spin_lock(&new->svm_bo->list_lock);
0926     list_add(&new->svm_bo_list, &new->svm_bo->range_list);
0927     spin_unlock(&new->svm_bo->list_lock);
0928 
0929     return 0;
0930 }
0931 
0932 /**
0933  * svm_range_split_adjust - split range and adjust
0934  *
0935  * @new: new range
0936  * @old: the old range
0937  * @start: the old range adjust to start address in pages
0938  * @last: the old range adjust to last address in pages
0939  *
0940  * Copy system memory dma_addr or vram ttm_res in old range to new
0941  * range from new_start up to size new->npages, the remaining old range is from
0942  * start to last
0943  *
0944  * Return:
0945  * 0 - OK, -ENOMEM - out of memory
0946  */
0947 static int
0948 svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
0949               uint64_t start, uint64_t last)
0950 {
0951     int r;
0952 
0953     pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n",
0954          new->svms, new->start, old->start, old->last, start, last);
0955 
0956     if (new->start < old->start ||
0957         new->last > old->last) {
0958         WARN_ONCE(1, "invalid new range start or last\n");
0959         return -EINVAL;
0960     }
0961 
0962     r = svm_range_split_pages(new, old, start, last);
0963     if (r)
0964         return r;
0965 
0966     if (old->actual_loc && old->ttm_res) {
0967         r = svm_range_split_nodes(new, old, start, last);
0968         if (r)
0969             return r;
0970     }
0971 
0972     old->npages = last - start + 1;
0973     old->start = start;
0974     old->last = last;
0975     new->flags = old->flags;
0976     new->preferred_loc = old->preferred_loc;
0977     new->prefetch_loc = old->prefetch_loc;
0978     new->actual_loc = old->actual_loc;
0979     new->granularity = old->granularity;
0980     new->mapped_to_gpu = old->mapped_to_gpu;
0981     bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
0982     bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
0983 
0984     return 0;
0985 }
0986 
0987 /**
0988  * svm_range_split - split a range in 2 ranges
0989  *
0990  * @prange: the svm range to split
0991  * @start: the remaining range start address in pages
0992  * @last: the remaining range last address in pages
0993  * @new: the result new range generated
0994  *
0995  * Two cases only:
0996  * case 1: if start == prange->start
0997  *         prange ==> prange[start, last]
0998  *         new range [last + 1, prange->last]
0999  *
1000  * case 2: if last == prange->last
1001  *         prange ==> prange[start, last]
1002  *         new range [prange->start, start - 1]
1003  *
1004  * Return:
1005  * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last
1006  */
1007 static int
1008 svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last,
1009         struct svm_range **new)
1010 {
1011     uint64_t old_start = prange->start;
1012     uint64_t old_last = prange->last;
1013     struct svm_range_list *svms;
1014     int r = 0;
1015 
1016     pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms,
1017          old_start, old_last, start, last);
1018 
1019     if (old_start != start && old_last != last)
1020         return -EINVAL;
1021     if (start < old_start || last > old_last)
1022         return -EINVAL;
1023 
1024     svms = prange->svms;
1025     if (old_start == start)
1026         *new = svm_range_new(svms, last + 1, old_last, false);
1027     else
1028         *new = svm_range_new(svms, old_start, start - 1, false);
1029     if (!*new)
1030         return -ENOMEM;
1031 
1032     r = svm_range_split_adjust(*new, prange, start, last);
1033     if (r) {
1034         pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n",
1035              r, old_start, old_last, start, last);
1036         svm_range_free(*new, false);
1037         *new = NULL;
1038     }
1039 
1040     return r;
1041 }
1042 
1043 static int
1044 svm_range_split_tail(struct svm_range *prange,
1045              uint64_t new_last, struct list_head *insert_list)
1046 {
1047     struct svm_range *tail;
1048     int r = svm_range_split(prange, prange->start, new_last, &tail);
1049 
1050     if (!r)
1051         list_add(&tail->list, insert_list);
1052     return r;
1053 }
1054 
1055 static int
1056 svm_range_split_head(struct svm_range *prange,
1057              uint64_t new_start, struct list_head *insert_list)
1058 {
1059     struct svm_range *head;
1060     int r = svm_range_split(prange, new_start, prange->last, &head);
1061 
1062     if (!r)
1063         list_add(&head->list, insert_list);
1064     return r;
1065 }
1066 
1067 static void
1068 svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
1069             struct svm_range *pchild, enum svm_work_list_ops op)
1070 {
1071     pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
1072          pchild, pchild->start, pchild->last, prange, op);
1073 
1074     pchild->work_item.mm = mm;
1075     pchild->work_item.op = op;
1076     list_add_tail(&pchild->child_list, &prange->child_list);
1077 }
1078 
1079 /**
1080  * svm_range_split_by_granularity - collect ranges within granularity boundary
1081  *
1082  * @p: the process with svms list
1083  * @mm: mm structure
1084  * @addr: the vm fault address in pages, to split the prange
1085  * @parent: parent range if prange is from child list
1086  * @prange: prange to split
1087  *
1088  * Trims @prange to be a single aligned block of prange->granularity if
1089  * possible. The head and tail are added to the child_list in @parent.
1090  *
1091  * Context: caller must hold mmap_read_lock and prange->lock
1092  *
1093  * Return:
1094  * 0 - OK, otherwise error code
1095  */
1096 int
1097 svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
1098                    unsigned long addr, struct svm_range *parent,
1099                    struct svm_range *prange)
1100 {
1101     struct svm_range *head, *tail;
1102     unsigned long start, last, size;
1103     int r;
1104 
1105     /* Align splited range start and size to granularity size, then a single
1106      * PTE will be used for whole range, this reduces the number of PTE
1107      * updated and the L1 TLB space used for translation.
1108      */
1109     size = 1UL << prange->granularity;
1110     start = ALIGN_DOWN(addr, size);
1111     last = ALIGN(addr + 1, size) - 1;
1112 
1113     pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n",
1114          prange->svms, prange->start, prange->last, start, last, size);
1115 
1116     if (start > prange->start) {
1117         r = svm_range_split(prange, start, prange->last, &head);
1118         if (r)
1119             return r;
1120         svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE);
1121     }
1122 
1123     if (last < prange->last) {
1124         r = svm_range_split(prange, prange->start, last, &tail);
1125         if (r)
1126             return r;
1127         svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
1128     }
1129 
1130     /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */
1131     if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) {
1132         prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP;
1133         pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n",
1134              prange, prange->start, prange->last,
1135              SVM_OP_ADD_RANGE_AND_MAP);
1136     }
1137     return 0;
1138 }
1139 
1140 static uint64_t
1141 svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange,
1142             int domain)
1143 {
1144     struct amdgpu_device *bo_adev;
1145     uint32_t flags = prange->flags;
1146     uint32_t mapping_flags = 0;
1147     uint64_t pte_flags;
1148     bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
1149     bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT;
1150 
1151     if (domain == SVM_RANGE_VRAM_DOMAIN)
1152         bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
1153 
1154     switch (KFD_GC_VERSION(adev->kfd.dev)) {
1155     case IP_VERSION(9, 4, 1):
1156         if (domain == SVM_RANGE_VRAM_DOMAIN) {
1157             if (bo_adev == adev) {
1158                 mapping_flags |= coherent ?
1159                     AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1160             } else {
1161                 mapping_flags |= coherent ?
1162                     AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1163                 if (amdgpu_xgmi_same_hive(adev, bo_adev))
1164                     snoop = true;
1165             }
1166         } else {
1167             mapping_flags |= coherent ?
1168                 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1169         }
1170         break;
1171     case IP_VERSION(9, 4, 2):
1172         if (domain == SVM_RANGE_VRAM_DOMAIN) {
1173             if (bo_adev == adev) {
1174                 mapping_flags |= coherent ?
1175                     AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1176                 if (adev->gmc.xgmi.connected_to_cpu)
1177                     snoop = true;
1178             } else {
1179                 mapping_flags |= coherent ?
1180                     AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1181                 if (amdgpu_xgmi_same_hive(adev, bo_adev))
1182                     snoop = true;
1183             }
1184         } else {
1185             mapping_flags |= coherent ?
1186                 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1187         }
1188         break;
1189     default:
1190         mapping_flags |= coherent ?
1191             AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1192     }
1193 
1194     mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
1195 
1196     if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO)
1197         mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE;
1198     if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC)
1199         mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
1200 
1201     pte_flags = AMDGPU_PTE_VALID;
1202     pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM;
1203     pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
1204 
1205     pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags);
1206     return pte_flags;
1207 }
1208 
1209 static int
1210 svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
1211              uint64_t start, uint64_t last,
1212              struct dma_fence **fence)
1213 {
1214     uint64_t init_pte_value = 0;
1215 
1216     pr_debug("[0x%llx 0x%llx]\n", start, last);
1217 
1218     return amdgpu_vm_update_range(adev, vm, false, true, true, NULL, start,
1219                       last, init_pte_value, 0, 0, NULL, NULL,
1220                       fence);
1221 }
1222 
1223 static int
1224 svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
1225               unsigned long last, uint32_t trigger)
1226 {
1227     DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1228     struct kfd_process_device *pdd;
1229     struct dma_fence *fence = NULL;
1230     struct kfd_process *p;
1231     uint32_t gpuidx;
1232     int r = 0;
1233 
1234     if (!prange->mapped_to_gpu) {
1235         pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n",
1236              prange, prange->start, prange->last);
1237         return 0;
1238     }
1239 
1240     if (prange->start == start && prange->last == last) {
1241         pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange);
1242         prange->mapped_to_gpu = false;
1243     }
1244 
1245     bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
1246           MAX_GPU_INSTANCE);
1247     p = container_of(prange->svms, struct kfd_process, svms);
1248 
1249     for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1250         pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
1251         pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1252         if (!pdd) {
1253             pr_debug("failed to find device idx %d\n", gpuidx);
1254             return -EINVAL;
1255         }
1256 
1257         kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid,
1258                          start, last, trigger);
1259 
1260         r = svm_range_unmap_from_gpu(pdd->dev->adev,
1261                          drm_priv_to_vm(pdd->drm_priv),
1262                          start, last, &fence);
1263         if (r)
1264             break;
1265 
1266         if (fence) {
1267             r = dma_fence_wait(fence, false);
1268             dma_fence_put(fence);
1269             fence = NULL;
1270             if (r)
1271                 break;
1272         }
1273         kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT);
1274     }
1275 
1276     return r;
1277 }
1278 
1279 static int
1280 svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
1281              unsigned long offset, unsigned long npages, bool readonly,
1282              dma_addr_t *dma_addr, struct amdgpu_device *bo_adev,
1283              struct dma_fence **fence, bool flush_tlb)
1284 {
1285     struct amdgpu_device *adev = pdd->dev->adev;
1286     struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv);
1287     uint64_t pte_flags;
1288     unsigned long last_start;
1289     int last_domain;
1290     int r = 0;
1291     int64_t i, j;
1292 
1293     last_start = prange->start + offset;
1294 
1295     pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms,
1296          last_start, last_start + npages - 1, readonly);
1297 
1298     for (i = offset; i < offset + npages; i++) {
1299         last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN;
1300         dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN;
1301 
1302         /* Collect all pages in the same address range and memory domain
1303          * that can be mapped with a single call to update mapping.
1304          */
1305         if (i < offset + npages - 1 &&
1306             last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN))
1307             continue;
1308 
1309         pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n",
1310              last_start, prange->start + i, last_domain ? "GPU" : "CPU");
1311 
1312         pte_flags = svm_range_get_pte_flags(adev, prange, last_domain);
1313         if (readonly)
1314             pte_flags &= ~AMDGPU_PTE_WRITEABLE;
1315 
1316         pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n",
1317              prange->svms, last_start, prange->start + i,
1318              (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0,
1319              pte_flags);
1320 
1321         r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, NULL,
1322                        last_start, prange->start + i,
1323                        pte_flags,
1324                        (last_start - prange->start) << PAGE_SHIFT,
1325                        bo_adev ? bo_adev->vm_manager.vram_base_offset : 0,
1326                        NULL, dma_addr, &vm->last_update);
1327 
1328         for (j = last_start - prange->start; j <= i; j++)
1329             dma_addr[j] |= last_domain;
1330 
1331         if (r) {
1332             pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start);
1333             goto out;
1334         }
1335         last_start = prange->start + i + 1;
1336     }
1337 
1338     r = amdgpu_vm_update_pdes(adev, vm, false);
1339     if (r) {
1340         pr_debug("failed %d to update directories 0x%lx\n", r,
1341              prange->start);
1342         goto out;
1343     }
1344 
1345     if (fence)
1346         *fence = dma_fence_get(vm->last_update);
1347 
1348 out:
1349     return r;
1350 }
1351 
1352 static int
1353 svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
1354               unsigned long npages, bool readonly,
1355               unsigned long *bitmap, bool wait, bool flush_tlb)
1356 {
1357     struct kfd_process_device *pdd;
1358     struct amdgpu_device *bo_adev;
1359     struct kfd_process *p;
1360     struct dma_fence *fence = NULL;
1361     uint32_t gpuidx;
1362     int r = 0;
1363 
1364     if (prange->svm_bo && prange->ttm_res)
1365         bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
1366     else
1367         bo_adev = NULL;
1368 
1369     p = container_of(prange->svms, struct kfd_process, svms);
1370     for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1371         pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
1372         pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1373         if (!pdd) {
1374             pr_debug("failed to find device idx %d\n", gpuidx);
1375             return -EINVAL;
1376         }
1377 
1378         pdd = kfd_bind_process_to_device(pdd->dev, p);
1379         if (IS_ERR(pdd))
1380             return -EINVAL;
1381 
1382         if (bo_adev && pdd->dev->adev != bo_adev &&
1383             !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) {
1384             pr_debug("cannot map to device idx %d\n", gpuidx);
1385             continue;
1386         }
1387 
1388         r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly,
1389                      prange->dma_addr[gpuidx],
1390                      bo_adev, wait ? &fence : NULL,
1391                      flush_tlb);
1392         if (r)
1393             break;
1394 
1395         if (fence) {
1396             r = dma_fence_wait(fence, false);
1397             dma_fence_put(fence);
1398             fence = NULL;
1399             if (r) {
1400                 pr_debug("failed %d to dma fence wait\n", r);
1401                 break;
1402             }
1403         }
1404 
1405         kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
1406     }
1407 
1408     return r;
1409 }
1410 
1411 struct svm_validate_context {
1412     struct kfd_process *process;
1413     struct svm_range *prange;
1414     bool intr;
1415     DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1416     struct ttm_validate_buffer tv[MAX_GPU_INSTANCE];
1417     struct list_head validate_list;
1418     struct ww_acquire_ctx ticket;
1419 };
1420 
1421 static int svm_range_reserve_bos(struct svm_validate_context *ctx)
1422 {
1423     struct kfd_process_device *pdd;
1424     struct amdgpu_vm *vm;
1425     uint32_t gpuidx;
1426     int r;
1427 
1428     INIT_LIST_HEAD(&ctx->validate_list);
1429     for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1430         pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1431         if (!pdd) {
1432             pr_debug("failed to find device idx %d\n", gpuidx);
1433             return -EINVAL;
1434         }
1435         vm = drm_priv_to_vm(pdd->drm_priv);
1436 
1437         ctx->tv[gpuidx].bo = &vm->root.bo->tbo;
1438         ctx->tv[gpuidx].num_shared = 4;
1439         list_add(&ctx->tv[gpuidx].head, &ctx->validate_list);
1440     }
1441 
1442     r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list,
1443                    ctx->intr, NULL);
1444     if (r) {
1445         pr_debug("failed %d to reserve bo\n", r);
1446         return r;
1447     }
1448 
1449     for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1450         pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1451         if (!pdd) {
1452             pr_debug("failed to find device idx %d\n", gpuidx);
1453             r = -EINVAL;
1454             goto unreserve_out;
1455         }
1456 
1457         r = amdgpu_vm_validate_pt_bos(pdd->dev->adev,
1458                           drm_priv_to_vm(pdd->drm_priv),
1459                           svm_range_bo_validate, NULL);
1460         if (r) {
1461             pr_debug("failed %d validate pt bos\n", r);
1462             goto unreserve_out;
1463         }
1464     }
1465 
1466     return 0;
1467 
1468 unreserve_out:
1469     ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list);
1470     return r;
1471 }
1472 
1473 static void svm_range_unreserve_bos(struct svm_validate_context *ctx)
1474 {
1475     ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list);
1476 }
1477 
1478 static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx)
1479 {
1480     struct kfd_process_device *pdd;
1481 
1482     pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1483 
1484     return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev);
1485 }
1486 
1487 /*
1488  * Validation+GPU mapping with concurrent invalidation (MMU notifiers)
1489  *
1490  * To prevent concurrent destruction or change of range attributes, the
1491  * svm_read_lock must be held. The caller must not hold the svm_write_lock
1492  * because that would block concurrent evictions and lead to deadlocks. To
1493  * serialize concurrent migrations or validations of the same range, the
1494  * prange->migrate_mutex must be held.
1495  *
1496  * For VRAM ranges, the SVM BO must be allocated and valid (protected by its
1497  * eviction fence.
1498  *
1499  * The following sequence ensures race-free validation and GPU mapping:
1500  *
1501  * 1. Reserve page table (and SVM BO if range is in VRAM)
1502  * 2. hmm_range_fault to get page addresses (if system memory)
1503  * 3. DMA-map pages (if system memory)
1504  * 4-a. Take notifier lock
1505  * 4-b. Check that pages still valid (mmu_interval_read_retry)
1506  * 4-c. Check that the range was not split or otherwise invalidated
1507  * 4-d. Update GPU page table
1508  * 4.e. Release notifier lock
1509  * 5. Release page table (and SVM BO) reservation
1510  */
1511 static int svm_range_validate_and_map(struct mm_struct *mm,
1512                       struct svm_range *prange, int32_t gpuidx,
1513                       bool intr, bool wait, bool flush_tlb)
1514 {
1515     struct svm_validate_context ctx;
1516     unsigned long start, end, addr;
1517     struct kfd_process *p;
1518     void *owner;
1519     int32_t idx;
1520     int r = 0;
1521 
1522     ctx.process = container_of(prange->svms, struct kfd_process, svms);
1523     ctx.prange = prange;
1524     ctx.intr = intr;
1525 
1526     if (gpuidx < MAX_GPU_INSTANCE) {
1527         bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE);
1528         bitmap_set(ctx.bitmap, gpuidx, 1);
1529     } else if (ctx.process->xnack_enabled) {
1530         bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
1531 
1532         /* If prefetch range to GPU, or GPU retry fault migrate range to
1533          * GPU, which has ACCESS attribute to the range, create mapping
1534          * on that GPU.
1535          */
1536         if (prange->actual_loc) {
1537             gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process,
1538                             prange->actual_loc);
1539             if (gpuidx < 0) {
1540                 WARN_ONCE(1, "failed get device by id 0x%x\n",
1541                      prange->actual_loc);
1542                 return -EINVAL;
1543             }
1544             if (test_bit(gpuidx, prange->bitmap_access))
1545                 bitmap_set(ctx.bitmap, gpuidx, 1);
1546         }
1547     } else {
1548         bitmap_or(ctx.bitmap, prange->bitmap_access,
1549               prange->bitmap_aip, MAX_GPU_INSTANCE);
1550     }
1551 
1552     if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) {
1553         if (!prange->mapped_to_gpu)
1554             return 0;
1555 
1556         bitmap_copy(ctx.bitmap, prange->bitmap_access, MAX_GPU_INSTANCE);
1557     }
1558 
1559     if (prange->actual_loc && !prange->ttm_res) {
1560         /* This should never happen. actual_loc gets set by
1561          * svm_migrate_ram_to_vram after allocating a BO.
1562          */
1563         WARN_ONCE(1, "VRAM BO missing during validation\n");
1564         return -EINVAL;
1565     }
1566 
1567     svm_range_reserve_bos(&ctx);
1568 
1569     p = container_of(prange->svms, struct kfd_process, svms);
1570     owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap,
1571                         MAX_GPU_INSTANCE));
1572     for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) {
1573         if (kfd_svm_page_owner(p, idx) != owner) {
1574             owner = NULL;
1575             break;
1576         }
1577     }
1578 
1579     start = prange->start << PAGE_SHIFT;
1580     end = (prange->last + 1) << PAGE_SHIFT;
1581     for (addr = start; addr < end && !r; ) {
1582         struct hmm_range *hmm_range;
1583         struct vm_area_struct *vma;
1584         unsigned long next;
1585         unsigned long offset;
1586         unsigned long npages;
1587         bool readonly;
1588 
1589         vma = find_vma(mm, addr);
1590         if (!vma || addr < vma->vm_start) {
1591             r = -EFAULT;
1592             goto unreserve_out;
1593         }
1594         readonly = !(vma->vm_flags & VM_WRITE);
1595 
1596         next = min(vma->vm_end, end);
1597         npages = (next - addr) >> PAGE_SHIFT;
1598         WRITE_ONCE(p->svms.faulting_task, current);
1599         r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
1600                            addr, npages, &hmm_range,
1601                            readonly, true, owner);
1602         WRITE_ONCE(p->svms.faulting_task, NULL);
1603         if (r) {
1604             pr_debug("failed %d to get svm range pages\n", r);
1605             goto unreserve_out;
1606         }
1607 
1608         offset = (addr - start) >> PAGE_SHIFT;
1609         r = svm_range_dma_map(prange, ctx.bitmap, offset, npages,
1610                       hmm_range->hmm_pfns);
1611         if (r) {
1612             pr_debug("failed %d to dma map range\n", r);
1613             goto unreserve_out;
1614         }
1615 
1616         svm_range_lock(prange);
1617         if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
1618             pr_debug("hmm update the range, need validate again\n");
1619             r = -EAGAIN;
1620             goto unlock_out;
1621         }
1622         if (!list_empty(&prange->child_list)) {
1623             pr_debug("range split by unmap in parallel, validate again\n");
1624             r = -EAGAIN;
1625             goto unlock_out;
1626         }
1627 
1628         r = svm_range_map_to_gpus(prange, offset, npages, readonly,
1629                       ctx.bitmap, wait, flush_tlb);
1630 
1631 unlock_out:
1632         svm_range_unlock(prange);
1633 
1634         addr = next;
1635     }
1636 
1637     if (addr == end) {
1638         prange->validated_once = true;
1639         prange->mapped_to_gpu = true;
1640     }
1641 
1642 unreserve_out:
1643     svm_range_unreserve_bos(&ctx);
1644 
1645     if (!r)
1646         prange->validate_timestamp = ktime_get_boottime();
1647 
1648     return r;
1649 }
1650 
1651 /**
1652  * svm_range_list_lock_and_flush_work - flush pending deferred work
1653  *
1654  * @svms: the svm range list
1655  * @mm: the mm structure
1656  *
1657  * Context: Returns with mmap write lock held, pending deferred work flushed
1658  *
1659  */
1660 void
1661 svm_range_list_lock_and_flush_work(struct svm_range_list *svms,
1662                    struct mm_struct *mm)
1663 {
1664 retry_flush_work:
1665     flush_work(&svms->deferred_list_work);
1666     mmap_write_lock(mm);
1667 
1668     if (list_empty(&svms->deferred_range_list))
1669         return;
1670     mmap_write_unlock(mm);
1671     pr_debug("retry flush\n");
1672     goto retry_flush_work;
1673 }
1674 
1675 static void svm_range_restore_work(struct work_struct *work)
1676 {
1677     struct delayed_work *dwork = to_delayed_work(work);
1678     struct amdkfd_process_info *process_info;
1679     struct svm_range_list *svms;
1680     struct svm_range *prange;
1681     struct kfd_process *p;
1682     struct mm_struct *mm;
1683     int evicted_ranges;
1684     int invalid;
1685     int r;
1686 
1687     svms = container_of(dwork, struct svm_range_list, restore_work);
1688     evicted_ranges = atomic_read(&svms->evicted_ranges);
1689     if (!evicted_ranges)
1690         return;
1691 
1692     pr_debug("restore svm ranges\n");
1693 
1694     p = container_of(svms, struct kfd_process, svms);
1695     process_info = p->kgd_process_info;
1696 
1697     /* Keep mm reference when svm_range_validate_and_map ranges */
1698     mm = get_task_mm(p->lead_thread);
1699     if (!mm) {
1700         pr_debug("svms 0x%p process mm gone\n", svms);
1701         return;
1702     }
1703 
1704     mutex_lock(&process_info->lock);
1705     svm_range_list_lock_and_flush_work(svms, mm);
1706     mutex_lock(&svms->lock);
1707 
1708     evicted_ranges = atomic_read(&svms->evicted_ranges);
1709 
1710     list_for_each_entry(prange, &svms->list, list) {
1711         invalid = atomic_read(&prange->invalid);
1712         if (!invalid)
1713             continue;
1714 
1715         pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n",
1716              prange->svms, prange, prange->start, prange->last,
1717              invalid);
1718 
1719         /*
1720          * If range is migrating, wait for migration is done.
1721          */
1722         mutex_lock(&prange->migrate_mutex);
1723 
1724         r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
1725                            false, true, false);
1726         if (r)
1727             pr_debug("failed %d to map 0x%lx to gpus\n", r,
1728                  prange->start);
1729 
1730         mutex_unlock(&prange->migrate_mutex);
1731         if (r)
1732             goto out_reschedule;
1733 
1734         if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid)
1735             goto out_reschedule;
1736     }
1737 
1738     if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) !=
1739         evicted_ranges)
1740         goto out_reschedule;
1741 
1742     evicted_ranges = 0;
1743 
1744     r = kgd2kfd_resume_mm(mm);
1745     if (r) {
1746         /* No recovery from this failure. Probably the CP is
1747          * hanging. No point trying again.
1748          */
1749         pr_debug("failed %d to resume KFD\n", r);
1750     }
1751 
1752     pr_debug("restore svm ranges successfully\n");
1753 
1754 out_reschedule:
1755     mutex_unlock(&svms->lock);
1756     mmap_write_unlock(mm);
1757     mutex_unlock(&process_info->lock);
1758 
1759     /* If validation failed, reschedule another attempt */
1760     if (evicted_ranges) {
1761         pr_debug("reschedule to restore svm range\n");
1762         schedule_delayed_work(&svms->restore_work,
1763             msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1764 
1765         kfd_smi_event_queue_restore_rescheduled(mm);
1766     }
1767     mmput(mm);
1768 }
1769 
1770 /**
1771  * svm_range_evict - evict svm range
1772  * @prange: svm range structure
1773  * @mm: current process mm_struct
1774  * @start: starting process queue number
1775  * @last: last process queue number
1776  *
1777  * Stop all queues of the process to ensure GPU doesn't access the memory, then
1778  * return to let CPU evict the buffer and proceed CPU pagetable update.
1779  *
1780  * Don't need use lock to sync cpu pagetable invalidation with GPU execution.
1781  * If invalidation happens while restore work is running, restore work will
1782  * restart to ensure to get the latest CPU pages mapping to GPU, then start
1783  * the queues.
1784  */
1785 static int
1786 svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
1787         unsigned long start, unsigned long last,
1788         enum mmu_notifier_event event)
1789 {
1790     struct svm_range_list *svms = prange->svms;
1791     struct svm_range *pchild;
1792     struct kfd_process *p;
1793     int r = 0;
1794 
1795     p = container_of(svms, struct kfd_process, svms);
1796 
1797     pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
1798          svms, prange->start, prange->last, start, last);
1799 
1800     if (!p->xnack_enabled ||
1801         (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) {
1802         int evicted_ranges;
1803         bool mapped = prange->mapped_to_gpu;
1804 
1805         list_for_each_entry(pchild, &prange->child_list, child_list) {
1806             if (!pchild->mapped_to_gpu)
1807                 continue;
1808             mapped = true;
1809             mutex_lock_nested(&pchild->lock, 1);
1810             if (pchild->start <= last && pchild->last >= start) {
1811                 pr_debug("increment pchild invalid [0x%lx 0x%lx]\n",
1812                      pchild->start, pchild->last);
1813                 atomic_inc(&pchild->invalid);
1814             }
1815             mutex_unlock(&pchild->lock);
1816         }
1817 
1818         if (!mapped)
1819             return r;
1820 
1821         if (prange->start <= last && prange->last >= start)
1822             atomic_inc(&prange->invalid);
1823 
1824         evicted_ranges = atomic_inc_return(&svms->evicted_ranges);
1825         if (evicted_ranges != 1)
1826             return r;
1827 
1828         pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n",
1829              prange->svms, prange->start, prange->last);
1830 
1831         /* First eviction, stop the queues */
1832         r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
1833         if (r)
1834             pr_debug("failed to quiesce KFD\n");
1835 
1836         pr_debug("schedule to restore svm %p ranges\n", svms);
1837         schedule_delayed_work(&svms->restore_work,
1838             msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1839     } else {
1840         unsigned long s, l;
1841         uint32_t trigger;
1842 
1843         if (event == MMU_NOTIFY_MIGRATE)
1844             trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE;
1845         else
1846             trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY;
1847 
1848         pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n",
1849              prange->svms, start, last);
1850         list_for_each_entry(pchild, &prange->child_list, child_list) {
1851             mutex_lock_nested(&pchild->lock, 1);
1852             s = max(start, pchild->start);
1853             l = min(last, pchild->last);
1854             if (l >= s)
1855                 svm_range_unmap_from_gpus(pchild, s, l, trigger);
1856             mutex_unlock(&pchild->lock);
1857         }
1858         s = max(start, prange->start);
1859         l = min(last, prange->last);
1860         if (l >= s)
1861             svm_range_unmap_from_gpus(prange, s, l, trigger);
1862     }
1863 
1864     return r;
1865 }
1866 
1867 static struct svm_range *svm_range_clone(struct svm_range *old)
1868 {
1869     struct svm_range *new;
1870 
1871     new = svm_range_new(old->svms, old->start, old->last, false);
1872     if (!new)
1873         return NULL;
1874 
1875     if (old->svm_bo) {
1876         new->ttm_res = old->ttm_res;
1877         new->offset = old->offset;
1878         new->svm_bo = svm_range_bo_ref(old->svm_bo);
1879         spin_lock(&new->svm_bo->list_lock);
1880         list_add(&new->svm_bo_list, &new->svm_bo->range_list);
1881         spin_unlock(&new->svm_bo->list_lock);
1882     }
1883     new->flags = old->flags;
1884     new->preferred_loc = old->preferred_loc;
1885     new->prefetch_loc = old->prefetch_loc;
1886     new->actual_loc = old->actual_loc;
1887     new->granularity = old->granularity;
1888     new->mapped_to_gpu = old->mapped_to_gpu;
1889     bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
1890     bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
1891 
1892     return new;
1893 }
1894 
1895 void svm_range_set_max_pages(struct amdgpu_device *adev)
1896 {
1897     uint64_t max_pages;
1898     uint64_t pages, _pages;
1899 
1900     /* 1/32 VRAM size in pages */
1901     pages = adev->gmc.real_vram_size >> 17;
1902     pages = clamp(pages, 1ULL << 9, 1ULL << 18);
1903     pages = rounddown_pow_of_two(pages);
1904     do {
1905         max_pages = READ_ONCE(max_svm_range_pages);
1906         _pages = min_not_zero(max_pages, pages);
1907     } while (cmpxchg(&max_svm_range_pages, max_pages, _pages) != max_pages);
1908 }
1909 
1910 static int
1911 svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last,
1912             uint64_t max_pages, struct list_head *insert_list,
1913             struct list_head *update_list)
1914 {
1915     struct svm_range *prange;
1916     uint64_t l;
1917 
1918     pr_debug("max_svm_range_pages 0x%llx adding [0x%llx 0x%llx]\n",
1919          max_pages, start, last);
1920 
1921     while (last >= start) {
1922         l = min(last, ALIGN_DOWN(start + max_pages, max_pages) - 1);
1923 
1924         prange = svm_range_new(svms, start, l, true);
1925         if (!prange)
1926             return -ENOMEM;
1927         list_add(&prange->list, insert_list);
1928         list_add(&prange->update_list, update_list);
1929 
1930         start = l + 1;
1931     }
1932     return 0;
1933 }
1934 
1935 /**
1936  * svm_range_add - add svm range and handle overlap
1937  * @p: the range add to this process svms
1938  * @start: page size aligned
1939  * @size: page size aligned
1940  * @nattr: number of attributes
1941  * @attrs: array of attributes
1942  * @update_list: output, the ranges need validate and update GPU mapping
1943  * @insert_list: output, the ranges need insert to svms
1944  * @remove_list: output, the ranges are replaced and need remove from svms
1945  *
1946  * Check if the virtual address range has overlap with any existing ranges,
1947  * split partly overlapping ranges and add new ranges in the gaps. All changes
1948  * should be applied to the range_list and interval tree transactionally. If
1949  * any range split or allocation fails, the entire update fails. Therefore any
1950  * existing overlapping svm_ranges are cloned and the original svm_ranges left
1951  * unchanged.
1952  *
1953  * If the transaction succeeds, the caller can update and insert clones and
1954  * new ranges, then free the originals.
1955  *
1956  * Otherwise the caller can free the clones and new ranges, while the old
1957  * svm_ranges remain unchanged.
1958  *
1959  * Context: Process context, caller must hold svms->lock
1960  *
1961  * Return:
1962  * 0 - OK, otherwise error code
1963  */
1964 static int
1965 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
1966           uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
1967           struct list_head *update_list, struct list_head *insert_list,
1968           struct list_head *remove_list)
1969 {
1970     unsigned long last = start + size - 1UL;
1971     struct svm_range_list *svms = &p->svms;
1972     struct interval_tree_node *node;
1973     struct svm_range *prange;
1974     struct svm_range *tmp;
1975     struct list_head new_list;
1976     int r = 0;
1977 
1978     pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last);
1979 
1980     INIT_LIST_HEAD(update_list);
1981     INIT_LIST_HEAD(insert_list);
1982     INIT_LIST_HEAD(remove_list);
1983     INIT_LIST_HEAD(&new_list);
1984 
1985     node = interval_tree_iter_first(&svms->objects, start, last);
1986     while (node) {
1987         struct interval_tree_node *next;
1988         unsigned long next_start;
1989 
1990         pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start,
1991              node->last);
1992 
1993         prange = container_of(node, struct svm_range, it_node);
1994         next = interval_tree_iter_next(node, start, last);
1995         next_start = min(node->last, last) + 1;
1996 
1997         if (svm_range_is_same_attrs(p, prange, nattr, attrs)) {
1998             /* nothing to do */
1999         } else if (node->start < start || node->last > last) {
2000             /* node intersects the update range and its attributes
2001              * will change. Clone and split it, apply updates only
2002              * to the overlapping part
2003              */
2004             struct svm_range *old = prange;
2005 
2006             prange = svm_range_clone(old);
2007             if (!prange) {
2008                 r = -ENOMEM;
2009                 goto out;
2010             }
2011 
2012             list_add(&old->update_list, remove_list);
2013             list_add(&prange->list, insert_list);
2014             list_add(&prange->update_list, update_list);
2015 
2016             if (node->start < start) {
2017                 pr_debug("change old range start\n");
2018                 r = svm_range_split_head(prange, start,
2019                              insert_list);
2020                 if (r)
2021                     goto out;
2022             }
2023             if (node->last > last) {
2024                 pr_debug("change old range last\n");
2025                 r = svm_range_split_tail(prange, last,
2026                              insert_list);
2027                 if (r)
2028                     goto out;
2029             }
2030         } else {
2031             /* The node is contained within start..last,
2032              * just update it
2033              */
2034             list_add(&prange->update_list, update_list);
2035         }
2036 
2037         /* insert a new node if needed */
2038         if (node->start > start) {
2039             r = svm_range_split_new(svms, start, node->start - 1,
2040                         READ_ONCE(max_svm_range_pages),
2041                         &new_list, update_list);
2042             if (r)
2043                 goto out;
2044         }
2045 
2046         node = next;
2047         start = next_start;
2048     }
2049 
2050     /* add a final range at the end if needed */
2051     if (start <= last)
2052         r = svm_range_split_new(svms, start, last,
2053                     READ_ONCE(max_svm_range_pages),
2054                     &new_list, update_list);
2055 
2056 out:
2057     if (r) {
2058         list_for_each_entry_safe(prange, tmp, insert_list, list)
2059             svm_range_free(prange, false);
2060         list_for_each_entry_safe(prange, tmp, &new_list, list)
2061             svm_range_free(prange, true);
2062     } else {
2063         list_splice(&new_list, insert_list);
2064     }
2065 
2066     return r;
2067 }
2068 
2069 static void
2070 svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
2071                         struct svm_range *prange)
2072 {
2073     unsigned long start;
2074     unsigned long last;
2075 
2076     start = prange->notifier.interval_tree.start >> PAGE_SHIFT;
2077     last = prange->notifier.interval_tree.last >> PAGE_SHIFT;
2078 
2079     if (prange->start == start && prange->last == last)
2080         return;
2081 
2082     pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
2083           prange->svms, prange, start, last, prange->start,
2084           prange->last);
2085 
2086     if (start != 0 && last != 0) {
2087         interval_tree_remove(&prange->it_node, &prange->svms->objects);
2088         svm_range_remove_notifier(prange);
2089     }
2090     prange->it_node.start = prange->start;
2091     prange->it_node.last = prange->last;
2092 
2093     interval_tree_insert(&prange->it_node, &prange->svms->objects);
2094     svm_range_add_notifier_locked(mm, prange);
2095 }
2096 
2097 static void
2098 svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange,
2099              struct mm_struct *mm)
2100 {
2101     switch (prange->work_item.op) {
2102     case SVM_OP_NULL:
2103         pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2104              svms, prange, prange->start, prange->last);
2105         break;
2106     case SVM_OP_UNMAP_RANGE:
2107         pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2108              svms, prange, prange->start, prange->last);
2109         svm_range_unlink(prange);
2110         svm_range_remove_notifier(prange);
2111         svm_range_free(prange, true);
2112         break;
2113     case SVM_OP_UPDATE_RANGE_NOTIFIER:
2114         pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2115              svms, prange, prange->start, prange->last);
2116         svm_range_update_notifier_and_interval_tree(mm, prange);
2117         break;
2118     case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
2119         pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
2120              svms, prange, prange->start, prange->last);
2121         svm_range_update_notifier_and_interval_tree(mm, prange);
2122         /* TODO: implement deferred validation and mapping */
2123         break;
2124     case SVM_OP_ADD_RANGE:
2125         pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
2126              prange->start, prange->last);
2127         svm_range_add_to_svms(prange);
2128         svm_range_add_notifier_locked(mm, prange);
2129         break;
2130     case SVM_OP_ADD_RANGE_AND_MAP:
2131         pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
2132              prange, prange->start, prange->last);
2133         svm_range_add_to_svms(prange);
2134         svm_range_add_notifier_locked(mm, prange);
2135         /* TODO: implement deferred validation and mapping */
2136         break;
2137     default:
2138         WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange,
2139              prange->work_item.op);
2140     }
2141 }
2142 
2143 static void svm_range_drain_retry_fault(struct svm_range_list *svms)
2144 {
2145     struct kfd_process_device *pdd;
2146     struct kfd_process *p;
2147     int drain;
2148     uint32_t i;
2149 
2150     p = container_of(svms, struct kfd_process, svms);
2151 
2152 restart:
2153     drain = atomic_read(&svms->drain_pagefaults);
2154     if (!drain)
2155         return;
2156 
2157     for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
2158         pdd = p->pdds[i];
2159         if (!pdd)
2160             continue;
2161 
2162         pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
2163 
2164         amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
2165                              &pdd->dev->adev->irq.ih1);
2166         pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
2167     }
2168     if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain)
2169         goto restart;
2170 }
2171 
2172 static void svm_range_deferred_list_work(struct work_struct *work)
2173 {
2174     struct svm_range_list *svms;
2175     struct svm_range *prange;
2176     struct mm_struct *mm;
2177 
2178     svms = container_of(work, struct svm_range_list, deferred_list_work);
2179     pr_debug("enter svms 0x%p\n", svms);
2180 
2181     spin_lock(&svms->deferred_list_lock);
2182     while (!list_empty(&svms->deferred_range_list)) {
2183         prange = list_first_entry(&svms->deferred_range_list,
2184                       struct svm_range, deferred_list);
2185         spin_unlock(&svms->deferred_list_lock);
2186 
2187         pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
2188              prange->start, prange->last, prange->work_item.op);
2189 
2190         mm = prange->work_item.mm;
2191 retry:
2192         mmap_write_lock(mm);
2193 
2194         /* Checking for the need to drain retry faults must be inside
2195          * mmap write lock to serialize with munmap notifiers.
2196          */
2197         if (unlikely(atomic_read(&svms->drain_pagefaults))) {
2198             mmap_write_unlock(mm);
2199             svm_range_drain_retry_fault(svms);
2200             goto retry;
2201         }
2202 
2203         /* Remove from deferred_list must be inside mmap write lock, for
2204          * two race cases:
2205          * 1. unmap_from_cpu may change work_item.op and add the range
2206          *    to deferred_list again, cause use after free bug.
2207          * 2. svm_range_list_lock_and_flush_work may hold mmap write
2208          *    lock and continue because deferred_list is empty, but
2209          *    deferred_list work is actually waiting for mmap lock.
2210          */
2211         spin_lock(&svms->deferred_list_lock);
2212         list_del_init(&prange->deferred_list);
2213         spin_unlock(&svms->deferred_list_lock);
2214 
2215         mutex_lock(&svms->lock);
2216         mutex_lock(&prange->migrate_mutex);
2217         while (!list_empty(&prange->child_list)) {
2218             struct svm_range *pchild;
2219 
2220             pchild = list_first_entry(&prange->child_list,
2221                         struct svm_range, child_list);
2222             pr_debug("child prange 0x%p op %d\n", pchild,
2223                  pchild->work_item.op);
2224             list_del_init(&pchild->child_list);
2225             svm_range_handle_list_op(svms, pchild, mm);
2226         }
2227         mutex_unlock(&prange->migrate_mutex);
2228 
2229         svm_range_handle_list_op(svms, prange, mm);
2230         mutex_unlock(&svms->lock);
2231         mmap_write_unlock(mm);
2232 
2233         /* Pairs with mmget in svm_range_add_list_work */
2234         mmput(mm);
2235 
2236         spin_lock(&svms->deferred_list_lock);
2237     }
2238     spin_unlock(&svms->deferred_list_lock);
2239     pr_debug("exit svms 0x%p\n", svms);
2240 }
2241 
2242 void
2243 svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
2244             struct mm_struct *mm, enum svm_work_list_ops op)
2245 {
2246     spin_lock(&svms->deferred_list_lock);
2247     /* if prange is on the deferred list */
2248     if (!list_empty(&prange->deferred_list)) {
2249         pr_debug("update exist prange 0x%p work op %d\n", prange, op);
2250         WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n");
2251         if (op != SVM_OP_NULL &&
2252             prange->work_item.op != SVM_OP_UNMAP_RANGE)
2253             prange->work_item.op = op;
2254     } else {
2255         prange->work_item.op = op;
2256 
2257         /* Pairs with mmput in deferred_list_work */
2258         mmget(mm);
2259         prange->work_item.mm = mm;
2260         list_add_tail(&prange->deferred_list,
2261                   &prange->svms->deferred_range_list);
2262         pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
2263              prange, prange->start, prange->last, op);
2264     }
2265     spin_unlock(&svms->deferred_list_lock);
2266 }
2267 
2268 void schedule_deferred_list_work(struct svm_range_list *svms)
2269 {
2270     spin_lock(&svms->deferred_list_lock);
2271     if (!list_empty(&svms->deferred_range_list))
2272         schedule_work(&svms->deferred_list_work);
2273     spin_unlock(&svms->deferred_list_lock);
2274 }
2275 
2276 static void
2277 svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
2278               struct svm_range *prange, unsigned long start,
2279               unsigned long last)
2280 {
2281     struct svm_range *head;
2282     struct svm_range *tail;
2283 
2284     if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2285         pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange,
2286              prange->start, prange->last);
2287         return;
2288     }
2289     if (start > prange->last || last < prange->start)
2290         return;
2291 
2292     head = tail = prange;
2293     if (start > prange->start)
2294         svm_range_split(prange, prange->start, start - 1, &tail);
2295     if (last < tail->last)
2296         svm_range_split(tail, last + 1, tail->last, &head);
2297 
2298     if (head != prange && tail != prange) {
2299         svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
2300         svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
2301     } else if (tail != prange) {
2302         svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE);
2303     } else if (head != prange) {
2304         svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
2305     } else if (parent != prange) {
2306         prange->work_item.op = SVM_OP_UNMAP_RANGE;
2307     }
2308 }
2309 
2310 static void
2311 svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
2312              unsigned long start, unsigned long last)
2313 {
2314     uint32_t trigger = KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU;
2315     struct svm_range_list *svms;
2316     struct svm_range *pchild;
2317     struct kfd_process *p;
2318     unsigned long s, l;
2319     bool unmap_parent;
2320 
2321     p = kfd_lookup_process_by_mm(mm);
2322     if (!p)
2323         return;
2324     svms = &p->svms;
2325 
2326     pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
2327          prange, prange->start, prange->last, start, last);
2328 
2329     /* Make sure pending page faults are drained in the deferred worker
2330      * before the range is freed to avoid straggler interrupts on
2331      * unmapped memory causing "phantom faults".
2332      */
2333     atomic_inc(&svms->drain_pagefaults);
2334 
2335     unmap_parent = start <= prange->start && last >= prange->last;
2336 
2337     list_for_each_entry(pchild, &prange->child_list, child_list) {
2338         mutex_lock_nested(&pchild->lock, 1);
2339         s = max(start, pchild->start);
2340         l = min(last, pchild->last);
2341         if (l >= s)
2342             svm_range_unmap_from_gpus(pchild, s, l, trigger);
2343         svm_range_unmap_split(mm, prange, pchild, start, last);
2344         mutex_unlock(&pchild->lock);
2345     }
2346     s = max(start, prange->start);
2347     l = min(last, prange->last);
2348     if (l >= s)
2349         svm_range_unmap_from_gpus(prange, s, l, trigger);
2350     svm_range_unmap_split(mm, prange, prange, start, last);
2351 
2352     if (unmap_parent)
2353         svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
2354     else
2355         svm_range_add_list_work(svms, prange, mm,
2356                     SVM_OP_UPDATE_RANGE_NOTIFIER);
2357     schedule_deferred_list_work(svms);
2358 
2359     kfd_unref_process(p);
2360 }
2361 
2362 /**
2363  * svm_range_cpu_invalidate_pagetables - interval notifier callback
2364  * @mni: mmu_interval_notifier struct
2365  * @range: mmu_notifier_range struct
2366  * @cur_seq: value to pass to mmu_interval_set_seq()
2367  *
2368  * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it
2369  * is from migration, or CPU page invalidation callback.
2370  *
2371  * For unmap event, unmap range from GPUs, remove prange from svms in a delayed
2372  * work thread, and split prange if only part of prange is unmapped.
2373  *
2374  * For invalidation event, if GPU retry fault is not enabled, evict the queues,
2375  * then schedule svm_range_restore_work to update GPU mapping and resume queues.
2376  * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will
2377  * update GPU mapping to recover.
2378  *
2379  * Context: mmap lock, notifier_invalidate_start lock are held
2380  *          for invalidate event, prange lock is held if this is from migration
2381  */
2382 static bool
2383 svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
2384                     const struct mmu_notifier_range *range,
2385                     unsigned long cur_seq)
2386 {
2387     struct svm_range *prange;
2388     unsigned long start;
2389     unsigned long last;
2390 
2391     if (range->event == MMU_NOTIFY_RELEASE)
2392         return true;
2393     if (!mmget_not_zero(mni->mm))
2394         return true;
2395 
2396     start = mni->interval_tree.start;
2397     last = mni->interval_tree.last;
2398     start = max(start, range->start) >> PAGE_SHIFT;
2399     last = min(last, range->end - 1) >> PAGE_SHIFT;
2400     pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n",
2401          start, last, range->start >> PAGE_SHIFT,
2402          (range->end - 1) >> PAGE_SHIFT,
2403          mni->interval_tree.start >> PAGE_SHIFT,
2404          mni->interval_tree.last >> PAGE_SHIFT, range->event);
2405 
2406     prange = container_of(mni, struct svm_range, notifier);
2407 
2408     svm_range_lock(prange);
2409     mmu_interval_set_seq(mni, cur_seq);
2410 
2411     switch (range->event) {
2412     case MMU_NOTIFY_UNMAP:
2413         svm_range_unmap_from_cpu(mni->mm, prange, start, last);
2414         break;
2415     default:
2416         svm_range_evict(prange, mni->mm, start, last, range->event);
2417         break;
2418     }
2419 
2420     svm_range_unlock(prange);
2421     mmput(mni->mm);
2422 
2423     return true;
2424 }
2425 
2426 /**
2427  * svm_range_from_addr - find svm range from fault address
2428  * @svms: svm range list header
2429  * @addr: address to search range interval tree, in pages
2430  * @parent: parent range if range is on child list
2431  *
2432  * Context: The caller must hold svms->lock
2433  *
2434  * Return: the svm_range found or NULL
2435  */
2436 struct svm_range *
2437 svm_range_from_addr(struct svm_range_list *svms, unsigned long addr,
2438             struct svm_range **parent)
2439 {
2440     struct interval_tree_node *node;
2441     struct svm_range *prange;
2442     struct svm_range *pchild;
2443 
2444     node = interval_tree_iter_first(&svms->objects, addr, addr);
2445     if (!node)
2446         return NULL;
2447 
2448     prange = container_of(node, struct svm_range, it_node);
2449     pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n",
2450          addr, prange->start, prange->last, node->start, node->last);
2451 
2452     if (addr >= prange->start && addr <= prange->last) {
2453         if (parent)
2454             *parent = prange;
2455         return prange;
2456     }
2457     list_for_each_entry(pchild, &prange->child_list, child_list)
2458         if (addr >= pchild->start && addr <= pchild->last) {
2459             pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n",
2460                  addr, pchild->start, pchild->last);
2461             if (parent)
2462                 *parent = prange;
2463             return pchild;
2464         }
2465 
2466     return NULL;
2467 }
2468 
2469 /* svm_range_best_restore_location - decide the best fault restore location
2470  * @prange: svm range structure
2471  * @adev: the GPU on which vm fault happened
2472  *
2473  * This is only called when xnack is on, to decide the best location to restore
2474  * the range mapping after GPU vm fault. Caller uses the best location to do
2475  * migration if actual loc is not best location, then update GPU page table
2476  * mapping to the best location.
2477  *
2478  * If the preferred loc is accessible by faulting GPU, use preferred loc.
2479  * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu
2480  * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then
2481  *    if range actual loc is cpu, best_loc is cpu
2482  *    if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is
2483  *    range actual loc.
2484  * Otherwise, GPU no access, best_loc is -1.
2485  *
2486  * Return:
2487  * -1 means vm fault GPU no access
2488  * 0 for CPU or GPU id
2489  */
2490 static int32_t
2491 svm_range_best_restore_location(struct svm_range *prange,
2492                 struct amdgpu_device *adev,
2493                 int32_t *gpuidx)
2494 {
2495     struct amdgpu_device *bo_adev, *preferred_adev;
2496     struct kfd_process *p;
2497     uint32_t gpuid;
2498     int r;
2499 
2500     p = container_of(prange->svms, struct kfd_process, svms);
2501 
2502     r = kfd_process_gpuid_from_adev(p, adev, &gpuid, gpuidx);
2503     if (r < 0) {
2504         pr_debug("failed to get gpuid from kgd\n");
2505         return -1;
2506     }
2507 
2508     if (prange->preferred_loc == gpuid ||
2509         prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) {
2510         return prange->preferred_loc;
2511     } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
2512         preferred_adev = svm_range_get_adev_by_id(prange,
2513                             prange->preferred_loc);
2514         if (amdgpu_xgmi_same_hive(adev, preferred_adev))
2515             return prange->preferred_loc;
2516         /* fall through */
2517     }
2518 
2519     if (test_bit(*gpuidx, prange->bitmap_access))
2520         return gpuid;
2521 
2522     if (test_bit(*gpuidx, prange->bitmap_aip)) {
2523         if (!prange->actual_loc)
2524             return 0;
2525 
2526         bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc);
2527         if (amdgpu_xgmi_same_hive(adev, bo_adev))
2528             return prange->actual_loc;
2529         else
2530             return 0;
2531     }
2532 
2533     return -1;
2534 }
2535 
2536 static int
2537 svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
2538                    unsigned long *start, unsigned long *last,
2539                    bool *is_heap_stack)
2540 {
2541     struct vm_area_struct *vma;
2542     struct interval_tree_node *node;
2543     unsigned long start_limit, end_limit;
2544 
2545     vma = find_vma(p->mm, addr << PAGE_SHIFT);
2546     if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
2547         pr_debug("VMA does not exist in address [0x%llx]\n", addr);
2548         return -EFAULT;
2549     }
2550 
2551     *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk &&
2552               vma->vm_end >= vma->vm_mm->start_brk) ||
2553              (vma->vm_start <= vma->vm_mm->start_stack &&
2554               vma->vm_end >= vma->vm_mm->start_stack);
2555 
2556     start_limit = max(vma->vm_start >> PAGE_SHIFT,
2557               (unsigned long)ALIGN_DOWN(addr, 2UL << 8));
2558     end_limit = min(vma->vm_end >> PAGE_SHIFT,
2559             (unsigned long)ALIGN(addr + 1, 2UL << 8));
2560     /* First range that starts after the fault address */
2561     node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX);
2562     if (node) {
2563         end_limit = min(end_limit, node->start);
2564         /* Last range that ends before the fault address */
2565         node = container_of(rb_prev(&node->rb),
2566                     struct interval_tree_node, rb);
2567     } else {
2568         /* Last range must end before addr because
2569          * there was no range after addr
2570          */
2571         node = container_of(rb_last(&p->svms.objects.rb_root),
2572                     struct interval_tree_node, rb);
2573     }
2574     if (node) {
2575         if (node->last >= addr) {
2576             WARN(1, "Overlap with prev node and page fault addr\n");
2577             return -EFAULT;
2578         }
2579         start_limit = max(start_limit, node->last + 1);
2580     }
2581 
2582     *start = start_limit;
2583     *last = end_limit - 1;
2584 
2585     pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n",
2586          vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT,
2587          *start, *last, *is_heap_stack);
2588 
2589     return 0;
2590 }
2591 
2592 static int
2593 svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last,
2594                uint64_t *bo_s, uint64_t *bo_l)
2595 {
2596     struct amdgpu_bo_va_mapping *mapping;
2597     struct interval_tree_node *node;
2598     struct amdgpu_bo *bo = NULL;
2599     unsigned long userptr;
2600     uint32_t i;
2601     int r;
2602 
2603     for (i = 0; i < p->n_pdds; i++) {
2604         struct amdgpu_vm *vm;
2605 
2606         if (!p->pdds[i]->drm_priv)
2607             continue;
2608 
2609         vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
2610         r = amdgpu_bo_reserve(vm->root.bo, false);
2611         if (r)
2612             return r;
2613 
2614         /* Check userptr by searching entire vm->va interval tree */
2615         node = interval_tree_iter_first(&vm->va, 0, ~0ULL);
2616         while (node) {
2617             mapping = container_of((struct rb_node *)node,
2618                            struct amdgpu_bo_va_mapping, rb);
2619             bo = mapping->bo_va->base.bo;
2620 
2621             if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
2622                              start << PAGE_SHIFT,
2623                              last << PAGE_SHIFT,
2624                              &userptr)) {
2625                 node = interval_tree_iter_next(node, 0, ~0ULL);
2626                 continue;
2627             }
2628 
2629             pr_debug("[0x%llx 0x%llx] already userptr mapped\n",
2630                  start, last);
2631             if (bo_s && bo_l) {
2632                 *bo_s = userptr >> PAGE_SHIFT;
2633                 *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1;
2634             }
2635             amdgpu_bo_unreserve(vm->root.bo);
2636             return -EADDRINUSE;
2637         }
2638         amdgpu_bo_unreserve(vm->root.bo);
2639     }
2640     return 0;
2641 }
2642 
2643 static struct
2644 svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev,
2645                         struct kfd_process *p,
2646                         struct mm_struct *mm,
2647                         int64_t addr)
2648 {
2649     struct svm_range *prange = NULL;
2650     unsigned long start, last;
2651     uint32_t gpuid, gpuidx;
2652     bool is_heap_stack;
2653     uint64_t bo_s = 0;
2654     uint64_t bo_l = 0;
2655     int r;
2656 
2657     if (svm_range_get_range_boundaries(p, addr, &start, &last,
2658                        &is_heap_stack))
2659         return NULL;
2660 
2661     r = svm_range_check_vm(p, start, last, &bo_s, &bo_l);
2662     if (r != -EADDRINUSE)
2663         r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l);
2664 
2665     if (r == -EADDRINUSE) {
2666         if (addr >= bo_s && addr <= bo_l)
2667             return NULL;
2668 
2669         /* Create one page svm range if 2MB range overlapping */
2670         start = addr;
2671         last = addr;
2672     }
2673 
2674     prange = svm_range_new(&p->svms, start, last, true);
2675     if (!prange) {
2676         pr_debug("Failed to create prange in address [0x%llx]\n", addr);
2677         return NULL;
2678     }
2679     if (kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx)) {
2680         pr_debug("failed to get gpuid from kgd\n");
2681         svm_range_free(prange, true);
2682         return NULL;
2683     }
2684 
2685     if (is_heap_stack)
2686         prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM;
2687 
2688     svm_range_add_to_svms(prange);
2689     svm_range_add_notifier_locked(mm, prange);
2690 
2691     return prange;
2692 }
2693 
2694 /* svm_range_skip_recover - decide if prange can be recovered
2695  * @prange: svm range structure
2696  *
2697  * GPU vm retry fault handle skip recover the range for cases:
2698  * 1. prange is on deferred list to be removed after unmap, it is stale fault,
2699  *    deferred list work will drain the stale fault before free the prange.
2700  * 2. prange is on deferred list to add interval notifier after split, or
2701  * 3. prange is child range, it is split from parent prange, recover later
2702  *    after interval notifier is added.
2703  *
2704  * Return: true to skip recover, false to recover
2705  */
2706 static bool svm_range_skip_recover(struct svm_range *prange)
2707 {
2708     struct svm_range_list *svms = prange->svms;
2709 
2710     spin_lock(&svms->deferred_list_lock);
2711     if (list_empty(&prange->deferred_list) &&
2712         list_empty(&prange->child_list)) {
2713         spin_unlock(&svms->deferred_list_lock);
2714         return false;
2715     }
2716     spin_unlock(&svms->deferred_list_lock);
2717 
2718     if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2719         pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n",
2720              svms, prange, prange->start, prange->last);
2721         return true;
2722     }
2723     if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP ||
2724         prange->work_item.op == SVM_OP_ADD_RANGE) {
2725         pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n",
2726              svms, prange, prange->start, prange->last);
2727         return true;
2728     }
2729     return false;
2730 }
2731 
2732 static void
2733 svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
2734               int32_t gpuidx)
2735 {
2736     struct kfd_process_device *pdd;
2737 
2738     /* fault is on different page of same range
2739      * or fault is skipped to recover later
2740      * or fault is on invalid virtual address
2741      */
2742     if (gpuidx == MAX_GPU_INSTANCE) {
2743         uint32_t gpuid;
2744         int r;
2745 
2746         r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx);
2747         if (r < 0)
2748             return;
2749     }
2750 
2751     /* fault is recovered
2752      * or fault cannot recover because GPU no access on the range
2753      */
2754     pdd = kfd_process_device_from_gpuidx(p, gpuidx);
2755     if (pdd)
2756         WRITE_ONCE(pdd->faults, pdd->faults + 1);
2757 }
2758 
2759 static bool
2760 svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
2761 {
2762     unsigned long requested = VM_READ;
2763 
2764     if (write_fault)
2765         requested |= VM_WRITE;
2766 
2767     pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
2768         vma->vm_flags);
2769     return (vma->vm_flags & requested) == requested;
2770 }
2771 
2772 int
2773 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
2774             uint64_t addr, bool write_fault)
2775 {
2776     struct mm_struct *mm = NULL;
2777     struct svm_range_list *svms;
2778     struct svm_range *prange;
2779     struct kfd_process *p;
2780     ktime_t timestamp = ktime_get_boottime();
2781     int32_t best_loc;
2782     int32_t gpuidx = MAX_GPU_INSTANCE;
2783     bool write_locked = false;
2784     struct vm_area_struct *vma;
2785     bool migration = false;
2786     int r = 0;
2787 
2788     if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
2789         pr_debug("device does not support SVM\n");
2790         return -EFAULT;
2791     }
2792 
2793     p = kfd_lookup_process_by_pasid(pasid);
2794     if (!p) {
2795         pr_debug("kfd process not founded pasid 0x%x\n", pasid);
2796         return 0;
2797     }
2798     svms = &p->svms;
2799 
2800     pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
2801 
2802     if (atomic_read(&svms->drain_pagefaults)) {
2803         pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
2804         r = 0;
2805         goto out;
2806     }
2807 
2808     if (!p->xnack_enabled) {
2809         pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
2810         r = -EFAULT;
2811         goto out;
2812     }
2813 
2814     /* p->lead_thread is available as kfd_process_wq_release flush the work
2815      * before releasing task ref.
2816      */
2817     mm = get_task_mm(p->lead_thread);
2818     if (!mm) {
2819         pr_debug("svms 0x%p failed to get mm\n", svms);
2820         r = 0;
2821         goto out;
2822     }
2823 
2824     mmap_read_lock(mm);
2825 retry_write_locked:
2826     mutex_lock(&svms->lock);
2827     prange = svm_range_from_addr(svms, addr, NULL);
2828     if (!prange) {
2829         pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
2830              svms, addr);
2831         if (!write_locked) {
2832             /* Need the write lock to create new range with MMU notifier.
2833              * Also flush pending deferred work to make sure the interval
2834              * tree is up to date before we add a new range
2835              */
2836             mutex_unlock(&svms->lock);
2837             mmap_read_unlock(mm);
2838             mmap_write_lock(mm);
2839             write_locked = true;
2840             goto retry_write_locked;
2841         }
2842         prange = svm_range_create_unregistered_range(adev, p, mm, addr);
2843         if (!prange) {
2844             pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n",
2845                  svms, addr);
2846             mmap_write_downgrade(mm);
2847             r = -EFAULT;
2848             goto out_unlock_svms;
2849         }
2850     }
2851     if (write_locked)
2852         mmap_write_downgrade(mm);
2853 
2854     mutex_lock(&prange->migrate_mutex);
2855 
2856     if (svm_range_skip_recover(prange)) {
2857         amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
2858         r = 0;
2859         goto out_unlock_range;
2860     }
2861 
2862     /* skip duplicate vm fault on different pages of same range */
2863     if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
2864                 AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
2865         pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
2866              svms, prange->start, prange->last);
2867         r = 0;
2868         goto out_unlock_range;
2869     }
2870 
2871     /* __do_munmap removed VMA, return success as we are handling stale
2872      * retry fault.
2873      */
2874     vma = find_vma(mm, addr << PAGE_SHIFT);
2875     if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
2876         pr_debug("address 0x%llx VMA is removed\n", addr);
2877         r = 0;
2878         goto out_unlock_range;
2879     }
2880 
2881     if (!svm_fault_allowed(vma, write_fault)) {
2882         pr_debug("fault addr 0x%llx no %s permission\n", addr,
2883             write_fault ? "write" : "read");
2884         r = -EPERM;
2885         goto out_unlock_range;
2886     }
2887 
2888     best_loc = svm_range_best_restore_location(prange, adev, &gpuidx);
2889     if (best_loc == -1) {
2890         pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
2891              svms, prange->start, prange->last);
2892         r = -EACCES;
2893         goto out_unlock_range;
2894     }
2895 
2896     pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
2897          svms, prange->start, prange->last, best_loc,
2898          prange->actual_loc);
2899 
2900     kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr,
2901                        write_fault, timestamp);
2902 
2903     if (prange->actual_loc != best_loc) {
2904         migration = true;
2905         if (best_loc) {
2906             r = svm_migrate_to_vram(prange, best_loc, mm,
2907                     KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
2908             if (r) {
2909                 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n",
2910                      r, addr);
2911                 /* Fallback to system memory if migration to
2912                  * VRAM failed
2913                  */
2914                 if (prange->actual_loc)
2915                     r = svm_migrate_vram_to_ram(prange, mm,
2916                        KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
2917                 else
2918                     r = 0;
2919             }
2920         } else {
2921             r = svm_migrate_vram_to_ram(prange, mm,
2922                     KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
2923         }
2924         if (r) {
2925             pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
2926                  r, svms, prange->start, prange->last);
2927             goto out_unlock_range;
2928         }
2929     }
2930 
2931     r = svm_range_validate_and_map(mm, prange, gpuidx, false, false, false);
2932     if (r)
2933         pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
2934              r, svms, prange->start, prange->last);
2935 
2936     kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr,
2937                      migration);
2938 
2939 out_unlock_range:
2940     mutex_unlock(&prange->migrate_mutex);
2941 out_unlock_svms:
2942     mutex_unlock(&svms->lock);
2943     mmap_read_unlock(mm);
2944 
2945     svm_range_count_fault(adev, p, gpuidx);
2946 
2947     mmput(mm);
2948 out:
2949     kfd_unref_process(p);
2950 
2951     if (r == -EAGAIN) {
2952         pr_debug("recover vm fault later\n");
2953         amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
2954         r = 0;
2955     }
2956     return r;
2957 }
2958 
2959 void svm_range_list_fini(struct kfd_process *p)
2960 {
2961     struct svm_range *prange;
2962     struct svm_range *next;
2963 
2964     pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
2965 
2966     cancel_delayed_work_sync(&p->svms.restore_work);
2967 
2968     /* Ensure list work is finished before process is destroyed */
2969     flush_work(&p->svms.deferred_list_work);
2970 
2971     /*
2972      * Ensure no retry fault comes in afterwards, as page fault handler will
2973      * not find kfd process and take mm lock to recover fault.
2974      */
2975     atomic_inc(&p->svms.drain_pagefaults);
2976     svm_range_drain_retry_fault(&p->svms);
2977 
2978     list_for_each_entry_safe(prange, next, &p->svms.list, list) {
2979         svm_range_unlink(prange);
2980         svm_range_remove_notifier(prange);
2981         svm_range_free(prange, true);
2982     }
2983 
2984     mutex_destroy(&p->svms.lock);
2985 
2986     pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms);
2987 }
2988 
2989 int svm_range_list_init(struct kfd_process *p)
2990 {
2991     struct svm_range_list *svms = &p->svms;
2992     int i;
2993 
2994     svms->objects = RB_ROOT_CACHED;
2995     mutex_init(&svms->lock);
2996     INIT_LIST_HEAD(&svms->list);
2997     atomic_set(&svms->evicted_ranges, 0);
2998     atomic_set(&svms->drain_pagefaults, 0);
2999     INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
3000     INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
3001     INIT_LIST_HEAD(&svms->deferred_range_list);
3002     INIT_LIST_HEAD(&svms->criu_svm_metadata_list);
3003     spin_lock_init(&svms->deferred_list_lock);
3004 
3005     for (i = 0; i < p->n_pdds; i++)
3006         if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev))
3007             bitmap_set(svms->bitmap_supported, i, 1);
3008 
3009     return 0;
3010 }
3011 
3012 /**
3013  * svm_range_check_vm - check if virtual address range mapped already
3014  * @p: current kfd_process
3015  * @start: range start address, in pages
3016  * @last: range last address, in pages
3017  * @bo_s: mapping start address in pages if address range already mapped
3018  * @bo_l: mapping last address in pages if address range already mapped
3019  *
3020  * The purpose is to avoid virtual address ranges already allocated by
3021  * kfd_ioctl_alloc_memory_of_gpu ioctl.
3022  * It looks for each pdd in the kfd_process.
3023  *
3024  * Context: Process context
3025  *
3026  * Return 0 - OK, if the range is not mapped.
3027  * Otherwise error code:
3028  * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu
3029  * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by
3030  * a signal. Release all buffer reservations and return to user-space.
3031  */
3032 static int
3033 svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
3034            uint64_t *bo_s, uint64_t *bo_l)
3035 {
3036     struct amdgpu_bo_va_mapping *mapping;
3037     struct interval_tree_node *node;
3038     uint32_t i;
3039     int r;
3040 
3041     for (i = 0; i < p->n_pdds; i++) {
3042         struct amdgpu_vm *vm;
3043 
3044         if (!p->pdds[i]->drm_priv)
3045             continue;
3046 
3047         vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
3048         r = amdgpu_bo_reserve(vm->root.bo, false);
3049         if (r)
3050             return r;
3051 
3052         node = interval_tree_iter_first(&vm->va, start, last);
3053         if (node) {
3054             pr_debug("range [0x%llx 0x%llx] already TTM mapped\n",
3055                  start, last);
3056             mapping = container_of((struct rb_node *)node,
3057                            struct amdgpu_bo_va_mapping, rb);
3058             if (bo_s && bo_l) {
3059                 *bo_s = mapping->start;
3060                 *bo_l = mapping->last;
3061             }
3062             amdgpu_bo_unreserve(vm->root.bo);
3063             return -EADDRINUSE;
3064         }
3065         amdgpu_bo_unreserve(vm->root.bo);
3066     }
3067 
3068     return 0;
3069 }
3070 
3071 /**
3072  * svm_range_is_valid - check if virtual address range is valid
3073  * @p: current kfd_process
3074  * @start: range start address, in pages
3075  * @size: range size, in pages
3076  *
3077  * Valid virtual address range means it belongs to one or more VMAs
3078  *
3079  * Context: Process context
3080  *
3081  * Return:
3082  *  0 - OK, otherwise error code
3083  */
3084 static int
3085 svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size)
3086 {
3087     const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
3088     struct vm_area_struct *vma;
3089     unsigned long end;
3090     unsigned long start_unchg = start;
3091 
3092     start <<= PAGE_SHIFT;
3093     end = start + (size << PAGE_SHIFT);
3094     do {
3095         vma = find_vma(p->mm, start);
3096         if (!vma || start < vma->vm_start ||
3097             (vma->vm_flags & device_vma))
3098             return -EFAULT;
3099         start = min(end, vma->vm_end);
3100     } while (start < end);
3101 
3102     return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL,
3103                   NULL);
3104 }
3105 
3106 /**
3107  * svm_range_best_prefetch_location - decide the best prefetch location
3108  * @prange: svm range structure
3109  *
3110  * For xnack off:
3111  * If range map to single GPU, the best prefetch location is prefetch_loc, which
3112  * can be CPU or GPU.
3113  *
3114  * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on
3115  * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise
3116  * the best prefetch location is always CPU, because GPU can not have coherent
3117  * mapping VRAM of other GPUs even with large-BAR PCIe connection.
3118  *
3119  * For xnack on:
3120  * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is
3121  * prefetch_loc, other GPU access will generate vm fault and trigger migration.
3122  *
3123  * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same
3124  * hive, the best prefetch location is prefetch_loc GPU, otherwise the best
3125  * prefetch location is always CPU.
3126  *
3127  * Context: Process context
3128  *
3129  * Return:
3130  * 0 for CPU or GPU id
3131  */
3132 static uint32_t
3133 svm_range_best_prefetch_location(struct svm_range *prange)
3134 {
3135     DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
3136     uint32_t best_loc = prange->prefetch_loc;
3137     struct kfd_process_device *pdd;
3138     struct amdgpu_device *bo_adev;
3139     struct kfd_process *p;
3140     uint32_t gpuidx;
3141 
3142     p = container_of(prange->svms, struct kfd_process, svms);
3143 
3144     if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
3145         goto out;
3146 
3147     bo_adev = svm_range_get_adev_by_id(prange, best_loc);
3148     if (!bo_adev) {
3149         WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc);
3150         best_loc = 0;
3151         goto out;
3152     }
3153 
3154     if (p->xnack_enabled)
3155         bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
3156     else
3157         bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
3158               MAX_GPU_INSTANCE);
3159 
3160     for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
3161         pdd = kfd_process_device_from_gpuidx(p, gpuidx);
3162         if (!pdd) {
3163             pr_debug("failed to get device by idx 0x%x\n", gpuidx);
3164             continue;
3165         }
3166 
3167         if (pdd->dev->adev == bo_adev)
3168             continue;
3169 
3170         if (!amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) {
3171             best_loc = 0;
3172             break;
3173         }
3174     }
3175 
3176 out:
3177     pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n",
3178          p->xnack_enabled, &p->svms, prange->start, prange->last,
3179          best_loc);
3180 
3181     return best_loc;
3182 }
3183 
3184 /* FIXME: This is a workaround for page locking bug when some pages are
3185  * invalid during migration to VRAM
3186  */
3187 void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm,
3188             void *owner)
3189 {
3190     struct hmm_range *hmm_range;
3191     int r;
3192 
3193     if (prange->validated_once)
3194         return;
3195 
3196     r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
3197                        prange->start << PAGE_SHIFT,
3198                        prange->npages, &hmm_range,
3199                        false, true, owner);
3200     if (!r) {
3201         amdgpu_hmm_range_get_pages_done(hmm_range);
3202         prange->validated_once = true;
3203     }
3204 }
3205 
3206 /* svm_range_trigger_migration - start page migration if prefetch loc changed
3207  * @mm: current process mm_struct
3208  * @prange: svm range structure
3209  * @migrated: output, true if migration is triggered
3210  *
3211  * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range
3212  * from ram to vram.
3213  * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range
3214  * from vram to ram.
3215  *
3216  * If GPU vm fault retry is not enabled, migration interact with MMU notifier
3217  * and restore work:
3218  * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict
3219  *    stops all queues, schedule restore work
3220  * 2. svm_range_restore_work wait for migration is done by
3221  *    a. svm_range_validate_vram takes prange->migrate_mutex
3222  *    b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns
3223  * 3. restore work update mappings of GPU, resume all queues.
3224  *
3225  * Context: Process context
3226  *
3227  * Return:
3228  * 0 - OK, otherwise - error code of migration
3229  */
3230 static int
3231 svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
3232                 bool *migrated)
3233 {
3234     uint32_t best_loc;
3235     int r = 0;
3236 
3237     *migrated = false;
3238     best_loc = svm_range_best_prefetch_location(prange);
3239 
3240     if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3241         best_loc == prange->actual_loc)
3242         return 0;
3243 
3244     if (!best_loc) {
3245         r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH);
3246         *migrated = !r;
3247         return r;
3248     }
3249 
3250     r = svm_migrate_to_vram(prange, best_loc, mm, KFD_MIGRATE_TRIGGER_PREFETCH);
3251     *migrated = !r;
3252 
3253     return r;
3254 }
3255 
3256 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
3257 {
3258     if (!fence)
3259         return -EINVAL;
3260 
3261     if (dma_fence_is_signaled(&fence->base))
3262         return 0;
3263 
3264     if (fence->svm_bo) {
3265         WRITE_ONCE(fence->svm_bo->evicting, 1);
3266         schedule_work(&fence->svm_bo->eviction_work);
3267     }
3268 
3269     return 0;
3270 }
3271 
3272 static void svm_range_evict_svm_bo_worker(struct work_struct *work)
3273 {
3274     struct svm_range_bo *svm_bo;
3275     struct mm_struct *mm;
3276     int r = 0;
3277 
3278     svm_bo = container_of(work, struct svm_range_bo, eviction_work);
3279     if (!svm_bo_ref_unless_zero(svm_bo))
3280         return; /* svm_bo was freed while eviction was pending */
3281 
3282     if (mmget_not_zero(svm_bo->eviction_fence->mm)) {
3283         mm = svm_bo->eviction_fence->mm;
3284     } else {
3285         svm_range_bo_unref(svm_bo);
3286         return;
3287     }
3288 
3289     mmap_read_lock(mm);
3290     spin_lock(&svm_bo->list_lock);
3291     while (!list_empty(&svm_bo->range_list) && !r) {
3292         struct svm_range *prange =
3293                 list_first_entry(&svm_bo->range_list,
3294                         struct svm_range, svm_bo_list);
3295         int retries = 3;
3296 
3297         list_del_init(&prange->svm_bo_list);
3298         spin_unlock(&svm_bo->list_lock);
3299 
3300         pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
3301              prange->start, prange->last);
3302 
3303         mutex_lock(&prange->migrate_mutex);
3304         do {
3305             r = svm_migrate_vram_to_ram(prange, mm,
3306                         KFD_MIGRATE_TRIGGER_TTM_EVICTION);
3307         } while (!r && prange->actual_loc && --retries);
3308 
3309         if (!r && prange->actual_loc)
3310             pr_info_once("Migration failed during eviction");
3311 
3312         if (!prange->actual_loc) {
3313             mutex_lock(&prange->lock);
3314             prange->svm_bo = NULL;
3315             mutex_unlock(&prange->lock);
3316         }
3317         mutex_unlock(&prange->migrate_mutex);
3318 
3319         spin_lock(&svm_bo->list_lock);
3320     }
3321     spin_unlock(&svm_bo->list_lock);
3322     mmap_read_unlock(mm);
3323     mmput(mm);
3324 
3325     dma_fence_signal(&svm_bo->eviction_fence->base);
3326 
3327     /* This is the last reference to svm_bo, after svm_range_vram_node_free
3328      * has been called in svm_migrate_vram_to_ram
3329      */
3330     WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
3331     svm_range_bo_unref(svm_bo);
3332 }
3333 
3334 static int
3335 svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
3336            uint64_t start, uint64_t size, uint32_t nattr,
3337            struct kfd_ioctl_svm_attribute *attrs)
3338 {
3339     struct amdkfd_process_info *process_info = p->kgd_process_info;
3340     struct list_head update_list;
3341     struct list_head insert_list;
3342     struct list_head remove_list;
3343     struct svm_range_list *svms;
3344     struct svm_range *prange;
3345     struct svm_range *next;
3346     bool update_mapping = false;
3347     bool flush_tlb;
3348     int r = 0;
3349 
3350     pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
3351          p->pasid, &p->svms, start, start + size - 1, size);
3352 
3353     r = svm_range_check_attr(p, nattr, attrs);
3354     if (r)
3355         return r;
3356 
3357     svms = &p->svms;
3358 
3359     mutex_lock(&process_info->lock);
3360 
3361     svm_range_list_lock_and_flush_work(svms, mm);
3362 
3363     r = svm_range_is_valid(p, start, size);
3364     if (r) {
3365         pr_debug("invalid range r=%d\n", r);
3366         mmap_write_unlock(mm);
3367         goto out;
3368     }
3369 
3370     mutex_lock(&svms->lock);
3371 
3372     /* Add new range and split existing ranges as needed */
3373     r = svm_range_add(p, start, size, nattr, attrs, &update_list,
3374               &insert_list, &remove_list);
3375     if (r) {
3376         mutex_unlock(&svms->lock);
3377         mmap_write_unlock(mm);
3378         goto out;
3379     }
3380     /* Apply changes as a transaction */
3381     list_for_each_entry_safe(prange, next, &insert_list, list) {
3382         svm_range_add_to_svms(prange);
3383         svm_range_add_notifier_locked(mm, prange);
3384     }
3385     list_for_each_entry(prange, &update_list, update_list) {
3386         svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping);
3387         /* TODO: unmap ranges from GPU that lost access */
3388     }
3389     list_for_each_entry_safe(prange, next, &remove_list, update_list) {
3390         pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n",
3391              prange->svms, prange, prange->start,
3392              prange->last);
3393         svm_range_unlink(prange);
3394         svm_range_remove_notifier(prange);
3395         svm_range_free(prange, false);
3396     }
3397 
3398     mmap_write_downgrade(mm);
3399     /* Trigger migrations and revalidate and map to GPUs as needed. If
3400      * this fails we may be left with partially completed actions. There
3401      * is no clean way of rolling back to the previous state in such a
3402      * case because the rollback wouldn't be guaranteed to work either.
3403      */
3404     list_for_each_entry(prange, &update_list, update_list) {
3405         bool migrated;
3406 
3407         mutex_lock(&prange->migrate_mutex);
3408 
3409         r = svm_range_trigger_migration(mm, prange, &migrated);
3410         if (r)
3411             goto out_unlock_range;
3412 
3413         if (migrated && (!p->xnack_enabled ||
3414             (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) &&
3415             prange->mapped_to_gpu) {
3416             pr_debug("restore_work will update mappings of GPUs\n");
3417             mutex_unlock(&prange->migrate_mutex);
3418             continue;
3419         }
3420 
3421         if (!migrated && !update_mapping) {
3422             mutex_unlock(&prange->migrate_mutex);
3423             continue;
3424         }
3425 
3426         flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu;
3427 
3428         r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
3429                            true, true, flush_tlb);
3430         if (r)
3431             pr_debug("failed %d to map svm range\n", r);
3432 
3433 out_unlock_range:
3434         mutex_unlock(&prange->migrate_mutex);
3435         if (r)
3436             break;
3437     }
3438 
3439     svm_range_debug_dump(svms);
3440 
3441     mutex_unlock(&svms->lock);
3442     mmap_read_unlock(mm);
3443 out:
3444     mutex_unlock(&process_info->lock);
3445 
3446     pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
3447          &p->svms, start, start + size - 1, r);
3448 
3449     return r;
3450 }
3451 
3452 static int
3453 svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm,
3454            uint64_t start, uint64_t size, uint32_t nattr,
3455            struct kfd_ioctl_svm_attribute *attrs)
3456 {
3457     DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
3458     DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
3459     bool get_preferred_loc = false;
3460     bool get_prefetch_loc = false;
3461     bool get_granularity = false;
3462     bool get_accessible = false;
3463     bool get_flags = false;
3464     uint64_t last = start + size - 1UL;
3465     uint8_t granularity = 0xff;
3466     struct interval_tree_node *node;
3467     struct svm_range_list *svms;
3468     struct svm_range *prange;
3469     uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3470     uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3471     uint32_t flags_and = 0xffffffff;
3472     uint32_t flags_or = 0;
3473     int gpuidx;
3474     uint32_t i;
3475     int r = 0;
3476 
3477     pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start,
3478          start + size - 1, nattr);
3479 
3480     /* Flush pending deferred work to avoid racing with deferred actions from
3481      * previous memory map changes (e.g. munmap). Concurrent memory map changes
3482      * can still race with get_attr because we don't hold the mmap lock. But that
3483      * would be a race condition in the application anyway, and undefined
3484      * behaviour is acceptable in that case.
3485      */
3486     flush_work(&p->svms.deferred_list_work);
3487 
3488     mmap_read_lock(mm);
3489     r = svm_range_is_valid(p, start, size);
3490     mmap_read_unlock(mm);
3491     if (r) {
3492         pr_debug("invalid range r=%d\n", r);
3493         return r;
3494     }
3495 
3496     for (i = 0; i < nattr; i++) {
3497         switch (attrs[i].type) {
3498         case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3499             get_preferred_loc = true;
3500             break;
3501         case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3502             get_prefetch_loc = true;
3503             break;
3504         case KFD_IOCTL_SVM_ATTR_ACCESS:
3505             get_accessible = true;
3506             break;
3507         case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3508         case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3509             get_flags = true;
3510             break;
3511         case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3512             get_granularity = true;
3513             break;
3514         case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
3515         case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
3516             fallthrough;
3517         default:
3518             pr_debug("get invalid attr type 0x%x\n", attrs[i].type);
3519             return -EINVAL;
3520         }
3521     }
3522 
3523     svms = &p->svms;
3524 
3525     mutex_lock(&svms->lock);
3526 
3527     node = interval_tree_iter_first(&svms->objects, start, last);
3528     if (!node) {
3529         pr_debug("range attrs not found return default values\n");
3530         svm_range_set_default_attributes(&location, &prefetch_loc,
3531                          &granularity, &flags_and);
3532         flags_or = flags_and;
3533         if (p->xnack_enabled)
3534             bitmap_copy(bitmap_access, svms->bitmap_supported,
3535                     MAX_GPU_INSTANCE);
3536         else
3537             bitmap_zero(bitmap_access, MAX_GPU_INSTANCE);
3538         bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE);
3539         goto fill_values;
3540     }
3541     bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE);
3542     bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE);
3543 
3544     while (node) {
3545         struct interval_tree_node *next;
3546 
3547         prange = container_of(node, struct svm_range, it_node);
3548         next = interval_tree_iter_next(node, start, last);
3549 
3550         if (get_preferred_loc) {
3551             if (prange->preferred_loc ==
3552                     KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3553                 (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3554                  location != prange->preferred_loc)) {
3555                 location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3556                 get_preferred_loc = false;
3557             } else {
3558                 location = prange->preferred_loc;
3559             }
3560         }
3561         if (get_prefetch_loc) {
3562             if (prange->prefetch_loc ==
3563                     KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3564                 (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3565                  prefetch_loc != prange->prefetch_loc)) {
3566                 prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3567                 get_prefetch_loc = false;
3568             } else {
3569                 prefetch_loc = prange->prefetch_loc;
3570             }
3571         }
3572         if (get_accessible) {
3573             bitmap_and(bitmap_access, bitmap_access,
3574                    prange->bitmap_access, MAX_GPU_INSTANCE);
3575             bitmap_and(bitmap_aip, bitmap_aip,
3576                    prange->bitmap_aip, MAX_GPU_INSTANCE);
3577         }
3578         if (get_flags) {
3579             flags_and &= prange->flags;
3580             flags_or |= prange->flags;
3581         }
3582 
3583         if (get_granularity && prange->granularity < granularity)
3584             granularity = prange->granularity;
3585 
3586         node = next;
3587     }
3588 fill_values:
3589     mutex_unlock(&svms->lock);
3590 
3591     for (i = 0; i < nattr; i++) {
3592         switch (attrs[i].type) {
3593         case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3594             attrs[i].value = location;
3595             break;
3596         case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3597             attrs[i].value = prefetch_loc;
3598             break;
3599         case KFD_IOCTL_SVM_ATTR_ACCESS:
3600             gpuidx = kfd_process_gpuidx_from_gpuid(p,
3601                                    attrs[i].value);
3602             if (gpuidx < 0) {
3603                 pr_debug("invalid gpuid %x\n", attrs[i].value);
3604                 return -EINVAL;
3605             }
3606             if (test_bit(gpuidx, bitmap_access))
3607                 attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS;
3608             else if (test_bit(gpuidx, bitmap_aip))
3609                 attrs[i].type =
3610                     KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE;
3611             else
3612                 attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS;
3613             break;
3614         case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3615             attrs[i].value = flags_and;
3616             break;
3617         case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3618             attrs[i].value = ~flags_or;
3619             break;
3620         case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3621             attrs[i].value = (uint32_t)granularity;
3622             break;
3623         }
3624     }
3625 
3626     return 0;
3627 }
3628 
3629 int kfd_criu_resume_svm(struct kfd_process *p)
3630 {
3631     struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL;
3632     int nattr_common = 4, nattr_accessibility = 1;
3633     struct criu_svm_metadata *criu_svm_md = NULL;
3634     struct svm_range_list *svms = &p->svms;
3635     struct criu_svm_metadata *next = NULL;
3636     uint32_t set_flags = 0xffffffff;
3637     int i, j, num_attrs, ret = 0;
3638     uint64_t set_attr_size;
3639     struct mm_struct *mm;
3640 
3641     if (list_empty(&svms->criu_svm_metadata_list)) {
3642         pr_debug("No SVM data from CRIU restore stage 2\n");
3643         return ret;
3644     }
3645 
3646     mm = get_task_mm(p->lead_thread);
3647     if (!mm) {
3648         pr_err("failed to get mm for the target process\n");
3649         return -ESRCH;
3650     }
3651 
3652     num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);
3653 
3654     i = j = 0;
3655     list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
3656         pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n",
3657              i, criu_svm_md->data.start_addr, criu_svm_md->data.size);
3658 
3659         for (j = 0; j < num_attrs; j++) {
3660             pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x\ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
3661                  i, j, criu_svm_md->data.attrs[j].type,
3662                  i, j, criu_svm_md->data.attrs[j].value);
3663             switch (criu_svm_md->data.attrs[j].type) {
3664             /* During Checkpoint operation, the query for
3665              * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might
3666              * return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were
3667              * not used by the range which was checkpointed. Care
3668              * must be taken to not restore with an invalid value
3669              * otherwise the gpuidx value will be invalid and
3670              * set_attr would eventually fail so just replace those
3671              * with another dummy attribute such as
3672              * KFD_IOCTL_SVM_ATTR_SET_FLAGS.
3673              */
3674             case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3675                 if (criu_svm_md->data.attrs[j].value ==
3676                     KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
3677                     criu_svm_md->data.attrs[j].type =
3678                         KFD_IOCTL_SVM_ATTR_SET_FLAGS;
3679                     criu_svm_md->data.attrs[j].value = 0;
3680                 }
3681                 break;
3682             case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3683                 set_flags = criu_svm_md->data.attrs[j].value;
3684                 break;
3685             default:
3686                 break;
3687             }
3688         }
3689 
3690         /* CLR_FLAGS is not available via get_attr during checkpoint but
3691          * it needs to be inserted before restoring the ranges so
3692          * allocate extra space for it before calling set_attr
3693          */
3694         set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
3695                         (num_attrs + 1);
3696         set_attr_new = krealloc(set_attr, set_attr_size,
3697                         GFP_KERNEL);
3698         if (!set_attr_new) {
3699             ret = -ENOMEM;
3700             goto exit;
3701         }
3702         set_attr = set_attr_new;
3703 
3704         memcpy(set_attr, criu_svm_md->data.attrs, num_attrs *
3705                     sizeof(struct kfd_ioctl_svm_attribute));
3706         set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS;
3707         set_attr[num_attrs].value = ~set_flags;
3708 
3709         ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr,
3710                      criu_svm_md->data.size, num_attrs + 1,
3711                      set_attr);
3712         if (ret) {
3713             pr_err("CRIU: failed to set range attributes\n");
3714             goto exit;
3715         }
3716 
3717         i++;
3718     }
3719 exit:
3720     kfree(set_attr);
3721     list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) {
3722         pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
3723                         criu_svm_md->data.start_addr);
3724         kfree(criu_svm_md);
3725     }
3726 
3727     mmput(mm);
3728     return ret;
3729 
3730 }
3731 
3732 int kfd_criu_restore_svm(struct kfd_process *p,
3733              uint8_t __user *user_priv_ptr,
3734              uint64_t *priv_data_offset,
3735              uint64_t max_priv_data_size)
3736 {
3737     uint64_t svm_priv_data_size, svm_object_md_size, svm_attrs_size;
3738     int nattr_common = 4, nattr_accessibility = 1;
3739     struct criu_svm_metadata *criu_svm_md = NULL;
3740     struct svm_range_list *svms = &p->svms;
3741     uint32_t num_devices;
3742     int ret = 0;
3743 
3744     num_devices = p->n_pdds;
3745     /* Handle one SVM range object at a time, also the number of gpus are
3746      * assumed to be same on the restore node, checking must be done while
3747      * evaluating the topology earlier
3748      */
3749 
3750     svm_attrs_size = sizeof(struct kfd_ioctl_svm_attribute) *
3751         (nattr_common + nattr_accessibility * num_devices);
3752     svm_object_md_size = sizeof(struct criu_svm_metadata) + svm_attrs_size;
3753 
3754     svm_priv_data_size = sizeof(struct kfd_criu_svm_range_priv_data) +
3755                                 svm_attrs_size;
3756 
3757     criu_svm_md = kzalloc(svm_object_md_size, GFP_KERNEL);
3758     if (!criu_svm_md) {
3759         pr_err("failed to allocate memory to store svm metadata\n");
3760         return -ENOMEM;
3761     }
3762     if (*priv_data_offset + svm_priv_data_size > max_priv_data_size) {
3763         ret = -EINVAL;
3764         goto exit;
3765     }
3766 
3767     ret = copy_from_user(&criu_svm_md->data, user_priv_ptr + *priv_data_offset,
3768                  svm_priv_data_size);
3769     if (ret) {
3770         ret = -EFAULT;
3771         goto exit;
3772     }
3773     *priv_data_offset += svm_priv_data_size;
3774 
3775     list_add_tail(&criu_svm_md->list, &svms->criu_svm_metadata_list);
3776 
3777     return 0;
3778 
3779 
3780 exit:
3781     kfree(criu_svm_md);
3782     return ret;
3783 }
3784 
3785 int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
3786                uint64_t *svm_priv_data_size)
3787 {
3788     uint64_t total_size, accessibility_size, common_attr_size;
3789     int nattr_common = 4, nattr_accessibility = 1;
3790     int num_devices = p->n_pdds;
3791     struct svm_range_list *svms;
3792     struct svm_range *prange;
3793     uint32_t count = 0;
3794 
3795     *svm_priv_data_size = 0;
3796 
3797     svms = &p->svms;
3798     if (!svms)
3799         return -EINVAL;
3800 
3801     mutex_lock(&svms->lock);
3802     list_for_each_entry(prange, &svms->list, list) {
3803         pr_debug("prange: 0x%p start: 0x%lx\t npages: 0x%llx\t end: 0x%llx\n",
3804              prange, prange->start, prange->npages,
3805              prange->start + prange->npages - 1);
3806         count++;
3807     }
3808     mutex_unlock(&svms->lock);
3809 
3810     *num_svm_ranges = count;
3811     /* Only the accessbility attributes need to be queried for all the gpus
3812      * individually, remaining ones are spanned across the entire process
3813      * regardless of the various gpu nodes. Of the remaining attributes,
3814      * KFD_IOCTL_SVM_ATTR_CLR_FLAGS need not be saved.
3815      *
3816      * KFD_IOCTL_SVM_ATTR_PREFERRED_LOC
3817      * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC
3818      * KFD_IOCTL_SVM_ATTR_SET_FLAGS
3819      * KFD_IOCTL_SVM_ATTR_GRANULARITY
3820      *
3821      * ** ACCESSBILITY ATTRIBUTES **
3822      * (Considered as one, type is altered during query, value is gpuid)
3823      * KFD_IOCTL_SVM_ATTR_ACCESS
3824      * KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE
3825      * KFD_IOCTL_SVM_ATTR_NO_ACCESS
3826      */
3827     if (*num_svm_ranges > 0) {
3828         common_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
3829             nattr_common;
3830         accessibility_size = sizeof(struct kfd_ioctl_svm_attribute) *
3831             nattr_accessibility * num_devices;
3832 
3833         total_size = sizeof(struct kfd_criu_svm_range_priv_data) +
3834             common_attr_size + accessibility_size;
3835 
3836         *svm_priv_data_size = *num_svm_ranges * total_size;
3837     }
3838 
3839     pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges,
3840          *svm_priv_data_size);
3841     return 0;
3842 }
3843 
3844 int kfd_criu_checkpoint_svm(struct kfd_process *p,
3845                 uint8_t __user *user_priv_data,
3846                 uint64_t *priv_data_offset)
3847 {
3848     struct kfd_criu_svm_range_priv_data *svm_priv = NULL;
3849     struct kfd_ioctl_svm_attribute *query_attr = NULL;
3850     uint64_t svm_priv_data_size, query_attr_size = 0;
3851     int index, nattr_common = 4, ret = 0;
3852     struct svm_range_list *svms;
3853     int num_devices = p->n_pdds;
3854     struct svm_range *prange;
3855     struct mm_struct *mm;
3856 
3857     svms = &p->svms;
3858     if (!svms)
3859         return -EINVAL;
3860 
3861     mm = get_task_mm(p->lead_thread);
3862     if (!mm) {
3863         pr_err("failed to get mm for the target process\n");
3864         return -ESRCH;
3865     }
3866 
3867     query_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
3868                 (nattr_common + num_devices);
3869 
3870     query_attr = kzalloc(query_attr_size, GFP_KERNEL);
3871     if (!query_attr) {
3872         ret = -ENOMEM;
3873         goto exit;
3874     }
3875 
3876     query_attr[0].type = KFD_IOCTL_SVM_ATTR_PREFERRED_LOC;
3877     query_attr[1].type = KFD_IOCTL_SVM_ATTR_PREFETCH_LOC;
3878     query_attr[2].type = KFD_IOCTL_SVM_ATTR_SET_FLAGS;
3879     query_attr[3].type = KFD_IOCTL_SVM_ATTR_GRANULARITY;
3880 
3881     for (index = 0; index < num_devices; index++) {
3882         struct kfd_process_device *pdd = p->pdds[index];
3883 
3884         query_attr[index + nattr_common].type =
3885             KFD_IOCTL_SVM_ATTR_ACCESS;
3886         query_attr[index + nattr_common].value = pdd->user_gpu_id;
3887     }
3888 
3889     svm_priv_data_size = sizeof(*svm_priv) + query_attr_size;
3890 
3891     svm_priv = kzalloc(svm_priv_data_size, GFP_KERNEL);
3892     if (!svm_priv) {
3893         ret = -ENOMEM;
3894         goto exit_query;
3895     }
3896 
3897     index = 0;
3898     list_for_each_entry(prange, &svms->list, list) {
3899 
3900         svm_priv->object_type = KFD_CRIU_OBJECT_TYPE_SVM_RANGE;
3901         svm_priv->start_addr = prange->start;
3902         svm_priv->size = prange->npages;
3903         memcpy(&svm_priv->attrs, query_attr, query_attr_size);
3904         pr_debug("CRIU: prange: 0x%p start: 0x%lx\t npages: 0x%llx end: 0x%llx\t size: 0x%llx\n",
3905              prange, prange->start, prange->npages,
3906              prange->start + prange->npages - 1,
3907              prange->npages * PAGE_SIZE);
3908 
3909         ret = svm_range_get_attr(p, mm, svm_priv->start_addr,
3910                      svm_priv->size,
3911                      (nattr_common + num_devices),
3912                      svm_priv->attrs);
3913         if (ret) {
3914             pr_err("CRIU: failed to obtain range attributes\n");
3915             goto exit_priv;
3916         }
3917 
3918         if (copy_to_user(user_priv_data + *priv_data_offset, svm_priv,
3919                  svm_priv_data_size)) {
3920             pr_err("Failed to copy svm priv to user\n");
3921             ret = -EFAULT;
3922             goto exit_priv;
3923         }
3924 
3925         *priv_data_offset += svm_priv_data_size;
3926 
3927     }
3928 
3929 
3930 exit_priv:
3931     kfree(svm_priv);
3932 exit_query:
3933     kfree(query_attr);
3934 exit:
3935     mmput(mm);
3936     return ret;
3937 }
3938 
3939 int
3940 svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
3941       uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs)
3942 {
3943     struct mm_struct *mm = current->mm;
3944     int r;
3945 
3946     start >>= PAGE_SHIFT;
3947     size >>= PAGE_SHIFT;
3948 
3949     switch (op) {
3950     case KFD_IOCTL_SVM_OP_SET_ATTR:
3951         r = svm_range_set_attr(p, mm, start, size, nattrs, attrs);
3952         break;
3953     case KFD_IOCTL_SVM_OP_GET_ATTR:
3954         r = svm_range_get_attr(p, mm, start, size, nattrs, attrs);
3955         break;
3956     default:
3957         r = EINVAL;
3958         break;
3959     }
3960 
3961     return r;
3962 }