Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * This is a module to test the HMM (Heterogeneous Memory Management)
0004  * mirror and zone device private memory migration APIs of the kernel.
0005  * Userspace programs can register with the driver to mirror their own address
0006  * space and can use the device to read/write any valid virtual address.
0007  */
0008 #include <linux/init.h>
0009 #include <linux/fs.h>
0010 #include <linux/mm.h>
0011 #include <linux/module.h>
0012 #include <linux/kernel.h>
0013 #include <linux/cdev.h>
0014 #include <linux/device.h>
0015 #include <linux/memremap.h>
0016 #include <linux/mutex.h>
0017 #include <linux/rwsem.h>
0018 #include <linux/sched.h>
0019 #include <linux/slab.h>
0020 #include <linux/highmem.h>
0021 #include <linux/delay.h>
0022 #include <linux/pagemap.h>
0023 #include <linux/hmm.h>
0024 #include <linux/vmalloc.h>
0025 #include <linux/swap.h>
0026 #include <linux/swapops.h>
0027 #include <linux/sched/mm.h>
0028 #include <linux/platform_device.h>
0029 #include <linux/rmap.h>
0030 #include <linux/mmu_notifier.h>
0031 #include <linux/migrate.h>
0032 
0033 #include "test_hmm_uapi.h"
0034 
0035 #define DMIRROR_NDEVICES        4
0036 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000
0037 #define DEVMEM_CHUNK_SIZE       (256 * 1024 * 1024U)
0038 #define DEVMEM_CHUNKS_RESERVE       16
0039 
0040 /*
0041  * For device_private pages, dpage is just a dummy struct page
0042  * representing a piece of device memory. dmirror_devmem_alloc_page
0043  * allocates a real system memory page as backing storage to fake a
0044  * real device. zone_device_data points to that backing page. But
0045  * for device_coherent memory, the struct page represents real
0046  * physical CPU-accessible memory that we can use directly.
0047  */
0048 #define BACKING_PAGE(page) (is_device_private_page((page)) ? \
0049                (page)->zone_device_data : (page))
0050 
0051 static unsigned long spm_addr_dev0;
0052 module_param(spm_addr_dev0, long, 0644);
0053 MODULE_PARM_DESC(spm_addr_dev0,
0054         "Specify start address for SPM (special purpose memory) used for device 0. By setting this Coherent device type will be used. Make sure spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
0055 
0056 static unsigned long spm_addr_dev1;
0057 module_param(spm_addr_dev1, long, 0644);
0058 MODULE_PARM_DESC(spm_addr_dev1,
0059         "Specify start address for SPM (special purpose memory) used for device 1. By setting this Coherent device type will be used. Make sure spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
0060 
0061 static const struct dev_pagemap_ops dmirror_devmem_ops;
0062 static const struct mmu_interval_notifier_ops dmirror_min_ops;
0063 static dev_t dmirror_dev;
0064 
0065 struct dmirror_device;
0066 
0067 struct dmirror_bounce {
0068     void            *ptr;
0069     unsigned long       size;
0070     unsigned long       addr;
0071     unsigned long       cpages;
0072 };
0073 
0074 #define DPT_XA_TAG_ATOMIC 1UL
0075 #define DPT_XA_TAG_WRITE 3UL
0076 
0077 /*
0078  * Data structure to track address ranges and register for mmu interval
0079  * notifier updates.
0080  */
0081 struct dmirror_interval {
0082     struct mmu_interval_notifier    notifier;
0083     struct dmirror          *dmirror;
0084 };
0085 
0086 /*
0087  * Data attached to the open device file.
0088  * Note that it might be shared after a fork().
0089  */
0090 struct dmirror {
0091     struct dmirror_device       *mdevice;
0092     struct xarray           pt;
0093     struct mmu_interval_notifier    notifier;
0094     struct mutex            mutex;
0095 };
0096 
0097 /*
0098  * ZONE_DEVICE pages for migration and simulating device memory.
0099  */
0100 struct dmirror_chunk {
0101     struct dev_pagemap  pagemap;
0102     struct dmirror_device   *mdevice;
0103 };
0104 
0105 /*
0106  * Per device data.
0107  */
0108 struct dmirror_device {
0109     struct cdev     cdevice;
0110     struct hmm_devmem   *devmem;
0111     unsigned int            zone_device_type;
0112 
0113     unsigned int        devmem_capacity;
0114     unsigned int        devmem_count;
0115     struct dmirror_chunk    **devmem_chunks;
0116     struct mutex        devmem_lock;    /* protects the above */
0117 
0118     unsigned long       calloc;
0119     unsigned long       cfree;
0120     struct page     *free_pages;
0121     spinlock_t      lock;       /* protects the above */
0122 };
0123 
0124 static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES];
0125 
0126 static int dmirror_bounce_init(struct dmirror_bounce *bounce,
0127                    unsigned long addr,
0128                    unsigned long size)
0129 {
0130     bounce->addr = addr;
0131     bounce->size = size;
0132     bounce->cpages = 0;
0133     bounce->ptr = vmalloc(size);
0134     if (!bounce->ptr)
0135         return -ENOMEM;
0136     return 0;
0137 }
0138 
0139 static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
0140 {
0141     return (mdevice->zone_device_type ==
0142         HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
0143 }
0144 
0145 static enum migrate_vma_direction
0146 dmirror_select_device(struct dmirror *dmirror)
0147 {
0148     return (dmirror->mdevice->zone_device_type ==
0149         HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
0150         MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
0151         MIGRATE_VMA_SELECT_DEVICE_COHERENT;
0152 }
0153 
0154 static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
0155 {
0156     vfree(bounce->ptr);
0157 }
0158 
0159 static int dmirror_fops_open(struct inode *inode, struct file *filp)
0160 {
0161     struct cdev *cdev = inode->i_cdev;
0162     struct dmirror *dmirror;
0163     int ret;
0164 
0165     /* Mirror this process address space */
0166     dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL);
0167     if (dmirror == NULL)
0168         return -ENOMEM;
0169 
0170     dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice);
0171     mutex_init(&dmirror->mutex);
0172     xa_init(&dmirror->pt);
0173 
0174     ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm,
0175                 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops);
0176     if (ret) {
0177         kfree(dmirror);
0178         return ret;
0179     }
0180 
0181     filp->private_data = dmirror;
0182     return 0;
0183 }
0184 
0185 static int dmirror_fops_release(struct inode *inode, struct file *filp)
0186 {
0187     struct dmirror *dmirror = filp->private_data;
0188 
0189     mmu_interval_notifier_remove(&dmirror->notifier);
0190     xa_destroy(&dmirror->pt);
0191     kfree(dmirror);
0192     return 0;
0193 }
0194 
0195 static struct dmirror_device *dmirror_page_to_device(struct page *page)
0196 
0197 {
0198     return container_of(page->pgmap, struct dmirror_chunk,
0199                 pagemap)->mdevice;
0200 }
0201 
0202 static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range)
0203 {
0204     unsigned long *pfns = range->hmm_pfns;
0205     unsigned long pfn;
0206 
0207     for (pfn = (range->start >> PAGE_SHIFT);
0208          pfn < (range->end >> PAGE_SHIFT);
0209          pfn++, pfns++) {
0210         struct page *page;
0211         void *entry;
0212 
0213         /*
0214          * Since we asked for hmm_range_fault() to populate pages,
0215          * it shouldn't return an error entry on success.
0216          */
0217         WARN_ON(*pfns & HMM_PFN_ERROR);
0218         WARN_ON(!(*pfns & HMM_PFN_VALID));
0219 
0220         page = hmm_pfn_to_page(*pfns);
0221         WARN_ON(!page);
0222 
0223         entry = page;
0224         if (*pfns & HMM_PFN_WRITE)
0225             entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
0226         else if (WARN_ON(range->default_flags & HMM_PFN_WRITE))
0227             return -EFAULT;
0228         entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
0229         if (xa_is_err(entry))
0230             return xa_err(entry);
0231     }
0232 
0233     return 0;
0234 }
0235 
0236 static void dmirror_do_update(struct dmirror *dmirror, unsigned long start,
0237                   unsigned long end)
0238 {
0239     unsigned long pfn;
0240     void *entry;
0241 
0242     /*
0243      * The XArray doesn't hold references to pages since it relies on
0244      * the mmu notifier to clear page pointers when they become stale.
0245      * Therefore, it is OK to just clear the entry.
0246      */
0247     xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT,
0248               end >> PAGE_SHIFT)
0249         xa_erase(&dmirror->pt, pfn);
0250 }
0251 
0252 static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni,
0253                 const struct mmu_notifier_range *range,
0254                 unsigned long cur_seq)
0255 {
0256     struct dmirror *dmirror = container_of(mni, struct dmirror, notifier);
0257 
0258     /*
0259      * Ignore invalidation callbacks for device private pages since
0260      * the invalidation is handled as part of the migration process.
0261      */
0262     if (range->event == MMU_NOTIFY_MIGRATE &&
0263         range->owner == dmirror->mdevice)
0264         return true;
0265 
0266     if (mmu_notifier_range_blockable(range))
0267         mutex_lock(&dmirror->mutex);
0268     else if (!mutex_trylock(&dmirror->mutex))
0269         return false;
0270 
0271     mmu_interval_set_seq(mni, cur_seq);
0272     dmirror_do_update(dmirror, range->start, range->end);
0273 
0274     mutex_unlock(&dmirror->mutex);
0275     return true;
0276 }
0277 
0278 static const struct mmu_interval_notifier_ops dmirror_min_ops = {
0279     .invalidate = dmirror_interval_invalidate,
0280 };
0281 
0282 static int dmirror_range_fault(struct dmirror *dmirror,
0283                 struct hmm_range *range)
0284 {
0285     struct mm_struct *mm = dmirror->notifier.mm;
0286     unsigned long timeout =
0287         jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
0288     int ret;
0289 
0290     while (true) {
0291         if (time_after(jiffies, timeout)) {
0292             ret = -EBUSY;
0293             goto out;
0294         }
0295 
0296         range->notifier_seq = mmu_interval_read_begin(range->notifier);
0297         mmap_read_lock(mm);
0298         ret = hmm_range_fault(range);
0299         mmap_read_unlock(mm);
0300         if (ret) {
0301             if (ret == -EBUSY)
0302                 continue;
0303             goto out;
0304         }
0305 
0306         mutex_lock(&dmirror->mutex);
0307         if (mmu_interval_read_retry(range->notifier,
0308                         range->notifier_seq)) {
0309             mutex_unlock(&dmirror->mutex);
0310             continue;
0311         }
0312         break;
0313     }
0314 
0315     ret = dmirror_do_fault(dmirror, range);
0316 
0317     mutex_unlock(&dmirror->mutex);
0318 out:
0319     return ret;
0320 }
0321 
0322 static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
0323              unsigned long end, bool write)
0324 {
0325     struct mm_struct *mm = dmirror->notifier.mm;
0326     unsigned long addr;
0327     unsigned long pfns[64];
0328     struct hmm_range range = {
0329         .notifier = &dmirror->notifier,
0330         .hmm_pfns = pfns,
0331         .pfn_flags_mask = 0,
0332         .default_flags =
0333             HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0),
0334         .dev_private_owner = dmirror->mdevice,
0335     };
0336     int ret = 0;
0337 
0338     /* Since the mm is for the mirrored process, get a reference first. */
0339     if (!mmget_not_zero(mm))
0340         return 0;
0341 
0342     for (addr = start; addr < end; addr = range.end) {
0343         range.start = addr;
0344         range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
0345 
0346         ret = dmirror_range_fault(dmirror, &range);
0347         if (ret)
0348             break;
0349     }
0350 
0351     mmput(mm);
0352     return ret;
0353 }
0354 
0355 static int dmirror_do_read(struct dmirror *dmirror, unsigned long start,
0356                unsigned long end, struct dmirror_bounce *bounce)
0357 {
0358     unsigned long pfn;
0359     void *ptr;
0360 
0361     ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK);
0362 
0363     for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
0364         void *entry;
0365         struct page *page;
0366         void *tmp;
0367 
0368         entry = xa_load(&dmirror->pt, pfn);
0369         page = xa_untag_pointer(entry);
0370         if (!page)
0371             return -ENOENT;
0372 
0373         tmp = kmap(page);
0374         memcpy(ptr, tmp, PAGE_SIZE);
0375         kunmap(page);
0376 
0377         ptr += PAGE_SIZE;
0378         bounce->cpages++;
0379     }
0380 
0381     return 0;
0382 }
0383 
0384 static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
0385 {
0386     struct dmirror_bounce bounce;
0387     unsigned long start, end;
0388     unsigned long size = cmd->npages << PAGE_SHIFT;
0389     int ret;
0390 
0391     start = cmd->addr;
0392     end = start + size;
0393     if (end < start)
0394         return -EINVAL;
0395 
0396     ret = dmirror_bounce_init(&bounce, start, size);
0397     if (ret)
0398         return ret;
0399 
0400     while (1) {
0401         mutex_lock(&dmirror->mutex);
0402         ret = dmirror_do_read(dmirror, start, end, &bounce);
0403         mutex_unlock(&dmirror->mutex);
0404         if (ret != -ENOENT)
0405             break;
0406 
0407         start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
0408         ret = dmirror_fault(dmirror, start, end, false);
0409         if (ret)
0410             break;
0411         cmd->faults++;
0412     }
0413 
0414     if (ret == 0) {
0415         if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
0416                  bounce.size))
0417             ret = -EFAULT;
0418     }
0419     cmd->cpages = bounce.cpages;
0420     dmirror_bounce_fini(&bounce);
0421     return ret;
0422 }
0423 
0424 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start,
0425                 unsigned long end, struct dmirror_bounce *bounce)
0426 {
0427     unsigned long pfn;
0428     void *ptr;
0429 
0430     ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK);
0431 
0432     for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
0433         void *entry;
0434         struct page *page;
0435         void *tmp;
0436 
0437         entry = xa_load(&dmirror->pt, pfn);
0438         page = xa_untag_pointer(entry);
0439         if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE)
0440             return -ENOENT;
0441 
0442         tmp = kmap(page);
0443         memcpy(tmp, ptr, PAGE_SIZE);
0444         kunmap(page);
0445 
0446         ptr += PAGE_SIZE;
0447         bounce->cpages++;
0448     }
0449 
0450     return 0;
0451 }
0452 
0453 static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
0454 {
0455     struct dmirror_bounce bounce;
0456     unsigned long start, end;
0457     unsigned long size = cmd->npages << PAGE_SHIFT;
0458     int ret;
0459 
0460     start = cmd->addr;
0461     end = start + size;
0462     if (end < start)
0463         return -EINVAL;
0464 
0465     ret = dmirror_bounce_init(&bounce, start, size);
0466     if (ret)
0467         return ret;
0468     if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr),
0469                bounce.size)) {
0470         ret = -EFAULT;
0471         goto fini;
0472     }
0473 
0474     while (1) {
0475         mutex_lock(&dmirror->mutex);
0476         ret = dmirror_do_write(dmirror, start, end, &bounce);
0477         mutex_unlock(&dmirror->mutex);
0478         if (ret != -ENOENT)
0479             break;
0480 
0481         start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
0482         ret = dmirror_fault(dmirror, start, end, true);
0483         if (ret)
0484             break;
0485         cmd->faults++;
0486     }
0487 
0488 fini:
0489     cmd->cpages = bounce.cpages;
0490     dmirror_bounce_fini(&bounce);
0491     return ret;
0492 }
0493 
0494 static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
0495                    struct page **ppage)
0496 {
0497     struct dmirror_chunk *devmem;
0498     struct resource *res = NULL;
0499     unsigned long pfn;
0500     unsigned long pfn_first;
0501     unsigned long pfn_last;
0502     void *ptr;
0503     int ret = -ENOMEM;
0504 
0505     devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
0506     if (!devmem)
0507         return ret;
0508 
0509     switch (mdevice->zone_device_type) {
0510     case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE:
0511         res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE,
0512                           "hmm_dmirror");
0513         if (IS_ERR_OR_NULL(res))
0514             goto err_devmem;
0515         devmem->pagemap.range.start = res->start;
0516         devmem->pagemap.range.end = res->end;
0517         devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
0518         break;
0519     case HMM_DMIRROR_MEMORY_DEVICE_COHERENT:
0520         devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) ?
0521                             spm_addr_dev0 :
0522                             spm_addr_dev1;
0523         devmem->pagemap.range.end = devmem->pagemap.range.start +
0524                         DEVMEM_CHUNK_SIZE - 1;
0525         devmem->pagemap.type = MEMORY_DEVICE_COHERENT;
0526         break;
0527     default:
0528         ret = -EINVAL;
0529         goto err_devmem;
0530     }
0531 
0532     devmem->pagemap.nr_range = 1;
0533     devmem->pagemap.ops = &dmirror_devmem_ops;
0534     devmem->pagemap.owner = mdevice;
0535 
0536     mutex_lock(&mdevice->devmem_lock);
0537 
0538     if (mdevice->devmem_count == mdevice->devmem_capacity) {
0539         struct dmirror_chunk **new_chunks;
0540         unsigned int new_capacity;
0541 
0542         new_capacity = mdevice->devmem_capacity +
0543                 DEVMEM_CHUNKS_RESERVE;
0544         new_chunks = krealloc(mdevice->devmem_chunks,
0545                 sizeof(new_chunks[0]) * new_capacity,
0546                 GFP_KERNEL);
0547         if (!new_chunks)
0548             goto err_release;
0549         mdevice->devmem_capacity = new_capacity;
0550         mdevice->devmem_chunks = new_chunks;
0551     }
0552     ptr = memremap_pages(&devmem->pagemap, numa_node_id());
0553     if (IS_ERR_OR_NULL(ptr)) {
0554         if (ptr)
0555             ret = PTR_ERR(ptr);
0556         else
0557             ret = -EFAULT;
0558         goto err_release;
0559     }
0560 
0561     devmem->mdevice = mdevice;
0562     pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
0563     pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT);
0564     mdevice->devmem_chunks[mdevice->devmem_count++] = devmem;
0565 
0566     mutex_unlock(&mdevice->devmem_lock);
0567 
0568     pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n",
0569         DEVMEM_CHUNK_SIZE / (1024 * 1024),
0570         mdevice->devmem_count,
0571         mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)),
0572         pfn_first, pfn_last);
0573 
0574     spin_lock(&mdevice->lock);
0575     for (pfn = pfn_first; pfn < pfn_last; pfn++) {
0576         struct page *page = pfn_to_page(pfn);
0577 
0578         page->zone_device_data = mdevice->free_pages;
0579         mdevice->free_pages = page;
0580     }
0581     if (ppage) {
0582         *ppage = mdevice->free_pages;
0583         mdevice->free_pages = (*ppage)->zone_device_data;
0584         mdevice->calloc++;
0585     }
0586     spin_unlock(&mdevice->lock);
0587 
0588     return 0;
0589 
0590 err_release:
0591     mutex_unlock(&mdevice->devmem_lock);
0592     if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
0593         release_mem_region(devmem->pagemap.range.start,
0594                    range_len(&devmem->pagemap.range));
0595 err_devmem:
0596     kfree(devmem);
0597 
0598     return ret;
0599 }
0600 
0601 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
0602 {
0603     struct page *dpage = NULL;
0604     struct page *rpage = NULL;
0605 
0606     /*
0607      * For ZONE_DEVICE private type, this is a fake device so we allocate
0608      * real system memory to store our device memory.
0609      * For ZONE_DEVICE coherent type we use the actual dpage to store the
0610      * data and ignore rpage.
0611      */
0612     if (dmirror_is_private_zone(mdevice)) {
0613         rpage = alloc_page(GFP_HIGHUSER);
0614         if (!rpage)
0615             return NULL;
0616     }
0617     spin_lock(&mdevice->lock);
0618 
0619     if (mdevice->free_pages) {
0620         dpage = mdevice->free_pages;
0621         mdevice->free_pages = dpage->zone_device_data;
0622         mdevice->calloc++;
0623         spin_unlock(&mdevice->lock);
0624     } else {
0625         spin_unlock(&mdevice->lock);
0626         if (dmirror_allocate_chunk(mdevice, &dpage))
0627             goto error;
0628     }
0629 
0630     dpage->zone_device_data = rpage;
0631     lock_page(dpage);
0632     return dpage;
0633 
0634 error:
0635     if (rpage)
0636         __free_page(rpage);
0637     return NULL;
0638 }
0639 
0640 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
0641                        struct dmirror *dmirror)
0642 {
0643     struct dmirror_device *mdevice = dmirror->mdevice;
0644     const unsigned long *src = args->src;
0645     unsigned long *dst = args->dst;
0646     unsigned long addr;
0647 
0648     for (addr = args->start; addr < args->end; addr += PAGE_SIZE,
0649                            src++, dst++) {
0650         struct page *spage;
0651         struct page *dpage;
0652         struct page *rpage;
0653 
0654         if (!(*src & MIGRATE_PFN_MIGRATE))
0655             continue;
0656 
0657         /*
0658          * Note that spage might be NULL which is OK since it is an
0659          * unallocated pte_none() or read-only zero page.
0660          */
0661         spage = migrate_pfn_to_page(*src);
0662         if (WARN(spage && is_zone_device_page(spage),
0663              "page already in device spage pfn: 0x%lx\n",
0664              page_to_pfn(spage)))
0665             continue;
0666 
0667         dpage = dmirror_devmem_alloc_page(mdevice);
0668         if (!dpage)
0669             continue;
0670 
0671         rpage = BACKING_PAGE(dpage);
0672         if (spage)
0673             copy_highpage(rpage, spage);
0674         else
0675             clear_highpage(rpage);
0676 
0677         /*
0678          * Normally, a device would use the page->zone_device_data to
0679          * point to the mirror but here we use it to hold the page for
0680          * the simulated device memory and that page holds the pointer
0681          * to the mirror.
0682          */
0683         rpage->zone_device_data = dmirror;
0684 
0685         pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
0686              page_to_pfn(spage), page_to_pfn(dpage));
0687         *dst = migrate_pfn(page_to_pfn(dpage));
0688         if ((*src & MIGRATE_PFN_WRITE) ||
0689             (!spage && args->vma->vm_flags & VM_WRITE))
0690             *dst |= MIGRATE_PFN_WRITE;
0691     }
0692 }
0693 
0694 static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start,
0695                  unsigned long end)
0696 {
0697     unsigned long pfn;
0698 
0699     for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
0700         void *entry;
0701 
0702         entry = xa_load(&dmirror->pt, pfn);
0703         if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC)
0704             return -EPERM;
0705     }
0706 
0707     return 0;
0708 }
0709 
0710 static int dmirror_atomic_map(unsigned long start, unsigned long end,
0711                   struct page **pages, struct dmirror *dmirror)
0712 {
0713     unsigned long pfn, mapped = 0;
0714     int i;
0715 
0716     /* Map the migrated pages into the device's page tables. */
0717     mutex_lock(&dmirror->mutex);
0718 
0719     for (i = 0, pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, i++) {
0720         void *entry;
0721 
0722         if (!pages[i])
0723             continue;
0724 
0725         entry = pages[i];
0726         entry = xa_tag_pointer(entry, DPT_XA_TAG_ATOMIC);
0727         entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
0728         if (xa_is_err(entry)) {
0729             mutex_unlock(&dmirror->mutex);
0730             return xa_err(entry);
0731         }
0732 
0733         mapped++;
0734     }
0735 
0736     mutex_unlock(&dmirror->mutex);
0737     return mapped;
0738 }
0739 
0740 static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
0741                         struct dmirror *dmirror)
0742 {
0743     unsigned long start = args->start;
0744     unsigned long end = args->end;
0745     const unsigned long *src = args->src;
0746     const unsigned long *dst = args->dst;
0747     unsigned long pfn;
0748 
0749     /* Map the migrated pages into the device's page tables. */
0750     mutex_lock(&dmirror->mutex);
0751 
0752     for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++,
0753                                 src++, dst++) {
0754         struct page *dpage;
0755         void *entry;
0756 
0757         if (!(*src & MIGRATE_PFN_MIGRATE))
0758             continue;
0759 
0760         dpage = migrate_pfn_to_page(*dst);
0761         if (!dpage)
0762             continue;
0763 
0764         entry = BACKING_PAGE(dpage);
0765         if (*dst & MIGRATE_PFN_WRITE)
0766             entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
0767         entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
0768         if (xa_is_err(entry)) {
0769             mutex_unlock(&dmirror->mutex);
0770             return xa_err(entry);
0771         }
0772     }
0773 
0774     mutex_unlock(&dmirror->mutex);
0775     return 0;
0776 }
0777 
0778 static int dmirror_exclusive(struct dmirror *dmirror,
0779                  struct hmm_dmirror_cmd *cmd)
0780 {
0781     unsigned long start, end, addr;
0782     unsigned long size = cmd->npages << PAGE_SHIFT;
0783     struct mm_struct *mm = dmirror->notifier.mm;
0784     struct page *pages[64];
0785     struct dmirror_bounce bounce;
0786     unsigned long next;
0787     int ret;
0788 
0789     start = cmd->addr;
0790     end = start + size;
0791     if (end < start)
0792         return -EINVAL;
0793 
0794     /* Since the mm is for the mirrored process, get a reference first. */
0795     if (!mmget_not_zero(mm))
0796         return -EINVAL;
0797 
0798     mmap_read_lock(mm);
0799     for (addr = start; addr < end; addr = next) {
0800         unsigned long mapped = 0;
0801         int i;
0802 
0803         if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT))
0804             next = end;
0805         else
0806             next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT);
0807 
0808         ret = make_device_exclusive_range(mm, addr, next, pages, NULL);
0809         /*
0810          * Do dmirror_atomic_map() iff all pages are marked for
0811          * exclusive access to avoid accessing uninitialized
0812          * fields of pages.
0813          */
0814         if (ret == (next - addr) >> PAGE_SHIFT)
0815             mapped = dmirror_atomic_map(addr, next, pages, dmirror);
0816         for (i = 0; i < ret; i++) {
0817             if (pages[i]) {
0818                 unlock_page(pages[i]);
0819                 put_page(pages[i]);
0820             }
0821         }
0822 
0823         if (addr + (mapped << PAGE_SHIFT) < next) {
0824             mmap_read_unlock(mm);
0825             mmput(mm);
0826             return -EBUSY;
0827         }
0828     }
0829     mmap_read_unlock(mm);
0830     mmput(mm);
0831 
0832     /* Return the migrated data for verification. */
0833     ret = dmirror_bounce_init(&bounce, start, size);
0834     if (ret)
0835         return ret;
0836     mutex_lock(&dmirror->mutex);
0837     ret = dmirror_do_read(dmirror, start, end, &bounce);
0838     mutex_unlock(&dmirror->mutex);
0839     if (ret == 0) {
0840         if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
0841                  bounce.size))
0842             ret = -EFAULT;
0843     }
0844 
0845     cmd->cpages = bounce.cpages;
0846     dmirror_bounce_fini(&bounce);
0847     return ret;
0848 }
0849 
0850 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
0851                               struct dmirror *dmirror)
0852 {
0853     const unsigned long *src = args->src;
0854     unsigned long *dst = args->dst;
0855     unsigned long start = args->start;
0856     unsigned long end = args->end;
0857     unsigned long addr;
0858 
0859     for (addr = start; addr < end; addr += PAGE_SIZE,
0860                        src++, dst++) {
0861         struct page *dpage, *spage;
0862 
0863         spage = migrate_pfn_to_page(*src);
0864         if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
0865             continue;
0866 
0867         if (WARN_ON(!is_device_private_page(spage) &&
0868                 !is_device_coherent_page(spage)))
0869             continue;
0870         spage = BACKING_PAGE(spage);
0871         dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
0872         if (!dpage)
0873             continue;
0874         pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
0875              page_to_pfn(spage), page_to_pfn(dpage));
0876 
0877         lock_page(dpage);
0878         xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
0879         copy_highpage(dpage, spage);
0880         *dst = migrate_pfn(page_to_pfn(dpage));
0881         if (*src & MIGRATE_PFN_WRITE)
0882             *dst |= MIGRATE_PFN_WRITE;
0883     }
0884     return 0;
0885 }
0886 
0887 static unsigned long
0888 dmirror_successful_migrated_pages(struct migrate_vma *migrate)
0889 {
0890     unsigned long cpages = 0;
0891     unsigned long i;
0892 
0893     for (i = 0; i < migrate->npages; i++) {
0894         if (migrate->src[i] & MIGRATE_PFN_VALID &&
0895             migrate->src[i] & MIGRATE_PFN_MIGRATE)
0896             cpages++;
0897     }
0898     return cpages;
0899 }
0900 
0901 static int dmirror_migrate_to_system(struct dmirror *dmirror,
0902                      struct hmm_dmirror_cmd *cmd)
0903 {
0904     unsigned long start, end, addr;
0905     unsigned long size = cmd->npages << PAGE_SHIFT;
0906     struct mm_struct *mm = dmirror->notifier.mm;
0907     struct vm_area_struct *vma;
0908     unsigned long src_pfns[64] = { 0 };
0909     unsigned long dst_pfns[64] = { 0 };
0910     struct migrate_vma args;
0911     unsigned long next;
0912     int ret;
0913 
0914     start = cmd->addr;
0915     end = start + size;
0916     if (end < start)
0917         return -EINVAL;
0918 
0919     /* Since the mm is for the mirrored process, get a reference first. */
0920     if (!mmget_not_zero(mm))
0921         return -EINVAL;
0922 
0923     cmd->cpages = 0;
0924     mmap_read_lock(mm);
0925     for (addr = start; addr < end; addr = next) {
0926         vma = vma_lookup(mm, addr);
0927         if (!vma || !(vma->vm_flags & VM_READ)) {
0928             ret = -EINVAL;
0929             goto out;
0930         }
0931         next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
0932         if (next > vma->vm_end)
0933             next = vma->vm_end;
0934 
0935         args.vma = vma;
0936         args.src = src_pfns;
0937         args.dst = dst_pfns;
0938         args.start = addr;
0939         args.end = next;
0940         args.pgmap_owner = dmirror->mdevice;
0941         args.flags = dmirror_select_device(dmirror);
0942 
0943         ret = migrate_vma_setup(&args);
0944         if (ret)
0945             goto out;
0946 
0947         pr_debug("Migrating from device mem to sys mem\n");
0948         dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
0949 
0950         migrate_vma_pages(&args);
0951         cmd->cpages += dmirror_successful_migrated_pages(&args);
0952         migrate_vma_finalize(&args);
0953     }
0954 out:
0955     mmap_read_unlock(mm);
0956     mmput(mm);
0957 
0958     return ret;
0959 }
0960 
0961 static int dmirror_migrate_to_device(struct dmirror *dmirror,
0962                 struct hmm_dmirror_cmd *cmd)
0963 {
0964     unsigned long start, end, addr;
0965     unsigned long size = cmd->npages << PAGE_SHIFT;
0966     struct mm_struct *mm = dmirror->notifier.mm;
0967     struct vm_area_struct *vma;
0968     unsigned long src_pfns[64] = { 0 };
0969     unsigned long dst_pfns[64] = { 0 };
0970     struct dmirror_bounce bounce;
0971     struct migrate_vma args;
0972     unsigned long next;
0973     int ret;
0974 
0975     start = cmd->addr;
0976     end = start + size;
0977     if (end < start)
0978         return -EINVAL;
0979 
0980     /* Since the mm is for the mirrored process, get a reference first. */
0981     if (!mmget_not_zero(mm))
0982         return -EINVAL;
0983 
0984     mmap_read_lock(mm);
0985     for (addr = start; addr < end; addr = next) {
0986         vma = vma_lookup(mm, addr);
0987         if (!vma || !(vma->vm_flags & VM_READ)) {
0988             ret = -EINVAL;
0989             goto out;
0990         }
0991         next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
0992         if (next > vma->vm_end)
0993             next = vma->vm_end;
0994 
0995         args.vma = vma;
0996         args.src = src_pfns;
0997         args.dst = dst_pfns;
0998         args.start = addr;
0999         args.end = next;
1000         args.pgmap_owner = dmirror->mdevice;
1001         args.flags = MIGRATE_VMA_SELECT_SYSTEM;
1002         ret = migrate_vma_setup(&args);
1003         if (ret)
1004             goto out;
1005 
1006         pr_debug("Migrating from sys mem to device mem\n");
1007         dmirror_migrate_alloc_and_copy(&args, dmirror);
1008         migrate_vma_pages(&args);
1009         dmirror_migrate_finalize_and_map(&args, dmirror);
1010         migrate_vma_finalize(&args);
1011     }
1012     mmap_read_unlock(mm);
1013     mmput(mm);
1014 
1015     /*
1016      * Return the migrated data for verification.
1017      * Only for pages in device zone
1018      */
1019     ret = dmirror_bounce_init(&bounce, start, size);
1020     if (ret)
1021         return ret;
1022     mutex_lock(&dmirror->mutex);
1023     ret = dmirror_do_read(dmirror, start, end, &bounce);
1024     mutex_unlock(&dmirror->mutex);
1025     if (ret == 0) {
1026         if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
1027                  bounce.size))
1028             ret = -EFAULT;
1029     }
1030     cmd->cpages = bounce.cpages;
1031     dmirror_bounce_fini(&bounce);
1032     return ret;
1033 
1034 out:
1035     mmap_read_unlock(mm);
1036     mmput(mm);
1037     return ret;
1038 }
1039 
1040 static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
1041                 unsigned char *perm, unsigned long entry)
1042 {
1043     struct page *page;
1044 
1045     if (entry & HMM_PFN_ERROR) {
1046         *perm = HMM_DMIRROR_PROT_ERROR;
1047         return;
1048     }
1049     if (!(entry & HMM_PFN_VALID)) {
1050         *perm = HMM_DMIRROR_PROT_NONE;
1051         return;
1052     }
1053 
1054     page = hmm_pfn_to_page(entry);
1055     if (is_device_private_page(page)) {
1056         /* Is the page migrated to this device or some other? */
1057         if (dmirror->mdevice == dmirror_page_to_device(page))
1058             *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
1059         else
1060             *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
1061     } else if (is_device_coherent_page(page)) {
1062         /* Is the page migrated to this device or some other? */
1063         if (dmirror->mdevice == dmirror_page_to_device(page))
1064             *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL;
1065         else
1066             *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE;
1067     } else if (is_zero_pfn(page_to_pfn(page)))
1068         *perm = HMM_DMIRROR_PROT_ZERO;
1069     else
1070         *perm = HMM_DMIRROR_PROT_NONE;
1071     if (entry & HMM_PFN_WRITE)
1072         *perm |= HMM_DMIRROR_PROT_WRITE;
1073     else
1074         *perm |= HMM_DMIRROR_PROT_READ;
1075     if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT)
1076         *perm |= HMM_DMIRROR_PROT_PMD;
1077     else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT)
1078         *perm |= HMM_DMIRROR_PROT_PUD;
1079 }
1080 
1081 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni,
1082                 const struct mmu_notifier_range *range,
1083                 unsigned long cur_seq)
1084 {
1085     struct dmirror_interval *dmi =
1086         container_of(mni, struct dmirror_interval, notifier);
1087     struct dmirror *dmirror = dmi->dmirror;
1088 
1089     if (mmu_notifier_range_blockable(range))
1090         mutex_lock(&dmirror->mutex);
1091     else if (!mutex_trylock(&dmirror->mutex))
1092         return false;
1093 
1094     /*
1095      * Snapshots only need to set the sequence number since any
1096      * invalidation in the interval invalidates the whole snapshot.
1097      */
1098     mmu_interval_set_seq(mni, cur_seq);
1099 
1100     mutex_unlock(&dmirror->mutex);
1101     return true;
1102 }
1103 
1104 static const struct mmu_interval_notifier_ops dmirror_mrn_ops = {
1105     .invalidate = dmirror_snapshot_invalidate,
1106 };
1107 
1108 static int dmirror_range_snapshot(struct dmirror *dmirror,
1109                   struct hmm_range *range,
1110                   unsigned char *perm)
1111 {
1112     struct mm_struct *mm = dmirror->notifier.mm;
1113     struct dmirror_interval notifier;
1114     unsigned long timeout =
1115         jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
1116     unsigned long i;
1117     unsigned long n;
1118     int ret = 0;
1119 
1120     notifier.dmirror = dmirror;
1121     range->notifier = &notifier.notifier;
1122 
1123     ret = mmu_interval_notifier_insert(range->notifier, mm,
1124             range->start, range->end - range->start,
1125             &dmirror_mrn_ops);
1126     if (ret)
1127         return ret;
1128 
1129     while (true) {
1130         if (time_after(jiffies, timeout)) {
1131             ret = -EBUSY;
1132             goto out;
1133         }
1134 
1135         range->notifier_seq = mmu_interval_read_begin(range->notifier);
1136 
1137         mmap_read_lock(mm);
1138         ret = hmm_range_fault(range);
1139         mmap_read_unlock(mm);
1140         if (ret) {
1141             if (ret == -EBUSY)
1142                 continue;
1143             goto out;
1144         }
1145 
1146         mutex_lock(&dmirror->mutex);
1147         if (mmu_interval_read_retry(range->notifier,
1148                         range->notifier_seq)) {
1149             mutex_unlock(&dmirror->mutex);
1150             continue;
1151         }
1152         break;
1153     }
1154 
1155     n = (range->end - range->start) >> PAGE_SHIFT;
1156     for (i = 0; i < n; i++)
1157         dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]);
1158 
1159     mutex_unlock(&dmirror->mutex);
1160 out:
1161     mmu_interval_notifier_remove(range->notifier);
1162     return ret;
1163 }
1164 
1165 static int dmirror_snapshot(struct dmirror *dmirror,
1166                 struct hmm_dmirror_cmd *cmd)
1167 {
1168     struct mm_struct *mm = dmirror->notifier.mm;
1169     unsigned long start, end;
1170     unsigned long size = cmd->npages << PAGE_SHIFT;
1171     unsigned long addr;
1172     unsigned long next;
1173     unsigned long pfns[64];
1174     unsigned char perm[64];
1175     char __user *uptr;
1176     struct hmm_range range = {
1177         .hmm_pfns = pfns,
1178         .dev_private_owner = dmirror->mdevice,
1179     };
1180     int ret = 0;
1181 
1182     start = cmd->addr;
1183     end = start + size;
1184     if (end < start)
1185         return -EINVAL;
1186 
1187     /* Since the mm is for the mirrored process, get a reference first. */
1188     if (!mmget_not_zero(mm))
1189         return -EINVAL;
1190 
1191     /*
1192      * Register a temporary notifier to detect invalidations even if it
1193      * overlaps with other mmu_interval_notifiers.
1194      */
1195     uptr = u64_to_user_ptr(cmd->ptr);
1196     for (addr = start; addr < end; addr = next) {
1197         unsigned long n;
1198 
1199         next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
1200         range.start = addr;
1201         range.end = next;
1202 
1203         ret = dmirror_range_snapshot(dmirror, &range, perm);
1204         if (ret)
1205             break;
1206 
1207         n = (range.end - range.start) >> PAGE_SHIFT;
1208         if (copy_to_user(uptr, perm, n)) {
1209             ret = -EFAULT;
1210             break;
1211         }
1212 
1213         cmd->cpages += n;
1214         uptr += n;
1215     }
1216     mmput(mm);
1217 
1218     return ret;
1219 }
1220 
1221 static long dmirror_fops_unlocked_ioctl(struct file *filp,
1222                     unsigned int command,
1223                     unsigned long arg)
1224 {
1225     void __user *uarg = (void __user *)arg;
1226     struct hmm_dmirror_cmd cmd;
1227     struct dmirror *dmirror;
1228     int ret;
1229 
1230     dmirror = filp->private_data;
1231     if (!dmirror)
1232         return -EINVAL;
1233 
1234     if (copy_from_user(&cmd, uarg, sizeof(cmd)))
1235         return -EFAULT;
1236 
1237     if (cmd.addr & ~PAGE_MASK)
1238         return -EINVAL;
1239     if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT)))
1240         return -EINVAL;
1241 
1242     cmd.cpages = 0;
1243     cmd.faults = 0;
1244 
1245     switch (command) {
1246     case HMM_DMIRROR_READ:
1247         ret = dmirror_read(dmirror, &cmd);
1248         break;
1249 
1250     case HMM_DMIRROR_WRITE:
1251         ret = dmirror_write(dmirror, &cmd);
1252         break;
1253 
1254     case HMM_DMIRROR_MIGRATE_TO_DEV:
1255         ret = dmirror_migrate_to_device(dmirror, &cmd);
1256         break;
1257 
1258     case HMM_DMIRROR_MIGRATE_TO_SYS:
1259         ret = dmirror_migrate_to_system(dmirror, &cmd);
1260         break;
1261 
1262     case HMM_DMIRROR_EXCLUSIVE:
1263         ret = dmirror_exclusive(dmirror, &cmd);
1264         break;
1265 
1266     case HMM_DMIRROR_CHECK_EXCLUSIVE:
1267         ret = dmirror_check_atomic(dmirror, cmd.addr,
1268                     cmd.addr + (cmd.npages << PAGE_SHIFT));
1269         break;
1270 
1271     case HMM_DMIRROR_SNAPSHOT:
1272         ret = dmirror_snapshot(dmirror, &cmd);
1273         break;
1274 
1275     default:
1276         return -EINVAL;
1277     }
1278     if (ret)
1279         return ret;
1280 
1281     if (copy_to_user(uarg, &cmd, sizeof(cmd)))
1282         return -EFAULT;
1283 
1284     return 0;
1285 }
1286 
1287 static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma)
1288 {
1289     unsigned long addr;
1290 
1291     for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
1292         struct page *page;
1293         int ret;
1294 
1295         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1296         if (!page)
1297             return -ENOMEM;
1298 
1299         ret = vm_insert_page(vma, addr, page);
1300         if (ret) {
1301             __free_page(page);
1302             return ret;
1303         }
1304         put_page(page);
1305     }
1306 
1307     return 0;
1308 }
1309 
1310 static const struct file_operations dmirror_fops = {
1311     .open       = dmirror_fops_open,
1312     .release    = dmirror_fops_release,
1313     .mmap       = dmirror_fops_mmap,
1314     .unlocked_ioctl = dmirror_fops_unlocked_ioctl,
1315     .llseek     = default_llseek,
1316     .owner      = THIS_MODULE,
1317 };
1318 
1319 static void dmirror_devmem_free(struct page *page)
1320 {
1321     struct page *rpage = BACKING_PAGE(page);
1322     struct dmirror_device *mdevice;
1323 
1324     if (rpage != page)
1325         __free_page(rpage);
1326 
1327     mdevice = dmirror_page_to_device(page);
1328     spin_lock(&mdevice->lock);
1329     mdevice->cfree++;
1330     page->zone_device_data = mdevice->free_pages;
1331     mdevice->free_pages = page;
1332     spin_unlock(&mdevice->lock);
1333 }
1334 
1335 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
1336 {
1337     struct migrate_vma args;
1338     unsigned long src_pfns = 0;
1339     unsigned long dst_pfns = 0;
1340     struct page *rpage;
1341     struct dmirror *dmirror;
1342     vm_fault_t ret;
1343 
1344     /*
1345      * Normally, a device would use the page->zone_device_data to point to
1346      * the mirror but here we use it to hold the page for the simulated
1347      * device memory and that page holds the pointer to the mirror.
1348      */
1349     rpage = vmf->page->zone_device_data;
1350     dmirror = rpage->zone_device_data;
1351 
1352     /* FIXME demonstrate how we can adjust migrate range */
1353     args.vma = vmf->vma;
1354     args.start = vmf->address;
1355     args.end = args.start + PAGE_SIZE;
1356     args.src = &src_pfns;
1357     args.dst = &dst_pfns;
1358     args.pgmap_owner = dmirror->mdevice;
1359     args.flags = dmirror_select_device(dmirror);
1360 
1361     if (migrate_vma_setup(&args))
1362         return VM_FAULT_SIGBUS;
1363 
1364     ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
1365     if (ret)
1366         return ret;
1367     migrate_vma_pages(&args);
1368     /*
1369      * No device finalize step is needed since
1370      * dmirror_devmem_fault_alloc_and_copy() will have already
1371      * invalidated the device page table.
1372      */
1373     migrate_vma_finalize(&args);
1374     return 0;
1375 }
1376 
1377 static const struct dev_pagemap_ops dmirror_devmem_ops = {
1378     .page_free  = dmirror_devmem_free,
1379     .migrate_to_ram = dmirror_devmem_fault,
1380 };
1381 
1382 static int dmirror_device_init(struct dmirror_device *mdevice, int id)
1383 {
1384     dev_t dev;
1385     int ret;
1386 
1387     dev = MKDEV(MAJOR(dmirror_dev), id);
1388     mutex_init(&mdevice->devmem_lock);
1389     spin_lock_init(&mdevice->lock);
1390 
1391     cdev_init(&mdevice->cdevice, &dmirror_fops);
1392     mdevice->cdevice.owner = THIS_MODULE;
1393     ret = cdev_add(&mdevice->cdevice, dev, 1);
1394     if (ret)
1395         return ret;
1396 
1397     /* Build a list of free ZONE_DEVICE struct pages */
1398     return dmirror_allocate_chunk(mdevice, NULL);
1399 }
1400 
1401 static void dmirror_device_remove(struct dmirror_device *mdevice)
1402 {
1403     unsigned int i;
1404 
1405     if (mdevice->devmem_chunks) {
1406         for (i = 0; i < mdevice->devmem_count; i++) {
1407             struct dmirror_chunk *devmem =
1408                 mdevice->devmem_chunks[i];
1409 
1410             memunmap_pages(&devmem->pagemap);
1411             if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
1412                 release_mem_region(devmem->pagemap.range.start,
1413                            range_len(&devmem->pagemap.range));
1414             kfree(devmem);
1415         }
1416         kfree(mdevice->devmem_chunks);
1417     }
1418 
1419     cdev_del(&mdevice->cdevice);
1420 }
1421 
1422 static int __init hmm_dmirror_init(void)
1423 {
1424     int ret;
1425     int id = 0;
1426     int ndevices = 0;
1427 
1428     ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES,
1429                   "HMM_DMIRROR");
1430     if (ret)
1431         goto err_unreg;
1432 
1433     memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0]));
1434     dmirror_devices[ndevices++].zone_device_type =
1435                 HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
1436     dmirror_devices[ndevices++].zone_device_type =
1437                 HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
1438     if (spm_addr_dev0 && spm_addr_dev1) {
1439         dmirror_devices[ndevices++].zone_device_type =
1440                     HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
1441         dmirror_devices[ndevices++].zone_device_type =
1442                     HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
1443     }
1444     for (id = 0; id < ndevices; id++) {
1445         ret = dmirror_device_init(dmirror_devices + id, id);
1446         if (ret)
1447             goto err_chrdev;
1448     }
1449 
1450     pr_info("HMM test module loaded. This is only for testing HMM.\n");
1451     return 0;
1452 
1453 err_chrdev:
1454     while (--id >= 0)
1455         dmirror_device_remove(dmirror_devices + id);
1456     unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
1457 err_unreg:
1458     return ret;
1459 }
1460 
1461 static void __exit hmm_dmirror_exit(void)
1462 {
1463     int id;
1464 
1465     for (id = 0; id < DMIRROR_NDEVICES; id++)
1466         if (dmirror_devices[id].zone_device_type)
1467             dmirror_device_remove(dmirror_devices + id);
1468     unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
1469 }
1470 
1471 module_init(hmm_dmirror_init);
1472 module_exit(hmm_dmirror_exit);
1473 MODULE_LICENSE("GPL");