the-tree/mm/migrate_device.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Device Memory Migration functionality.
0004  *
0005  * Originally written by Jérôme Glisse.
0006  */
0007 #include <linux/export.h>
0008 #include <linux/memremap.h>
0009 #include <linux/migrate.h>
0010 #include <linux/mm.h>
0011 #include <linux/mm_inline.h>
0012 #include <linux/mmu_notifier.h>
0013 #include <linux/oom.h>
0014 #include <linux/pagewalk.h>
0015 #include <linux/rmap.h>
0016 #include <linux/swapops.h>
0017 #include <asm/tlbflush.h>
0018 #include "internal.h"
0019
0020 static int migrate_vma_collect_skip(unsigned long start,
0021                     unsigned long end,
0022                     struct mm_walk *walk)
0023 {
0024     struct migrate_vma *migrate = walk->private;
0025     unsigned long addr;
0026
0027     for (addr = start; addr < end; addr += PAGE_SIZE) {
0028         migrate->dst[migrate->npages] = 0;
0029         migrate->src[migrate->npages++] = 0;
0030     }
0031
0032     return 0;
0033 }
0034
0035 static int migrate_vma_collect_hole(unsigned long start,
0036                     unsigned long end,
0037                     __always_unused int depth,
0038                     struct mm_walk *walk)
0039 {
0040     struct migrate_vma *migrate = walk->private;
0041     unsigned long addr;
0042
0043     /* Only allow populating anonymous memory. */
0044     if (!vma_is_anonymous(walk->vma))
0045         return migrate_vma_collect_skip(start, end, walk);
0046
0047     for (addr = start; addr < end; addr += PAGE_SIZE) {
0048         migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
0049         migrate->dst[migrate->npages] = 0;
0050         migrate->npages++;
0051         migrate->cpages++;
0052     }
0053
0054     return 0;
0055 }
0056
0057 static int migrate_vma_collect_pmd(pmd_t *pmdp,
0058                    unsigned long start,
0059                    unsigned long end,
0060                    struct mm_walk *walk)
0061 {
0062     struct migrate_vma *migrate = walk->private;
0063     struct vm_area_struct *vma = walk->vma;
0064     struct mm_struct *mm = vma->vm_mm;
0065     unsigned long addr = start, unmapped = 0;
0066     spinlock_t *ptl;
0067     pte_t *ptep;
0068
0069 again:
0070     if (pmd_none(*pmdp))
0071         return migrate_vma_collect_hole(start, end, -1, walk);
0072
0073     if (pmd_trans_huge(*pmdp)) {
0074         struct page *page;
0075
0076         ptl = pmd_lock(mm, pmdp);
0077         if (unlikely(!pmd_trans_huge(*pmdp))) {
0078             spin_unlock(ptl);
0079             goto again;
0080         }
0081
0082         page = pmd_page(*pmdp);
0083         if (is_huge_zero_page(page)) {
0084             spin_unlock(ptl);
0085             split_huge_pmd(vma, pmdp, addr);
0086             if (pmd_trans_unstable(pmdp))
0087                 return migrate_vma_collect_skip(start, end,
0088                                 walk);
0089         } else {
0090             int ret;
0091
0092             get_page(page);
0093             spin_unlock(ptl);
0094             if (unlikely(!trylock_page(page)))
0095                 return migrate_vma_collect_skip(start, end,
0096                                 walk);
0097             ret = split_huge_page(page);
0098             unlock_page(page);
0099             put_page(page);
0100             if (ret)
0101                 return migrate_vma_collect_skip(start, end,
0102                                 walk);
0103             if (pmd_none(*pmdp))
0104                 return migrate_vma_collect_hole(start, end, -1,
0105                                 walk);
0106         }
0107     }
0108
0109     if (unlikely(pmd_bad(*pmdp)))
0110         return migrate_vma_collect_skip(start, end, walk);
0111
0112     ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
0113     arch_enter_lazy_mmu_mode();
0114
0115     for (; addr < end; addr += PAGE_SIZE, ptep++) {
0116         unsigned long mpfn = 0, pfn;
0117         struct page *page;
0118         swp_entry_t entry;
0119         pte_t pte;
0120
0121         pte = *ptep;
0122
0123         if (pte_none(pte)) {
0124             if (vma_is_anonymous(vma)) {
0125                 mpfn = MIGRATE_PFN_MIGRATE;
0126                 migrate->cpages++;
0127             }
0128             goto next;
0129         }
0130
0131         if (!pte_present(pte)) {
0132             /*
0133              * Only care about unaddressable device page special
0134              * page table entry. Other special swap entries are not
0135              * migratable, and we ignore regular swapped page.
0136              */
0137             entry = pte_to_swp_entry(pte);
0138             if (!is_device_private_entry(entry))
0139                 goto next;
0140
0141             page = pfn_swap_entry_to_page(entry);
0142             if (!(migrate->flags &
0143                 MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
0144                 page->pgmap->owner != migrate->pgmap_owner)
0145                 goto next;
0146
0147             mpfn = migrate_pfn(page_to_pfn(page)) |
0148                     MIGRATE_PFN_MIGRATE;
0149             if (is_writable_device_private_entry(entry))
0150                 mpfn |= MIGRATE_PFN_WRITE;
0151         } else {
0152             pfn = pte_pfn(pte);
0153             if (is_zero_pfn(pfn) &&
0154                 (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
0155                 mpfn = MIGRATE_PFN_MIGRATE;
0156                 migrate->cpages++;
0157                 goto next;
0158             }
0159             page = vm_normal_page(migrate->vma, addr, pte);
0160             if (page && !is_zone_device_page(page) &&
0161                 !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
0162                 goto next;
0163             else if (page && is_device_coherent_page(page) &&
0164                 (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
0165                  page->pgmap->owner != migrate->pgmap_owner))
0166                 goto next;
0167             mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
0168             mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
0169         }
0170
0171         /* FIXME support THP */
0172         if (!page || !page->mapping || PageTransCompound(page)) {
0173             mpfn = 0;
0174             goto next;
0175         }
0176
0177         /*
0178          * By getting a reference on the page we pin it and that blocks
0179          * any kind of migration. Side effect is that it "freezes" the
0180          * pte.
0181          *
0182          * We drop this reference after isolating the page from the lru
0183          * for non device page (device page are not on the lru and thus
0184          * can't be dropped from it).
0185          */
0186         get_page(page);
0187
0188         /*
0189          * Optimize for the common case where page is only mapped once
0190          * in one process. If we can lock the page, then we can safely
0191          * set up a special migration page table entry now.
0192          */
0193         if (trylock_page(page)) {
0194             bool anon_exclusive;
0195             pte_t swp_pte;
0196
0197             flush_cache_page(vma, addr, pte_pfn(*ptep));
0198             anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
0199             if (anon_exclusive) {
0200                 pte = ptep_clear_flush(vma, addr, ptep);
0201
0202                 if (page_try_share_anon_rmap(page)) {
0203                     set_pte_at(mm, addr, ptep, pte);
0204                     unlock_page(page);
0205                     put_page(page);
0206                     mpfn = 0;
0207                     goto next;
0208                 }
0209             } else {
0210                 pte = ptep_get_and_clear(mm, addr, ptep);
0211             }
0212
0213             migrate->cpages++;
0214
0215             /* Set the dirty flag on the folio now the pte is gone. */
0216             if (pte_dirty(pte))
0217                 folio_mark_dirty(page_folio(page));
0218
0219             /* Setup special migration page table entry */
0220             if (mpfn & MIGRATE_PFN_WRITE)
0221                 entry = make_writable_migration_entry(
0222                             page_to_pfn(page));
0223             else if (anon_exclusive)
0224                 entry = make_readable_exclusive_migration_entry(
0225                             page_to_pfn(page));
0226             else
0227                 entry = make_readable_migration_entry(
0228                             page_to_pfn(page));
0229             swp_pte = swp_entry_to_pte(entry);
0230             if (pte_present(pte)) {
0231                 if (pte_soft_dirty(pte))
0232                     swp_pte = pte_swp_mksoft_dirty(swp_pte);
0233                 if (pte_uffd_wp(pte))
0234                     swp_pte = pte_swp_mkuffd_wp(swp_pte);
0235             } else {
0236                 if (pte_swp_soft_dirty(pte))
0237                     swp_pte = pte_swp_mksoft_dirty(swp_pte);
0238                 if (pte_swp_uffd_wp(pte))
0239                     swp_pte = pte_swp_mkuffd_wp(swp_pte);
0240             }
0241             set_pte_at(mm, addr, ptep, swp_pte);
0242
0243             /*
0244              * This is like regular unmap: we remove the rmap and
0245              * drop page refcount. Page won't be freed, as we took
0246              * a reference just above.
0247              */
0248             page_remove_rmap(page, vma, false);
0249             put_page(page);
0250
0251             if (pte_present(pte))
0252                 unmapped++;
0253         } else {
0254             put_page(page);
0255             mpfn = 0;
0256         }
0257
0258 next:
0259         migrate->dst[migrate->npages] = 0;
0260         migrate->src[migrate->npages++] = mpfn;
0261     }
0262
0263     /* Only flush the TLB if we actually modified any entries */
0264     if (unmapped)
0265         flush_tlb_range(walk->vma, start, end);
0266
0267     arch_leave_lazy_mmu_mode();
0268     pte_unmap_unlock(ptep - 1, ptl);
0269
0270     return 0;
0271 }
0272
0273 static const struct mm_walk_ops migrate_vma_walk_ops = {
0274     .pmd_entry      = migrate_vma_collect_pmd,
0275     .pte_hole       = migrate_vma_collect_hole,
0276 };
0277
0278 /*
0279  * migrate_vma_collect() - collect pages over a range of virtual addresses
0280  * @migrate: migrate struct containing all migration information
0281  *
0282  * This will walk the CPU page table. For each virtual address backed by a
0283  * valid page, it updates the src array and takes a reference on the page, in
0284  * order to pin the page until we lock it and unmap it.
0285  */
0286 static void migrate_vma_collect(struct migrate_vma *migrate)
0287 {
0288     struct mmu_notifier_range range;
0289
0290     /*
0291      * Note that the pgmap_owner is passed to the mmu notifier callback so
0292      * that the registered device driver can skip invalidating device
0293      * private page mappings that won't be migrated.
0294      */
0295     mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
0296         migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
0297         migrate->pgmap_owner);
0298     mmu_notifier_invalidate_range_start(&range);
0299
0300     walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
0301             &migrate_vma_walk_ops, migrate);
0302
0303     mmu_notifier_invalidate_range_end(&range);
0304     migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
0305 }
0306
0307 /*
0308  * migrate_vma_check_page() - check if page is pinned or not
0309  * @page: struct page to check
0310  *
0311  * Pinned pages cannot be migrated. This is the same test as in
0312  * folio_migrate_mapping(), except that here we allow migration of a
0313  * ZONE_DEVICE page.
0314  */
0315 static bool migrate_vma_check_page(struct page *page)
0316 {
0317     /*
0318      * One extra ref because caller holds an extra reference, either from
0319      * isolate_lru_page() for a regular page, or migrate_vma_collect() for
0320      * a device page.
0321      */
0322     int extra = 1;
0323
0324     /*
0325      * FIXME support THP (transparent huge page), it is bit more complex to
0326      * check them than regular pages, because they can be mapped with a pmd
0327      * or with a pte (split pte mapping).
0328      */
0329     if (PageCompound(page))
0330         return false;
0331
0332     /* Page from ZONE_DEVICE have one extra reference */
0333     if (is_zone_device_page(page))
0334         extra++;
0335
0336     /* For file back page */
0337     if (page_mapping(page))
0338         extra += 1 + page_has_private(page);
0339
0340     if ((page_count(page) - extra) > page_mapcount(page))
0341         return false;
0342
0343     return true;
0344 }
0345
0346 /*
0347  * migrate_vma_unmap() - replace page mapping with special migration pte entry
0348  * @migrate: migrate struct containing all migration information
0349  *
0350  * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
0351  * special migration pte entry and check if it has been pinned. Pinned pages are
0352  * restored because we cannot migrate them.
0353  *
0354  * This is the last step before we call the device driver callback to allocate
0355  * destination memory and copy contents of original page over to new page.
0356  */
0357 static void migrate_vma_unmap(struct migrate_vma *migrate)
0358 {
0359     const unsigned long npages = migrate->npages;
0360     unsigned long i, restore = 0;
0361     bool allow_drain = true;
0362
0363     lru_add_drain();
0364
0365     for (i = 0; i < npages; i++) {
0366         struct page *page = migrate_pfn_to_page(migrate->src[i]);
0367         struct folio *folio;
0368
0369         if (!page)
0370             continue;
0371
0372         /* ZONE_DEVICE pages are not on LRU */
0373         if (!is_zone_device_page(page)) {
0374             if (!PageLRU(page) && allow_drain) {
0375                 /* Drain CPU's pagevec */
0376                 lru_add_drain_all();
0377                 allow_drain = false;
0378             }
0379
0380             if (isolate_lru_page(page)) {
0381                 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0382                 migrate->cpages--;
0383                 restore++;
0384                 continue;
0385             }
0386
0387             /* Drop the reference we took in collect */
0388             put_page(page);
0389         }
0390
0391         folio = page_folio(page);
0392         if (folio_mapped(folio))
0393             try_to_migrate(folio, 0);
0394
0395         if (page_mapped(page) || !migrate_vma_check_page(page)) {
0396             if (!is_zone_device_page(page)) {
0397                 get_page(page);
0398                 putback_lru_page(page);
0399             }
0400
0401             migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0402             migrate->cpages--;
0403             restore++;
0404             continue;
0405         }
0406     }
0407
0408     for (i = 0; i < npages && restore; i++) {
0409         struct page *page = migrate_pfn_to_page(migrate->src[i]);
0410         struct folio *folio;
0411
0412         if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
0413             continue;
0414
0415         folio = page_folio(page);
0416         remove_migration_ptes(folio, folio, false);
0417
0418         migrate->src[i] = 0;
0419         folio_unlock(folio);
0420         folio_put(folio);
0421         restore--;
0422     }
0423 }
0424
0425 /**
0426  * migrate_vma_setup() - prepare to migrate a range of memory
0427  * @args: contains the vma, start, and pfns arrays for the migration
0428  *
0429  * Returns: negative errno on failures, 0 when 0 or more pages were migrated
0430  * without an error.
0431  *
0432  * Prepare to migrate a range of memory virtual address range by collecting all
0433  * the pages backing each virtual address in the range, saving them inside the
0434  * src array.  Then lock those pages and unmap them. Once the pages are locked
0435  * and unmapped, check whether each page is pinned or not.  Pages that aren't
0436  * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
0437  * corresponding src array entry.  Then restores any pages that are pinned, by
0438  * remapping and unlocking those pages.
0439  *
0440  * The caller should then allocate destination memory and copy source memory to
0441  * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
0442  * flag set).  Once these are allocated and copied, the caller must update each
0443  * corresponding entry in the dst array with the pfn value of the destination
0444  * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
0445  * lock_page().
0446  *
0447  * Note that the caller does not have to migrate all the pages that are marked
0448  * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
0449  * device memory to system memory.  If the caller cannot migrate a device page
0450  * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
0451  * consequences for the userspace process, so it must be avoided if at all
0452  * possible.
0453  *
0454  * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
0455  * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
0456  * allowing the caller to allocate device memory for those unbacked virtual
0457  * addresses.  For this the caller simply has to allocate device memory and
0458  * properly set the destination entry like for regular migration.  Note that
0459  * this can still fail, and thus inside the device driver you must check if the
0460  * migration was successful for those entries after calling migrate_vma_pages(),
0461  * just like for regular migration.
0462  *
0463  * After that, the callers must call migrate_vma_pages() to go over each entry
0464  * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
0465  * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
0466  * then migrate_vma_pages() to migrate struct page information from the source
0467  * struct page to the destination struct page.  If it fails to migrate the
0468  * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
0469  * src array.
0470  *
0471  * At this point all successfully migrated pages have an entry in the src
0472  * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
0473  * array entry with MIGRATE_PFN_VALID flag set.
0474  *
0475  * Once migrate_vma_pages() returns the caller may inspect which pages were
0476  * successfully migrated, and which were not.  Successfully migrated pages will
0477  * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
0478  *
0479  * It is safe to update device page table after migrate_vma_pages() because
0480  * both destination and source page are still locked, and the mmap_lock is held
0481  * in read mode (hence no one can unmap the range being migrated).
0482  *
0483  * Once the caller is done cleaning up things and updating its page table (if it
0484  * chose to do so, this is not an obligation) it finally calls
0485  * migrate_vma_finalize() to update the CPU page table to point to new pages
0486  * for successfully migrated pages or otherwise restore the CPU page table to
0487  * point to the original source pages.
0488  */
0489 int migrate_vma_setup(struct migrate_vma *args)
0490 {
0491     long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
0492
0493     args->start &= PAGE_MASK;
0494     args->end &= PAGE_MASK;
0495     if (!args->vma || is_vm_hugetlb_page(args->vma) ||
0496         (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
0497         return -EINVAL;
0498     if (nr_pages <= 0)
0499         return -EINVAL;
0500     if (args->start < args->vma->vm_start ||
0501         args->start >= args->vma->vm_end)
0502         return -EINVAL;
0503     if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
0504         return -EINVAL;
0505     if (!args->src || !args->dst)
0506         return -EINVAL;
0507
0508     memset(args->src, 0, sizeof(*args->src) * nr_pages);
0509     args->cpages = 0;
0510     args->npages = 0;
0511
0512     migrate_vma_collect(args);
0513
0514     if (args->cpages)
0515         migrate_vma_unmap(args);
0516
0517     /*
0518      * At this point pages are locked and unmapped, and thus they have
0519      * stable content and can safely be copied to destination memory that
0520      * is allocated by the drivers.
0521      */
0522     return 0;
0523
0524 }
0525 EXPORT_SYMBOL(migrate_vma_setup);
0526
0527 /*
0528  * This code closely matches the code in:
0529  *   __handle_mm_fault()
0530  *     handle_pte_fault()
0531  *       do_anonymous_page()
0532  * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
0533  * private or coherent page.
0534  */
0535 static void migrate_vma_insert_page(struct migrate_vma *migrate,
0536                     unsigned long addr,
0537                     struct page *page,
0538                     unsigned long *src)
0539 {
0540     struct vm_area_struct *vma = migrate->vma;
0541     struct mm_struct *mm = vma->vm_mm;
0542     bool flush = false;
0543     spinlock_t *ptl;
0544     pte_t entry;
0545     pgd_t *pgdp;
0546     p4d_t *p4dp;
0547     pud_t *pudp;
0548     pmd_t *pmdp;
0549     pte_t *ptep;
0550
0551     /* Only allow populating anonymous memory */
0552     if (!vma_is_anonymous(vma))
0553         goto abort;
0554
0555     pgdp = pgd_offset(mm, addr);
0556     p4dp = p4d_alloc(mm, pgdp, addr);
0557     if (!p4dp)
0558         goto abort;
0559     pudp = pud_alloc(mm, p4dp, addr);
0560     if (!pudp)
0561         goto abort;
0562     pmdp = pmd_alloc(mm, pudp, addr);
0563     if (!pmdp)
0564         goto abort;
0565
0566     if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
0567         goto abort;
0568
0569     /*
0570      * Use pte_alloc() instead of pte_alloc_map().  We can't run
0571      * pte_offset_map() on pmds where a huge pmd might be created
0572      * from a different thread.
0573      *
0574      * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
0575      * parallel threads are excluded by other means.
0576      *
0577      * Here we only have mmap_read_lock(mm).
0578      */
0579     if (pte_alloc(mm, pmdp))
0580         goto abort;
0581
0582     /* See the comment in pte_alloc_one_map() */
0583     if (unlikely(pmd_trans_unstable(pmdp)))
0584         goto abort;
0585
0586     if (unlikely(anon_vma_prepare(vma)))
0587         goto abort;
0588     if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
0589         goto abort;
0590
0591     /*
0592      * The memory barrier inside __SetPageUptodate makes sure that
0593      * preceding stores to the page contents become visible before
0594      * the set_pte_at() write.
0595      */
0596     __SetPageUptodate(page);
0597
0598     if (is_device_private_page(page)) {
0599         swp_entry_t swp_entry;
0600
0601         if (vma->vm_flags & VM_WRITE)
0602             swp_entry = make_writable_device_private_entry(
0603                         page_to_pfn(page));
0604         else
0605             swp_entry = make_readable_device_private_entry(
0606                         page_to_pfn(page));
0607         entry = swp_entry_to_pte(swp_entry);
0608     } else {
0609         if (is_zone_device_page(page) &&
0610             !is_device_coherent_page(page)) {
0611             pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
0612             goto abort;
0613         }
0614         entry = mk_pte(page, vma->vm_page_prot);
0615         if (vma->vm_flags & VM_WRITE)
0616             entry = pte_mkwrite(pte_mkdirty(entry));
0617     }
0618
0619     ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
0620
0621     if (check_stable_address_space(mm))
0622         goto unlock_abort;
0623
0624     if (pte_present(*ptep)) {
0625         unsigned long pfn = pte_pfn(*ptep);
0626
0627         if (!is_zero_pfn(pfn))
0628             goto unlock_abort;
0629         flush = true;
0630     } else if (!pte_none(*ptep))
0631         goto unlock_abort;
0632
0633     /*
0634      * Check for userfaultfd but do not deliver the fault. Instead,
0635      * just back off.
0636      */
0637     if (userfaultfd_missing(vma))
0638         goto unlock_abort;
0639
0640     inc_mm_counter(mm, MM_ANONPAGES);
0641     page_add_new_anon_rmap(page, vma, addr);
0642     if (!is_zone_device_page(page))
0643         lru_cache_add_inactive_or_unevictable(page, vma);
0644     get_page(page);
0645
0646     if (flush) {
0647         flush_cache_page(vma, addr, pte_pfn(*ptep));
0648         ptep_clear_flush_notify(vma, addr, ptep);
0649         set_pte_at_notify(mm, addr, ptep, entry);
0650         update_mmu_cache(vma, addr, ptep);
0651     } else {
0652         /* No need to invalidate - it was non-present before */
0653         set_pte_at(mm, addr, ptep, entry);
0654         update_mmu_cache(vma, addr, ptep);
0655     }
0656
0657     pte_unmap_unlock(ptep, ptl);
0658     *src = MIGRATE_PFN_MIGRATE;
0659     return;
0660
0661 unlock_abort:
0662     pte_unmap_unlock(ptep, ptl);
0663 abort:
0664     *src &= ~MIGRATE_PFN_MIGRATE;
0665 }
0666
0667 /**
0668  * migrate_vma_pages() - migrate meta-data from src page to dst page
0669  * @migrate: migrate struct containing all migration information
0670  *
0671  * This migrates struct page meta-data from source struct page to destination
0672  * struct page. This effectively finishes the migration from source page to the
0673  * destination page.
0674  */
0675 void migrate_vma_pages(struct migrate_vma *migrate)
0676 {
0677     const unsigned long npages = migrate->npages;
0678     const unsigned long start = migrate->start;
0679     struct mmu_notifier_range range;
0680     unsigned long addr, i;
0681     bool notified = false;
0682
0683     for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
0684         struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
0685         struct page *page = migrate_pfn_to_page(migrate->src[i]);
0686         struct address_space *mapping;
0687         int r;
0688
0689         if (!newpage) {
0690             migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0691             continue;
0692         }
0693
0694         if (!page) {
0695             /*
0696              * The only time there is no vma is when called from
0697              * migrate_device_coherent_page(). However this isn't
0698              * called if the page could not be unmapped.
0699              */
0700             VM_BUG_ON(!migrate->vma);
0701             if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
0702                 continue;
0703             if (!notified) {
0704                 notified = true;
0705
0706                 mmu_notifier_range_init_owner(&range,
0707                     MMU_NOTIFY_MIGRATE, 0, migrate->vma,
0708                     migrate->vma->vm_mm, addr, migrate->end,
0709                     migrate->pgmap_owner);
0710                 mmu_notifier_invalidate_range_start(&range);
0711             }
0712             migrate_vma_insert_page(migrate, addr, newpage,
0713                         &migrate->src[i]);
0714             continue;
0715         }
0716
0717         mapping = page_mapping(page);
0718
0719         if (is_device_private_page(newpage) ||
0720             is_device_coherent_page(newpage)) {
0721             /*
0722              * For now only support anonymous memory migrating to
0723              * device private or coherent memory.
0724              */
0725             if (mapping) {
0726                 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0727                 continue;
0728             }
0729         } else if (is_zone_device_page(newpage)) {
0730             /*
0731              * Other types of ZONE_DEVICE page are not supported.
0732              */
0733             migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0734             continue;
0735         }
0736
0737         r = migrate_folio(mapping, page_folio(newpage),
0738                 page_folio(page), MIGRATE_SYNC_NO_COPY);
0739         if (r != MIGRATEPAGE_SUCCESS)
0740             migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0741     }
0742
0743     /*
0744      * No need to double call mmu_notifier->invalidate_range() callback as
0745      * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
0746      * did already call it.
0747      */
0748     if (notified)
0749         mmu_notifier_invalidate_range_only_end(&range);
0750 }
0751 EXPORT_SYMBOL(migrate_vma_pages);
0752
0753 /**
0754  * migrate_vma_finalize() - restore CPU page table entry
0755  * @migrate: migrate struct containing all migration information
0756  *
0757  * This replaces the special migration pte entry with either a mapping to the
0758  * new page if migration was successful for that page, or to the original page
0759  * otherwise.
0760  *
0761  * This also unlocks the pages and puts them back on the lru, or drops the extra
0762  * refcount, for device pages.
0763  */
0764 void migrate_vma_finalize(struct migrate_vma *migrate)
0765 {
0766     const unsigned long npages = migrate->npages;
0767     unsigned long i;
0768
0769     for (i = 0; i < npages; i++) {
0770         struct folio *dst, *src;
0771         struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
0772         struct page *page = migrate_pfn_to_page(migrate->src[i]);
0773
0774         if (!page) {
0775             if (newpage) {
0776                 unlock_page(newpage);
0777                 put_page(newpage);
0778             }
0779             continue;
0780         }
0781
0782         if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
0783             if (newpage) {
0784                 unlock_page(newpage);
0785                 put_page(newpage);
0786             }
0787             newpage = page;
0788         }
0789
0790         src = page_folio(page);
0791         dst = page_folio(newpage);
0792         remove_migration_ptes(src, dst, false);
0793         folio_unlock(src);
0794
0795         if (is_zone_device_page(page))
0796             put_page(page);
0797         else
0798             putback_lru_page(page);
0799
0800         if (newpage != page) {
0801             unlock_page(newpage);
0802             if (is_zone_device_page(newpage))
0803                 put_page(newpage);
0804             else
0805                 putback_lru_page(newpage);
0806         }
0807     }
0808 }
0809 EXPORT_SYMBOL(migrate_vma_finalize);
0810
0811 /*
0812  * Migrate a device coherent page back to normal memory. The caller should have
0813  * a reference on page which will be copied to the new page if migration is
0814  * successful or dropped on failure.
0815  */
0816 int migrate_device_coherent_page(struct page *page)
0817 {
0818     unsigned long src_pfn, dst_pfn = 0;
0819     struct migrate_vma args;
0820     struct page *dpage;
0821
0822     WARN_ON_ONCE(PageCompound(page));
0823
0824     lock_page(page);
0825     src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
0826     args.src = &src_pfn;
0827     args.dst = &dst_pfn;
0828     args.cpages = 1;
0829     args.npages = 1;
0830     args.vma = NULL;
0831
0832     /*
0833      * We don't have a VMA and don't need to walk the page tables to find
0834      * the source page. So call migrate_vma_unmap() directly to unmap the
0835      * page as migrate_vma_setup() will fail if args.vma == NULL.
0836      */
0837     migrate_vma_unmap(&args);
0838     if (!(src_pfn & MIGRATE_PFN_MIGRATE))
0839         return -EBUSY;
0840
0841     dpage = alloc_page(GFP_USER | __GFP_NOWARN);
0842     if (dpage) {
0843         lock_page(dpage);
0844         dst_pfn = migrate_pfn(page_to_pfn(dpage));
0845     }
0846
0847     migrate_vma_pages(&args);
0848     if (src_pfn & MIGRATE_PFN_MIGRATE)
0849         copy_highpage(dpage, page);
0850     migrate_vma_finalize(&args);
0851
0852     if (src_pfn & MIGRATE_PFN_MIGRATE)
0853         return 0;
0854     return -EBUSY;
0855 }