0001
0002
0003
0004
0005
0006
0007 #include <linux/export.h>
0008 #include <linux/memremap.h>
0009 #include <linux/migrate.h>
0010 #include <linux/mm.h>
0011 #include <linux/mm_inline.h>
0012 #include <linux/mmu_notifier.h>
0013 #include <linux/oom.h>
0014 #include <linux/pagewalk.h>
0015 #include <linux/rmap.h>
0016 #include <linux/swapops.h>
0017 #include <asm/tlbflush.h>
0018 #include "internal.h"
0019
0020 static int migrate_vma_collect_skip(unsigned long start,
0021 unsigned long end,
0022 struct mm_walk *walk)
0023 {
0024 struct migrate_vma *migrate = walk->private;
0025 unsigned long addr;
0026
0027 for (addr = start; addr < end; addr += PAGE_SIZE) {
0028 migrate->dst[migrate->npages] = 0;
0029 migrate->src[migrate->npages++] = 0;
0030 }
0031
0032 return 0;
0033 }
0034
0035 static int migrate_vma_collect_hole(unsigned long start,
0036 unsigned long end,
0037 __always_unused int depth,
0038 struct mm_walk *walk)
0039 {
0040 struct migrate_vma *migrate = walk->private;
0041 unsigned long addr;
0042
0043
0044 if (!vma_is_anonymous(walk->vma))
0045 return migrate_vma_collect_skip(start, end, walk);
0046
0047 for (addr = start; addr < end; addr += PAGE_SIZE) {
0048 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
0049 migrate->dst[migrate->npages] = 0;
0050 migrate->npages++;
0051 migrate->cpages++;
0052 }
0053
0054 return 0;
0055 }
0056
0057 static int migrate_vma_collect_pmd(pmd_t *pmdp,
0058 unsigned long start,
0059 unsigned long end,
0060 struct mm_walk *walk)
0061 {
0062 struct migrate_vma *migrate = walk->private;
0063 struct vm_area_struct *vma = walk->vma;
0064 struct mm_struct *mm = vma->vm_mm;
0065 unsigned long addr = start, unmapped = 0;
0066 spinlock_t *ptl;
0067 pte_t *ptep;
0068
0069 again:
0070 if (pmd_none(*pmdp))
0071 return migrate_vma_collect_hole(start, end, -1, walk);
0072
0073 if (pmd_trans_huge(*pmdp)) {
0074 struct page *page;
0075
0076 ptl = pmd_lock(mm, pmdp);
0077 if (unlikely(!pmd_trans_huge(*pmdp))) {
0078 spin_unlock(ptl);
0079 goto again;
0080 }
0081
0082 page = pmd_page(*pmdp);
0083 if (is_huge_zero_page(page)) {
0084 spin_unlock(ptl);
0085 split_huge_pmd(vma, pmdp, addr);
0086 if (pmd_trans_unstable(pmdp))
0087 return migrate_vma_collect_skip(start, end,
0088 walk);
0089 } else {
0090 int ret;
0091
0092 get_page(page);
0093 spin_unlock(ptl);
0094 if (unlikely(!trylock_page(page)))
0095 return migrate_vma_collect_skip(start, end,
0096 walk);
0097 ret = split_huge_page(page);
0098 unlock_page(page);
0099 put_page(page);
0100 if (ret)
0101 return migrate_vma_collect_skip(start, end,
0102 walk);
0103 if (pmd_none(*pmdp))
0104 return migrate_vma_collect_hole(start, end, -1,
0105 walk);
0106 }
0107 }
0108
0109 if (unlikely(pmd_bad(*pmdp)))
0110 return migrate_vma_collect_skip(start, end, walk);
0111
0112 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
0113 arch_enter_lazy_mmu_mode();
0114
0115 for (; addr < end; addr += PAGE_SIZE, ptep++) {
0116 unsigned long mpfn = 0, pfn;
0117 struct page *page;
0118 swp_entry_t entry;
0119 pte_t pte;
0120
0121 pte = *ptep;
0122
0123 if (pte_none(pte)) {
0124 if (vma_is_anonymous(vma)) {
0125 mpfn = MIGRATE_PFN_MIGRATE;
0126 migrate->cpages++;
0127 }
0128 goto next;
0129 }
0130
0131 if (!pte_present(pte)) {
0132
0133
0134
0135
0136
0137 entry = pte_to_swp_entry(pte);
0138 if (!is_device_private_entry(entry))
0139 goto next;
0140
0141 page = pfn_swap_entry_to_page(entry);
0142 if (!(migrate->flags &
0143 MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
0144 page->pgmap->owner != migrate->pgmap_owner)
0145 goto next;
0146
0147 mpfn = migrate_pfn(page_to_pfn(page)) |
0148 MIGRATE_PFN_MIGRATE;
0149 if (is_writable_device_private_entry(entry))
0150 mpfn |= MIGRATE_PFN_WRITE;
0151 } else {
0152 pfn = pte_pfn(pte);
0153 if (is_zero_pfn(pfn) &&
0154 (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
0155 mpfn = MIGRATE_PFN_MIGRATE;
0156 migrate->cpages++;
0157 goto next;
0158 }
0159 page = vm_normal_page(migrate->vma, addr, pte);
0160 if (page && !is_zone_device_page(page) &&
0161 !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
0162 goto next;
0163 else if (page && is_device_coherent_page(page) &&
0164 (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
0165 page->pgmap->owner != migrate->pgmap_owner))
0166 goto next;
0167 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
0168 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
0169 }
0170
0171
0172 if (!page || !page->mapping || PageTransCompound(page)) {
0173 mpfn = 0;
0174 goto next;
0175 }
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186 get_page(page);
0187
0188
0189
0190
0191
0192
0193 if (trylock_page(page)) {
0194 bool anon_exclusive;
0195 pte_t swp_pte;
0196
0197 flush_cache_page(vma, addr, pte_pfn(*ptep));
0198 anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
0199 if (anon_exclusive) {
0200 pte = ptep_clear_flush(vma, addr, ptep);
0201
0202 if (page_try_share_anon_rmap(page)) {
0203 set_pte_at(mm, addr, ptep, pte);
0204 unlock_page(page);
0205 put_page(page);
0206 mpfn = 0;
0207 goto next;
0208 }
0209 } else {
0210 pte = ptep_get_and_clear(mm, addr, ptep);
0211 }
0212
0213 migrate->cpages++;
0214
0215
0216 if (pte_dirty(pte))
0217 folio_mark_dirty(page_folio(page));
0218
0219
0220 if (mpfn & MIGRATE_PFN_WRITE)
0221 entry = make_writable_migration_entry(
0222 page_to_pfn(page));
0223 else if (anon_exclusive)
0224 entry = make_readable_exclusive_migration_entry(
0225 page_to_pfn(page));
0226 else
0227 entry = make_readable_migration_entry(
0228 page_to_pfn(page));
0229 swp_pte = swp_entry_to_pte(entry);
0230 if (pte_present(pte)) {
0231 if (pte_soft_dirty(pte))
0232 swp_pte = pte_swp_mksoft_dirty(swp_pte);
0233 if (pte_uffd_wp(pte))
0234 swp_pte = pte_swp_mkuffd_wp(swp_pte);
0235 } else {
0236 if (pte_swp_soft_dirty(pte))
0237 swp_pte = pte_swp_mksoft_dirty(swp_pte);
0238 if (pte_swp_uffd_wp(pte))
0239 swp_pte = pte_swp_mkuffd_wp(swp_pte);
0240 }
0241 set_pte_at(mm, addr, ptep, swp_pte);
0242
0243
0244
0245
0246
0247
0248 page_remove_rmap(page, vma, false);
0249 put_page(page);
0250
0251 if (pte_present(pte))
0252 unmapped++;
0253 } else {
0254 put_page(page);
0255 mpfn = 0;
0256 }
0257
0258 next:
0259 migrate->dst[migrate->npages] = 0;
0260 migrate->src[migrate->npages++] = mpfn;
0261 }
0262
0263
0264 if (unmapped)
0265 flush_tlb_range(walk->vma, start, end);
0266
0267 arch_leave_lazy_mmu_mode();
0268 pte_unmap_unlock(ptep - 1, ptl);
0269
0270 return 0;
0271 }
0272
0273 static const struct mm_walk_ops migrate_vma_walk_ops = {
0274 .pmd_entry = migrate_vma_collect_pmd,
0275 .pte_hole = migrate_vma_collect_hole,
0276 };
0277
0278
0279
0280
0281
0282
0283
0284
0285
0286 static void migrate_vma_collect(struct migrate_vma *migrate)
0287 {
0288 struct mmu_notifier_range range;
0289
0290
0291
0292
0293
0294
0295 mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
0296 migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
0297 migrate->pgmap_owner);
0298 mmu_notifier_invalidate_range_start(&range);
0299
0300 walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
0301 &migrate_vma_walk_ops, migrate);
0302
0303 mmu_notifier_invalidate_range_end(&range);
0304 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
0305 }
0306
0307
0308
0309
0310
0311
0312
0313
0314
0315 static bool migrate_vma_check_page(struct page *page)
0316 {
0317
0318
0319
0320
0321
0322 int extra = 1;
0323
0324
0325
0326
0327
0328
0329 if (PageCompound(page))
0330 return false;
0331
0332
0333 if (is_zone_device_page(page))
0334 extra++;
0335
0336
0337 if (page_mapping(page))
0338 extra += 1 + page_has_private(page);
0339
0340 if ((page_count(page) - extra) > page_mapcount(page))
0341 return false;
0342
0343 return true;
0344 }
0345
0346
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356
0357 static void migrate_vma_unmap(struct migrate_vma *migrate)
0358 {
0359 const unsigned long npages = migrate->npages;
0360 unsigned long i, restore = 0;
0361 bool allow_drain = true;
0362
0363 lru_add_drain();
0364
0365 for (i = 0; i < npages; i++) {
0366 struct page *page = migrate_pfn_to_page(migrate->src[i]);
0367 struct folio *folio;
0368
0369 if (!page)
0370 continue;
0371
0372
0373 if (!is_zone_device_page(page)) {
0374 if (!PageLRU(page) && allow_drain) {
0375
0376 lru_add_drain_all();
0377 allow_drain = false;
0378 }
0379
0380 if (isolate_lru_page(page)) {
0381 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0382 migrate->cpages--;
0383 restore++;
0384 continue;
0385 }
0386
0387
0388 put_page(page);
0389 }
0390
0391 folio = page_folio(page);
0392 if (folio_mapped(folio))
0393 try_to_migrate(folio, 0);
0394
0395 if (page_mapped(page) || !migrate_vma_check_page(page)) {
0396 if (!is_zone_device_page(page)) {
0397 get_page(page);
0398 putback_lru_page(page);
0399 }
0400
0401 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0402 migrate->cpages--;
0403 restore++;
0404 continue;
0405 }
0406 }
0407
0408 for (i = 0; i < npages && restore; i++) {
0409 struct page *page = migrate_pfn_to_page(migrate->src[i]);
0410 struct folio *folio;
0411
0412 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
0413 continue;
0414
0415 folio = page_folio(page);
0416 remove_migration_ptes(folio, folio, false);
0417
0418 migrate->src[i] = 0;
0419 folio_unlock(folio);
0420 folio_put(folio);
0421 restore--;
0422 }
0423 }
0424
0425
0426
0427
0428
0429
0430
0431
0432
0433
0434
0435
0436
0437
0438
0439
0440
0441
0442
0443
0444
0445
0446
0447
0448
0449
0450
0451
0452
0453
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464
0465
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479
0480
0481
0482
0483
0484
0485
0486
0487
0488
0489 int migrate_vma_setup(struct migrate_vma *args)
0490 {
0491 long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
0492
0493 args->start &= PAGE_MASK;
0494 args->end &= PAGE_MASK;
0495 if (!args->vma || is_vm_hugetlb_page(args->vma) ||
0496 (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
0497 return -EINVAL;
0498 if (nr_pages <= 0)
0499 return -EINVAL;
0500 if (args->start < args->vma->vm_start ||
0501 args->start >= args->vma->vm_end)
0502 return -EINVAL;
0503 if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
0504 return -EINVAL;
0505 if (!args->src || !args->dst)
0506 return -EINVAL;
0507
0508 memset(args->src, 0, sizeof(*args->src) * nr_pages);
0509 args->cpages = 0;
0510 args->npages = 0;
0511
0512 migrate_vma_collect(args);
0513
0514 if (args->cpages)
0515 migrate_vma_unmap(args);
0516
0517
0518
0519
0520
0521
0522 return 0;
0523
0524 }
0525 EXPORT_SYMBOL(migrate_vma_setup);
0526
0527
0528
0529
0530
0531
0532
0533
0534
0535 static void migrate_vma_insert_page(struct migrate_vma *migrate,
0536 unsigned long addr,
0537 struct page *page,
0538 unsigned long *src)
0539 {
0540 struct vm_area_struct *vma = migrate->vma;
0541 struct mm_struct *mm = vma->vm_mm;
0542 bool flush = false;
0543 spinlock_t *ptl;
0544 pte_t entry;
0545 pgd_t *pgdp;
0546 p4d_t *p4dp;
0547 pud_t *pudp;
0548 pmd_t *pmdp;
0549 pte_t *ptep;
0550
0551
0552 if (!vma_is_anonymous(vma))
0553 goto abort;
0554
0555 pgdp = pgd_offset(mm, addr);
0556 p4dp = p4d_alloc(mm, pgdp, addr);
0557 if (!p4dp)
0558 goto abort;
0559 pudp = pud_alloc(mm, p4dp, addr);
0560 if (!pudp)
0561 goto abort;
0562 pmdp = pmd_alloc(mm, pudp, addr);
0563 if (!pmdp)
0564 goto abort;
0565
0566 if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
0567 goto abort;
0568
0569
0570
0571
0572
0573
0574
0575
0576
0577
0578
0579 if (pte_alloc(mm, pmdp))
0580 goto abort;
0581
0582
0583 if (unlikely(pmd_trans_unstable(pmdp)))
0584 goto abort;
0585
0586 if (unlikely(anon_vma_prepare(vma)))
0587 goto abort;
0588 if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
0589 goto abort;
0590
0591
0592
0593
0594
0595
0596 __SetPageUptodate(page);
0597
0598 if (is_device_private_page(page)) {
0599 swp_entry_t swp_entry;
0600
0601 if (vma->vm_flags & VM_WRITE)
0602 swp_entry = make_writable_device_private_entry(
0603 page_to_pfn(page));
0604 else
0605 swp_entry = make_readable_device_private_entry(
0606 page_to_pfn(page));
0607 entry = swp_entry_to_pte(swp_entry);
0608 } else {
0609 if (is_zone_device_page(page) &&
0610 !is_device_coherent_page(page)) {
0611 pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
0612 goto abort;
0613 }
0614 entry = mk_pte(page, vma->vm_page_prot);
0615 if (vma->vm_flags & VM_WRITE)
0616 entry = pte_mkwrite(pte_mkdirty(entry));
0617 }
0618
0619 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
0620
0621 if (check_stable_address_space(mm))
0622 goto unlock_abort;
0623
0624 if (pte_present(*ptep)) {
0625 unsigned long pfn = pte_pfn(*ptep);
0626
0627 if (!is_zero_pfn(pfn))
0628 goto unlock_abort;
0629 flush = true;
0630 } else if (!pte_none(*ptep))
0631 goto unlock_abort;
0632
0633
0634
0635
0636
0637 if (userfaultfd_missing(vma))
0638 goto unlock_abort;
0639
0640 inc_mm_counter(mm, MM_ANONPAGES);
0641 page_add_new_anon_rmap(page, vma, addr);
0642 if (!is_zone_device_page(page))
0643 lru_cache_add_inactive_or_unevictable(page, vma);
0644 get_page(page);
0645
0646 if (flush) {
0647 flush_cache_page(vma, addr, pte_pfn(*ptep));
0648 ptep_clear_flush_notify(vma, addr, ptep);
0649 set_pte_at_notify(mm, addr, ptep, entry);
0650 update_mmu_cache(vma, addr, ptep);
0651 } else {
0652
0653 set_pte_at(mm, addr, ptep, entry);
0654 update_mmu_cache(vma, addr, ptep);
0655 }
0656
0657 pte_unmap_unlock(ptep, ptl);
0658 *src = MIGRATE_PFN_MIGRATE;
0659 return;
0660
0661 unlock_abort:
0662 pte_unmap_unlock(ptep, ptl);
0663 abort:
0664 *src &= ~MIGRATE_PFN_MIGRATE;
0665 }
0666
0667
0668
0669
0670
0671
0672
0673
0674
0675 void migrate_vma_pages(struct migrate_vma *migrate)
0676 {
0677 const unsigned long npages = migrate->npages;
0678 const unsigned long start = migrate->start;
0679 struct mmu_notifier_range range;
0680 unsigned long addr, i;
0681 bool notified = false;
0682
0683 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
0684 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
0685 struct page *page = migrate_pfn_to_page(migrate->src[i]);
0686 struct address_space *mapping;
0687 int r;
0688
0689 if (!newpage) {
0690 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0691 continue;
0692 }
0693
0694 if (!page) {
0695
0696
0697
0698
0699
0700 VM_BUG_ON(!migrate->vma);
0701 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
0702 continue;
0703 if (!notified) {
0704 notified = true;
0705
0706 mmu_notifier_range_init_owner(&range,
0707 MMU_NOTIFY_MIGRATE, 0, migrate->vma,
0708 migrate->vma->vm_mm, addr, migrate->end,
0709 migrate->pgmap_owner);
0710 mmu_notifier_invalidate_range_start(&range);
0711 }
0712 migrate_vma_insert_page(migrate, addr, newpage,
0713 &migrate->src[i]);
0714 continue;
0715 }
0716
0717 mapping = page_mapping(page);
0718
0719 if (is_device_private_page(newpage) ||
0720 is_device_coherent_page(newpage)) {
0721
0722
0723
0724
0725 if (mapping) {
0726 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0727 continue;
0728 }
0729 } else if (is_zone_device_page(newpage)) {
0730
0731
0732
0733 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0734 continue;
0735 }
0736
0737 r = migrate_folio(mapping, page_folio(newpage),
0738 page_folio(page), MIGRATE_SYNC_NO_COPY);
0739 if (r != MIGRATEPAGE_SUCCESS)
0740 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
0741 }
0742
0743
0744
0745
0746
0747
0748 if (notified)
0749 mmu_notifier_invalidate_range_only_end(&range);
0750 }
0751 EXPORT_SYMBOL(migrate_vma_pages);
0752
0753
0754
0755
0756
0757
0758
0759
0760
0761
0762
0763
0764 void migrate_vma_finalize(struct migrate_vma *migrate)
0765 {
0766 const unsigned long npages = migrate->npages;
0767 unsigned long i;
0768
0769 for (i = 0; i < npages; i++) {
0770 struct folio *dst, *src;
0771 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
0772 struct page *page = migrate_pfn_to_page(migrate->src[i]);
0773
0774 if (!page) {
0775 if (newpage) {
0776 unlock_page(newpage);
0777 put_page(newpage);
0778 }
0779 continue;
0780 }
0781
0782 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
0783 if (newpage) {
0784 unlock_page(newpage);
0785 put_page(newpage);
0786 }
0787 newpage = page;
0788 }
0789
0790 src = page_folio(page);
0791 dst = page_folio(newpage);
0792 remove_migration_ptes(src, dst, false);
0793 folio_unlock(src);
0794
0795 if (is_zone_device_page(page))
0796 put_page(page);
0797 else
0798 putback_lru_page(page);
0799
0800 if (newpage != page) {
0801 unlock_page(newpage);
0802 if (is_zone_device_page(newpage))
0803 put_page(newpage);
0804 else
0805 putback_lru_page(newpage);
0806 }
0807 }
0808 }
0809 EXPORT_SYMBOL(migrate_vma_finalize);
0810
0811
0812
0813
0814
0815
0816 int migrate_device_coherent_page(struct page *page)
0817 {
0818 unsigned long src_pfn, dst_pfn = 0;
0819 struct migrate_vma args;
0820 struct page *dpage;
0821
0822 WARN_ON_ONCE(PageCompound(page));
0823
0824 lock_page(page);
0825 src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
0826 args.src = &src_pfn;
0827 args.dst = &dst_pfn;
0828 args.cpages = 1;
0829 args.npages = 1;
0830 args.vma = NULL;
0831
0832
0833
0834
0835
0836
0837 migrate_vma_unmap(&args);
0838 if (!(src_pfn & MIGRATE_PFN_MIGRATE))
0839 return -EBUSY;
0840
0841 dpage = alloc_page(GFP_USER | __GFP_NOWARN);
0842 if (dpage) {
0843 lock_page(dpage);
0844 dst_pfn = migrate_pfn(page_to_pfn(dpage));
0845 }
0846
0847 migrate_vma_pages(&args);
0848 if (src_pfn & MIGRATE_PFN_MIGRATE)
0849 copy_highpage(dpage, page);
0850 migrate_vma_finalize(&args);
0851
0852 if (src_pfn & MIGRATE_PFN_MIGRATE)
0853 return 0;
0854 return -EBUSY;
0855 }