0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016 #include <linux/migrate.h>
0017 #include <linux/export.h>
0018 #include <linux/swap.h>
0019 #include <linux/swapops.h>
0020 #include <linux/pagemap.h>
0021 #include <linux/buffer_head.h>
0022 #include <linux/mm_inline.h>
0023 #include <linux/nsproxy.h>
0024 #include <linux/pagevec.h>
0025 #include <linux/ksm.h>
0026 #include <linux/rmap.h>
0027 #include <linux/topology.h>
0028 #include <linux/cpu.h>
0029 #include <linux/cpuset.h>
0030 #include <linux/writeback.h>
0031 #include <linux/mempolicy.h>
0032 #include <linux/vmalloc.h>
0033 #include <linux/security.h>
0034 #include <linux/backing-dev.h>
0035 #include <linux/compaction.h>
0036 #include <linux/syscalls.h>
0037 #include <linux/compat.h>
0038 #include <linux/hugetlb.h>
0039 #include <linux/hugetlb_cgroup.h>
0040 #include <linux/gfp.h>
0041 #include <linux/pfn_t.h>
0042 #include <linux/memremap.h>
0043 #include <linux/userfaultfd_k.h>
0044 #include <linux/balloon_compaction.h>
0045 #include <linux/page_idle.h>
0046 #include <linux/page_owner.h>
0047 #include <linux/sched/mm.h>
0048 #include <linux/ptrace.h>
0049 #include <linux/oom.h>
0050 #include <linux/memory.h>
0051 #include <linux/random.h>
0052 #include <linux/sched/sysctl.h>
0053
0054 #include <asm/tlbflush.h>
0055
0056 #include <trace/events/migrate.h>
0057
0058 #include "internal.h"
0059
0060 int isolate_movable_page(struct page *page, isolate_mode_t mode)
0061 {
0062 const struct movable_operations *mops;
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073 if (unlikely(!get_page_unless_zero(page)))
0074 goto out;
0075
0076
0077
0078
0079
0080
0081 if (unlikely(!__PageMovable(page)))
0082 goto out_putpage;
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094 if (unlikely(!trylock_page(page)))
0095 goto out_putpage;
0096
0097 if (!PageMovable(page) || PageIsolated(page))
0098 goto out_no_isolated;
0099
0100 mops = page_movable_ops(page);
0101 VM_BUG_ON_PAGE(!mops, page);
0102
0103 if (!mops->isolate_page(page, mode))
0104 goto out_no_isolated;
0105
0106
0107 WARN_ON_ONCE(PageIsolated(page));
0108 SetPageIsolated(page);
0109 unlock_page(page);
0110
0111 return 0;
0112
0113 out_no_isolated:
0114 unlock_page(page);
0115 out_putpage:
0116 put_page(page);
0117 out:
0118 return -EBUSY;
0119 }
0120
0121 static void putback_movable_page(struct page *page)
0122 {
0123 const struct movable_operations *mops = page_movable_ops(page);
0124
0125 mops->putback_page(page);
0126 ClearPageIsolated(page);
0127 }
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137 void putback_movable_pages(struct list_head *l)
0138 {
0139 struct page *page;
0140 struct page *page2;
0141
0142 list_for_each_entry_safe(page, page2, l, lru) {
0143 if (unlikely(PageHuge(page))) {
0144 putback_active_hugepage(page);
0145 continue;
0146 }
0147 list_del(&page->lru);
0148
0149
0150
0151
0152
0153 if (unlikely(__PageMovable(page))) {
0154 VM_BUG_ON_PAGE(!PageIsolated(page), page);
0155 lock_page(page);
0156 if (PageMovable(page))
0157 putback_movable_page(page);
0158 else
0159 ClearPageIsolated(page);
0160 unlock_page(page);
0161 put_page(page);
0162 } else {
0163 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
0164 page_is_file_lru(page), -thp_nr_pages(page));
0165 putback_lru_page(page);
0166 }
0167 }
0168 }
0169
0170
0171
0172
0173 static bool remove_migration_pte(struct folio *folio,
0174 struct vm_area_struct *vma, unsigned long addr, void *old)
0175 {
0176 DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
0177
0178 while (page_vma_mapped_walk(&pvmw)) {
0179 rmap_t rmap_flags = RMAP_NONE;
0180 pte_t pte;
0181 swp_entry_t entry;
0182 struct page *new;
0183 unsigned long idx = 0;
0184
0185
0186 if (folio_test_large(folio) && !folio_test_hugetlb(folio))
0187 idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
0188 new = folio_page(folio, idx);
0189
0190 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
0191
0192 if (!pvmw.pte) {
0193 VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
0194 !folio_test_pmd_mappable(folio), folio);
0195 remove_migration_pmd(&pvmw, new);
0196 continue;
0197 }
0198 #endif
0199
0200 folio_get(folio);
0201 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
0202 if (pte_swp_soft_dirty(*pvmw.pte))
0203 pte = pte_mksoft_dirty(pte);
0204
0205
0206
0207
0208 entry = pte_to_swp_entry(*pvmw.pte);
0209 if (is_writable_migration_entry(entry))
0210 pte = maybe_mkwrite(pte, vma);
0211 else if (pte_swp_uffd_wp(*pvmw.pte))
0212 pte = pte_mkuffd_wp(pte);
0213
0214 if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
0215 rmap_flags |= RMAP_EXCLUSIVE;
0216
0217 if (unlikely(is_device_private_page(new))) {
0218 if (pte_write(pte))
0219 entry = make_writable_device_private_entry(
0220 page_to_pfn(new));
0221 else
0222 entry = make_readable_device_private_entry(
0223 page_to_pfn(new));
0224 pte = swp_entry_to_pte(entry);
0225 if (pte_swp_soft_dirty(*pvmw.pte))
0226 pte = pte_swp_mksoft_dirty(pte);
0227 if (pte_swp_uffd_wp(*pvmw.pte))
0228 pte = pte_swp_mkuffd_wp(pte);
0229 }
0230
0231 #ifdef CONFIG_HUGETLB_PAGE
0232 if (folio_test_hugetlb(folio)) {
0233 unsigned int shift = huge_page_shift(hstate_vma(vma));
0234
0235 pte = pte_mkhuge(pte);
0236 pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
0237 if (folio_test_anon(folio))
0238 hugepage_add_anon_rmap(new, vma, pvmw.address,
0239 rmap_flags);
0240 else
0241 page_dup_file_rmap(new, true);
0242 set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
0243 } else
0244 #endif
0245 {
0246 if (folio_test_anon(folio))
0247 page_add_anon_rmap(new, vma, pvmw.address,
0248 rmap_flags);
0249 else
0250 page_add_file_rmap(new, vma, false);
0251 set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
0252 }
0253 if (vma->vm_flags & VM_LOCKED)
0254 mlock_page_drain_local();
0255
0256 trace_remove_migration_pte(pvmw.address, pte_val(pte),
0257 compound_order(new));
0258
0259
0260 update_mmu_cache(vma, pvmw.address, pvmw.pte);
0261 }
0262
0263 return true;
0264 }
0265
0266
0267
0268
0269
0270 void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
0271 {
0272 struct rmap_walk_control rwc = {
0273 .rmap_one = remove_migration_pte,
0274 .arg = src,
0275 };
0276
0277 if (locked)
0278 rmap_walk_locked(dst, &rwc);
0279 else
0280 rmap_walk(dst, &rwc);
0281 }
0282
0283
0284
0285
0286
0287
0288 void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
0289 spinlock_t *ptl)
0290 {
0291 pte_t pte;
0292 swp_entry_t entry;
0293
0294 spin_lock(ptl);
0295 pte = *ptep;
0296 if (!is_swap_pte(pte))
0297 goto out;
0298
0299 entry = pte_to_swp_entry(pte);
0300 if (!is_migration_entry(entry))
0301 goto out;
0302
0303 migration_entry_wait_on_locked(entry, ptep, ptl);
0304 return;
0305 out:
0306 pte_unmap_unlock(ptep, ptl);
0307 }
0308
0309 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
0310 unsigned long address)
0311 {
0312 spinlock_t *ptl = pte_lockptr(mm, pmd);
0313 pte_t *ptep = pte_offset_map(pmd, address);
0314 __migration_entry_wait(mm, ptep, ptl);
0315 }
0316
0317 #ifdef CONFIG_HUGETLB_PAGE
0318 void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
0319 {
0320 pte_t pte;
0321
0322 spin_lock(ptl);
0323 pte = huge_ptep_get(ptep);
0324
0325 if (unlikely(!is_hugetlb_entry_migration(pte)))
0326 spin_unlock(ptl);
0327 else
0328 migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
0329 }
0330
0331 void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
0332 {
0333 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
0334
0335 __migration_entry_wait_huge(pte, ptl);
0336 }
0337 #endif
0338
0339 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
0340 void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
0341 {
0342 spinlock_t *ptl;
0343
0344 ptl = pmd_lock(mm, pmd);
0345 if (!is_pmd_migration_entry(*pmd))
0346 goto unlock;
0347 migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
0348 return;
0349 unlock:
0350 spin_unlock(ptl);
0351 }
0352 #endif
0353
0354 static int folio_expected_refs(struct address_space *mapping,
0355 struct folio *folio)
0356 {
0357 int refs = 1;
0358 if (!mapping)
0359 return refs;
0360
0361 refs += folio_nr_pages(folio);
0362 if (folio_test_private(folio))
0363 refs++;
0364
0365 return refs;
0366 }
0367
0368
0369
0370
0371
0372
0373
0374
0375
0376 int folio_migrate_mapping(struct address_space *mapping,
0377 struct folio *newfolio, struct folio *folio, int extra_count)
0378 {
0379 XA_STATE(xas, &mapping->i_pages, folio_index(folio));
0380 struct zone *oldzone, *newzone;
0381 int dirty;
0382 int expected_count = folio_expected_refs(mapping, folio) + extra_count;
0383 long nr = folio_nr_pages(folio);
0384
0385 if (!mapping) {
0386
0387 if (folio_ref_count(folio) != expected_count)
0388 return -EAGAIN;
0389
0390
0391 newfolio->index = folio->index;
0392 newfolio->mapping = folio->mapping;
0393 if (folio_test_swapbacked(folio))
0394 __folio_set_swapbacked(newfolio);
0395
0396 return MIGRATEPAGE_SUCCESS;
0397 }
0398
0399 oldzone = folio_zone(folio);
0400 newzone = folio_zone(newfolio);
0401
0402 xas_lock_irq(&xas);
0403 if (!folio_ref_freeze(folio, expected_count)) {
0404 xas_unlock_irq(&xas);
0405 return -EAGAIN;
0406 }
0407
0408
0409
0410
0411
0412 newfolio->index = folio->index;
0413 newfolio->mapping = folio->mapping;
0414 folio_ref_add(newfolio, nr);
0415 if (folio_test_swapbacked(folio)) {
0416 __folio_set_swapbacked(newfolio);
0417 if (folio_test_swapcache(folio)) {
0418 folio_set_swapcache(newfolio);
0419 newfolio->private = folio_get_private(folio);
0420 }
0421 } else {
0422 VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
0423 }
0424
0425
0426 dirty = folio_test_dirty(folio);
0427 if (dirty) {
0428 folio_clear_dirty(folio);
0429 folio_set_dirty(newfolio);
0430 }
0431
0432 xas_store(&xas, newfolio);
0433
0434
0435
0436
0437
0438
0439 folio_ref_unfreeze(folio, expected_count - nr);
0440
0441 xas_unlock(&xas);
0442
0443
0444
0445
0446
0447
0448
0449
0450
0451
0452
0453
0454 if (newzone != oldzone) {
0455 struct lruvec *old_lruvec, *new_lruvec;
0456 struct mem_cgroup *memcg;
0457
0458 memcg = folio_memcg(folio);
0459 old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
0460 new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
0461
0462 __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
0463 __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
0464 if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
0465 __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
0466 __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
0467 }
0468 #ifdef CONFIG_SWAP
0469 if (folio_test_swapcache(folio)) {
0470 __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
0471 __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
0472 }
0473 #endif
0474 if (dirty && mapping_can_writeback(mapping)) {
0475 __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
0476 __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
0477 __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
0478 __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
0479 }
0480 }
0481 local_irq_enable();
0482
0483 return MIGRATEPAGE_SUCCESS;
0484 }
0485 EXPORT_SYMBOL(folio_migrate_mapping);
0486
0487
0488
0489
0490
0491 int migrate_huge_page_move_mapping(struct address_space *mapping,
0492 struct folio *dst, struct folio *src)
0493 {
0494 XA_STATE(xas, &mapping->i_pages, folio_index(src));
0495 int expected_count;
0496
0497 xas_lock_irq(&xas);
0498 expected_count = 2 + folio_has_private(src);
0499 if (!folio_ref_freeze(src, expected_count)) {
0500 xas_unlock_irq(&xas);
0501 return -EAGAIN;
0502 }
0503
0504 dst->index = src->index;
0505 dst->mapping = src->mapping;
0506
0507 folio_get(dst);
0508
0509 xas_store(&xas, dst);
0510
0511 folio_ref_unfreeze(src, expected_count - 1);
0512
0513 xas_unlock_irq(&xas);
0514
0515 return MIGRATEPAGE_SUCCESS;
0516 }
0517
0518
0519
0520
0521 void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
0522 {
0523 int cpupid;
0524
0525 if (folio_test_error(folio))
0526 folio_set_error(newfolio);
0527 if (folio_test_referenced(folio))
0528 folio_set_referenced(newfolio);
0529 if (folio_test_uptodate(folio))
0530 folio_mark_uptodate(newfolio);
0531 if (folio_test_clear_active(folio)) {
0532 VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
0533 folio_set_active(newfolio);
0534 } else if (folio_test_clear_unevictable(folio))
0535 folio_set_unevictable(newfolio);
0536 if (folio_test_workingset(folio))
0537 folio_set_workingset(newfolio);
0538 if (folio_test_checked(folio))
0539 folio_set_checked(newfolio);
0540
0541
0542
0543
0544
0545
0546 if (folio_test_mappedtodisk(folio))
0547 folio_set_mappedtodisk(newfolio);
0548
0549
0550 if (folio_test_dirty(folio))
0551 folio_set_dirty(newfolio);
0552
0553 if (folio_test_young(folio))
0554 folio_set_young(newfolio);
0555 if (folio_test_idle(folio))
0556 folio_set_idle(newfolio);
0557
0558
0559
0560
0561
0562 cpupid = page_cpupid_xchg_last(&folio->page, -1);
0563 page_cpupid_xchg_last(&newfolio->page, cpupid);
0564
0565 folio_migrate_ksm(newfolio, folio);
0566
0567
0568
0569
0570 if (folio_test_swapcache(folio))
0571 folio_clear_swapcache(folio);
0572 folio_clear_private(folio);
0573
0574
0575 if (!folio_test_hugetlb(folio))
0576 folio->private = NULL;
0577
0578
0579
0580
0581
0582 if (folio_test_writeback(newfolio))
0583 folio_end_writeback(newfolio);
0584
0585
0586
0587
0588
0589
0590 if (folio_test_readahead(folio))
0591 folio_set_readahead(newfolio);
0592
0593 folio_copy_owner(newfolio, folio);
0594
0595 if (!folio_test_hugetlb(folio))
0596 mem_cgroup_migrate(folio, newfolio);
0597 }
0598 EXPORT_SYMBOL(folio_migrate_flags);
0599
0600 void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
0601 {
0602 folio_copy(newfolio, folio);
0603 folio_migrate_flags(newfolio, folio);
0604 }
0605 EXPORT_SYMBOL(folio_migrate_copy);
0606
0607
0608
0609
0610
0611
0612
0613
0614
0615
0616
0617
0618
0619
0620
0621
0622
0623 int migrate_folio(struct address_space *mapping, struct folio *dst,
0624 struct folio *src, enum migrate_mode mode)
0625 {
0626 int rc;
0627
0628 BUG_ON(folio_test_writeback(src));
0629
0630 rc = folio_migrate_mapping(mapping, dst, src, 0);
0631
0632 if (rc != MIGRATEPAGE_SUCCESS)
0633 return rc;
0634
0635 if (mode != MIGRATE_SYNC_NO_COPY)
0636 folio_migrate_copy(dst, src);
0637 else
0638 folio_migrate_flags(dst, src);
0639 return MIGRATEPAGE_SUCCESS;
0640 }
0641 EXPORT_SYMBOL(migrate_folio);
0642
0643 #ifdef CONFIG_BLOCK
0644
0645 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
0646 enum migrate_mode mode)
0647 {
0648 struct buffer_head *bh = head;
0649
0650
0651 if (mode != MIGRATE_ASYNC) {
0652 do {
0653 lock_buffer(bh);
0654 bh = bh->b_this_page;
0655
0656 } while (bh != head);
0657
0658 return true;
0659 }
0660
0661
0662 do {
0663 if (!trylock_buffer(bh)) {
0664
0665
0666
0667
0668 struct buffer_head *failed_bh = bh;
0669 bh = head;
0670 while (bh != failed_bh) {
0671 unlock_buffer(bh);
0672 bh = bh->b_this_page;
0673 }
0674 return false;
0675 }
0676
0677 bh = bh->b_this_page;
0678 } while (bh != head);
0679 return true;
0680 }
0681
0682 static int __buffer_migrate_folio(struct address_space *mapping,
0683 struct folio *dst, struct folio *src, enum migrate_mode mode,
0684 bool check_refs)
0685 {
0686 struct buffer_head *bh, *head;
0687 int rc;
0688 int expected_count;
0689
0690 head = folio_buffers(src);
0691 if (!head)
0692 return migrate_folio(mapping, dst, src, mode);
0693
0694
0695 expected_count = folio_expected_refs(mapping, src);
0696 if (folio_ref_count(src) != expected_count)
0697 return -EAGAIN;
0698
0699 if (!buffer_migrate_lock_buffers(head, mode))
0700 return -EAGAIN;
0701
0702 if (check_refs) {
0703 bool busy;
0704 bool invalidated = false;
0705
0706 recheck_buffers:
0707 busy = false;
0708 spin_lock(&mapping->private_lock);
0709 bh = head;
0710 do {
0711 if (atomic_read(&bh->b_count)) {
0712 busy = true;
0713 break;
0714 }
0715 bh = bh->b_this_page;
0716 } while (bh != head);
0717 if (busy) {
0718 if (invalidated) {
0719 rc = -EAGAIN;
0720 goto unlock_buffers;
0721 }
0722 spin_unlock(&mapping->private_lock);
0723 invalidate_bh_lrus();
0724 invalidated = true;
0725 goto recheck_buffers;
0726 }
0727 }
0728
0729 rc = folio_migrate_mapping(mapping, dst, src, 0);
0730 if (rc != MIGRATEPAGE_SUCCESS)
0731 goto unlock_buffers;
0732
0733 folio_attach_private(dst, folio_detach_private(src));
0734
0735 bh = head;
0736 do {
0737 set_bh_page(bh, &dst->page, bh_offset(bh));
0738 bh = bh->b_this_page;
0739 } while (bh != head);
0740
0741 if (mode != MIGRATE_SYNC_NO_COPY)
0742 folio_migrate_copy(dst, src);
0743 else
0744 folio_migrate_flags(dst, src);
0745
0746 rc = MIGRATEPAGE_SUCCESS;
0747 unlock_buffers:
0748 if (check_refs)
0749 spin_unlock(&mapping->private_lock);
0750 bh = head;
0751 do {
0752 unlock_buffer(bh);
0753 bh = bh->b_this_page;
0754 } while (bh != head);
0755
0756 return rc;
0757 }
0758
0759
0760
0761
0762
0763
0764
0765
0766
0767
0768
0769
0770
0771
0772
0773
0774 int buffer_migrate_folio(struct address_space *mapping,
0775 struct folio *dst, struct folio *src, enum migrate_mode mode)
0776 {
0777 return __buffer_migrate_folio(mapping, dst, src, mode, false);
0778 }
0779 EXPORT_SYMBOL(buffer_migrate_folio);
0780
0781
0782
0783
0784
0785
0786
0787
0788
0789
0790
0791
0792
0793
0794
0795 int buffer_migrate_folio_norefs(struct address_space *mapping,
0796 struct folio *dst, struct folio *src, enum migrate_mode mode)
0797 {
0798 return __buffer_migrate_folio(mapping, dst, src, mode, true);
0799 }
0800 #endif
0801
0802 int filemap_migrate_folio(struct address_space *mapping,
0803 struct folio *dst, struct folio *src, enum migrate_mode mode)
0804 {
0805 int ret;
0806
0807 ret = folio_migrate_mapping(mapping, dst, src, 0);
0808 if (ret != MIGRATEPAGE_SUCCESS)
0809 return ret;
0810
0811 if (folio_get_private(src))
0812 folio_attach_private(dst, folio_detach_private(src));
0813
0814 if (mode != MIGRATE_SYNC_NO_COPY)
0815 folio_migrate_copy(dst, src);
0816 else
0817 folio_migrate_flags(dst, src);
0818 return MIGRATEPAGE_SUCCESS;
0819 }
0820 EXPORT_SYMBOL_GPL(filemap_migrate_folio);
0821
0822
0823
0824
0825 static int writeout(struct address_space *mapping, struct folio *folio)
0826 {
0827 struct writeback_control wbc = {
0828 .sync_mode = WB_SYNC_NONE,
0829 .nr_to_write = 1,
0830 .range_start = 0,
0831 .range_end = LLONG_MAX,
0832 .for_reclaim = 1
0833 };
0834 int rc;
0835
0836 if (!mapping->a_ops->writepage)
0837
0838 return -EINVAL;
0839
0840 if (!folio_clear_dirty_for_io(folio))
0841
0842 return -EAGAIN;
0843
0844
0845
0846
0847
0848
0849
0850
0851
0852 remove_migration_ptes(folio, folio, false);
0853
0854 rc = mapping->a_ops->writepage(&folio->page, &wbc);
0855
0856 if (rc != AOP_WRITEPAGE_ACTIVATE)
0857
0858 folio_lock(folio);
0859
0860 return (rc < 0) ? -EIO : -EAGAIN;
0861 }
0862
0863
0864
0865
0866 static int fallback_migrate_folio(struct address_space *mapping,
0867 struct folio *dst, struct folio *src, enum migrate_mode mode)
0868 {
0869 if (folio_test_dirty(src)) {
0870
0871 switch (mode) {
0872 case MIGRATE_SYNC:
0873 case MIGRATE_SYNC_NO_COPY:
0874 break;
0875 default:
0876 return -EBUSY;
0877 }
0878 return writeout(mapping, src);
0879 }
0880
0881
0882
0883
0884
0885 if (folio_test_private(src) &&
0886 !filemap_release_folio(src, GFP_KERNEL))
0887 return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
0888
0889 return migrate_folio(mapping, dst, src, mode);
0890 }
0891
0892
0893
0894
0895
0896
0897
0898
0899
0900
0901
0902
0903 static int move_to_new_folio(struct folio *dst, struct folio *src,
0904 enum migrate_mode mode)
0905 {
0906 int rc = -EAGAIN;
0907 bool is_lru = !__PageMovable(&src->page);
0908
0909 VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
0910 VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
0911
0912 if (likely(is_lru)) {
0913 struct address_space *mapping = folio_mapping(src);
0914
0915 if (!mapping)
0916 rc = migrate_folio(mapping, dst, src, mode);
0917 else if (mapping->a_ops->migrate_folio)
0918
0919
0920
0921
0922
0923
0924
0925 rc = mapping->a_ops->migrate_folio(mapping, dst, src,
0926 mode);
0927 else
0928 rc = fallback_migrate_folio(mapping, dst, src, mode);
0929 } else {
0930 const struct movable_operations *mops;
0931
0932
0933
0934
0935
0936 VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
0937 if (!folio_test_movable(src)) {
0938 rc = MIGRATEPAGE_SUCCESS;
0939 folio_clear_isolated(src);
0940 goto out;
0941 }
0942
0943 mops = page_movable_ops(&src->page);
0944 rc = mops->migrate_page(&dst->page, &src->page, mode);
0945 WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
0946 !folio_test_isolated(src));
0947 }
0948
0949
0950
0951
0952
0953 if (rc == MIGRATEPAGE_SUCCESS) {
0954 if (__PageMovable(&src->page)) {
0955 VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
0956
0957
0958
0959
0960
0961 folio_clear_isolated(src);
0962 }
0963
0964
0965
0966
0967
0968
0969 if (!folio_mapping_flags(src))
0970 src->mapping = NULL;
0971
0972 if (likely(!folio_is_zone_device(dst)))
0973 flush_dcache_folio(dst);
0974 }
0975 out:
0976 return rc;
0977 }
0978
0979 static int __unmap_and_move(struct page *page, struct page *newpage,
0980 int force, enum migrate_mode mode)
0981 {
0982 struct folio *folio = page_folio(page);
0983 struct folio *dst = page_folio(newpage);
0984 int rc = -EAGAIN;
0985 bool page_was_mapped = false;
0986 struct anon_vma *anon_vma = NULL;
0987 bool is_lru = !__PageMovable(page);
0988
0989 if (!trylock_page(page)) {
0990 if (!force || mode == MIGRATE_ASYNC)
0991 goto out;
0992
0993
0994
0995
0996
0997
0998
0999
1000
1001
1002
1003
1004
1005
1006 if (current->flags & PF_MEMALLOC)
1007 goto out;
1008
1009 lock_page(page);
1010 }
1011
1012 if (PageWriteback(page)) {
1013
1014
1015
1016
1017
1018
1019 switch (mode) {
1020 case MIGRATE_SYNC:
1021 case MIGRATE_SYNC_NO_COPY:
1022 break;
1023 default:
1024 rc = -EBUSY;
1025 goto out_unlock;
1026 }
1027 if (!force)
1028 goto out_unlock;
1029 wait_on_page_writeback(page);
1030 }
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046 if (PageAnon(page) && !PageKsm(page))
1047 anon_vma = page_get_anon_vma(page);
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057 if (unlikely(!trylock_page(newpage)))
1058 goto out_unlock;
1059
1060 if (unlikely(!is_lru)) {
1061 rc = move_to_new_folio(dst, folio, mode);
1062 goto out_unlock_both;
1063 }
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077 if (!page->mapping) {
1078 VM_BUG_ON_PAGE(PageAnon(page), page);
1079 if (page_has_private(page)) {
1080 try_to_free_buffers(folio);
1081 goto out_unlock_both;
1082 }
1083 } else if (page_mapped(page)) {
1084
1085 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
1086 page);
1087 try_to_migrate(folio, 0);
1088 page_was_mapped = true;
1089 }
1090
1091 if (!page_mapped(page))
1092 rc = move_to_new_folio(dst, folio, mode);
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103 if (rc == MIGRATEPAGE_SUCCESS) {
1104 lru_cache_add(newpage);
1105 if (page_was_mapped)
1106 lru_add_drain();
1107 }
1108
1109 if (page_was_mapped)
1110 remove_migration_ptes(folio,
1111 rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
1112
1113 out_unlock_both:
1114 unlock_page(newpage);
1115 out_unlock:
1116
1117 if (anon_vma)
1118 put_anon_vma(anon_vma);
1119 unlock_page(page);
1120 out:
1121
1122
1123
1124
1125
1126 if (rc == MIGRATEPAGE_SUCCESS)
1127 put_page(newpage);
1128
1129 return rc;
1130 }
1131
1132
1133
1134
1135
1136 static int unmap_and_move(new_page_t get_new_page,
1137 free_page_t put_new_page,
1138 unsigned long private, struct page *page,
1139 int force, enum migrate_mode mode,
1140 enum migrate_reason reason,
1141 struct list_head *ret)
1142 {
1143 int rc = MIGRATEPAGE_SUCCESS;
1144 struct page *newpage = NULL;
1145
1146 if (!thp_migration_supported() && PageTransHuge(page))
1147 return -ENOSYS;
1148
1149 if (page_count(page) == 1) {
1150
1151 ClearPageActive(page);
1152 ClearPageUnevictable(page);
1153
1154 goto out;
1155 }
1156
1157 newpage = get_new_page(page, private);
1158 if (!newpage)
1159 return -ENOMEM;
1160
1161 newpage->private = 0;
1162 rc = __unmap_and_move(page, newpage, force, mode);
1163 if (rc == MIGRATEPAGE_SUCCESS)
1164 set_page_owner_migrate_reason(newpage, reason);
1165
1166 out:
1167 if (rc != -EAGAIN) {
1168
1169
1170
1171
1172
1173 list_del(&page->lru);
1174 }
1175
1176
1177
1178
1179
1180
1181 if (rc == MIGRATEPAGE_SUCCESS) {
1182
1183
1184
1185
1186
1187 if (likely(!__PageMovable(page)))
1188 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1189 page_is_file_lru(page), -thp_nr_pages(page));
1190
1191 if (reason != MR_MEMORY_FAILURE)
1192
1193
1194
1195 put_page(page);
1196 } else {
1197 if (rc != -EAGAIN)
1198 list_add_tail(&page->lru, ret);
1199
1200 if (put_new_page)
1201 put_new_page(newpage, private);
1202 else
1203 put_page(newpage);
1204 }
1205
1206 return rc;
1207 }
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227 static int unmap_and_move_huge_page(new_page_t get_new_page,
1228 free_page_t put_new_page, unsigned long private,
1229 struct page *hpage, int force,
1230 enum migrate_mode mode, int reason,
1231 struct list_head *ret)
1232 {
1233 struct folio *dst, *src = page_folio(hpage);
1234 int rc = -EAGAIN;
1235 int page_was_mapped = 0;
1236 struct page *new_hpage;
1237 struct anon_vma *anon_vma = NULL;
1238 struct address_space *mapping = NULL;
1239
1240
1241
1242
1243
1244
1245
1246
1247 if (!hugepage_migration_supported(page_hstate(hpage))) {
1248 list_move_tail(&hpage->lru, ret);
1249 return -ENOSYS;
1250 }
1251
1252 if (page_count(hpage) == 1) {
1253
1254 putback_active_hugepage(hpage);
1255 return MIGRATEPAGE_SUCCESS;
1256 }
1257
1258 new_hpage = get_new_page(hpage, private);
1259 if (!new_hpage)
1260 return -ENOMEM;
1261 dst = page_folio(new_hpage);
1262
1263 if (!trylock_page(hpage)) {
1264 if (!force)
1265 goto out;
1266 switch (mode) {
1267 case MIGRATE_SYNC:
1268 case MIGRATE_SYNC_NO_COPY:
1269 break;
1270 default:
1271 goto out;
1272 }
1273 lock_page(hpage);
1274 }
1275
1276
1277
1278
1279
1280
1281 if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
1282 rc = -EBUSY;
1283 goto out_unlock;
1284 }
1285
1286 if (PageAnon(hpage))
1287 anon_vma = page_get_anon_vma(hpage);
1288
1289 if (unlikely(!trylock_page(new_hpage)))
1290 goto put_anon;
1291
1292 if (page_mapped(hpage)) {
1293 enum ttu_flags ttu = 0;
1294
1295 if (!PageAnon(hpage)) {
1296
1297
1298
1299
1300
1301
1302 mapping = hugetlb_page_mapping_lock_write(hpage);
1303 if (unlikely(!mapping))
1304 goto unlock_put_anon;
1305
1306 ttu = TTU_RMAP_LOCKED;
1307 }
1308
1309 try_to_migrate(src, ttu);
1310 page_was_mapped = 1;
1311
1312 if (ttu & TTU_RMAP_LOCKED)
1313 i_mmap_unlock_write(mapping);
1314 }
1315
1316 if (!page_mapped(hpage))
1317 rc = move_to_new_folio(dst, src, mode);
1318
1319 if (page_was_mapped)
1320 remove_migration_ptes(src,
1321 rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
1322
1323 unlock_put_anon:
1324 unlock_page(new_hpage);
1325
1326 put_anon:
1327 if (anon_vma)
1328 put_anon_vma(anon_vma);
1329
1330 if (rc == MIGRATEPAGE_SUCCESS) {
1331 move_hugetlb_state(hpage, new_hpage, reason);
1332 put_new_page = NULL;
1333 }
1334
1335 out_unlock:
1336 unlock_page(hpage);
1337 out:
1338 if (rc == MIGRATEPAGE_SUCCESS)
1339 putback_active_hugepage(hpage);
1340 else if (rc != -EAGAIN)
1341 list_move_tail(&hpage->lru, ret);
1342
1343
1344
1345
1346
1347
1348 if (put_new_page)
1349 put_new_page(new_hpage, private);
1350 else
1351 putback_active_hugepage(new_hpage);
1352
1353 return rc;
1354 }
1355
1356 static inline int try_split_thp(struct page *page, struct page **page2,
1357 struct list_head *from)
1358 {
1359 int rc = 0;
1360
1361 lock_page(page);
1362 rc = split_huge_page_to_list(page, from);
1363 unlock_page(page);
1364 if (!rc)
1365 list_safe_reset_next(page, *page2, lru);
1366
1367 return rc;
1368 }
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395 int migrate_pages(struct list_head *from, new_page_t get_new_page,
1396 free_page_t put_new_page, unsigned long private,
1397 enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
1398 {
1399 int retry = 1;
1400 int thp_retry = 1;
1401 int nr_failed = 0;
1402 int nr_failed_pages = 0;
1403 int nr_succeeded = 0;
1404 int nr_thp_succeeded = 0;
1405 int nr_thp_failed = 0;
1406 int nr_thp_split = 0;
1407 int pass = 0;
1408 bool is_thp = false;
1409 struct page *page;
1410 struct page *page2;
1411 int rc, nr_subpages;
1412 LIST_HEAD(ret_pages);
1413 LIST_HEAD(thp_split_pages);
1414 bool nosplit = (reason == MR_NUMA_MISPLACED);
1415 bool no_subpage_counting = false;
1416
1417 trace_mm_migrate_pages_start(mode, reason);
1418
1419 thp_subpage_migration:
1420 for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
1421 retry = 0;
1422 thp_retry = 0;
1423
1424 list_for_each_entry_safe(page, page2, from, lru) {
1425 retry:
1426
1427
1428
1429
1430
1431 is_thp = PageTransHuge(page) && !PageHuge(page);
1432 nr_subpages = compound_nr(page);
1433 cond_resched();
1434
1435 if (PageHuge(page))
1436 rc = unmap_and_move_huge_page(get_new_page,
1437 put_new_page, private, page,
1438 pass > 2, mode, reason,
1439 &ret_pages);
1440 else
1441 rc = unmap_and_move(get_new_page, put_new_page,
1442 private, page, pass > 2, mode,
1443 reason, &ret_pages);
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453 switch(rc) {
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465 case -ENOSYS:
1466
1467 if (is_thp) {
1468 nr_thp_failed++;
1469 if (!try_split_thp(page, &page2, &thp_split_pages)) {
1470 nr_thp_split++;
1471 goto retry;
1472 }
1473
1474 } else if (!no_subpage_counting) {
1475 nr_failed++;
1476 }
1477
1478 nr_failed_pages += nr_subpages;
1479 break;
1480 case -ENOMEM:
1481
1482
1483
1484
1485
1486 if (is_thp && !nosplit) {
1487 nr_thp_failed++;
1488 if (!try_split_thp(page, &page2, &thp_split_pages)) {
1489 nr_thp_split++;
1490 goto retry;
1491 }
1492 } else if (!no_subpage_counting) {
1493 nr_failed++;
1494 }
1495
1496 nr_failed_pages += nr_subpages;
1497
1498
1499
1500
1501
1502
1503 list_splice_init(&thp_split_pages, from);
1504 nr_thp_failed += thp_retry;
1505 goto out;
1506 case -EAGAIN:
1507 if (is_thp)
1508 thp_retry++;
1509 else
1510 retry++;
1511 break;
1512 case MIGRATEPAGE_SUCCESS:
1513 nr_succeeded += nr_subpages;
1514 if (is_thp)
1515 nr_thp_succeeded++;
1516 break;
1517 default:
1518
1519
1520
1521
1522
1523
1524 if (is_thp)
1525 nr_thp_failed++;
1526 else if (!no_subpage_counting)
1527 nr_failed++;
1528
1529 nr_failed_pages += nr_subpages;
1530 break;
1531 }
1532 }
1533 }
1534 nr_failed += retry;
1535 nr_thp_failed += thp_retry;
1536
1537
1538
1539
1540
1541 if (!list_empty(&thp_split_pages)) {
1542
1543
1544
1545
1546 list_splice_init(from, &ret_pages);
1547 list_splice_init(&thp_split_pages, from);
1548 no_subpage_counting = true;
1549 retry = 1;
1550 goto thp_subpage_migration;
1551 }
1552
1553 rc = nr_failed + nr_thp_failed;
1554 out:
1555
1556
1557
1558
1559 list_splice(&ret_pages, from);
1560
1561 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1562 count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
1563 count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
1564 count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
1565 count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
1566 trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
1567 nr_thp_failed, nr_thp_split, mode, reason);
1568
1569 if (ret_succeeded)
1570 *ret_succeeded = nr_succeeded;
1571
1572 return rc;
1573 }
1574
1575 struct page *alloc_migration_target(struct page *page, unsigned long private)
1576 {
1577 struct folio *folio = page_folio(page);
1578 struct migration_target_control *mtc;
1579 gfp_t gfp_mask;
1580 unsigned int order = 0;
1581 struct folio *new_folio = NULL;
1582 int nid;
1583 int zidx;
1584
1585 mtc = (struct migration_target_control *)private;
1586 gfp_mask = mtc->gfp_mask;
1587 nid = mtc->nid;
1588 if (nid == NUMA_NO_NODE)
1589 nid = folio_nid(folio);
1590
1591 if (folio_test_hugetlb(folio)) {
1592 struct hstate *h = page_hstate(&folio->page);
1593
1594 gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
1595 return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
1596 }
1597
1598 if (folio_test_large(folio)) {
1599
1600
1601
1602
1603 gfp_mask &= ~__GFP_RECLAIM;
1604 gfp_mask |= GFP_TRANSHUGE;
1605 order = folio_order(folio);
1606 }
1607 zidx = zone_idx(folio_zone(folio));
1608 if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
1609 gfp_mask |= __GFP_HIGHMEM;
1610
1611 new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask);
1612
1613 return &new_folio->page;
1614 }
1615
1616 #ifdef CONFIG_NUMA
1617
1618 static int store_status(int __user *status, int start, int value, int nr)
1619 {
1620 while (nr-- > 0) {
1621 if (put_user(value, status + start))
1622 return -EFAULT;
1623 start++;
1624 }
1625
1626 return 0;
1627 }
1628
1629 static int do_move_pages_to_node(struct mm_struct *mm,
1630 struct list_head *pagelist, int node)
1631 {
1632 int err;
1633 struct migration_target_control mtc = {
1634 .nid = node,
1635 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1636 };
1637
1638 err = migrate_pages(pagelist, alloc_migration_target, NULL,
1639 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1640 if (err)
1641 putback_movable_pages(pagelist);
1642 return err;
1643 }
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654 static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
1655 int node, struct list_head *pagelist, bool migrate_all)
1656 {
1657 struct vm_area_struct *vma;
1658 struct page *page;
1659 int err;
1660
1661 mmap_read_lock(mm);
1662 err = -EFAULT;
1663 vma = vma_lookup(mm, addr);
1664 if (!vma || !vma_migratable(vma))
1665 goto out;
1666
1667
1668 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
1669
1670 err = PTR_ERR(page);
1671 if (IS_ERR(page))
1672 goto out;
1673
1674 err = -ENOENT;
1675 if (!page || is_zone_device_page(page))
1676 goto out;
1677
1678 err = 0;
1679 if (page_to_nid(page) == node)
1680 goto out_putpage;
1681
1682 err = -EACCES;
1683 if (page_mapcount(page) > 1 && !migrate_all)
1684 goto out_putpage;
1685
1686 if (PageHuge(page)) {
1687 if (PageHead(page)) {
1688 err = isolate_hugetlb(page, pagelist);
1689 if (!err)
1690 err = 1;
1691 }
1692 } else {
1693 struct page *head;
1694
1695 head = compound_head(page);
1696 err = isolate_lru_page(head);
1697 if (err)
1698 goto out_putpage;
1699
1700 err = 1;
1701 list_add_tail(&head->lru, pagelist);
1702 mod_node_page_state(page_pgdat(head),
1703 NR_ISOLATED_ANON + page_is_file_lru(head),
1704 thp_nr_pages(head));
1705 }
1706 out_putpage:
1707
1708
1709
1710
1711
1712 put_page(page);
1713 out:
1714 mmap_read_unlock(mm);
1715 return err;
1716 }
1717
1718 static int move_pages_and_store_status(struct mm_struct *mm, int node,
1719 struct list_head *pagelist, int __user *status,
1720 int start, int i, unsigned long nr_pages)
1721 {
1722 int err;
1723
1724 if (list_empty(pagelist))
1725 return 0;
1726
1727 err = do_move_pages_to_node(mm, pagelist, node);
1728 if (err) {
1729
1730
1731
1732
1733
1734
1735
1736
1737 if (err > 0)
1738 err += nr_pages - i - 1;
1739 return err;
1740 }
1741 return store_status(status, start, node, i - start);
1742 }
1743
1744
1745
1746
1747
1748 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1749 unsigned long nr_pages,
1750 const void __user * __user *pages,
1751 const int __user *nodes,
1752 int __user *status, int flags)
1753 {
1754 int current_node = NUMA_NO_NODE;
1755 LIST_HEAD(pagelist);
1756 int start, i;
1757 int err = 0, err1;
1758
1759 lru_cache_disable();
1760
1761 for (i = start = 0; i < nr_pages; i++) {
1762 const void __user *p;
1763 unsigned long addr;
1764 int node;
1765
1766 err = -EFAULT;
1767 if (get_user(p, pages + i))
1768 goto out_flush;
1769 if (get_user(node, nodes + i))
1770 goto out_flush;
1771 addr = (unsigned long)untagged_addr(p);
1772
1773 err = -ENODEV;
1774 if (node < 0 || node >= MAX_NUMNODES)
1775 goto out_flush;
1776 if (!node_state(node, N_MEMORY))
1777 goto out_flush;
1778
1779 err = -EACCES;
1780 if (!node_isset(node, task_nodes))
1781 goto out_flush;
1782
1783 if (current_node == NUMA_NO_NODE) {
1784 current_node = node;
1785 start = i;
1786 } else if (node != current_node) {
1787 err = move_pages_and_store_status(mm, current_node,
1788 &pagelist, status, start, i, nr_pages);
1789 if (err)
1790 goto out;
1791 start = i;
1792 current_node = node;
1793 }
1794
1795
1796
1797
1798
1799 err = add_page_for_migration(mm, addr, current_node,
1800 &pagelist, flags & MPOL_MF_MOVE_ALL);
1801
1802 if (err > 0) {
1803
1804 continue;
1805 }
1806
1807
1808
1809
1810
1811 if (err == -EEXIST)
1812 err = -EFAULT;
1813
1814
1815
1816
1817
1818 err = store_status(status, i, err ? : current_node, 1);
1819 if (err)
1820 goto out_flush;
1821
1822 err = move_pages_and_store_status(mm, current_node, &pagelist,
1823 status, start, i, nr_pages);
1824 if (err)
1825 goto out;
1826 current_node = NUMA_NO_NODE;
1827 }
1828 out_flush:
1829
1830 err1 = move_pages_and_store_status(mm, current_node, &pagelist,
1831 status, start, i, nr_pages);
1832 if (err >= 0)
1833 err = err1;
1834 out:
1835 lru_cache_enable();
1836 return err;
1837 }
1838
1839
1840
1841
1842 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1843 const void __user **pages, int *status)
1844 {
1845 unsigned long i;
1846
1847 mmap_read_lock(mm);
1848
1849 for (i = 0; i < nr_pages; i++) {
1850 unsigned long addr = (unsigned long)(*pages);
1851 struct vm_area_struct *vma;
1852 struct page *page;
1853 int err = -EFAULT;
1854
1855 vma = vma_lookup(mm, addr);
1856 if (!vma)
1857 goto set_status;
1858
1859
1860 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
1861
1862 err = PTR_ERR(page);
1863 if (IS_ERR(page))
1864 goto set_status;
1865
1866 if (page && !is_zone_device_page(page)) {
1867 err = page_to_nid(page);
1868 put_page(page);
1869 } else {
1870 err = -ENOENT;
1871 }
1872 set_status:
1873 *status = err;
1874
1875 pages++;
1876 status++;
1877 }
1878
1879 mmap_read_unlock(mm);
1880 }
1881
1882 static int get_compat_pages_array(const void __user *chunk_pages[],
1883 const void __user * __user *pages,
1884 unsigned long chunk_nr)
1885 {
1886 compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
1887 compat_uptr_t p;
1888 int i;
1889
1890 for (i = 0; i < chunk_nr; i++) {
1891 if (get_user(p, pages32 + i))
1892 return -EFAULT;
1893 chunk_pages[i] = compat_ptr(p);
1894 }
1895
1896 return 0;
1897 }
1898
1899
1900
1901
1902
1903 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1904 const void __user * __user *pages,
1905 int __user *status)
1906 {
1907 #define DO_PAGES_STAT_CHUNK_NR 16UL
1908 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1909 int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1910
1911 while (nr_pages) {
1912 unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
1913
1914 if (in_compat_syscall()) {
1915 if (get_compat_pages_array(chunk_pages, pages,
1916 chunk_nr))
1917 break;
1918 } else {
1919 if (copy_from_user(chunk_pages, pages,
1920 chunk_nr * sizeof(*chunk_pages)))
1921 break;
1922 }
1923
1924 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1925
1926 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1927 break;
1928
1929 pages += chunk_nr;
1930 status += chunk_nr;
1931 nr_pages -= chunk_nr;
1932 }
1933 return nr_pages ? -EFAULT : 0;
1934 }
1935
1936 static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
1937 {
1938 struct task_struct *task;
1939 struct mm_struct *mm;
1940
1941
1942
1943
1944
1945 if (!pid) {
1946 mmget(current->mm);
1947 *mem_nodes = cpuset_mems_allowed(current);
1948 return current->mm;
1949 }
1950
1951
1952 rcu_read_lock();
1953 task = find_task_by_vpid(pid);
1954 if (!task) {
1955 rcu_read_unlock();
1956 return ERR_PTR(-ESRCH);
1957 }
1958 get_task_struct(task);
1959
1960
1961
1962
1963
1964 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1965 rcu_read_unlock();
1966 mm = ERR_PTR(-EPERM);
1967 goto out;
1968 }
1969 rcu_read_unlock();
1970
1971 mm = ERR_PTR(security_task_movememory(task));
1972 if (IS_ERR(mm))
1973 goto out;
1974 *mem_nodes = cpuset_mems_allowed(task);
1975 mm = get_task_mm(task);
1976 out:
1977 put_task_struct(task);
1978 if (!mm)
1979 mm = ERR_PTR(-EINVAL);
1980 return mm;
1981 }
1982
1983
1984
1985
1986
1987 static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
1988 const void __user * __user *pages,
1989 const int __user *nodes,
1990 int __user *status, int flags)
1991 {
1992 struct mm_struct *mm;
1993 int err;
1994 nodemask_t task_nodes;
1995
1996
1997 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1998 return -EINVAL;
1999
2000 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
2001 return -EPERM;
2002
2003 mm = find_mm_struct(pid, &task_nodes);
2004 if (IS_ERR(mm))
2005 return PTR_ERR(mm);
2006
2007 if (nodes)
2008 err = do_pages_move(mm, task_nodes, nr_pages, pages,
2009 nodes, status, flags);
2010 else
2011 err = do_pages_stat(mm, nr_pages, pages, status);
2012
2013 mmput(mm);
2014 return err;
2015 }
2016
2017 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
2018 const void __user * __user *, pages,
2019 const int __user *, nodes,
2020 int __user *, status, int, flags)
2021 {
2022 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
2023 }
2024
2025 #ifdef CONFIG_NUMA_BALANCING
2026
2027
2028
2029
2030 static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
2031 unsigned long nr_migrate_pages)
2032 {
2033 int z;
2034
2035 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2036 struct zone *zone = pgdat->node_zones + z;
2037
2038 if (!managed_zone(zone))
2039 continue;
2040
2041
2042 if (!zone_watermark_ok(zone, 0,
2043 high_wmark_pages(zone) +
2044 nr_migrate_pages,
2045 ZONE_MOVABLE, 0))
2046 continue;
2047 return true;
2048 }
2049 return false;
2050 }
2051
2052 static struct page *alloc_misplaced_dst_page(struct page *page,
2053 unsigned long data)
2054 {
2055 int nid = (int) data;
2056 int order = compound_order(page);
2057 gfp_t gfp = __GFP_THISNODE;
2058 struct folio *new;
2059
2060 if (order > 0)
2061 gfp |= GFP_TRANSHUGE_LIGHT;
2062 else {
2063 gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
2064 __GFP_NOWARN;
2065 gfp &= ~__GFP_RECLAIM;
2066 }
2067 new = __folio_alloc_node(gfp, order, nid);
2068
2069 return &new->page;
2070 }
2071
2072 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
2073 {
2074 int nr_pages = thp_nr_pages(page);
2075 int order = compound_order(page);
2076
2077 VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
2078
2079
2080 if (PageTransHuge(page) && total_mapcount(page) > 1)
2081 return 0;
2082
2083
2084 if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
2085 int z;
2086
2087 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
2088 return 0;
2089 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2090 if (managed_zone(pgdat->node_zones + z))
2091 break;
2092 }
2093 wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
2094 return 0;
2095 }
2096
2097 if (isolate_lru_page(page))
2098 return 0;
2099
2100 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
2101 nr_pages);
2102
2103
2104
2105
2106
2107
2108 put_page(page);
2109 return 1;
2110 }
2111
2112
2113
2114
2115
2116
2117 int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
2118 int node)
2119 {
2120 pg_data_t *pgdat = NODE_DATA(node);
2121 int isolated;
2122 int nr_remaining;
2123 unsigned int nr_succeeded;
2124 LIST_HEAD(migratepages);
2125 int nr_pages = thp_nr_pages(page);
2126
2127
2128
2129
2130
2131 if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
2132 (vma->vm_flags & VM_EXEC))
2133 goto out;
2134
2135
2136
2137
2138
2139 if (page_is_file_lru(page) && PageDirty(page))
2140 goto out;
2141
2142 isolated = numamigrate_isolate_page(pgdat, page);
2143 if (!isolated)
2144 goto out;
2145
2146 list_add(&page->lru, &migratepages);
2147 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
2148 NULL, node, MIGRATE_ASYNC,
2149 MR_NUMA_MISPLACED, &nr_succeeded);
2150 if (nr_remaining) {
2151 if (!list_empty(&migratepages)) {
2152 list_del(&page->lru);
2153 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
2154 page_is_file_lru(page), -nr_pages);
2155 putback_lru_page(page);
2156 }
2157 isolated = 0;
2158 }
2159 if (nr_succeeded) {
2160 count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
2161 if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
2162 mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
2163 nr_succeeded);
2164 }
2165 BUG_ON(!list_empty(&migratepages));
2166 return isolated;
2167
2168 out:
2169 put_page(page);
2170 return 0;
2171 }
2172 #endif
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228 #define DEFAULT_DEMOTION_TARGET_NODES 15
2229
2230 #if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
2231 #define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
2232 #else
2233 #define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
2234 #endif
2235
2236 struct demotion_nodes {
2237 unsigned short nr;
2238 short nodes[DEMOTION_TARGET_NODES];
2239 };
2240
2241 static struct demotion_nodes *node_demotion __read_mostly;
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252 int next_demotion_node(int node)
2253 {
2254 struct demotion_nodes *nd;
2255 unsigned short target_nr, index;
2256 int target;
2257
2258 if (!node_demotion)
2259 return NUMA_NO_NODE;
2260
2261 nd = &node_demotion[node];
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272 rcu_read_lock();
2273 target_nr = READ_ONCE(nd->nr);
2274
2275 switch (target_nr) {
2276 case 0:
2277 target = NUMA_NO_NODE;
2278 goto out;
2279 case 1:
2280 index = 0;
2281 break;
2282 default:
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295 index = get_random_int() % target_nr;
2296 break;
2297 }
2298
2299 target = READ_ONCE(nd->nodes[index]);
2300
2301 out:
2302 rcu_read_unlock();
2303 return target;
2304 }
2305
2306
2307 static void __disable_all_migrate_targets(void)
2308 {
2309 int node, i;
2310
2311 if (!node_demotion)
2312 return;
2313
2314 for_each_online_node(node) {
2315 node_demotion[node].nr = 0;
2316 for (i = 0; i < DEMOTION_TARGET_NODES; i++)
2317 node_demotion[node].nodes[i] = NUMA_NO_NODE;
2318 }
2319 }
2320
2321 static void disable_all_migrate_targets(void)
2322 {
2323 __disable_all_migrate_targets();
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337 synchronize_rcu();
2338 }
2339
2340
2341
2342
2343
2344
2345 static int establish_migrate_target(int node, nodemask_t *used,
2346 int best_distance)
2347 {
2348 int migration_target, index, val;
2349 struct demotion_nodes *nd;
2350
2351 if (!node_demotion)
2352 return NUMA_NO_NODE;
2353
2354 nd = &node_demotion[node];
2355
2356 migration_target = find_next_best_node(node, used);
2357 if (migration_target == NUMA_NO_NODE)
2358 return NUMA_NO_NODE;
2359
2360
2361
2362
2363
2364
2365
2366 if (best_distance != -1) {
2367 val = node_distance(node, migration_target);
2368 if (val > best_distance)
2369 goto out_clear;
2370 }
2371
2372 index = nd->nr;
2373 if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
2374 "Exceeds maximum demotion target nodes\n"))
2375 goto out_clear;
2376
2377 nd->nodes[index] = migration_target;
2378 nd->nr++;
2379
2380 return migration_target;
2381 out_clear:
2382 node_clear(migration_target, *used);
2383 return NUMA_NO_NODE;
2384 }
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406 static void __set_migration_target_nodes(void)
2407 {
2408 nodemask_t next_pass;
2409 nodemask_t this_pass;
2410 nodemask_t used_targets = NODE_MASK_NONE;
2411 int node, best_distance;
2412
2413
2414
2415
2416
2417
2418 disable_all_migrate_targets();
2419
2420
2421
2422
2423
2424 next_pass = node_states[N_CPU];
2425 again:
2426 this_pass = next_pass;
2427 next_pass = NODE_MASK_NONE;
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439 nodes_or(used_targets, used_targets, this_pass);
2440
2441 for_each_node_mask(node, this_pass) {
2442 best_distance = -1;
2443
2444
2445
2446
2447
2448
2449 do {
2450 int target_node =
2451 establish_migrate_target(node, &used_targets,
2452 best_distance);
2453
2454 if (target_node == NUMA_NO_NODE)
2455 break;
2456
2457 if (best_distance == -1)
2458 best_distance = node_distance(node, target_node);
2459
2460
2461
2462
2463
2464
2465 node_set(target_node, next_pass);
2466 } while (1);
2467 }
2468
2469
2470
2471
2472
2473 if (!nodes_empty(next_pass))
2474 goto again;
2475 }
2476
2477
2478
2479
2480 void set_migration_target_nodes(void)
2481 {
2482 get_online_mems();
2483 __set_migration_target_nodes();
2484 put_online_mems();
2485 }
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498 #ifdef CONFIG_MEMORY_HOTPLUG
2499 static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
2500 unsigned long action, void *_arg)
2501 {
2502 struct memory_notify *arg = _arg;
2503
2504
2505
2506
2507
2508
2509 if (arg->status_change_nid < 0)
2510 return notifier_from_errno(0);
2511
2512 switch (action) {
2513 case MEM_GOING_OFFLINE:
2514
2515
2516
2517
2518
2519
2520 disable_all_migrate_targets();
2521 break;
2522 case MEM_OFFLINE:
2523 case MEM_ONLINE:
2524
2525
2526
2527
2528 __set_migration_target_nodes();
2529 break;
2530 case MEM_CANCEL_OFFLINE:
2531
2532
2533
2534
2535 __set_migration_target_nodes();
2536 break;
2537 case MEM_GOING_ONLINE:
2538 case MEM_CANCEL_ONLINE:
2539 break;
2540 }
2541
2542 return notifier_from_errno(0);
2543 }
2544 #endif
2545
2546 void __init migrate_on_reclaim_init(void)
2547 {
2548 node_demotion = kcalloc(nr_node_ids,
2549 sizeof(struct demotion_nodes),
2550 GFP_KERNEL);
2551 WARN_ON(!node_demotion);
2552 #ifdef CONFIG_MEMORY_HOTPLUG
2553 hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
2554 #endif
2555
2556
2557
2558
2559
2560
2561 cpus_read_lock();
2562 set_migration_target_nodes();
2563 cpus_read_unlock();
2564 }
2565
2566 bool numa_demotion_enabled = false;
2567
2568 #ifdef CONFIG_SYSFS
2569 static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
2570 struct kobj_attribute *attr, char *buf)
2571 {
2572 return sysfs_emit(buf, "%s\n",
2573 numa_demotion_enabled ? "true" : "false");
2574 }
2575
2576 static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
2577 struct kobj_attribute *attr,
2578 const char *buf, size_t count)
2579 {
2580 ssize_t ret;
2581
2582 ret = kstrtobool(buf, &numa_demotion_enabled);
2583 if (ret)
2584 return ret;
2585
2586 return count;
2587 }
2588
2589 static struct kobj_attribute numa_demotion_enabled_attr =
2590 __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
2591 numa_demotion_enabled_store);
2592
2593 static struct attribute *numa_attrs[] = {
2594 &numa_demotion_enabled_attr.attr,
2595 NULL,
2596 };
2597
2598 static const struct attribute_group numa_attr_group = {
2599 .attrs = numa_attrs,
2600 };
2601
2602 static int __init numa_init_sysfs(void)
2603 {
2604 int err;
2605 struct kobject *numa_kobj;
2606
2607 numa_kobj = kobject_create_and_add("numa", mm_kobj);
2608 if (!numa_kobj) {
2609 pr_err("failed to create numa kobject\n");
2610 return -ENOMEM;
2611 }
2612 err = sysfs_create_group(numa_kobj, &numa_attr_group);
2613 if (err) {
2614 pr_err("failed to register numa group\n");
2615 goto delete_obj;
2616 }
2617 return 0;
2618
2619 delete_obj:
2620 kobject_put(numa_kobj);
2621 return err;
2622 }
2623 subsys_initcall(numa_init_sysfs);
2624 #endif
2625 #endif