0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055 #include <linux/mm.h>
0056 #include <linux/sched/mm.h>
0057 #include <linux/sched/task.h>
0058 #include <linux/pagemap.h>
0059 #include <linux/swap.h>
0060 #include <linux/swapops.h>
0061 #include <linux/slab.h>
0062 #include <linux/init.h>
0063 #include <linux/ksm.h>
0064 #include <linux/rmap.h>
0065 #include <linux/rcupdate.h>
0066 #include <linux/export.h>
0067 #include <linux/memcontrol.h>
0068 #include <linux/mmu_notifier.h>
0069 #include <linux/migrate.h>
0070 #include <linux/hugetlb.h>
0071 #include <linux/huge_mm.h>
0072 #include <linux/backing-dev.h>
0073 #include <linux/page_idle.h>
0074 #include <linux/memremap.h>
0075 #include <linux/userfaultfd_k.h>
0076 #include <linux/mm_inline.h>
0077
0078 #include <asm/tlbflush.h>
0079
0080 #define CREATE_TRACE_POINTS
0081 #include <trace/events/tlb.h>
0082 #include <trace/events/migrate.h>
0083
0084 #include "internal.h"
0085
0086 static struct kmem_cache *anon_vma_cachep;
0087 static struct kmem_cache *anon_vma_chain_cachep;
0088
0089 static inline struct anon_vma *anon_vma_alloc(void)
0090 {
0091 struct anon_vma *anon_vma;
0092
0093 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
0094 if (anon_vma) {
0095 atomic_set(&anon_vma->refcount, 1);
0096 anon_vma->num_children = 0;
0097 anon_vma->num_active_vmas = 0;
0098 anon_vma->parent = anon_vma;
0099
0100
0101
0102
0103 anon_vma->root = anon_vma;
0104 }
0105
0106 return anon_vma;
0107 }
0108
0109 static inline void anon_vma_free(struct anon_vma *anon_vma)
0110 {
0111 VM_BUG_ON(atomic_read(&anon_vma->refcount));
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130 might_sleep();
0131 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
0132 anon_vma_lock_write(anon_vma);
0133 anon_vma_unlock_write(anon_vma);
0134 }
0135
0136 kmem_cache_free(anon_vma_cachep, anon_vma);
0137 }
0138
0139 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
0140 {
0141 return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
0142 }
0143
0144 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
0145 {
0146 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
0147 }
0148
0149 static void anon_vma_chain_link(struct vm_area_struct *vma,
0150 struct anon_vma_chain *avc,
0151 struct anon_vma *anon_vma)
0152 {
0153 avc->vma = vma;
0154 avc->anon_vma = anon_vma;
0155 list_add(&avc->same_vma, &vma->anon_vma_chain);
0156 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
0157 }
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187 int __anon_vma_prepare(struct vm_area_struct *vma)
0188 {
0189 struct mm_struct *mm = vma->vm_mm;
0190 struct anon_vma *anon_vma, *allocated;
0191 struct anon_vma_chain *avc;
0192
0193 might_sleep();
0194
0195 avc = anon_vma_chain_alloc(GFP_KERNEL);
0196 if (!avc)
0197 goto out_enomem;
0198
0199 anon_vma = find_mergeable_anon_vma(vma);
0200 allocated = NULL;
0201 if (!anon_vma) {
0202 anon_vma = anon_vma_alloc();
0203 if (unlikely(!anon_vma))
0204 goto out_enomem_free_avc;
0205 anon_vma->num_children++;
0206 allocated = anon_vma;
0207 }
0208
0209 anon_vma_lock_write(anon_vma);
0210
0211 spin_lock(&mm->page_table_lock);
0212 if (likely(!vma->anon_vma)) {
0213 vma->anon_vma = anon_vma;
0214 anon_vma_chain_link(vma, avc, anon_vma);
0215 anon_vma->num_active_vmas++;
0216 allocated = NULL;
0217 avc = NULL;
0218 }
0219 spin_unlock(&mm->page_table_lock);
0220 anon_vma_unlock_write(anon_vma);
0221
0222 if (unlikely(allocated))
0223 put_anon_vma(allocated);
0224 if (unlikely(avc))
0225 anon_vma_chain_free(avc);
0226
0227 return 0;
0228
0229 out_enomem_free_avc:
0230 anon_vma_chain_free(avc);
0231 out_enomem:
0232 return -ENOMEM;
0233 }
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
0244 {
0245 struct anon_vma *new_root = anon_vma->root;
0246 if (new_root != root) {
0247 if (WARN_ON_ONCE(root))
0248 up_write(&root->rwsem);
0249 root = new_root;
0250 down_write(&root->rwsem);
0251 }
0252 return root;
0253 }
0254
0255 static inline void unlock_anon_vma_root(struct anon_vma *root)
0256 {
0257 if (root)
0258 up_write(&root->rwsem);
0259 }
0260
0261
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274
0275
0276
0277
0278
0279 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
0280 {
0281 struct anon_vma_chain *avc, *pavc;
0282 struct anon_vma *root = NULL;
0283
0284 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
0285 struct anon_vma *anon_vma;
0286
0287 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
0288 if (unlikely(!avc)) {
0289 unlock_anon_vma_root(root);
0290 root = NULL;
0291 avc = anon_vma_chain_alloc(GFP_KERNEL);
0292 if (!avc)
0293 goto enomem_failure;
0294 }
0295 anon_vma = pavc->anon_vma;
0296 root = lock_anon_vma_root(root, anon_vma);
0297 anon_vma_chain_link(dst, avc, anon_vma);
0298
0299
0300
0301
0302
0303
0304
0305
0306 if (!dst->anon_vma && src->anon_vma &&
0307 anon_vma->num_children < 2 &&
0308 anon_vma->num_active_vmas == 0)
0309 dst->anon_vma = anon_vma;
0310 }
0311 if (dst->anon_vma)
0312 dst->anon_vma->num_active_vmas++;
0313 unlock_anon_vma_root(root);
0314 return 0;
0315
0316 enomem_failure:
0317
0318
0319
0320
0321
0322
0323 dst->anon_vma = NULL;
0324 unlink_anon_vmas(dst);
0325 return -ENOMEM;
0326 }
0327
0328
0329
0330
0331
0332
0333 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
0334 {
0335 struct anon_vma_chain *avc;
0336 struct anon_vma *anon_vma;
0337 int error;
0338
0339
0340 if (!pvma->anon_vma)
0341 return 0;
0342
0343
0344 vma->anon_vma = NULL;
0345
0346
0347
0348
0349
0350 error = anon_vma_clone(vma, pvma);
0351 if (error)
0352 return error;
0353
0354
0355 if (vma->anon_vma)
0356 return 0;
0357
0358
0359 anon_vma = anon_vma_alloc();
0360 if (!anon_vma)
0361 goto out_error;
0362 anon_vma->num_active_vmas++;
0363 avc = anon_vma_chain_alloc(GFP_KERNEL);
0364 if (!avc)
0365 goto out_error_free_anon_vma;
0366
0367
0368
0369
0370
0371 anon_vma->root = pvma->anon_vma->root;
0372 anon_vma->parent = pvma->anon_vma;
0373
0374
0375
0376
0377
0378 get_anon_vma(anon_vma->root);
0379
0380 vma->anon_vma = anon_vma;
0381 anon_vma_lock_write(anon_vma);
0382 anon_vma_chain_link(vma, avc, anon_vma);
0383 anon_vma->parent->num_children++;
0384 anon_vma_unlock_write(anon_vma);
0385
0386 return 0;
0387
0388 out_error_free_anon_vma:
0389 put_anon_vma(anon_vma);
0390 out_error:
0391 unlink_anon_vmas(vma);
0392 return -ENOMEM;
0393 }
0394
0395 void unlink_anon_vmas(struct vm_area_struct *vma)
0396 {
0397 struct anon_vma_chain *avc, *next;
0398 struct anon_vma *root = NULL;
0399
0400
0401
0402
0403
0404 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
0405 struct anon_vma *anon_vma = avc->anon_vma;
0406
0407 root = lock_anon_vma_root(root, anon_vma);
0408 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
0409
0410
0411
0412
0413
0414 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
0415 anon_vma->parent->num_children--;
0416 continue;
0417 }
0418
0419 list_del(&avc->same_vma);
0420 anon_vma_chain_free(avc);
0421 }
0422 if (vma->anon_vma) {
0423 vma->anon_vma->num_active_vmas--;
0424
0425
0426
0427
0428
0429 vma->anon_vma = NULL;
0430 }
0431 unlock_anon_vma_root(root);
0432
0433
0434
0435
0436
0437
0438 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
0439 struct anon_vma *anon_vma = avc->anon_vma;
0440
0441 VM_WARN_ON(anon_vma->num_children);
0442 VM_WARN_ON(anon_vma->num_active_vmas);
0443 put_anon_vma(anon_vma);
0444
0445 list_del(&avc->same_vma);
0446 anon_vma_chain_free(avc);
0447 }
0448 }
0449
0450 static void anon_vma_ctor(void *data)
0451 {
0452 struct anon_vma *anon_vma = data;
0453
0454 init_rwsem(&anon_vma->rwsem);
0455 atomic_set(&anon_vma->refcount, 0);
0456 anon_vma->rb_root = RB_ROOT_CACHED;
0457 }
0458
0459 void __init anon_vma_init(void)
0460 {
0461 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
0462 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
0463 anon_vma_ctor);
0464 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
0465 SLAB_PANIC|SLAB_ACCOUNT);
0466 }
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479
0480
0481
0482
0483
0484
0485
0486
0487
0488
0489
0490
0491
0492 struct anon_vma *page_get_anon_vma(struct page *page)
0493 {
0494 struct anon_vma *anon_vma = NULL;
0495 unsigned long anon_mapping;
0496
0497 rcu_read_lock();
0498 anon_mapping = (unsigned long)READ_ONCE(page->mapping);
0499 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
0500 goto out;
0501 if (!page_mapped(page))
0502 goto out;
0503
0504 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
0505 if (!atomic_inc_not_zero(&anon_vma->refcount)) {
0506 anon_vma = NULL;
0507 goto out;
0508 }
0509
0510
0511
0512
0513
0514
0515
0516
0517 if (!page_mapped(page)) {
0518 rcu_read_unlock();
0519 put_anon_vma(anon_vma);
0520 return NULL;
0521 }
0522 out:
0523 rcu_read_unlock();
0524
0525 return anon_vma;
0526 }
0527
0528
0529
0530
0531
0532
0533
0534
0535
0536 struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
0537 struct rmap_walk_control *rwc)
0538 {
0539 struct anon_vma *anon_vma = NULL;
0540 struct anon_vma *root_anon_vma;
0541 unsigned long anon_mapping;
0542
0543 rcu_read_lock();
0544 anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
0545 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
0546 goto out;
0547 if (!folio_mapped(folio))
0548 goto out;
0549
0550 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
0551 root_anon_vma = READ_ONCE(anon_vma->root);
0552 if (down_read_trylock(&root_anon_vma->rwsem)) {
0553
0554
0555
0556
0557
0558 if (!folio_mapped(folio)) {
0559 up_read(&root_anon_vma->rwsem);
0560 anon_vma = NULL;
0561 }
0562 goto out;
0563 }
0564
0565 if (rwc && rwc->try_lock) {
0566 anon_vma = NULL;
0567 rwc->contended = true;
0568 goto out;
0569 }
0570
0571
0572 if (!atomic_inc_not_zero(&anon_vma->refcount)) {
0573 anon_vma = NULL;
0574 goto out;
0575 }
0576
0577 if (!folio_mapped(folio)) {
0578 rcu_read_unlock();
0579 put_anon_vma(anon_vma);
0580 return NULL;
0581 }
0582
0583
0584 rcu_read_unlock();
0585 anon_vma_lock_read(anon_vma);
0586
0587 if (atomic_dec_and_test(&anon_vma->refcount)) {
0588
0589
0590
0591
0592
0593 anon_vma_unlock_read(anon_vma);
0594 __put_anon_vma(anon_vma);
0595 anon_vma = NULL;
0596 }
0597
0598 return anon_vma;
0599
0600 out:
0601 rcu_read_unlock();
0602 return anon_vma;
0603 }
0604
0605 void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
0606 {
0607 anon_vma_unlock_read(anon_vma);
0608 }
0609
0610 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
0611
0612
0613
0614
0615
0616
0617 void try_to_unmap_flush(void)
0618 {
0619 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
0620
0621 if (!tlb_ubc->flush_required)
0622 return;
0623
0624 arch_tlbbatch_flush(&tlb_ubc->arch);
0625 tlb_ubc->flush_required = false;
0626 tlb_ubc->writable = false;
0627 }
0628
0629
0630 void try_to_unmap_flush_dirty(void)
0631 {
0632 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
0633
0634 if (tlb_ubc->writable)
0635 try_to_unmap_flush();
0636 }
0637
0638
0639
0640
0641
0642 #define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16
0643 #define TLB_FLUSH_BATCH_PENDING_MASK \
0644 ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
0645 #define TLB_FLUSH_BATCH_PENDING_LARGE \
0646 (TLB_FLUSH_BATCH_PENDING_MASK / 2)
0647
0648 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
0649 {
0650 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
0651 int batch, nbatch;
0652
0653 arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
0654 tlb_ubc->flush_required = true;
0655
0656
0657
0658
0659
0660 barrier();
0661 batch = atomic_read(&mm->tlb_flush_batched);
0662 retry:
0663 if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
0664
0665
0666
0667
0668
0669 nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1);
0670 if (nbatch != batch) {
0671 batch = nbatch;
0672 goto retry;
0673 }
0674 } else {
0675 atomic_inc(&mm->tlb_flush_batched);
0676 }
0677
0678
0679
0680
0681
0682
0683 if (writable)
0684 tlb_ubc->writable = true;
0685 }
0686
0687
0688
0689
0690
0691 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
0692 {
0693 bool should_defer = false;
0694
0695 if (!(flags & TTU_BATCH_FLUSH))
0696 return false;
0697
0698
0699 if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
0700 should_defer = true;
0701 put_cpu();
0702
0703 return should_defer;
0704 }
0705
0706
0707
0708
0709
0710
0711
0712
0713
0714
0715
0716
0717
0718
0719
0720
0721 void flush_tlb_batched_pending(struct mm_struct *mm)
0722 {
0723 int batch = atomic_read(&mm->tlb_flush_batched);
0724 int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
0725 int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
0726
0727 if (pending != flushed) {
0728 flush_tlb_mm(mm);
0729
0730
0731
0732
0733 atomic_cmpxchg(&mm->tlb_flush_batched, batch,
0734 pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
0735 }
0736 }
0737 #else
0738 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
0739 {
0740 }
0741
0742 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
0743 {
0744 return false;
0745 }
0746 #endif
0747
0748
0749
0750
0751
0752 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
0753 {
0754 struct folio *folio = page_folio(page);
0755 if (folio_test_anon(folio)) {
0756 struct anon_vma *page__anon_vma = folio_anon_vma(folio);
0757
0758
0759
0760
0761 if (!vma->anon_vma || !page__anon_vma ||
0762 vma->anon_vma->root != page__anon_vma->root)
0763 return -EFAULT;
0764 } else if (!vma->vm_file) {
0765 return -EFAULT;
0766 } else if (vma->vm_file->f_mapping != folio->mapping) {
0767 return -EFAULT;
0768 }
0769
0770 return vma_address(page, vma);
0771 }
0772
0773 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
0774 {
0775 pgd_t *pgd;
0776 p4d_t *p4d;
0777 pud_t *pud;
0778 pmd_t *pmd = NULL;
0779 pmd_t pmde;
0780
0781 pgd = pgd_offset(mm, address);
0782 if (!pgd_present(*pgd))
0783 goto out;
0784
0785 p4d = p4d_offset(pgd, address);
0786 if (!p4d_present(*p4d))
0787 goto out;
0788
0789 pud = pud_offset(p4d, address);
0790 if (!pud_present(*pud))
0791 goto out;
0792
0793 pmd = pmd_offset(pud, address);
0794
0795
0796
0797
0798
0799 pmde = *pmd;
0800 barrier();
0801 if (!pmd_present(pmde) || pmd_trans_huge(pmde))
0802 pmd = NULL;
0803 out:
0804 return pmd;
0805 }
0806
0807 struct folio_referenced_arg {
0808 int mapcount;
0809 int referenced;
0810 unsigned long vm_flags;
0811 struct mem_cgroup *memcg;
0812 };
0813
0814
0815
0816 static bool folio_referenced_one(struct folio *folio,
0817 struct vm_area_struct *vma, unsigned long address, void *arg)
0818 {
0819 struct folio_referenced_arg *pra = arg;
0820 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
0821 int referenced = 0;
0822
0823 while (page_vma_mapped_walk(&pvmw)) {
0824 address = pvmw.address;
0825
0826 if ((vma->vm_flags & VM_LOCKED) &&
0827 (!folio_test_large(folio) || !pvmw.pte)) {
0828
0829 mlock_vma_folio(folio, vma, !pvmw.pte);
0830 page_vma_mapped_walk_done(&pvmw);
0831 pra->vm_flags |= VM_LOCKED;
0832 return false;
0833 }
0834
0835 if (pvmw.pte) {
0836 if (ptep_clear_flush_young_notify(vma, address,
0837 pvmw.pte)) {
0838
0839
0840
0841
0842
0843
0844
0845
0846 if (likely(!(vma->vm_flags & VM_SEQ_READ)))
0847 referenced++;
0848 }
0849 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
0850 if (pmdp_clear_flush_young_notify(vma, address,
0851 pvmw.pmd))
0852 referenced++;
0853 } else {
0854
0855 WARN_ON_ONCE(1);
0856 }
0857
0858 pra->mapcount--;
0859 }
0860
0861 if (referenced)
0862 folio_clear_idle(folio);
0863 if (folio_test_clear_young(folio))
0864 referenced++;
0865
0866 if (referenced) {
0867 pra->referenced++;
0868 pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
0869 }
0870
0871 if (!pra->mapcount)
0872 return false;
0873
0874 return true;
0875 }
0876
0877 static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
0878 {
0879 struct folio_referenced_arg *pra = arg;
0880 struct mem_cgroup *memcg = pra->memcg;
0881
0882 if (!mm_match_cgroup(vma->vm_mm, memcg))
0883 return true;
0884
0885 return false;
0886 }
0887
0888
0889
0890
0891
0892
0893
0894
0895
0896
0897
0898
0899
0900 int folio_referenced(struct folio *folio, int is_locked,
0901 struct mem_cgroup *memcg, unsigned long *vm_flags)
0902 {
0903 int we_locked = 0;
0904 struct folio_referenced_arg pra = {
0905 .mapcount = folio_mapcount(folio),
0906 .memcg = memcg,
0907 };
0908 struct rmap_walk_control rwc = {
0909 .rmap_one = folio_referenced_one,
0910 .arg = (void *)&pra,
0911 .anon_lock = folio_lock_anon_vma_read,
0912 .try_lock = true,
0913 };
0914
0915 *vm_flags = 0;
0916 if (!pra.mapcount)
0917 return 0;
0918
0919 if (!folio_raw_mapping(folio))
0920 return 0;
0921
0922 if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
0923 we_locked = folio_trylock(folio);
0924 if (!we_locked)
0925 return 1;
0926 }
0927
0928
0929
0930
0931
0932
0933 if (memcg) {
0934 rwc.invalid_vma = invalid_folio_referenced_vma;
0935 }
0936
0937 rmap_walk(folio, &rwc);
0938 *vm_flags = pra.vm_flags;
0939
0940 if (we_locked)
0941 folio_unlock(folio);
0942
0943 return rwc.contended ? -1 : pra.referenced;
0944 }
0945
0946 static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
0947 {
0948 int cleaned = 0;
0949 struct vm_area_struct *vma = pvmw->vma;
0950 struct mmu_notifier_range range;
0951 unsigned long address = pvmw->address;
0952
0953
0954
0955
0956
0957 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0958 0, vma, vma->vm_mm, address,
0959 vma_address_end(pvmw));
0960 mmu_notifier_invalidate_range_start(&range);
0961
0962 while (page_vma_mapped_walk(pvmw)) {
0963 int ret = 0;
0964
0965 address = pvmw->address;
0966 if (pvmw->pte) {
0967 pte_t entry;
0968 pte_t *pte = pvmw->pte;
0969
0970 if (!pte_dirty(*pte) && !pte_write(*pte))
0971 continue;
0972
0973 flush_cache_page(vma, address, pte_pfn(*pte));
0974 entry = ptep_clear_flush(vma, address, pte);
0975 entry = pte_wrprotect(entry);
0976 entry = pte_mkclean(entry);
0977 set_pte_at(vma->vm_mm, address, pte, entry);
0978 ret = 1;
0979 } else {
0980 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0981 pmd_t *pmd = pvmw->pmd;
0982 pmd_t entry;
0983
0984 if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
0985 continue;
0986
0987 flush_cache_range(vma, address,
0988 address + HPAGE_PMD_SIZE);
0989 entry = pmdp_invalidate(vma, address, pmd);
0990 entry = pmd_wrprotect(entry);
0991 entry = pmd_mkclean(entry);
0992 set_pmd_at(vma->vm_mm, address, pmd, entry);
0993 ret = 1;
0994 #else
0995
0996 WARN_ON_ONCE(1);
0997 #endif
0998 }
0999
1000
1001
1002
1003
1004
1005
1006
1007 if (ret)
1008 cleaned++;
1009 }
1010
1011 mmu_notifier_invalidate_range_end(&range);
1012
1013 return cleaned;
1014 }
1015
1016 static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
1017 unsigned long address, void *arg)
1018 {
1019 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
1020 int *cleaned = arg;
1021
1022 *cleaned += page_vma_mkclean_one(&pvmw);
1023
1024 return true;
1025 }
1026
1027 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
1028 {
1029 if (vma->vm_flags & VM_SHARED)
1030 return false;
1031
1032 return true;
1033 }
1034
1035 int folio_mkclean(struct folio *folio)
1036 {
1037 int cleaned = 0;
1038 struct address_space *mapping;
1039 struct rmap_walk_control rwc = {
1040 .arg = (void *)&cleaned,
1041 .rmap_one = page_mkclean_one,
1042 .invalid_vma = invalid_mkclean_vma,
1043 };
1044
1045 BUG_ON(!folio_test_locked(folio));
1046
1047 if (!folio_mapped(folio))
1048 return 0;
1049
1050 mapping = folio_mapping(folio);
1051 if (!mapping)
1052 return 0;
1053
1054 rmap_walk(folio, &rwc);
1055
1056 return cleaned;
1057 }
1058 EXPORT_SYMBOL_GPL(folio_mkclean);
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
1073 struct vm_area_struct *vma)
1074 {
1075 struct page_vma_mapped_walk pvmw = {
1076 .pfn = pfn,
1077 .nr_pages = nr_pages,
1078 .pgoff = pgoff,
1079 .vma = vma,
1080 .flags = PVMW_SYNC,
1081 };
1082
1083 if (invalid_mkclean_vma(vma, NULL))
1084 return 0;
1085
1086 pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma);
1087 VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
1088
1089 return page_vma_mkclean_one(&pvmw);
1090 }
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
1103 {
1104 struct anon_vma *anon_vma = vma->anon_vma;
1105 struct page *subpage = page;
1106
1107 page = compound_head(page);
1108
1109 VM_BUG_ON_PAGE(!PageLocked(page), page);
1110 VM_BUG_ON_VMA(!anon_vma, vma);
1111
1112 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1113
1114
1115
1116
1117
1118 WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
1119 SetPageAnonExclusive(subpage);
1120 }
1121
1122
1123
1124
1125
1126
1127
1128
1129 static void __page_set_anon_rmap(struct page *page,
1130 struct vm_area_struct *vma, unsigned long address, int exclusive)
1131 {
1132 struct anon_vma *anon_vma = vma->anon_vma;
1133
1134 BUG_ON(!anon_vma);
1135
1136 if (PageAnon(page))
1137 goto out;
1138
1139
1140
1141
1142
1143
1144 if (!exclusive)
1145 anon_vma = anon_vma->root;
1146
1147
1148
1149
1150
1151
1152
1153 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1154 WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
1155 page->index = linear_page_index(vma, address);
1156 out:
1157 if (exclusive)
1158 SetPageAnonExclusive(page);
1159 }
1160
1161
1162
1163
1164
1165
1166
1167 static void __page_check_anon_rmap(struct page *page,
1168 struct vm_area_struct *vma, unsigned long address)
1169 {
1170 struct folio *folio = page_folio(page);
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182 VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
1183 folio);
1184 VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
1185 page);
1186 }
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200 void page_add_anon_rmap(struct page *page,
1201 struct vm_area_struct *vma, unsigned long address, rmap_t flags)
1202 {
1203 bool compound = flags & RMAP_COMPOUND;
1204 bool first;
1205
1206 if (unlikely(PageKsm(page)))
1207 lock_page_memcg(page);
1208 else
1209 VM_BUG_ON_PAGE(!PageLocked(page), page);
1210
1211 if (compound) {
1212 atomic_t *mapcount;
1213 VM_BUG_ON_PAGE(!PageLocked(page), page);
1214 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1215 mapcount = compound_mapcount_ptr(page);
1216 first = atomic_inc_and_test(mapcount);
1217 } else {
1218 first = atomic_inc_and_test(&page->_mapcount);
1219 }
1220 VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
1221 VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
1222
1223 if (first) {
1224 int nr = compound ? thp_nr_pages(page) : 1;
1225
1226
1227
1228
1229
1230
1231 if (compound)
1232 __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
1233 __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
1234 }
1235
1236 if (unlikely(PageKsm(page)))
1237 unlock_page_memcg(page);
1238
1239
1240 else if (first)
1241 __page_set_anon_rmap(page, vma, address,
1242 !!(flags & RMAP_EXCLUSIVE));
1243 else
1244 __page_check_anon_rmap(page, vma, address);
1245
1246 mlock_vma_page(page, vma, compound);
1247 }
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262 void page_add_new_anon_rmap(struct page *page,
1263 struct vm_area_struct *vma, unsigned long address)
1264 {
1265 const bool compound = PageCompound(page);
1266 int nr = compound ? thp_nr_pages(page) : 1;
1267
1268 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
1269 __SetPageSwapBacked(page);
1270 if (compound) {
1271 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1272
1273 atomic_set(compound_mapcount_ptr(page), 0);
1274 atomic_set(compound_pincount_ptr(page), 0);
1275
1276 __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
1277 } else {
1278
1279 atomic_set(&page->_mapcount, 0);
1280 }
1281 __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
1282 __page_set_anon_rmap(page, vma, address, 1);
1283 }
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293 void page_add_file_rmap(struct page *page,
1294 struct vm_area_struct *vma, bool compound)
1295 {
1296 int i, nr = 0;
1297
1298 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
1299 lock_page_memcg(page);
1300 if (compound && PageTransHuge(page)) {
1301 int nr_pages = thp_nr_pages(page);
1302
1303 for (i = 0; i < nr_pages; i++) {
1304 if (atomic_inc_and_test(&page[i]._mapcount))
1305 nr++;
1306 }
1307 if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
1308 goto out;
1309
1310
1311
1312
1313
1314
1315
1316 VM_WARN_ON_ONCE(!PageLocked(page));
1317 if (nr == nr_pages && PageDoubleMap(page))
1318 ClearPageDoubleMap(page);
1319
1320 if (PageSwapBacked(page))
1321 __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
1322 nr_pages);
1323 else
1324 __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
1325 nr_pages);
1326 } else {
1327 if (PageTransCompound(page) && page_mapping(page)) {
1328 VM_WARN_ON_ONCE(!PageLocked(page));
1329 SetPageDoubleMap(compound_head(page));
1330 }
1331 if (atomic_inc_and_test(&page->_mapcount))
1332 nr++;
1333 }
1334 out:
1335 if (nr)
1336 __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
1337 unlock_page_memcg(page);
1338
1339 mlock_vma_page(page, vma, compound);
1340 }
1341
1342 static void page_remove_file_rmap(struct page *page, bool compound)
1343 {
1344 int i, nr = 0;
1345
1346 VM_BUG_ON_PAGE(compound && !PageHead(page), page);
1347
1348
1349 if (unlikely(PageHuge(page))) {
1350
1351 atomic_dec(compound_mapcount_ptr(page));
1352 return;
1353 }
1354
1355
1356 if (compound && PageTransHuge(page)) {
1357 int nr_pages = thp_nr_pages(page);
1358
1359 for (i = 0; i < nr_pages; i++) {
1360 if (atomic_add_negative(-1, &page[i]._mapcount))
1361 nr++;
1362 }
1363 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1364 goto out;
1365 if (PageSwapBacked(page))
1366 __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
1367 -nr_pages);
1368 else
1369 __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
1370 -nr_pages);
1371 } else {
1372 if (atomic_add_negative(-1, &page->_mapcount))
1373 nr++;
1374 }
1375 out:
1376 if (nr)
1377 __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
1378 }
1379
1380 static void page_remove_anon_compound_rmap(struct page *page)
1381 {
1382 int i, nr;
1383
1384 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1385 return;
1386
1387
1388 if (unlikely(PageHuge(page)))
1389 return;
1390
1391 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1392 return;
1393
1394 __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
1395
1396 if (TestClearPageDoubleMap(page)) {
1397
1398
1399
1400
1401 for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1402 if (atomic_add_negative(-1, &page[i]._mapcount))
1403 nr++;
1404 }
1405
1406
1407
1408
1409
1410
1411 if (nr && nr < thp_nr_pages(page))
1412 deferred_split_huge_page(page);
1413 } else {
1414 nr = thp_nr_pages(page);
1415 }
1416
1417 if (nr)
1418 __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
1419 }
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429 void page_remove_rmap(struct page *page,
1430 struct vm_area_struct *vma, bool compound)
1431 {
1432 lock_page_memcg(page);
1433
1434 if (!PageAnon(page)) {
1435 page_remove_file_rmap(page, compound);
1436 goto out;
1437 }
1438
1439 if (compound) {
1440 page_remove_anon_compound_rmap(page);
1441 goto out;
1442 }
1443
1444
1445 if (!atomic_add_negative(-1, &page->_mapcount))
1446 goto out;
1447
1448
1449
1450
1451
1452
1453 __dec_lruvec_page_state(page, NR_ANON_MAPPED);
1454
1455 if (PageTransCompound(page))
1456 deferred_split_huge_page(compound_head(page));
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467 out:
1468 unlock_page_memcg(page);
1469
1470 munlock_vma_page(page, vma, compound);
1471 }
1472
1473
1474
1475
1476 static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
1477 unsigned long address, void *arg)
1478 {
1479 struct mm_struct *mm = vma->vm_mm;
1480 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
1481 pte_t pteval;
1482 struct page *subpage;
1483 bool anon_exclusive, ret = true;
1484 struct mmu_notifier_range range;
1485 enum ttu_flags flags = (enum ttu_flags)(long)arg;
1486
1487
1488
1489
1490
1491
1492
1493 if (flags & TTU_SYNC)
1494 pvmw.flags = PVMW_SYNC;
1495
1496 if (flags & TTU_SPLIT_HUGE_PMD)
1497 split_huge_pmd_address(vma, address, false, folio);
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507 range.end = vma_address_end(&pvmw);
1508 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1509 address, range.end);
1510 if (folio_test_hugetlb(folio)) {
1511
1512
1513
1514
1515 adjust_range_if_pmd_sharing_possible(vma, &range.start,
1516 &range.end);
1517 }
1518 mmu_notifier_invalidate_range_start(&range);
1519
1520 while (page_vma_mapped_walk(&pvmw)) {
1521
1522 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
1523
1524
1525
1526
1527 if (!(flags & TTU_IGNORE_MLOCK) &&
1528 (vma->vm_flags & VM_LOCKED)) {
1529
1530 mlock_vma_folio(folio, vma, false);
1531 page_vma_mapped_walk_done(&pvmw);
1532 ret = false;
1533 break;
1534 }
1535
1536 subpage = folio_page(folio,
1537 pte_pfn(*pvmw.pte) - folio_pfn(folio));
1538 address = pvmw.address;
1539 anon_exclusive = folio_test_anon(folio) &&
1540 PageAnonExclusive(subpage);
1541
1542 if (folio_test_hugetlb(folio)) {
1543 bool anon = folio_test_anon(folio);
1544
1545
1546
1547
1548
1549 VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
1550
1551
1552
1553
1554
1555
1556
1557 flush_cache_range(vma, range.start, range.end);
1558
1559
1560
1561
1562
1563
1564 VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
1565 if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
1566 flush_tlb_range(vma, range.start, range.end);
1567 mmu_notifier_invalidate_range(mm, range.start,
1568 range.end);
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579 page_vma_mapped_walk_done(&pvmw);
1580 break;
1581 }
1582 pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
1583 } else {
1584 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
1585
1586
1587
1588
1589 if (should_defer_flush(mm, flags) && !anon_exclusive) {
1590
1591
1592
1593
1594
1595
1596
1597
1598 pteval = ptep_get_and_clear(mm, address, pvmw.pte);
1599
1600 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
1601 } else {
1602 pteval = ptep_clear_flush(vma, address, pvmw.pte);
1603 }
1604 }
1605
1606
1607
1608
1609
1610
1611 pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
1612
1613
1614 if (pte_dirty(pteval))
1615 folio_mark_dirty(folio);
1616
1617
1618 update_hiwater_rss(mm);
1619
1620 if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
1621 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
1622 if (folio_test_hugetlb(folio)) {
1623 hugetlb_count_sub(folio_nr_pages(folio), mm);
1624 set_huge_pte_at(mm, address, pvmw.pte, pteval);
1625 } else {
1626 dec_mm_counter(mm, mm_counter(&folio->page));
1627 set_pte_at(mm, address, pvmw.pte, pteval);
1628 }
1629
1630 } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641 dec_mm_counter(mm, mm_counter(&folio->page));
1642
1643 mmu_notifier_invalidate_range(mm, address,
1644 address + PAGE_SIZE);
1645 } else if (folio_test_anon(folio)) {
1646 swp_entry_t entry = { .val = page_private(subpage) };
1647 pte_t swp_pte;
1648
1649
1650
1651
1652 if (unlikely(folio_test_swapbacked(folio) !=
1653 folio_test_swapcache(folio))) {
1654 WARN_ON_ONCE(1);
1655 ret = false;
1656
1657 mmu_notifier_invalidate_range(mm, address,
1658 address + PAGE_SIZE);
1659 page_vma_mapped_walk_done(&pvmw);
1660 break;
1661 }
1662
1663
1664 if (!folio_test_swapbacked(folio)) {
1665 int ref_count, map_count;
1666
1667
1668
1669
1670
1671
1672 smp_mb();
1673
1674 ref_count = folio_ref_count(folio);
1675 map_count = folio_mapcount(folio);
1676
1677
1678
1679
1680
1681 smp_rmb();
1682
1683
1684
1685
1686
1687 if (ref_count == 1 + map_count &&
1688 !folio_test_dirty(folio)) {
1689
1690 mmu_notifier_invalidate_range(mm,
1691 address, address + PAGE_SIZE);
1692 dec_mm_counter(mm, MM_ANONPAGES);
1693 goto discard;
1694 }
1695
1696
1697
1698
1699
1700 set_pte_at(mm, address, pvmw.pte, pteval);
1701 folio_set_swapbacked(folio);
1702 ret = false;
1703 page_vma_mapped_walk_done(&pvmw);
1704 break;
1705 }
1706
1707 if (swap_duplicate(entry) < 0) {
1708 set_pte_at(mm, address, pvmw.pte, pteval);
1709 ret = false;
1710 page_vma_mapped_walk_done(&pvmw);
1711 break;
1712 }
1713 if (arch_unmap_one(mm, vma, address, pteval) < 0) {
1714 swap_free(entry);
1715 set_pte_at(mm, address, pvmw.pte, pteval);
1716 ret = false;
1717 page_vma_mapped_walk_done(&pvmw);
1718 break;
1719 }
1720 if (anon_exclusive &&
1721 page_try_share_anon_rmap(subpage)) {
1722 swap_free(entry);
1723 set_pte_at(mm, address, pvmw.pte, pteval);
1724 ret = false;
1725 page_vma_mapped_walk_done(&pvmw);
1726 break;
1727 }
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739 if (list_empty(&mm->mmlist)) {
1740 spin_lock(&mmlist_lock);
1741 if (list_empty(&mm->mmlist))
1742 list_add(&mm->mmlist, &init_mm.mmlist);
1743 spin_unlock(&mmlist_lock);
1744 }
1745 dec_mm_counter(mm, MM_ANONPAGES);
1746 inc_mm_counter(mm, MM_SWAPENTS);
1747 swp_pte = swp_entry_to_pte(entry);
1748 if (anon_exclusive)
1749 swp_pte = pte_swp_mkexclusive(swp_pte);
1750 if (pte_soft_dirty(pteval))
1751 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1752 if (pte_uffd_wp(pteval))
1753 swp_pte = pte_swp_mkuffd_wp(swp_pte);
1754 set_pte_at(mm, address, pvmw.pte, swp_pte);
1755
1756 mmu_notifier_invalidate_range(mm, address,
1757 address + PAGE_SIZE);
1758 } else {
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770 dec_mm_counter(mm, mm_counter_file(&folio->page));
1771 }
1772 discard:
1773
1774
1775
1776
1777
1778
1779
1780 page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
1781 if (vma->vm_flags & VM_LOCKED)
1782 mlock_page_drain_local();
1783 folio_put(folio);
1784 }
1785
1786 mmu_notifier_invalidate_range_end(&range);
1787
1788 return ret;
1789 }
1790
1791 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
1792 {
1793 return vma_is_temporary_stack(vma);
1794 }
1795
1796 static int page_not_mapped(struct folio *folio)
1797 {
1798 return !folio_mapped(folio);
1799 }
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812 void try_to_unmap(struct folio *folio, enum ttu_flags flags)
1813 {
1814 struct rmap_walk_control rwc = {
1815 .rmap_one = try_to_unmap_one,
1816 .arg = (void *)flags,
1817 .done = page_not_mapped,
1818 .anon_lock = folio_lock_anon_vma_read,
1819 };
1820
1821 if (flags & TTU_RMAP_LOCKED)
1822 rmap_walk_locked(folio, &rwc);
1823 else
1824 rmap_walk(folio, &rwc);
1825 }
1826
1827
1828
1829
1830
1831
1832
1833 static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
1834 unsigned long address, void *arg)
1835 {
1836 struct mm_struct *mm = vma->vm_mm;
1837 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
1838 pte_t pteval;
1839 struct page *subpage;
1840 bool anon_exclusive, ret = true;
1841 struct mmu_notifier_range range;
1842 enum ttu_flags flags = (enum ttu_flags)(long)arg;
1843
1844
1845
1846
1847
1848
1849
1850 if (flags & TTU_SYNC)
1851 pvmw.flags = PVMW_SYNC;
1852
1853
1854
1855
1856
1857 if (flags & TTU_SPLIT_HUGE_PMD)
1858 split_huge_pmd_address(vma, address, true, folio);
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868 range.end = vma_address_end(&pvmw);
1869 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1870 address, range.end);
1871 if (folio_test_hugetlb(folio)) {
1872
1873
1874
1875
1876 adjust_range_if_pmd_sharing_possible(vma, &range.start,
1877 &range.end);
1878 }
1879 mmu_notifier_invalidate_range_start(&range);
1880
1881 while (page_vma_mapped_walk(&pvmw)) {
1882 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1883
1884 if (!pvmw.pte) {
1885 subpage = folio_page(folio,
1886 pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
1887 VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
1888 !folio_test_pmd_mappable(folio), folio);
1889
1890 if (set_pmd_migration_entry(&pvmw, subpage)) {
1891 ret = false;
1892 page_vma_mapped_walk_done(&pvmw);
1893 break;
1894 }
1895 continue;
1896 }
1897 #endif
1898
1899
1900 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
1901
1902 if (folio_is_zone_device(folio)) {
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913 VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
1914 subpage = &folio->page;
1915 } else {
1916 subpage = folio_page(folio,
1917 pte_pfn(*pvmw.pte) - folio_pfn(folio));
1918 }
1919 address = pvmw.address;
1920 anon_exclusive = folio_test_anon(folio) &&
1921 PageAnonExclusive(subpage);
1922
1923 if (folio_test_hugetlb(folio)) {
1924 bool anon = folio_test_anon(folio);
1925
1926
1927
1928
1929
1930
1931
1932
1933 flush_cache_range(vma, range.start, range.end);
1934
1935
1936
1937
1938
1939
1940 VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
1941 if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
1942 flush_tlb_range(vma, range.start, range.end);
1943 mmu_notifier_invalidate_range(mm, range.start,
1944 range.end);
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955 page_vma_mapped_walk_done(&pvmw);
1956 break;
1957 }
1958
1959
1960 pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
1961 } else {
1962 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
1963
1964 pteval = ptep_clear_flush(vma, address, pvmw.pte);
1965 }
1966
1967
1968 if (pte_dirty(pteval))
1969 folio_mark_dirty(folio);
1970
1971
1972 update_hiwater_rss(mm);
1973
1974 if (folio_is_device_private(folio)) {
1975 unsigned long pfn = folio_pfn(folio);
1976 swp_entry_t entry;
1977 pte_t swp_pte;
1978
1979 if (anon_exclusive)
1980 BUG_ON(page_try_share_anon_rmap(subpage));
1981
1982
1983
1984
1985
1986
1987 entry = pte_to_swp_entry(pteval);
1988 if (is_writable_device_private_entry(entry))
1989 entry = make_writable_migration_entry(pfn);
1990 else if (anon_exclusive)
1991 entry = make_readable_exclusive_migration_entry(pfn);
1992 else
1993 entry = make_readable_migration_entry(pfn);
1994 swp_pte = swp_entry_to_pte(entry);
1995
1996
1997
1998
1999
2000 if (pte_swp_soft_dirty(pteval))
2001 swp_pte = pte_swp_mksoft_dirty(swp_pte);
2002 if (pte_swp_uffd_wp(pteval))
2003 swp_pte = pte_swp_mkuffd_wp(swp_pte);
2004 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
2005 trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
2006 compound_order(&folio->page));
2007
2008
2009
2010
2011 } else if (PageHWPoison(subpage)) {
2012 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
2013 if (folio_test_hugetlb(folio)) {
2014 hugetlb_count_sub(folio_nr_pages(folio), mm);
2015 set_huge_pte_at(mm, address, pvmw.pte, pteval);
2016 } else {
2017 dec_mm_counter(mm, mm_counter(&folio->page));
2018 set_pte_at(mm, address, pvmw.pte, pteval);
2019 }
2020
2021 } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032 dec_mm_counter(mm, mm_counter(&folio->page));
2033
2034 mmu_notifier_invalidate_range(mm, address,
2035 address + PAGE_SIZE);
2036 } else {
2037 swp_entry_t entry;
2038 pte_t swp_pte;
2039
2040 if (arch_unmap_one(mm, vma, address, pteval) < 0) {
2041 if (folio_test_hugetlb(folio))
2042 set_huge_pte_at(mm, address, pvmw.pte, pteval);
2043 else
2044 set_pte_at(mm, address, pvmw.pte, pteval);
2045 ret = false;
2046 page_vma_mapped_walk_done(&pvmw);
2047 break;
2048 }
2049 VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
2050 !anon_exclusive, subpage);
2051 if (anon_exclusive &&
2052 page_try_share_anon_rmap(subpage)) {
2053 if (folio_test_hugetlb(folio))
2054 set_huge_pte_at(mm, address, pvmw.pte, pteval);
2055 else
2056 set_pte_at(mm, address, pvmw.pte, pteval);
2057 ret = false;
2058 page_vma_mapped_walk_done(&pvmw);
2059 break;
2060 }
2061
2062
2063
2064
2065
2066
2067 if (pte_write(pteval))
2068 entry = make_writable_migration_entry(
2069 page_to_pfn(subpage));
2070 else if (anon_exclusive)
2071 entry = make_readable_exclusive_migration_entry(
2072 page_to_pfn(subpage));
2073 else
2074 entry = make_readable_migration_entry(
2075 page_to_pfn(subpage));
2076
2077 swp_pte = swp_entry_to_pte(entry);
2078 if (pte_soft_dirty(pteval))
2079 swp_pte = pte_swp_mksoft_dirty(swp_pte);
2080 if (pte_uffd_wp(pteval))
2081 swp_pte = pte_swp_mkuffd_wp(swp_pte);
2082 if (folio_test_hugetlb(folio))
2083 set_huge_pte_at(mm, address, pvmw.pte, swp_pte);
2084 else
2085 set_pte_at(mm, address, pvmw.pte, swp_pte);
2086 trace_set_migration_pte(address, pte_val(swp_pte),
2087 compound_order(&folio->page));
2088
2089
2090
2091
2092 }
2093
2094
2095
2096
2097
2098
2099
2100
2101 page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
2102 if (vma->vm_flags & VM_LOCKED)
2103 mlock_page_drain_local();
2104 folio_put(folio);
2105 }
2106
2107 mmu_notifier_invalidate_range_end(&range);
2108
2109 return ret;
2110 }
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120 void try_to_migrate(struct folio *folio, enum ttu_flags flags)
2121 {
2122 struct rmap_walk_control rwc = {
2123 .rmap_one = try_to_migrate_one,
2124 .arg = (void *)flags,
2125 .done = page_not_mapped,
2126 .anon_lock = folio_lock_anon_vma_read,
2127 };
2128
2129
2130
2131
2132
2133 if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
2134 TTU_SYNC)))
2135 return;
2136
2137 if (folio_is_zone_device(folio) &&
2138 (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
2139 return;
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149 if (!folio_test_ksm(folio) && folio_test_anon(folio))
2150 rwc.invalid_vma = invalid_migration_vma;
2151
2152 if (flags & TTU_RMAP_LOCKED)
2153 rmap_walk_locked(folio, &rwc);
2154 else
2155 rmap_walk(folio, &rwc);
2156 }
2157
2158 #ifdef CONFIG_DEVICE_PRIVATE
2159 struct make_exclusive_args {
2160 struct mm_struct *mm;
2161 unsigned long address;
2162 void *owner;
2163 bool valid;
2164 };
2165
2166 static bool page_make_device_exclusive_one(struct folio *folio,
2167 struct vm_area_struct *vma, unsigned long address, void *priv)
2168 {
2169 struct mm_struct *mm = vma->vm_mm;
2170 DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
2171 struct make_exclusive_args *args = priv;
2172 pte_t pteval;
2173 struct page *subpage;
2174 bool ret = true;
2175 struct mmu_notifier_range range;
2176 swp_entry_t entry;
2177 pte_t swp_pte;
2178
2179 mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
2180 vma->vm_mm, address, min(vma->vm_end,
2181 address + folio_size(folio)),
2182 args->owner);
2183 mmu_notifier_invalidate_range_start(&range);
2184
2185 while (page_vma_mapped_walk(&pvmw)) {
2186
2187 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
2188
2189 if (!pte_present(*pvmw.pte)) {
2190 ret = false;
2191 page_vma_mapped_walk_done(&pvmw);
2192 break;
2193 }
2194
2195 subpage = folio_page(folio,
2196 pte_pfn(*pvmw.pte) - folio_pfn(folio));
2197 address = pvmw.address;
2198
2199
2200 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
2201 pteval = ptep_clear_flush(vma, address, pvmw.pte);
2202
2203
2204 if (pte_dirty(pteval))
2205 folio_mark_dirty(folio);
2206
2207
2208
2209
2210
2211 if (args->mm == mm && args->address == address &&
2212 pte_write(pteval))
2213 args->valid = true;
2214
2215
2216
2217
2218
2219
2220 if (pte_write(pteval))
2221 entry = make_writable_device_exclusive_entry(
2222 page_to_pfn(subpage));
2223 else
2224 entry = make_readable_device_exclusive_entry(
2225 page_to_pfn(subpage));
2226 swp_pte = swp_entry_to_pte(entry);
2227 if (pte_soft_dirty(pteval))
2228 swp_pte = pte_swp_mksoft_dirty(swp_pte);
2229 if (pte_uffd_wp(pteval))
2230 swp_pte = pte_swp_mkuffd_wp(swp_pte);
2231
2232 set_pte_at(mm, address, pvmw.pte, swp_pte);
2233
2234
2235
2236
2237
2238 page_remove_rmap(subpage, vma, false);
2239 }
2240
2241 mmu_notifier_invalidate_range_end(&range);
2242
2243 return ret;
2244 }
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261 static bool folio_make_device_exclusive(struct folio *folio,
2262 struct mm_struct *mm, unsigned long address, void *owner)
2263 {
2264 struct make_exclusive_args args = {
2265 .mm = mm,
2266 .address = address,
2267 .owner = owner,
2268 .valid = false,
2269 };
2270 struct rmap_walk_control rwc = {
2271 .rmap_one = page_make_device_exclusive_one,
2272 .done = page_not_mapped,
2273 .anon_lock = folio_lock_anon_vma_read,
2274 .arg = &args,
2275 };
2276
2277
2278
2279
2280
2281 if (!folio_test_anon(folio))
2282 return false;
2283
2284 rmap_walk(folio, &rwc);
2285
2286 return args.valid && !folio_mapcount(folio);
2287 }
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310 int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
2311 unsigned long end, struct page **pages,
2312 void *owner)
2313 {
2314 long npages = (end - start) >> PAGE_SHIFT;
2315 long i;
2316
2317 npages = get_user_pages_remote(mm, start, npages,
2318 FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
2319 pages, NULL, NULL);
2320 if (npages < 0)
2321 return npages;
2322
2323 for (i = 0; i < npages; i++, start += PAGE_SIZE) {
2324 struct folio *folio = page_folio(pages[i]);
2325 if (PageTail(pages[i]) || !folio_trylock(folio)) {
2326 folio_put(folio);
2327 pages[i] = NULL;
2328 continue;
2329 }
2330
2331 if (!folio_make_device_exclusive(folio, mm, start, owner)) {
2332 folio_unlock(folio);
2333 folio_put(folio);
2334 pages[i] = NULL;
2335 }
2336 }
2337
2338 return npages;
2339 }
2340 EXPORT_SYMBOL_GPL(make_device_exclusive_range);
2341 #endif
2342
2343 void __put_anon_vma(struct anon_vma *anon_vma)
2344 {
2345 struct anon_vma *root = anon_vma->root;
2346
2347 anon_vma_free(anon_vma);
2348 if (root != anon_vma && atomic_dec_and_test(&root->refcount))
2349 anon_vma_free(root);
2350 }
2351
2352 static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
2353 struct rmap_walk_control *rwc)
2354 {
2355 struct anon_vma *anon_vma;
2356
2357 if (rwc->anon_lock)
2358 return rwc->anon_lock(folio, rwc);
2359
2360
2361
2362
2363
2364
2365
2366 anon_vma = folio_anon_vma(folio);
2367 if (!anon_vma)
2368 return NULL;
2369
2370 if (anon_vma_trylock_read(anon_vma))
2371 goto out;
2372
2373 if (rwc->try_lock) {
2374 anon_vma = NULL;
2375 rwc->contended = true;
2376 goto out;
2377 }
2378
2379 anon_vma_lock_read(anon_vma);
2380 out:
2381 return anon_vma;
2382 }
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393 static void rmap_walk_anon(struct folio *folio,
2394 struct rmap_walk_control *rwc, bool locked)
2395 {
2396 struct anon_vma *anon_vma;
2397 pgoff_t pgoff_start, pgoff_end;
2398 struct anon_vma_chain *avc;
2399
2400 if (locked) {
2401 anon_vma = folio_anon_vma(folio);
2402
2403 VM_BUG_ON_FOLIO(!anon_vma, folio);
2404 } else {
2405 anon_vma = rmap_walk_anon_lock(folio, rwc);
2406 }
2407 if (!anon_vma)
2408 return;
2409
2410 pgoff_start = folio_pgoff(folio);
2411 pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
2412 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
2413 pgoff_start, pgoff_end) {
2414 struct vm_area_struct *vma = avc->vma;
2415 unsigned long address = vma_address(&folio->page, vma);
2416
2417 VM_BUG_ON_VMA(address == -EFAULT, vma);
2418 cond_resched();
2419
2420 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2421 continue;
2422
2423 if (!rwc->rmap_one(folio, vma, address, rwc->arg))
2424 break;
2425 if (rwc->done && rwc->done(folio))
2426 break;
2427 }
2428
2429 if (!locked)
2430 anon_vma_unlock_read(anon_vma);
2431 }
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441 static void rmap_walk_file(struct folio *folio,
2442 struct rmap_walk_control *rwc, bool locked)
2443 {
2444 struct address_space *mapping = folio_mapping(folio);
2445 pgoff_t pgoff_start, pgoff_end;
2446 struct vm_area_struct *vma;
2447
2448
2449
2450
2451
2452
2453
2454 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
2455
2456 if (!mapping)
2457 return;
2458
2459 pgoff_start = folio_pgoff(folio);
2460 pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
2461 if (!locked) {
2462 if (i_mmap_trylock_read(mapping))
2463 goto lookup;
2464
2465 if (rwc->try_lock) {
2466 rwc->contended = true;
2467 return;
2468 }
2469
2470 i_mmap_lock_read(mapping);
2471 }
2472 lookup:
2473 vma_interval_tree_foreach(vma, &mapping->i_mmap,
2474 pgoff_start, pgoff_end) {
2475 unsigned long address = vma_address(&folio->page, vma);
2476
2477 VM_BUG_ON_VMA(address == -EFAULT, vma);
2478 cond_resched();
2479
2480 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2481 continue;
2482
2483 if (!rwc->rmap_one(folio, vma, address, rwc->arg))
2484 goto done;
2485 if (rwc->done && rwc->done(folio))
2486 goto done;
2487 }
2488
2489 done:
2490 if (!locked)
2491 i_mmap_unlock_read(mapping);
2492 }
2493
2494 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
2495 {
2496 if (unlikely(folio_test_ksm(folio)))
2497 rmap_walk_ksm(folio, rwc);
2498 else if (folio_test_anon(folio))
2499 rmap_walk_anon(folio, rwc, false);
2500 else
2501 rmap_walk_file(folio, rwc, false);
2502 }
2503
2504
2505 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
2506 {
2507
2508 VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
2509 if (folio_test_anon(folio))
2510 rmap_walk_anon(folio, rwc, true);
2511 else
2512 rmap_walk_file(folio, rwc, true);
2513 }
2514
2515 #ifdef CONFIG_HUGETLB_PAGE
2516
2517
2518
2519
2520
2521
2522
2523 void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
2524 unsigned long address, rmap_t flags)
2525 {
2526 struct anon_vma *anon_vma = vma->anon_vma;
2527 int first;
2528
2529 BUG_ON(!PageLocked(page));
2530 BUG_ON(!anon_vma);
2531
2532 first = atomic_inc_and_test(compound_mapcount_ptr(page));
2533 VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
2534 VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
2535 if (first)
2536 __page_set_anon_rmap(page, vma, address,
2537 !!(flags & RMAP_EXCLUSIVE));
2538 }
2539
2540 void hugepage_add_new_anon_rmap(struct page *page,
2541 struct vm_area_struct *vma, unsigned long address)
2542 {
2543 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
2544 atomic_set(compound_mapcount_ptr(page), 0);
2545 atomic_set(compound_pincount_ptr(page), 0);
2546
2547 __page_set_anon_rmap(page, vma, address, 1);
2548 }
2549 #endif