0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042 #include <linux/kernel_stat.h>
0043 #include <linux/mm.h>
0044 #include <linux/mm_inline.h>
0045 #include <linux/sched/mm.h>
0046 #include <linux/sched/coredump.h>
0047 #include <linux/sched/numa_balancing.h>
0048 #include <linux/sched/task.h>
0049 #include <linux/hugetlb.h>
0050 #include <linux/mman.h>
0051 #include <linux/swap.h>
0052 #include <linux/highmem.h>
0053 #include <linux/pagemap.h>
0054 #include <linux/memremap.h>
0055 #include <linux/ksm.h>
0056 #include <linux/rmap.h>
0057 #include <linux/export.h>
0058 #include <linux/delayacct.h>
0059 #include <linux/init.h>
0060 #include <linux/pfn_t.h>
0061 #include <linux/writeback.h>
0062 #include <linux/memcontrol.h>
0063 #include <linux/mmu_notifier.h>
0064 #include <linux/swapops.h>
0065 #include <linux/elf.h>
0066 #include <linux/gfp.h>
0067 #include <linux/migrate.h>
0068 #include <linux/string.h>
0069 #include <linux/debugfs.h>
0070 #include <linux/userfaultfd_k.h>
0071 #include <linux/dax.h>
0072 #include <linux/oom.h>
0073 #include <linux/numa.h>
0074 #include <linux/perf_event.h>
0075 #include <linux/ptrace.h>
0076 #include <linux/vmalloc.h>
0077
0078 #include <trace/events/kmem.h>
0079
0080 #include <asm/io.h>
0081 #include <asm/mmu_context.h>
0082 #include <asm/pgalloc.h>
0083 #include <linux/uaccess.h>
0084 #include <asm/tlb.h>
0085 #include <asm/tlbflush.h>
0086
0087 #include "pgalloc-track.h"
0088 #include "internal.h"
0089 #include "swap.h"
0090
0091 #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
0092 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
0093 #endif
0094
0095 #ifndef CONFIG_NUMA
0096 unsigned long max_mapnr;
0097 EXPORT_SYMBOL(max_mapnr);
0098
0099 struct page *mem_map;
0100 EXPORT_SYMBOL(mem_map);
0101 #endif
0102
0103 static vm_fault_t do_fault(struct vm_fault *vmf);
0104
0105
0106
0107
0108
0109
0110
0111
0112 void *high_memory;
0113 EXPORT_SYMBOL(high_memory);
0114
0115
0116
0117
0118
0119
0120
0121 int randomize_va_space __read_mostly =
0122 #ifdef CONFIG_COMPAT_BRK
0123 1;
0124 #else
0125 2;
0126 #endif
0127
0128 #ifndef arch_faults_on_old_pte
0129 static inline bool arch_faults_on_old_pte(void)
0130 {
0131
0132
0133
0134
0135
0136 return true;
0137 }
0138 #endif
0139
0140 #ifndef arch_wants_old_prefaulted_pte
0141 static inline bool arch_wants_old_prefaulted_pte(void)
0142 {
0143
0144
0145
0146
0147
0148 return false;
0149 }
0150 #endif
0151
0152 static int __init disable_randmaps(char *s)
0153 {
0154 randomize_va_space = 0;
0155 return 1;
0156 }
0157 __setup("norandmaps", disable_randmaps);
0158
0159 unsigned long zero_pfn __read_mostly;
0160 EXPORT_SYMBOL(zero_pfn);
0161
0162 unsigned long highest_memmap_pfn __read_mostly;
0163
0164
0165
0166
0167 static int __init init_zero_pfn(void)
0168 {
0169 zero_pfn = page_to_pfn(ZERO_PAGE(0));
0170 return 0;
0171 }
0172 early_initcall(init_zero_pfn);
0173
0174 void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
0175 {
0176 trace_rss_stat(mm, member, count);
0177 }
0178
0179 #if defined(SPLIT_RSS_COUNTING)
0180
0181 void sync_mm_rss(struct mm_struct *mm)
0182 {
0183 int i;
0184
0185 for (i = 0; i < NR_MM_COUNTERS; i++) {
0186 if (current->rss_stat.count[i]) {
0187 add_mm_counter(mm, i, current->rss_stat.count[i]);
0188 current->rss_stat.count[i] = 0;
0189 }
0190 }
0191 current->rss_stat.events = 0;
0192 }
0193
0194 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
0195 {
0196 struct task_struct *task = current;
0197
0198 if (likely(task->mm == mm))
0199 task->rss_stat.count[member] += val;
0200 else
0201 add_mm_counter(mm, member, val);
0202 }
0203 #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
0204 #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
0205
0206
0207 #define TASK_RSS_EVENTS_THRESH (64)
0208 static void check_sync_rss_stat(struct task_struct *task)
0209 {
0210 if (unlikely(task != current))
0211 return;
0212 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
0213 sync_mm_rss(task->mm);
0214 }
0215 #else
0216
0217 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
0218 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
0219
0220 static void check_sync_rss_stat(struct task_struct *task)
0221 {
0222 }
0223
0224 #endif
0225
0226
0227
0228
0229
0230 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
0231 unsigned long addr)
0232 {
0233 pgtable_t token = pmd_pgtable(*pmd);
0234 pmd_clear(pmd);
0235 pte_free_tlb(tlb, token, addr);
0236 mm_dec_nr_ptes(tlb->mm);
0237 }
0238
0239 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
0240 unsigned long addr, unsigned long end,
0241 unsigned long floor, unsigned long ceiling)
0242 {
0243 pmd_t *pmd;
0244 unsigned long next;
0245 unsigned long start;
0246
0247 start = addr;
0248 pmd = pmd_offset(pud, addr);
0249 do {
0250 next = pmd_addr_end(addr, end);
0251 if (pmd_none_or_clear_bad(pmd))
0252 continue;
0253 free_pte_range(tlb, pmd, addr);
0254 } while (pmd++, addr = next, addr != end);
0255
0256 start &= PUD_MASK;
0257 if (start < floor)
0258 return;
0259 if (ceiling) {
0260 ceiling &= PUD_MASK;
0261 if (!ceiling)
0262 return;
0263 }
0264 if (end - 1 > ceiling - 1)
0265 return;
0266
0267 pmd = pmd_offset(pud, start);
0268 pud_clear(pud);
0269 pmd_free_tlb(tlb, pmd, start);
0270 mm_dec_nr_pmds(tlb->mm);
0271 }
0272
0273 static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
0274 unsigned long addr, unsigned long end,
0275 unsigned long floor, unsigned long ceiling)
0276 {
0277 pud_t *pud;
0278 unsigned long next;
0279 unsigned long start;
0280
0281 start = addr;
0282 pud = pud_offset(p4d, addr);
0283 do {
0284 next = pud_addr_end(addr, end);
0285 if (pud_none_or_clear_bad(pud))
0286 continue;
0287 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
0288 } while (pud++, addr = next, addr != end);
0289
0290 start &= P4D_MASK;
0291 if (start < floor)
0292 return;
0293 if (ceiling) {
0294 ceiling &= P4D_MASK;
0295 if (!ceiling)
0296 return;
0297 }
0298 if (end - 1 > ceiling - 1)
0299 return;
0300
0301 pud = pud_offset(p4d, start);
0302 p4d_clear(p4d);
0303 pud_free_tlb(tlb, pud, start);
0304 mm_dec_nr_puds(tlb->mm);
0305 }
0306
0307 static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
0308 unsigned long addr, unsigned long end,
0309 unsigned long floor, unsigned long ceiling)
0310 {
0311 p4d_t *p4d;
0312 unsigned long next;
0313 unsigned long start;
0314
0315 start = addr;
0316 p4d = p4d_offset(pgd, addr);
0317 do {
0318 next = p4d_addr_end(addr, end);
0319 if (p4d_none_or_clear_bad(p4d))
0320 continue;
0321 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
0322 } while (p4d++, addr = next, addr != end);
0323
0324 start &= PGDIR_MASK;
0325 if (start < floor)
0326 return;
0327 if (ceiling) {
0328 ceiling &= PGDIR_MASK;
0329 if (!ceiling)
0330 return;
0331 }
0332 if (end - 1 > ceiling - 1)
0333 return;
0334
0335 p4d = p4d_offset(pgd, start);
0336 pgd_clear(pgd);
0337 p4d_free_tlb(tlb, p4d, start);
0338 }
0339
0340
0341
0342
0343 void free_pgd_range(struct mmu_gather *tlb,
0344 unsigned long addr, unsigned long end,
0345 unsigned long floor, unsigned long ceiling)
0346 {
0347 pgd_t *pgd;
0348 unsigned long next;
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371
0372
0373
0374
0375
0376 addr &= PMD_MASK;
0377 if (addr < floor) {
0378 addr += PMD_SIZE;
0379 if (!addr)
0380 return;
0381 }
0382 if (ceiling) {
0383 ceiling &= PMD_MASK;
0384 if (!ceiling)
0385 return;
0386 }
0387 if (end - 1 > ceiling - 1)
0388 end -= PMD_SIZE;
0389 if (addr > end - 1)
0390 return;
0391
0392
0393
0394
0395 tlb_change_page_size(tlb, PAGE_SIZE);
0396 pgd = pgd_offset(tlb->mm, addr);
0397 do {
0398 next = pgd_addr_end(addr, end);
0399 if (pgd_none_or_clear_bad(pgd))
0400 continue;
0401 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
0402 } while (pgd++, addr = next, addr != end);
0403 }
0404
0405 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
0406 unsigned long floor, unsigned long ceiling)
0407 {
0408 while (vma) {
0409 struct vm_area_struct *next = vma->vm_next;
0410 unsigned long addr = vma->vm_start;
0411
0412
0413
0414
0415
0416 unlink_anon_vmas(vma);
0417 unlink_file_vma(vma);
0418
0419 if (is_vm_hugetlb_page(vma)) {
0420 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
0421 floor, next ? next->vm_start : ceiling);
0422 } else {
0423
0424
0425
0426 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
0427 && !is_vm_hugetlb_page(next)) {
0428 vma = next;
0429 next = vma->vm_next;
0430 unlink_anon_vmas(vma);
0431 unlink_file_vma(vma);
0432 }
0433 free_pgd_range(tlb, addr, vma->vm_end,
0434 floor, next ? next->vm_start : ceiling);
0435 }
0436 vma = next;
0437 }
0438 }
0439
0440 void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
0441 {
0442 spinlock_t *ptl = pmd_lock(mm, pmd);
0443
0444 if (likely(pmd_none(*pmd))) {
0445 mm_inc_nr_ptes(mm);
0446
0447
0448
0449
0450
0451
0452
0453
0454
0455
0456
0457
0458
0459 smp_wmb();
0460 pmd_populate(mm, pmd, *pte);
0461 *pte = NULL;
0462 }
0463 spin_unlock(ptl);
0464 }
0465
0466 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
0467 {
0468 pgtable_t new = pte_alloc_one(mm);
0469 if (!new)
0470 return -ENOMEM;
0471
0472 pmd_install(mm, pmd, &new);
0473 if (new)
0474 pte_free(mm, new);
0475 return 0;
0476 }
0477
0478 int __pte_alloc_kernel(pmd_t *pmd)
0479 {
0480 pte_t *new = pte_alloc_one_kernel(&init_mm);
0481 if (!new)
0482 return -ENOMEM;
0483
0484 spin_lock(&init_mm.page_table_lock);
0485 if (likely(pmd_none(*pmd))) {
0486 smp_wmb();
0487 pmd_populate_kernel(&init_mm, pmd, new);
0488 new = NULL;
0489 }
0490 spin_unlock(&init_mm.page_table_lock);
0491 if (new)
0492 pte_free_kernel(&init_mm, new);
0493 return 0;
0494 }
0495
0496 static inline void init_rss_vec(int *rss)
0497 {
0498 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
0499 }
0500
0501 static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
0502 {
0503 int i;
0504
0505 if (current->mm == mm)
0506 sync_mm_rss(mm);
0507 for (i = 0; i < NR_MM_COUNTERS; i++)
0508 if (rss[i])
0509 add_mm_counter(mm, i, rss[i]);
0510 }
0511
0512
0513
0514
0515
0516
0517
0518
0519 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
0520 pte_t pte, struct page *page)
0521 {
0522 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
0523 p4d_t *p4d = p4d_offset(pgd, addr);
0524 pud_t *pud = pud_offset(p4d, addr);
0525 pmd_t *pmd = pmd_offset(pud, addr);
0526 struct address_space *mapping;
0527 pgoff_t index;
0528 static unsigned long resume;
0529 static unsigned long nr_shown;
0530 static unsigned long nr_unshown;
0531
0532
0533
0534
0535
0536 if (nr_shown == 60) {
0537 if (time_before(jiffies, resume)) {
0538 nr_unshown++;
0539 return;
0540 }
0541 if (nr_unshown) {
0542 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
0543 nr_unshown);
0544 nr_unshown = 0;
0545 }
0546 nr_shown = 0;
0547 }
0548 if (nr_shown++ == 0)
0549 resume = jiffies + 60 * HZ;
0550
0551 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
0552 index = linear_page_index(vma, addr);
0553
0554 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
0555 current->comm,
0556 (long long)pte_val(pte), (long long)pmd_val(*pmd));
0557 if (page)
0558 dump_page(page, "bad pte");
0559 pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
0560 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
0561 pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
0562 vma->vm_file,
0563 vma->vm_ops ? vma->vm_ops->fault : NULL,
0564 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
0565 mapping ? mapping->a_ops->read_folio : NULL);
0566 dump_stack();
0567 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
0568 }
0569
0570
0571
0572
0573
0574
0575
0576
0577
0578
0579
0580
0581
0582
0583
0584
0585
0586
0587
0588
0589
0590
0591
0592
0593
0594
0595
0596
0597
0598
0599
0600
0601
0602
0603
0604
0605
0606
0607
0608
0609
0610
0611
0612 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
0613 pte_t pte)
0614 {
0615 unsigned long pfn = pte_pfn(pte);
0616
0617 if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
0618 if (likely(!pte_special(pte)))
0619 goto check_pfn;
0620 if (vma->vm_ops && vma->vm_ops->find_special_page)
0621 return vma->vm_ops->find_special_page(vma, addr);
0622 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
0623 return NULL;
0624 if (is_zero_pfn(pfn))
0625 return NULL;
0626 if (pte_devmap(pte))
0627
0628
0629
0630
0631
0632
0633
0634
0635 return NULL;
0636
0637 print_bad_pte(vma, addr, pte, NULL);
0638 return NULL;
0639 }
0640
0641
0642
0643 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
0644 if (vma->vm_flags & VM_MIXEDMAP) {
0645 if (!pfn_valid(pfn))
0646 return NULL;
0647 goto out;
0648 } else {
0649 unsigned long off;
0650 off = (addr - vma->vm_start) >> PAGE_SHIFT;
0651 if (pfn == vma->vm_pgoff + off)
0652 return NULL;
0653 if (!is_cow_mapping(vma->vm_flags))
0654 return NULL;
0655 }
0656 }
0657
0658 if (is_zero_pfn(pfn))
0659 return NULL;
0660
0661 check_pfn:
0662 if (unlikely(pfn > highest_memmap_pfn)) {
0663 print_bad_pte(vma, addr, pte, NULL);
0664 return NULL;
0665 }
0666
0667
0668
0669
0670
0671 out:
0672 return pfn_to_page(pfn);
0673 }
0674
0675 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0676 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
0677 pmd_t pmd)
0678 {
0679 unsigned long pfn = pmd_pfn(pmd);
0680
0681
0682
0683
0684
0685
0686 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
0687 if (vma->vm_flags & VM_MIXEDMAP) {
0688 if (!pfn_valid(pfn))
0689 return NULL;
0690 goto out;
0691 } else {
0692 unsigned long off;
0693 off = (addr - vma->vm_start) >> PAGE_SHIFT;
0694 if (pfn == vma->vm_pgoff + off)
0695 return NULL;
0696 if (!is_cow_mapping(vma->vm_flags))
0697 return NULL;
0698 }
0699 }
0700
0701 if (pmd_devmap(pmd))
0702 return NULL;
0703 if (is_huge_zero_pmd(pmd))
0704 return NULL;
0705 if (unlikely(pfn > highest_memmap_pfn))
0706 return NULL;
0707
0708
0709
0710
0711
0712 out:
0713 return pfn_to_page(pfn);
0714 }
0715 #endif
0716
0717 static void restore_exclusive_pte(struct vm_area_struct *vma,
0718 struct page *page, unsigned long address,
0719 pte_t *ptep)
0720 {
0721 pte_t pte;
0722 swp_entry_t entry;
0723
0724 pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
0725 if (pte_swp_soft_dirty(*ptep))
0726 pte = pte_mksoft_dirty(pte);
0727
0728 entry = pte_to_swp_entry(*ptep);
0729 if (pte_swp_uffd_wp(*ptep))
0730 pte = pte_mkuffd_wp(pte);
0731 else if (is_writable_device_exclusive_entry(entry))
0732 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
0733
0734 VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page)));
0735
0736
0737
0738
0739
0740 if (PageAnon(page))
0741 page_add_anon_rmap(page, vma, address, RMAP_NONE);
0742 else
0743
0744
0745
0746
0747 WARN_ON_ONCE(1);
0748
0749 set_pte_at(vma->vm_mm, address, ptep, pte);
0750
0751
0752
0753
0754
0755 update_mmu_cache(vma, address, ptep);
0756 }
0757
0758
0759
0760
0761
0762 static int
0763 try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
0764 unsigned long addr)
0765 {
0766 swp_entry_t entry = pte_to_swp_entry(*src_pte);
0767 struct page *page = pfn_swap_entry_to_page(entry);
0768
0769 if (trylock_page(page)) {
0770 restore_exclusive_pte(vma, page, addr, src_pte);
0771 unlock_page(page);
0772 return 0;
0773 }
0774
0775 return -EBUSY;
0776 }
0777
0778
0779
0780
0781
0782
0783
0784 static unsigned long
0785 copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
0786 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
0787 struct vm_area_struct *src_vma, unsigned long addr, int *rss)
0788 {
0789 unsigned long vm_flags = dst_vma->vm_flags;
0790 pte_t pte = *src_pte;
0791 struct page *page;
0792 swp_entry_t entry = pte_to_swp_entry(pte);
0793
0794 if (likely(!non_swap_entry(entry))) {
0795 if (swap_duplicate(entry) < 0)
0796 return -EIO;
0797
0798
0799 if (unlikely(list_empty(&dst_mm->mmlist))) {
0800 spin_lock(&mmlist_lock);
0801 if (list_empty(&dst_mm->mmlist))
0802 list_add(&dst_mm->mmlist,
0803 &src_mm->mmlist);
0804 spin_unlock(&mmlist_lock);
0805 }
0806
0807 if (pte_swp_exclusive(*src_pte)) {
0808 pte = pte_swp_clear_exclusive(*src_pte);
0809 set_pte_at(src_mm, addr, src_pte, pte);
0810 }
0811 rss[MM_SWAPENTS]++;
0812 } else if (is_migration_entry(entry)) {
0813 page = pfn_swap_entry_to_page(entry);
0814
0815 rss[mm_counter(page)]++;
0816
0817 if (!is_readable_migration_entry(entry) &&
0818 is_cow_mapping(vm_flags)) {
0819
0820
0821
0822
0823
0824 entry = make_readable_migration_entry(
0825 swp_offset(entry));
0826 pte = swp_entry_to_pte(entry);
0827 if (pte_swp_soft_dirty(*src_pte))
0828 pte = pte_swp_mksoft_dirty(pte);
0829 if (pte_swp_uffd_wp(*src_pte))
0830 pte = pte_swp_mkuffd_wp(pte);
0831 set_pte_at(src_mm, addr, src_pte, pte);
0832 }
0833 } else if (is_device_private_entry(entry)) {
0834 page = pfn_swap_entry_to_page(entry);
0835
0836
0837
0838
0839
0840
0841
0842
0843
0844
0845 get_page(page);
0846 rss[mm_counter(page)]++;
0847
0848 BUG_ON(page_try_dup_anon_rmap(page, false, src_vma));
0849
0850
0851
0852
0853
0854
0855
0856
0857 if (is_writable_device_private_entry(entry) &&
0858 is_cow_mapping(vm_flags)) {
0859 entry = make_readable_device_private_entry(
0860 swp_offset(entry));
0861 pte = swp_entry_to_pte(entry);
0862 if (pte_swp_uffd_wp(*src_pte))
0863 pte = pte_swp_mkuffd_wp(pte);
0864 set_pte_at(src_mm, addr, src_pte, pte);
0865 }
0866 } else if (is_device_exclusive_entry(entry)) {
0867
0868
0869
0870
0871
0872
0873 VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
0874 if (try_restore_exclusive_pte(src_pte, src_vma, addr))
0875 return -EBUSY;
0876 return -ENOENT;
0877 } else if (is_pte_marker_entry(entry)) {
0878
0879
0880
0881
0882 WARN_ON_ONCE(!userfaultfd_wp(dst_vma));
0883 set_pte_at(dst_mm, addr, dst_pte, pte);
0884 return 0;
0885 }
0886 if (!userfaultfd_wp(dst_vma))
0887 pte = pte_swp_clear_uffd_wp(pte);
0888 set_pte_at(dst_mm, addr, dst_pte, pte);
0889 return 0;
0890 }
0891
0892
0893
0894
0895
0896
0897
0898
0899
0900
0901
0902
0903
0904 static inline int
0905 copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
0906 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
0907 struct page **prealloc, struct page *page)
0908 {
0909 struct page *new_page;
0910 pte_t pte;
0911
0912 new_page = *prealloc;
0913 if (!new_page)
0914 return -EAGAIN;
0915
0916
0917
0918
0919
0920 *prealloc = NULL;
0921 copy_user_highpage(new_page, page, addr, src_vma);
0922 __SetPageUptodate(new_page);
0923 page_add_new_anon_rmap(new_page, dst_vma, addr);
0924 lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
0925 rss[mm_counter(new_page)]++;
0926
0927
0928 pte = mk_pte(new_page, dst_vma->vm_page_prot);
0929 pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
0930 if (userfaultfd_pte_wp(dst_vma, *src_pte))
0931
0932 pte = pte_wrprotect(pte_mkuffd_wp(pte));
0933 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
0934 return 0;
0935 }
0936
0937
0938
0939
0940
0941 static inline int
0942 copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
0943 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
0944 struct page **prealloc)
0945 {
0946 struct mm_struct *src_mm = src_vma->vm_mm;
0947 unsigned long vm_flags = src_vma->vm_flags;
0948 pte_t pte = *src_pte;
0949 struct page *page;
0950
0951 page = vm_normal_page(src_vma, addr, pte);
0952 if (page && PageAnon(page)) {
0953
0954
0955
0956
0957
0958
0959 get_page(page);
0960 if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
0961
0962 put_page(page);
0963 return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
0964 addr, rss, prealloc, page);
0965 }
0966 rss[mm_counter(page)]++;
0967 } else if (page) {
0968 get_page(page);
0969 page_dup_file_rmap(page, false);
0970 rss[mm_counter(page)]++;
0971 }
0972
0973
0974
0975
0976
0977 if (is_cow_mapping(vm_flags) && pte_write(pte)) {
0978 ptep_set_wrprotect(src_mm, addr, src_pte);
0979 pte = pte_wrprotect(pte);
0980 }
0981 VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page));
0982
0983
0984
0985
0986
0987 if (vm_flags & VM_SHARED)
0988 pte = pte_mkclean(pte);
0989 pte = pte_mkold(pte);
0990
0991 if (!userfaultfd_wp(dst_vma))
0992 pte = pte_clear_uffd_wp(pte);
0993
0994 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
0995 return 0;
0996 }
0997
0998 static inline struct page *
0999 page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
1000 unsigned long addr)
1001 {
1002 struct page *new_page;
1003
1004 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
1005 if (!new_page)
1006 return NULL;
1007
1008 if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) {
1009 put_page(new_page);
1010 return NULL;
1011 }
1012 cgroup_throttle_swaprate(new_page, GFP_KERNEL);
1013
1014 return new_page;
1015 }
1016
1017 static int
1018 copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1019 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1020 unsigned long end)
1021 {
1022 struct mm_struct *dst_mm = dst_vma->vm_mm;
1023 struct mm_struct *src_mm = src_vma->vm_mm;
1024 pte_t *orig_src_pte, *orig_dst_pte;
1025 pte_t *src_pte, *dst_pte;
1026 spinlock_t *src_ptl, *dst_ptl;
1027 int progress, ret = 0;
1028 int rss[NR_MM_COUNTERS];
1029 swp_entry_t entry = (swp_entry_t){0};
1030 struct page *prealloc = NULL;
1031
1032 again:
1033 progress = 0;
1034 init_rss_vec(rss);
1035
1036 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1037 if (!dst_pte) {
1038 ret = -ENOMEM;
1039 goto out;
1040 }
1041 src_pte = pte_offset_map(src_pmd, addr);
1042 src_ptl = pte_lockptr(src_mm, src_pmd);
1043 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1044 orig_src_pte = src_pte;
1045 orig_dst_pte = dst_pte;
1046 arch_enter_lazy_mmu_mode();
1047
1048 do {
1049
1050
1051
1052
1053 if (progress >= 32) {
1054 progress = 0;
1055 if (need_resched() ||
1056 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
1057 break;
1058 }
1059 if (pte_none(*src_pte)) {
1060 progress++;
1061 continue;
1062 }
1063 if (unlikely(!pte_present(*src_pte))) {
1064 ret = copy_nonpresent_pte(dst_mm, src_mm,
1065 dst_pte, src_pte,
1066 dst_vma, src_vma,
1067 addr, rss);
1068 if (ret == -EIO) {
1069 entry = pte_to_swp_entry(*src_pte);
1070 break;
1071 } else if (ret == -EBUSY) {
1072 break;
1073 } else if (!ret) {
1074 progress += 8;
1075 continue;
1076 }
1077
1078
1079
1080
1081
1082 WARN_ON_ONCE(ret != -ENOENT);
1083 }
1084
1085 ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
1086 addr, rss, &prealloc);
1087
1088
1089
1090
1091 if (unlikely(ret == -EAGAIN))
1092 break;
1093 if (unlikely(prealloc)) {
1094
1095
1096
1097
1098
1099
1100 put_page(prealloc);
1101 prealloc = NULL;
1102 }
1103 progress += 8;
1104 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1105
1106 arch_leave_lazy_mmu_mode();
1107 spin_unlock(src_ptl);
1108 pte_unmap(orig_src_pte);
1109 add_mm_rss_vec(dst_mm, rss);
1110 pte_unmap_unlock(orig_dst_pte, dst_ptl);
1111 cond_resched();
1112
1113 if (ret == -EIO) {
1114 VM_WARN_ON_ONCE(!entry.val);
1115 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
1116 ret = -ENOMEM;
1117 goto out;
1118 }
1119 entry.val = 0;
1120 } else if (ret == -EBUSY) {
1121 goto out;
1122 } else if (ret == -EAGAIN) {
1123 prealloc = page_copy_prealloc(src_mm, src_vma, addr);
1124 if (!prealloc)
1125 return -ENOMEM;
1126 } else if (ret) {
1127 VM_WARN_ON_ONCE(1);
1128 }
1129
1130
1131 ret = 0;
1132
1133 if (addr != end)
1134 goto again;
1135 out:
1136 if (unlikely(prealloc))
1137 put_page(prealloc);
1138 return ret;
1139 }
1140
1141 static inline int
1142 copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1143 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1144 unsigned long end)
1145 {
1146 struct mm_struct *dst_mm = dst_vma->vm_mm;
1147 struct mm_struct *src_mm = src_vma->vm_mm;
1148 pmd_t *src_pmd, *dst_pmd;
1149 unsigned long next;
1150
1151 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1152 if (!dst_pmd)
1153 return -ENOMEM;
1154 src_pmd = pmd_offset(src_pud, addr);
1155 do {
1156 next = pmd_addr_end(addr, end);
1157 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
1158 || pmd_devmap(*src_pmd)) {
1159 int err;
1160 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
1161 err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
1162 addr, dst_vma, src_vma);
1163 if (err == -ENOMEM)
1164 return -ENOMEM;
1165 if (!err)
1166 continue;
1167
1168 }
1169 if (pmd_none_or_clear_bad(src_pmd))
1170 continue;
1171 if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
1172 addr, next))
1173 return -ENOMEM;
1174 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1175 return 0;
1176 }
1177
1178 static inline int
1179 copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1180 p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
1181 unsigned long end)
1182 {
1183 struct mm_struct *dst_mm = dst_vma->vm_mm;
1184 struct mm_struct *src_mm = src_vma->vm_mm;
1185 pud_t *src_pud, *dst_pud;
1186 unsigned long next;
1187
1188 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1189 if (!dst_pud)
1190 return -ENOMEM;
1191 src_pud = pud_offset(src_p4d, addr);
1192 do {
1193 next = pud_addr_end(addr, end);
1194 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1195 int err;
1196
1197 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1198 err = copy_huge_pud(dst_mm, src_mm,
1199 dst_pud, src_pud, addr, src_vma);
1200 if (err == -ENOMEM)
1201 return -ENOMEM;
1202 if (!err)
1203 continue;
1204
1205 }
1206 if (pud_none_or_clear_bad(src_pud))
1207 continue;
1208 if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
1209 addr, next))
1210 return -ENOMEM;
1211 } while (dst_pud++, src_pud++, addr = next, addr != end);
1212 return 0;
1213 }
1214
1215 static inline int
1216 copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1217 pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
1218 unsigned long end)
1219 {
1220 struct mm_struct *dst_mm = dst_vma->vm_mm;
1221 p4d_t *src_p4d, *dst_p4d;
1222 unsigned long next;
1223
1224 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1225 if (!dst_p4d)
1226 return -ENOMEM;
1227 src_p4d = p4d_offset(src_pgd, addr);
1228 do {
1229 next = p4d_addr_end(addr, end);
1230 if (p4d_none_or_clear_bad(src_p4d))
1231 continue;
1232 if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
1233 addr, next))
1234 return -ENOMEM;
1235 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
1236 return 0;
1237 }
1238
1239
1240
1241
1242
1243
1244 static bool
1245 vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1246 {
1247
1248
1249
1250
1251
1252
1253 if (userfaultfd_wp(dst_vma))
1254 return true;
1255
1256 if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
1257 return true;
1258
1259 if (src_vma->anon_vma)
1260 return true;
1261
1262
1263
1264
1265
1266
1267
1268 return false;
1269 }
1270
1271 int
1272 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1273 {
1274 pgd_t *src_pgd, *dst_pgd;
1275 unsigned long next;
1276 unsigned long addr = src_vma->vm_start;
1277 unsigned long end = src_vma->vm_end;
1278 struct mm_struct *dst_mm = dst_vma->vm_mm;
1279 struct mm_struct *src_mm = src_vma->vm_mm;
1280 struct mmu_notifier_range range;
1281 bool is_cow;
1282 int ret;
1283
1284 if (!vma_needs_copy(dst_vma, src_vma))
1285 return 0;
1286
1287 if (is_vm_hugetlb_page(src_vma))
1288 return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
1289
1290 if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1291
1292
1293
1294
1295 ret = track_pfn_copy(src_vma);
1296 if (ret)
1297 return ret;
1298 }
1299
1300
1301
1302
1303
1304
1305
1306 is_cow = is_cow_mapping(src_vma->vm_flags);
1307
1308 if (is_cow) {
1309 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
1310 0, src_vma, src_mm, addr, end);
1311 mmu_notifier_invalidate_range_start(&range);
1312
1313
1314
1315
1316
1317
1318
1319 mmap_assert_write_locked(src_mm);
1320 raw_write_seqcount_begin(&src_mm->write_protect_seq);
1321 }
1322
1323 ret = 0;
1324 dst_pgd = pgd_offset(dst_mm, addr);
1325 src_pgd = pgd_offset(src_mm, addr);
1326 do {
1327 next = pgd_addr_end(addr, end);
1328 if (pgd_none_or_clear_bad(src_pgd))
1329 continue;
1330 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
1331 addr, next))) {
1332 ret = -ENOMEM;
1333 break;
1334 }
1335 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1336
1337 if (is_cow) {
1338 raw_write_seqcount_end(&src_mm->write_protect_seq);
1339 mmu_notifier_invalidate_range_end(&range);
1340 }
1341 return ret;
1342 }
1343
1344
1345
1346
1347 struct zap_details {
1348 struct folio *single_folio;
1349 bool even_cows;
1350 zap_flags_t zap_flags;
1351 };
1352
1353
1354 static inline bool should_zap_cows(struct zap_details *details)
1355 {
1356
1357 if (!details)
1358 return true;
1359
1360
1361 return details->even_cows;
1362 }
1363
1364
1365 static inline bool should_zap_page(struct zap_details *details, struct page *page)
1366 {
1367
1368 if (should_zap_cows(details))
1369 return true;
1370
1371
1372 if (!page)
1373 return true;
1374
1375
1376 return !PageAnon(page);
1377 }
1378
1379 static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
1380 {
1381 if (!details)
1382 return false;
1383
1384 return details->zap_flags & ZAP_FLAG_DROP_MARKER;
1385 }
1386
1387
1388
1389
1390
1391 static inline void
1392 zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
1393 unsigned long addr, pte_t *pte,
1394 struct zap_details *details, pte_t pteval)
1395 {
1396 if (zap_drop_file_uffd_wp(details))
1397 return;
1398
1399 pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
1400 }
1401
1402 static unsigned long zap_pte_range(struct mmu_gather *tlb,
1403 struct vm_area_struct *vma, pmd_t *pmd,
1404 unsigned long addr, unsigned long end,
1405 struct zap_details *details)
1406 {
1407 struct mm_struct *mm = tlb->mm;
1408 int force_flush = 0;
1409 int rss[NR_MM_COUNTERS];
1410 spinlock_t *ptl;
1411 pte_t *start_pte;
1412 pte_t *pte;
1413 swp_entry_t entry;
1414
1415 tlb_change_page_size(tlb, PAGE_SIZE);
1416 again:
1417 init_rss_vec(rss);
1418 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1419 pte = start_pte;
1420 flush_tlb_batched_pending(mm);
1421 arch_enter_lazy_mmu_mode();
1422 do {
1423 pte_t ptent = *pte;
1424 struct page *page;
1425
1426 if (pte_none(ptent))
1427 continue;
1428
1429 if (need_resched())
1430 break;
1431
1432 if (pte_present(ptent)) {
1433 page = vm_normal_page(vma, addr, ptent);
1434 if (unlikely(!should_zap_page(details, page)))
1435 continue;
1436 ptent = ptep_get_and_clear_full(mm, addr, pte,
1437 tlb->fullmm);
1438 tlb_remove_tlb_entry(tlb, pte, addr);
1439 zap_install_uffd_wp_if_needed(vma, addr, pte, details,
1440 ptent);
1441 if (unlikely(!page))
1442 continue;
1443
1444 if (!PageAnon(page)) {
1445 if (pte_dirty(ptent)) {
1446 force_flush = 1;
1447 set_page_dirty(page);
1448 }
1449 if (pte_young(ptent) &&
1450 likely(!(vma->vm_flags & VM_SEQ_READ)))
1451 mark_page_accessed(page);
1452 }
1453 rss[mm_counter(page)]--;
1454 page_remove_rmap(page, vma, false);
1455 if (unlikely(page_mapcount(page) < 0))
1456 print_bad_pte(vma, addr, ptent, page);
1457 if (unlikely(__tlb_remove_page(tlb, page))) {
1458 force_flush = 1;
1459 addr += PAGE_SIZE;
1460 break;
1461 }
1462 continue;
1463 }
1464
1465 entry = pte_to_swp_entry(ptent);
1466 if (is_device_private_entry(entry) ||
1467 is_device_exclusive_entry(entry)) {
1468 page = pfn_swap_entry_to_page(entry);
1469 if (unlikely(!should_zap_page(details, page)))
1470 continue;
1471
1472
1473
1474
1475
1476
1477 WARN_ON_ONCE(!vma_is_anonymous(vma));
1478 rss[mm_counter(page)]--;
1479 if (is_device_private_entry(entry))
1480 page_remove_rmap(page, vma, false);
1481 put_page(page);
1482 } else if (!non_swap_entry(entry)) {
1483
1484 if (!should_zap_cows(details))
1485 continue;
1486 rss[MM_SWAPENTS]--;
1487 if (unlikely(!free_swap_and_cache(entry)))
1488 print_bad_pte(vma, addr, ptent, NULL);
1489 } else if (is_migration_entry(entry)) {
1490 page = pfn_swap_entry_to_page(entry);
1491 if (!should_zap_page(details, page))
1492 continue;
1493 rss[mm_counter(page)]--;
1494 } else if (pte_marker_entry_uffd_wp(entry)) {
1495
1496 if (!zap_drop_file_uffd_wp(details))
1497 continue;
1498 } else if (is_hwpoison_entry(entry) ||
1499 is_swapin_error_entry(entry)) {
1500 if (!should_zap_cows(details))
1501 continue;
1502 } else {
1503
1504 WARN_ON_ONCE(1);
1505 }
1506 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1507 zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
1508 } while (pte++, addr += PAGE_SIZE, addr != end);
1509
1510 add_mm_rss_vec(mm, rss);
1511 arch_leave_lazy_mmu_mode();
1512
1513
1514 if (force_flush)
1515 tlb_flush_mmu_tlbonly(tlb);
1516 pte_unmap_unlock(start_pte, ptl);
1517
1518
1519
1520
1521
1522
1523
1524 if (force_flush) {
1525 force_flush = 0;
1526 tlb_flush_mmu(tlb);
1527 }
1528
1529 if (addr != end) {
1530 cond_resched();
1531 goto again;
1532 }
1533
1534 return addr;
1535 }
1536
1537 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1538 struct vm_area_struct *vma, pud_t *pud,
1539 unsigned long addr, unsigned long end,
1540 struct zap_details *details)
1541 {
1542 pmd_t *pmd;
1543 unsigned long next;
1544
1545 pmd = pmd_offset(pud, addr);
1546 do {
1547 next = pmd_addr_end(addr, end);
1548 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1549 if (next - addr != HPAGE_PMD_SIZE)
1550 __split_huge_pmd(vma, pmd, addr, false, NULL);
1551 else if (zap_huge_pmd(tlb, vma, pmd, addr))
1552 goto next;
1553
1554 } else if (details && details->single_folio &&
1555 folio_test_pmd_mappable(details->single_folio) &&
1556 next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
1557 spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
1558
1559
1560
1561
1562
1563 spin_unlock(ptl);
1564 }
1565
1566
1567
1568
1569
1570
1571
1572
1573 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1574 goto next;
1575 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1576 next:
1577 cond_resched();
1578 } while (pmd++, addr = next, addr != end);
1579
1580 return addr;
1581 }
1582
1583 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1584 struct vm_area_struct *vma, p4d_t *p4d,
1585 unsigned long addr, unsigned long end,
1586 struct zap_details *details)
1587 {
1588 pud_t *pud;
1589 unsigned long next;
1590
1591 pud = pud_offset(p4d, addr);
1592 do {
1593 next = pud_addr_end(addr, end);
1594 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1595 if (next - addr != HPAGE_PUD_SIZE) {
1596 mmap_assert_locked(tlb->mm);
1597 split_huge_pud(vma, pud, addr);
1598 } else if (zap_huge_pud(tlb, vma, pud, addr))
1599 goto next;
1600
1601 }
1602 if (pud_none_or_clear_bad(pud))
1603 continue;
1604 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1605 next:
1606 cond_resched();
1607 } while (pud++, addr = next, addr != end);
1608
1609 return addr;
1610 }
1611
1612 static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1613 struct vm_area_struct *vma, pgd_t *pgd,
1614 unsigned long addr, unsigned long end,
1615 struct zap_details *details)
1616 {
1617 p4d_t *p4d;
1618 unsigned long next;
1619
1620 p4d = p4d_offset(pgd, addr);
1621 do {
1622 next = p4d_addr_end(addr, end);
1623 if (p4d_none_or_clear_bad(p4d))
1624 continue;
1625 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1626 } while (p4d++, addr = next, addr != end);
1627
1628 return addr;
1629 }
1630
1631 void unmap_page_range(struct mmu_gather *tlb,
1632 struct vm_area_struct *vma,
1633 unsigned long addr, unsigned long end,
1634 struct zap_details *details)
1635 {
1636 pgd_t *pgd;
1637 unsigned long next;
1638
1639 BUG_ON(addr >= end);
1640 tlb_start_vma(tlb, vma);
1641 pgd = pgd_offset(vma->vm_mm, addr);
1642 do {
1643 next = pgd_addr_end(addr, end);
1644 if (pgd_none_or_clear_bad(pgd))
1645 continue;
1646 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1647 } while (pgd++, addr = next, addr != end);
1648 tlb_end_vma(tlb, vma);
1649 }
1650
1651
1652 static void unmap_single_vma(struct mmu_gather *tlb,
1653 struct vm_area_struct *vma, unsigned long start_addr,
1654 unsigned long end_addr,
1655 struct zap_details *details)
1656 {
1657 unsigned long start = max(vma->vm_start, start_addr);
1658 unsigned long end;
1659
1660 if (start >= vma->vm_end)
1661 return;
1662 end = min(vma->vm_end, end_addr);
1663 if (end <= vma->vm_start)
1664 return;
1665
1666 if (vma->vm_file)
1667 uprobe_munmap(vma, start, end);
1668
1669 if (unlikely(vma->vm_flags & VM_PFNMAP))
1670 untrack_pfn(vma, 0, 0);
1671
1672 if (start != end) {
1673 if (unlikely(is_vm_hugetlb_page(vma))) {
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685 if (vma->vm_file) {
1686 zap_flags_t zap_flags = details ?
1687 details->zap_flags : 0;
1688 i_mmap_lock_write(vma->vm_file->f_mapping);
1689 __unmap_hugepage_range_final(tlb, vma, start, end,
1690 NULL, zap_flags);
1691 i_mmap_unlock_write(vma->vm_file->f_mapping);
1692 }
1693 } else
1694 unmap_page_range(tlb, vma, start, end, details);
1695 }
1696 }
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716 void unmap_vmas(struct mmu_gather *tlb,
1717 struct vm_area_struct *vma, unsigned long start_addr,
1718 unsigned long end_addr)
1719 {
1720 struct mmu_notifier_range range;
1721 struct zap_details details = {
1722 .zap_flags = ZAP_FLAG_DROP_MARKER,
1723
1724 .even_cows = true,
1725 };
1726
1727 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
1728 start_addr, end_addr);
1729 mmu_notifier_invalidate_range_start(&range);
1730 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1731 unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
1732 mmu_notifier_invalidate_range_end(&range);
1733 }
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1744 unsigned long size)
1745 {
1746 struct mmu_notifier_range range;
1747 struct mmu_gather tlb;
1748
1749 lru_add_drain();
1750 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1751 start, start + size);
1752 tlb_gather_mmu(&tlb, vma->vm_mm);
1753 update_hiwater_rss(vma->vm_mm);
1754 mmu_notifier_invalidate_range_start(&range);
1755 for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
1756 unmap_single_vma(&tlb, vma, start, range.end, NULL);
1757 mmu_notifier_invalidate_range_end(&range);
1758 tlb_finish_mmu(&tlb);
1759 }
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770 static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1771 unsigned long size, struct zap_details *details)
1772 {
1773 struct mmu_notifier_range range;
1774 struct mmu_gather tlb;
1775
1776 lru_add_drain();
1777 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1778 address, address + size);
1779 tlb_gather_mmu(&tlb, vma->vm_mm);
1780 update_hiwater_rss(vma->vm_mm);
1781 mmu_notifier_invalidate_range_start(&range);
1782 unmap_single_vma(&tlb, vma, address, range.end, details);
1783 mmu_notifier_invalidate_range_end(&range);
1784 tlb_finish_mmu(&tlb);
1785 }
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798 void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1799 unsigned long size)
1800 {
1801 if (!range_in_vma(vma, address, address + size) ||
1802 !(vma->vm_flags & VM_PFNMAP))
1803 return;
1804
1805 zap_page_range_single(vma, address, size, NULL);
1806 }
1807 EXPORT_SYMBOL_GPL(zap_vma_ptes);
1808
1809 static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
1810 {
1811 pgd_t *pgd;
1812 p4d_t *p4d;
1813 pud_t *pud;
1814 pmd_t *pmd;
1815
1816 pgd = pgd_offset(mm, addr);
1817 p4d = p4d_alloc(mm, pgd, addr);
1818 if (!p4d)
1819 return NULL;
1820 pud = pud_alloc(mm, p4d, addr);
1821 if (!pud)
1822 return NULL;
1823 pmd = pmd_alloc(mm, pud, addr);
1824 if (!pmd)
1825 return NULL;
1826
1827 VM_BUG_ON(pmd_trans_huge(*pmd));
1828 return pmd;
1829 }
1830
1831 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1832 spinlock_t **ptl)
1833 {
1834 pmd_t *pmd = walk_to_pmd(mm, addr);
1835
1836 if (!pmd)
1837 return NULL;
1838 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1839 }
1840
1841 static int validate_page_before_insert(struct page *page)
1842 {
1843 if (PageAnon(page) || PageSlab(page) || page_has_type(page))
1844 return -EINVAL;
1845 flush_dcache_page(page);
1846 return 0;
1847 }
1848
1849 static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
1850 unsigned long addr, struct page *page, pgprot_t prot)
1851 {
1852 if (!pte_none(*pte))
1853 return -EBUSY;
1854
1855 get_page(page);
1856 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
1857 page_add_file_rmap(page, vma, false);
1858 set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
1859 return 0;
1860 }
1861
1862
1863
1864
1865
1866
1867
1868
1869 static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1870 struct page *page, pgprot_t prot)
1871 {
1872 int retval;
1873 pte_t *pte;
1874 spinlock_t *ptl;
1875
1876 retval = validate_page_before_insert(page);
1877 if (retval)
1878 goto out;
1879 retval = -ENOMEM;
1880 pte = get_locked_pte(vma->vm_mm, addr, &ptl);
1881 if (!pte)
1882 goto out;
1883 retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
1884 pte_unmap_unlock(pte, ptl);
1885 out:
1886 return retval;
1887 }
1888
1889 #ifdef pte_index
1890 static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
1891 unsigned long addr, struct page *page, pgprot_t prot)
1892 {
1893 int err;
1894
1895 if (!page_count(page))
1896 return -EINVAL;
1897 err = validate_page_before_insert(page);
1898 if (err)
1899 return err;
1900 return insert_page_into_pte_locked(vma, pte, addr, page, prot);
1901 }
1902
1903
1904
1905
1906 static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
1907 struct page **pages, unsigned long *num, pgprot_t prot)
1908 {
1909 pmd_t *pmd = NULL;
1910 pte_t *start_pte, *pte;
1911 spinlock_t *pte_lock;
1912 struct mm_struct *const mm = vma->vm_mm;
1913 unsigned long curr_page_idx = 0;
1914 unsigned long remaining_pages_total = *num;
1915 unsigned long pages_to_write_in_pmd;
1916 int ret;
1917 more:
1918 ret = -EFAULT;
1919 pmd = walk_to_pmd(mm, addr);
1920 if (!pmd)
1921 goto out;
1922
1923 pages_to_write_in_pmd = min_t(unsigned long,
1924 remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
1925
1926
1927 ret = -ENOMEM;
1928 if (pte_alloc(mm, pmd))
1929 goto out;
1930
1931 while (pages_to_write_in_pmd) {
1932 int pte_idx = 0;
1933 const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
1934
1935 start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
1936 for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
1937 int err = insert_page_in_batch_locked(vma, pte,
1938 addr, pages[curr_page_idx], prot);
1939 if (unlikely(err)) {
1940 pte_unmap_unlock(start_pte, pte_lock);
1941 ret = err;
1942 remaining_pages_total -= pte_idx;
1943 goto out;
1944 }
1945 addr += PAGE_SIZE;
1946 ++curr_page_idx;
1947 }
1948 pte_unmap_unlock(start_pte, pte_lock);
1949 pages_to_write_in_pmd -= batch_size;
1950 remaining_pages_total -= batch_size;
1951 }
1952 if (remaining_pages_total)
1953 goto more;
1954 ret = 0;
1955 out:
1956 *num = remaining_pages_total;
1957 return ret;
1958 }
1959 #endif
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976 int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
1977 struct page **pages, unsigned long *num)
1978 {
1979 #ifdef pte_index
1980 const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
1981
1982 if (addr < vma->vm_start || end_addr >= vma->vm_end)
1983 return -EFAULT;
1984 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1985 BUG_ON(mmap_read_trylock(vma->vm_mm));
1986 BUG_ON(vma->vm_flags & VM_PFNMAP);
1987 vma->vm_flags |= VM_MIXEDMAP;
1988 }
1989
1990 return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
1991 #else
1992 unsigned long idx = 0, pgcount = *num;
1993 int err = -EINVAL;
1994
1995 for (; idx < pgcount; ++idx) {
1996 err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
1997 if (err)
1998 break;
1999 }
2000 *num = pgcount - idx;
2001 return err;
2002 #endif
2003 }
2004 EXPORT_SYMBOL(vm_insert_pages);
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2036 struct page *page)
2037 {
2038 if (addr < vma->vm_start || addr >= vma->vm_end)
2039 return -EFAULT;
2040 if (!page_count(page))
2041 return -EINVAL;
2042 if (!(vma->vm_flags & VM_MIXEDMAP)) {
2043 BUG_ON(mmap_read_trylock(vma->vm_mm));
2044 BUG_ON(vma->vm_flags & VM_PFNMAP);
2045 vma->vm_flags |= VM_MIXEDMAP;
2046 }
2047 return insert_page(vma, addr, page, vma->vm_page_prot);
2048 }
2049 EXPORT_SYMBOL(vm_insert_page);
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062 static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
2063 unsigned long num, unsigned long offset)
2064 {
2065 unsigned long count = vma_pages(vma);
2066 unsigned long uaddr = vma->vm_start;
2067 int ret, i;
2068
2069
2070 if (offset >= num)
2071 return -ENXIO;
2072
2073
2074 if (count > num - offset)
2075 return -ENXIO;
2076
2077 for (i = 0; i < count; i++) {
2078 ret = vm_insert_page(vma, uaddr, pages[offset + i]);
2079 if (ret < 0)
2080 return ret;
2081 uaddr += PAGE_SIZE;
2082 }
2083
2084 return 0;
2085 }
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105 int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
2106 unsigned long num)
2107 {
2108 return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
2109 }
2110 EXPORT_SYMBOL(vm_map_pages);
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125 int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
2126 unsigned long num)
2127 {
2128 return __vm_map_pages(vma, pages, num, 0);
2129 }
2130 EXPORT_SYMBOL(vm_map_pages_zero);
2131
2132 static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2133 pfn_t pfn, pgprot_t prot, bool mkwrite)
2134 {
2135 struct mm_struct *mm = vma->vm_mm;
2136 pte_t *pte, entry;
2137 spinlock_t *ptl;
2138
2139 pte = get_locked_pte(mm, addr, &ptl);
2140 if (!pte)
2141 return VM_FAULT_OOM;
2142 if (!pte_none(*pte)) {
2143 if (mkwrite) {
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154 if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
2155 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
2156 goto out_unlock;
2157 }
2158 entry = pte_mkyoung(*pte);
2159 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2160 if (ptep_set_access_flags(vma, addr, pte, entry, 1))
2161 update_mmu_cache(vma, addr, pte);
2162 }
2163 goto out_unlock;
2164 }
2165
2166
2167 if (pfn_t_devmap(pfn))
2168 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
2169 else
2170 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
2171
2172 if (mkwrite) {
2173 entry = pte_mkyoung(entry);
2174 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2175 }
2176
2177 set_pte_at(mm, addr, pte, entry);
2178 update_mmu_cache(vma, addr, pte);
2179
2180 out_unlock:
2181 pte_unmap_unlock(pte, ptl);
2182 return VM_FAULT_NOPAGE;
2183 }
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206 vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
2207 unsigned long pfn, pgprot_t pgprot)
2208 {
2209
2210
2211
2212
2213
2214
2215 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2216 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2217 (VM_PFNMAP|VM_MIXEDMAP));
2218 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2219 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2220
2221 if (addr < vma->vm_start || addr >= vma->vm_end)
2222 return VM_FAULT_SIGBUS;
2223
2224 if (!pfn_modify_allowed(pfn, pgprot))
2225 return VM_FAULT_SIGBUS;
2226
2227 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
2228
2229 return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
2230 false);
2231 }
2232 EXPORT_SYMBOL(vmf_insert_pfn_prot);
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254 vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2255 unsigned long pfn)
2256 {
2257 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
2258 }
2259 EXPORT_SYMBOL(vmf_insert_pfn);
2260
2261 static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
2262 {
2263
2264 if (vma->vm_flags & VM_MIXEDMAP)
2265 return true;
2266 if (pfn_t_devmap(pfn))
2267 return true;
2268 if (pfn_t_special(pfn))
2269 return true;
2270 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
2271 return true;
2272 return false;
2273 }
2274
2275 static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
2276 unsigned long addr, pfn_t pfn, pgprot_t pgprot,
2277 bool mkwrite)
2278 {
2279 int err;
2280
2281 BUG_ON(!vm_mixed_ok(vma, pfn));
2282
2283 if (addr < vma->vm_start || addr >= vma->vm_end)
2284 return VM_FAULT_SIGBUS;
2285
2286 track_pfn_insert(vma, &pgprot, pfn);
2287
2288 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
2289 return VM_FAULT_SIGBUS;
2290
2291
2292
2293
2294
2295
2296
2297
2298 if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
2299 !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
2300 struct page *page;
2301
2302
2303
2304
2305
2306
2307 page = pfn_to_page(pfn_t_to_pfn(pfn));
2308 err = insert_page(vma, addr, page, pgprot);
2309 } else {
2310 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
2311 }
2312
2313 if (err == -ENOMEM)
2314 return VM_FAULT_OOM;
2315 if (err < 0 && err != -EBUSY)
2316 return VM_FAULT_SIGBUS;
2317
2318 return VM_FAULT_NOPAGE;
2319 }
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347 vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
2348 pfn_t pfn, pgprot_t pgprot)
2349 {
2350 return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
2351 }
2352 EXPORT_SYMBOL(vmf_insert_mixed_prot);
2353
2354 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2355 pfn_t pfn)
2356 {
2357 return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
2358 }
2359 EXPORT_SYMBOL(vmf_insert_mixed);
2360
2361
2362
2363
2364
2365
2366 vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2367 unsigned long addr, pfn_t pfn)
2368 {
2369 return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
2370 }
2371 EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
2372
2373
2374
2375
2376
2377
2378 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2379 unsigned long addr, unsigned long end,
2380 unsigned long pfn, pgprot_t prot)
2381 {
2382 pte_t *pte, *mapped_pte;
2383 spinlock_t *ptl;
2384 int err = 0;
2385
2386 mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2387 if (!pte)
2388 return -ENOMEM;
2389 arch_enter_lazy_mmu_mode();
2390 do {
2391 BUG_ON(!pte_none(*pte));
2392 if (!pfn_modify_allowed(pfn, prot)) {
2393 err = -EACCES;
2394 break;
2395 }
2396 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2397 pfn++;
2398 } while (pte++, addr += PAGE_SIZE, addr != end);
2399 arch_leave_lazy_mmu_mode();
2400 pte_unmap_unlock(mapped_pte, ptl);
2401 return err;
2402 }
2403
2404 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2405 unsigned long addr, unsigned long end,
2406 unsigned long pfn, pgprot_t prot)
2407 {
2408 pmd_t *pmd;
2409 unsigned long next;
2410 int err;
2411
2412 pfn -= addr >> PAGE_SHIFT;
2413 pmd = pmd_alloc(mm, pud, addr);
2414 if (!pmd)
2415 return -ENOMEM;
2416 VM_BUG_ON(pmd_trans_huge(*pmd));
2417 do {
2418 next = pmd_addr_end(addr, end);
2419 err = remap_pte_range(mm, pmd, addr, next,
2420 pfn + (addr >> PAGE_SHIFT), prot);
2421 if (err)
2422 return err;
2423 } while (pmd++, addr = next, addr != end);
2424 return 0;
2425 }
2426
2427 static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
2428 unsigned long addr, unsigned long end,
2429 unsigned long pfn, pgprot_t prot)
2430 {
2431 pud_t *pud;
2432 unsigned long next;
2433 int err;
2434
2435 pfn -= addr >> PAGE_SHIFT;
2436 pud = pud_alloc(mm, p4d, addr);
2437 if (!pud)
2438 return -ENOMEM;
2439 do {
2440 next = pud_addr_end(addr, end);
2441 err = remap_pmd_range(mm, pud, addr, next,
2442 pfn + (addr >> PAGE_SHIFT), prot);
2443 if (err)
2444 return err;
2445 } while (pud++, addr = next, addr != end);
2446 return 0;
2447 }
2448
2449 static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2450 unsigned long addr, unsigned long end,
2451 unsigned long pfn, pgprot_t prot)
2452 {
2453 p4d_t *p4d;
2454 unsigned long next;
2455 int err;
2456
2457 pfn -= addr >> PAGE_SHIFT;
2458 p4d = p4d_alloc(mm, pgd, addr);
2459 if (!p4d)
2460 return -ENOMEM;
2461 do {
2462 next = p4d_addr_end(addr, end);
2463 err = remap_pud_range(mm, p4d, addr, next,
2464 pfn + (addr >> PAGE_SHIFT), prot);
2465 if (err)
2466 return err;
2467 } while (p4d++, addr = next, addr != end);
2468 return 0;
2469 }
2470
2471
2472
2473
2474
2475 int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
2476 unsigned long pfn, unsigned long size, pgprot_t prot)
2477 {
2478 pgd_t *pgd;
2479 unsigned long next;
2480 unsigned long end = addr + PAGE_ALIGN(size);
2481 struct mm_struct *mm = vma->vm_mm;
2482 int err;
2483
2484 if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
2485 return -EINVAL;
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505 if (is_cow_mapping(vma->vm_flags)) {
2506 if (addr != vma->vm_start || end != vma->vm_end)
2507 return -EINVAL;
2508 vma->vm_pgoff = pfn;
2509 }
2510
2511 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2512
2513 BUG_ON(addr >= end);
2514 pfn -= addr >> PAGE_SHIFT;
2515 pgd = pgd_offset(mm, addr);
2516 flush_cache_range(vma, addr, end);
2517 do {
2518 next = pgd_addr_end(addr, end);
2519 err = remap_p4d_range(mm, pgd, addr, next,
2520 pfn + (addr >> PAGE_SHIFT), prot);
2521 if (err)
2522 return err;
2523 } while (pgd++, addr = next, addr != end);
2524
2525 return 0;
2526 }
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2541 unsigned long pfn, unsigned long size, pgprot_t prot)
2542 {
2543 int err;
2544
2545 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2546 if (err)
2547 return -EINVAL;
2548
2549 err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
2550 if (err)
2551 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2552 return err;
2553 }
2554 EXPORT_SYMBOL(remap_pfn_range);
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2572 {
2573 unsigned long vm_len, pfn, pages;
2574
2575
2576 if (start + len < start)
2577 return -EINVAL;
2578
2579
2580
2581
2582
2583 len += start & ~PAGE_MASK;
2584 pfn = start >> PAGE_SHIFT;
2585 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2586 if (pfn + pages < pfn)
2587 return -EINVAL;
2588
2589
2590 if (vma->vm_pgoff > pages)
2591 return -EINVAL;
2592 pfn += vma->vm_pgoff;
2593 pages -= vma->vm_pgoff;
2594
2595
2596 vm_len = vma->vm_end - vma->vm_start;
2597 if (vm_len >> PAGE_SHIFT > pages)
2598 return -EINVAL;
2599
2600
2601 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2602 }
2603 EXPORT_SYMBOL(vm_iomap_memory);
2604
2605 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2606 unsigned long addr, unsigned long end,
2607 pte_fn_t fn, void *data, bool create,
2608 pgtbl_mod_mask *mask)
2609 {
2610 pte_t *pte, *mapped_pte;
2611 int err = 0;
2612 spinlock_t *ptl;
2613
2614 if (create) {
2615 mapped_pte = pte = (mm == &init_mm) ?
2616 pte_alloc_kernel_track(pmd, addr, mask) :
2617 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2618 if (!pte)
2619 return -ENOMEM;
2620 } else {
2621 mapped_pte = pte = (mm == &init_mm) ?
2622 pte_offset_kernel(pmd, addr) :
2623 pte_offset_map_lock(mm, pmd, addr, &ptl);
2624 }
2625
2626 BUG_ON(pmd_huge(*pmd));
2627
2628 arch_enter_lazy_mmu_mode();
2629
2630 if (fn) {
2631 do {
2632 if (create || !pte_none(*pte)) {
2633 err = fn(pte++, addr, data);
2634 if (err)
2635 break;
2636 }
2637 } while (addr += PAGE_SIZE, addr != end);
2638 }
2639 *mask |= PGTBL_PTE_MODIFIED;
2640
2641 arch_leave_lazy_mmu_mode();
2642
2643 if (mm != &init_mm)
2644 pte_unmap_unlock(mapped_pte, ptl);
2645 return err;
2646 }
2647
2648 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2649 unsigned long addr, unsigned long end,
2650 pte_fn_t fn, void *data, bool create,
2651 pgtbl_mod_mask *mask)
2652 {
2653 pmd_t *pmd;
2654 unsigned long next;
2655 int err = 0;
2656
2657 BUG_ON(pud_huge(*pud));
2658
2659 if (create) {
2660 pmd = pmd_alloc_track(mm, pud, addr, mask);
2661 if (!pmd)
2662 return -ENOMEM;
2663 } else {
2664 pmd = pmd_offset(pud, addr);
2665 }
2666 do {
2667 next = pmd_addr_end(addr, end);
2668 if (pmd_none(*pmd) && !create)
2669 continue;
2670 if (WARN_ON_ONCE(pmd_leaf(*pmd)))
2671 return -EINVAL;
2672 if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
2673 if (!create)
2674 continue;
2675 pmd_clear_bad(pmd);
2676 }
2677 err = apply_to_pte_range(mm, pmd, addr, next,
2678 fn, data, create, mask);
2679 if (err)
2680 break;
2681 } while (pmd++, addr = next, addr != end);
2682
2683 return err;
2684 }
2685
2686 static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2687 unsigned long addr, unsigned long end,
2688 pte_fn_t fn, void *data, bool create,
2689 pgtbl_mod_mask *mask)
2690 {
2691 pud_t *pud;
2692 unsigned long next;
2693 int err = 0;
2694
2695 if (create) {
2696 pud = pud_alloc_track(mm, p4d, addr, mask);
2697 if (!pud)
2698 return -ENOMEM;
2699 } else {
2700 pud = pud_offset(p4d, addr);
2701 }
2702 do {
2703 next = pud_addr_end(addr, end);
2704 if (pud_none(*pud) && !create)
2705 continue;
2706 if (WARN_ON_ONCE(pud_leaf(*pud)))
2707 return -EINVAL;
2708 if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
2709 if (!create)
2710 continue;
2711 pud_clear_bad(pud);
2712 }
2713 err = apply_to_pmd_range(mm, pud, addr, next,
2714 fn, data, create, mask);
2715 if (err)
2716 break;
2717 } while (pud++, addr = next, addr != end);
2718
2719 return err;
2720 }
2721
2722 static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2723 unsigned long addr, unsigned long end,
2724 pte_fn_t fn, void *data, bool create,
2725 pgtbl_mod_mask *mask)
2726 {
2727 p4d_t *p4d;
2728 unsigned long next;
2729 int err = 0;
2730
2731 if (create) {
2732 p4d = p4d_alloc_track(mm, pgd, addr, mask);
2733 if (!p4d)
2734 return -ENOMEM;
2735 } else {
2736 p4d = p4d_offset(pgd, addr);
2737 }
2738 do {
2739 next = p4d_addr_end(addr, end);
2740 if (p4d_none(*p4d) && !create)
2741 continue;
2742 if (WARN_ON_ONCE(p4d_leaf(*p4d)))
2743 return -EINVAL;
2744 if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
2745 if (!create)
2746 continue;
2747 p4d_clear_bad(p4d);
2748 }
2749 err = apply_to_pud_range(mm, p4d, addr, next,
2750 fn, data, create, mask);
2751 if (err)
2752 break;
2753 } while (p4d++, addr = next, addr != end);
2754
2755 return err;
2756 }
2757
2758 static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2759 unsigned long size, pte_fn_t fn,
2760 void *data, bool create)
2761 {
2762 pgd_t *pgd;
2763 unsigned long start = addr, next;
2764 unsigned long end = addr + size;
2765 pgtbl_mod_mask mask = 0;
2766 int err = 0;
2767
2768 if (WARN_ON(addr >= end))
2769 return -EINVAL;
2770
2771 pgd = pgd_offset(mm, addr);
2772 do {
2773 next = pgd_addr_end(addr, end);
2774 if (pgd_none(*pgd) && !create)
2775 continue;
2776 if (WARN_ON_ONCE(pgd_leaf(*pgd)))
2777 return -EINVAL;
2778 if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
2779 if (!create)
2780 continue;
2781 pgd_clear_bad(pgd);
2782 }
2783 err = apply_to_p4d_range(mm, pgd, addr, next,
2784 fn, data, create, &mask);
2785 if (err)
2786 break;
2787 } while (pgd++, addr = next, addr != end);
2788
2789 if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
2790 arch_sync_kernel_mappings(start, start + size);
2791
2792 return err;
2793 }
2794
2795
2796
2797
2798
2799 int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2800 unsigned long size, pte_fn_t fn, void *data)
2801 {
2802 return __apply_to_page_range(mm, addr, size, fn, data, true);
2803 }
2804 EXPORT_SYMBOL_GPL(apply_to_page_range);
2805
2806
2807
2808
2809
2810
2811
2812
2813 int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
2814 unsigned long size, pte_fn_t fn, void *data)
2815 {
2816 return __apply_to_page_range(mm, addr, size, fn, data, false);
2817 }
2818 EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828 static inline int pte_unmap_same(struct vm_fault *vmf)
2829 {
2830 int same = 1;
2831 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
2832 if (sizeof(pte_t) > sizeof(unsigned long)) {
2833 spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
2834 spin_lock(ptl);
2835 same = pte_same(*vmf->pte, vmf->orig_pte);
2836 spin_unlock(ptl);
2837 }
2838 #endif
2839 pte_unmap(vmf->pte);
2840 vmf->pte = NULL;
2841 return same;
2842 }
2843
2844 static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
2845 struct vm_fault *vmf)
2846 {
2847 bool ret;
2848 void *kaddr;
2849 void __user *uaddr;
2850 bool locked = false;
2851 struct vm_area_struct *vma = vmf->vma;
2852 struct mm_struct *mm = vma->vm_mm;
2853 unsigned long addr = vmf->address;
2854
2855 if (likely(src)) {
2856 copy_user_highpage(dst, src, addr, vma);
2857 return true;
2858 }
2859
2860
2861
2862
2863
2864
2865
2866 kaddr = kmap_atomic(dst);
2867 uaddr = (void __user *)(addr & PAGE_MASK);
2868
2869
2870
2871
2872
2873 if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
2874 pte_t entry;
2875
2876 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2877 locked = true;
2878 if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2879
2880
2881
2882
2883 update_mmu_tlb(vma, addr, vmf->pte);
2884 ret = false;
2885 goto pte_unlock;
2886 }
2887
2888 entry = pte_mkyoung(vmf->orig_pte);
2889 if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
2890 update_mmu_cache(vma, addr, vmf->pte);
2891 }
2892
2893
2894
2895
2896
2897
2898
2899 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
2900 if (locked)
2901 goto warn;
2902
2903
2904 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2905 locked = true;
2906 if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2907
2908 update_mmu_tlb(vma, addr, vmf->pte);
2909 ret = false;
2910 goto pte_unlock;
2911 }
2912
2913
2914
2915
2916
2917 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
2918
2919
2920
2921
2922 warn:
2923 WARN_ON_ONCE(1);
2924 clear_page(kaddr);
2925 }
2926 }
2927
2928 ret = true;
2929
2930 pte_unlock:
2931 if (locked)
2932 pte_unmap_unlock(vmf->pte, vmf->ptl);
2933 kunmap_atomic(kaddr);
2934 flush_dcache_page(dst);
2935
2936 return ret;
2937 }
2938
2939 static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2940 {
2941 struct file *vm_file = vma->vm_file;
2942
2943 if (vm_file)
2944 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2945
2946
2947
2948
2949
2950 return GFP_KERNEL;
2951 }
2952
2953
2954
2955
2956
2957
2958
2959 static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2960 {
2961 vm_fault_t ret;
2962 struct page *page = vmf->page;
2963 unsigned int old_flags = vmf->flags;
2964
2965 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2966
2967 if (vmf->vma->vm_file &&
2968 IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
2969 return VM_FAULT_SIGBUS;
2970
2971 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2972
2973 vmf->flags = old_flags;
2974 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2975 return ret;
2976 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2977 lock_page(page);
2978 if (!page->mapping) {
2979 unlock_page(page);
2980 return 0;
2981 }
2982 ret |= VM_FAULT_LOCKED;
2983 } else
2984 VM_BUG_ON_PAGE(!PageLocked(page), page);
2985 return ret;
2986 }
2987
2988
2989
2990
2991
2992
2993 static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
2994 {
2995 struct vm_area_struct *vma = vmf->vma;
2996 struct address_space *mapping;
2997 struct page *page = vmf->page;
2998 bool dirtied;
2999 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
3000
3001 dirtied = set_page_dirty(page);
3002 VM_BUG_ON_PAGE(PageAnon(page), page);
3003
3004
3005
3006
3007
3008
3009 mapping = page_rmapping(page);
3010 unlock_page(page);
3011
3012 if (!page_mkwrite)
3013 file_update_time(vma->vm_file);
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024 if ((dirtied || page_mkwrite) && mapping) {
3025 struct file *fpin;
3026
3027 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
3028 balance_dirty_pages_ratelimited(mapping);
3029 if (fpin) {
3030 fput(fpin);
3031 return VM_FAULT_COMPLETED;
3032 }
3033 }
3034
3035 return 0;
3036 }
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046 static inline void wp_page_reuse(struct vm_fault *vmf)
3047 __releases(vmf->ptl)
3048 {
3049 struct vm_area_struct *vma = vmf->vma;
3050 struct page *page = vmf->page;
3051 pte_t entry;
3052
3053 VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
3054 VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page));
3055
3056
3057
3058
3059
3060
3061 if (page)
3062 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
3063
3064 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
3065 entry = pte_mkyoung(vmf->orig_pte);
3066 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3067 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
3068 update_mmu_cache(vma, vmf->address, vmf->pte);
3069 pte_unmap_unlock(vmf->pte, vmf->ptl);
3070 count_vm_event(PGREUSE);
3071 }
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090 static vm_fault_t wp_page_copy(struct vm_fault *vmf)
3091 {
3092 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
3093 struct vm_area_struct *vma = vmf->vma;
3094 struct mm_struct *mm = vma->vm_mm;
3095 struct page *old_page = vmf->page;
3096 struct page *new_page = NULL;
3097 pte_t entry;
3098 int page_copied = 0;
3099 struct mmu_notifier_range range;
3100
3101 delayacct_wpcopy_start();
3102
3103 if (unlikely(anon_vma_prepare(vma)))
3104 goto oom;
3105
3106 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
3107 new_page = alloc_zeroed_user_highpage_movable(vma,
3108 vmf->address);
3109 if (!new_page)
3110 goto oom;
3111 } else {
3112 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
3113 vmf->address);
3114 if (!new_page)
3115 goto oom;
3116
3117 if (!__wp_page_copy_user(new_page, old_page, vmf)) {
3118
3119
3120
3121
3122
3123
3124 put_page(new_page);
3125 if (old_page)
3126 put_page(old_page);
3127
3128 delayacct_wpcopy_end();
3129 return 0;
3130 }
3131 }
3132
3133 if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
3134 goto oom_free_new;
3135 cgroup_throttle_swaprate(new_page, GFP_KERNEL);
3136
3137 __SetPageUptodate(new_page);
3138
3139 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
3140 vmf->address & PAGE_MASK,
3141 (vmf->address & PAGE_MASK) + PAGE_SIZE);
3142 mmu_notifier_invalidate_range_start(&range);
3143
3144
3145
3146
3147 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
3148 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
3149 if (old_page) {
3150 if (!PageAnon(old_page)) {
3151 dec_mm_counter_fast(mm,
3152 mm_counter_file(old_page));
3153 inc_mm_counter_fast(mm, MM_ANONPAGES);
3154 }
3155 } else {
3156 inc_mm_counter_fast(mm, MM_ANONPAGES);
3157 }
3158 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
3159 entry = mk_pte(new_page, vma->vm_page_prot);
3160 entry = pte_sw_mkyoung(entry);
3161 if (unlikely(unshare)) {
3162 if (pte_soft_dirty(vmf->orig_pte))
3163 entry = pte_mksoft_dirty(entry);
3164 if (pte_uffd_wp(vmf->orig_pte))
3165 entry = pte_mkuffd_wp(entry);
3166 } else {
3167 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3168 }
3169
3170
3171
3172
3173
3174
3175
3176
3177 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
3178 page_add_new_anon_rmap(new_page, vma, vmf->address);
3179 lru_cache_add_inactive_or_unevictable(new_page, vma);
3180
3181
3182
3183
3184
3185 BUG_ON(unshare && pte_write(entry));
3186 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
3187 update_mmu_cache(vma, vmf->address, vmf->pte);
3188 if (old_page) {
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211 page_remove_rmap(old_page, vma, false);
3212 }
3213
3214
3215 new_page = old_page;
3216 page_copied = 1;
3217 } else {
3218 update_mmu_tlb(vma, vmf->address, vmf->pte);
3219 }
3220
3221 if (new_page)
3222 put_page(new_page);
3223
3224 pte_unmap_unlock(vmf->pte, vmf->ptl);
3225
3226
3227
3228
3229 mmu_notifier_invalidate_range_only_end(&range);
3230 if (old_page) {
3231 if (page_copied)
3232 free_swap_cache(old_page);
3233 put_page(old_page);
3234 }
3235
3236 delayacct_wpcopy_end();
3237 return (page_copied && !unshare) ? VM_FAULT_WRITE : 0;
3238 oom_free_new:
3239 put_page(new_page);
3240 oom:
3241 if (old_page)
3242 put_page(old_page);
3243
3244 delayacct_wpcopy_end();
3245 return VM_FAULT_OOM;
3246 }
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264 vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
3265 {
3266 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
3267 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
3268 &vmf->ptl);
3269
3270
3271
3272
3273 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
3274 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
3275 pte_unmap_unlock(vmf->pte, vmf->ptl);
3276 return VM_FAULT_NOPAGE;
3277 }
3278 wp_page_reuse(vmf);
3279 return 0;
3280 }
3281
3282
3283
3284
3285
3286 static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
3287 {
3288 struct vm_area_struct *vma = vmf->vma;
3289
3290 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
3291 vm_fault_t ret;
3292
3293 pte_unmap_unlock(vmf->pte, vmf->ptl);
3294 vmf->flags |= FAULT_FLAG_MKWRITE;
3295 ret = vma->vm_ops->pfn_mkwrite(vmf);
3296 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
3297 return ret;
3298 return finish_mkwrite_fault(vmf);
3299 }
3300 wp_page_reuse(vmf);
3301 return VM_FAULT_WRITE;
3302 }
3303
3304 static vm_fault_t wp_page_shared(struct vm_fault *vmf)
3305 __releases(vmf->ptl)
3306 {
3307 struct vm_area_struct *vma = vmf->vma;
3308 vm_fault_t ret = VM_FAULT_WRITE;
3309
3310 get_page(vmf->page);
3311
3312 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
3313 vm_fault_t tmp;
3314
3315 pte_unmap_unlock(vmf->pte, vmf->ptl);
3316 tmp = do_page_mkwrite(vmf);
3317 if (unlikely(!tmp || (tmp &
3318 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3319 put_page(vmf->page);
3320 return tmp;
3321 }
3322 tmp = finish_mkwrite_fault(vmf);
3323 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3324 unlock_page(vmf->page);
3325 put_page(vmf->page);
3326 return tmp;
3327 }
3328 } else {
3329 wp_page_reuse(vmf);
3330 lock_page(vmf->page);
3331 }
3332 ret |= fault_dirty_shared_page(vmf);
3333 put_page(vmf->page);
3334
3335 return ret;
3336 }
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360 static vm_fault_t do_wp_page(struct vm_fault *vmf)
3361 __releases(vmf->ptl)
3362 {
3363 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
3364 struct vm_area_struct *vma = vmf->vma;
3365
3366 VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
3367 VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
3368
3369 if (likely(!unshare)) {
3370 if (userfaultfd_pte_wp(vma, *vmf->pte)) {
3371 pte_unmap_unlock(vmf->pte, vmf->ptl);
3372 return handle_userfault(vmf, VM_UFFD_WP);
3373 }
3374
3375
3376
3377
3378
3379 if (unlikely(userfaultfd_wp(vmf->vma) &&
3380 mm_tlb_flush_pending(vmf->vma->vm_mm)))
3381 flush_tlb_page(vmf->vma, vmf->address);
3382 }
3383
3384 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
3385 if (!vmf->page) {
3386 if (unlikely(unshare)) {
3387
3388 pte_unmap_unlock(vmf->pte, vmf->ptl);
3389 return 0;
3390 }
3391
3392
3393
3394
3395
3396
3397
3398
3399 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
3400 (VM_WRITE|VM_SHARED))
3401 return wp_pfn_shared(vmf);
3402
3403 pte_unmap_unlock(vmf->pte, vmf->ptl);
3404 return wp_page_copy(vmf);
3405 }
3406
3407
3408
3409
3410
3411 if (PageAnon(vmf->page)) {
3412 struct page *page = vmf->page;
3413
3414
3415
3416
3417
3418 if (PageAnonExclusive(page))
3419 goto reuse;
3420
3421
3422
3423
3424
3425
3426
3427
3428 if (PageKsm(page) || page_count(page) > 3)
3429 goto copy;
3430 if (!PageLRU(page))
3431
3432
3433
3434
3435 lru_add_drain();
3436 if (page_count(page) > 1 + PageSwapCache(page))
3437 goto copy;
3438 if (!trylock_page(page))
3439 goto copy;
3440 if (PageSwapCache(page))
3441 try_to_free_swap(page);
3442 if (PageKsm(page) || page_count(page) != 1) {
3443 unlock_page(page);
3444 goto copy;
3445 }
3446
3447
3448
3449
3450
3451 page_move_anon_rmap(page, vma);
3452 unlock_page(page);
3453 reuse:
3454 if (unlikely(unshare)) {
3455 pte_unmap_unlock(vmf->pte, vmf->ptl);
3456 return 0;
3457 }
3458 wp_page_reuse(vmf);
3459 return VM_FAULT_WRITE;
3460 } else if (unshare) {
3461
3462 pte_unmap_unlock(vmf->pte, vmf->ptl);
3463 return 0;
3464 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
3465 (VM_WRITE|VM_SHARED))) {
3466 return wp_page_shared(vmf);
3467 }
3468 copy:
3469
3470
3471
3472 get_page(vmf->page);
3473
3474 pte_unmap_unlock(vmf->pte, vmf->ptl);
3475 #ifdef CONFIG_KSM
3476 if (PageKsm(vmf->page))
3477 count_vm_event(COW_KSM);
3478 #endif
3479 return wp_page_copy(vmf);
3480 }
3481
3482 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
3483 unsigned long start_addr, unsigned long end_addr,
3484 struct zap_details *details)
3485 {
3486 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
3487 }
3488
3489 static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
3490 pgoff_t first_index,
3491 pgoff_t last_index,
3492 struct zap_details *details)
3493 {
3494 struct vm_area_struct *vma;
3495 pgoff_t vba, vea, zba, zea;
3496
3497 vma_interval_tree_foreach(vma, root, first_index, last_index) {
3498 vba = vma->vm_pgoff;
3499 vea = vba + vma_pages(vma) - 1;
3500 zba = max(first_index, vba);
3501 zea = min(last_index, vea);
3502
3503 unmap_mapping_range_vma(vma,
3504 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
3505 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
3506 details);
3507 }
3508 }
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521 void unmap_mapping_folio(struct folio *folio)
3522 {
3523 struct address_space *mapping = folio->mapping;
3524 struct zap_details details = { };
3525 pgoff_t first_index;
3526 pgoff_t last_index;
3527
3528 VM_BUG_ON(!folio_test_locked(folio));
3529
3530 first_index = folio->index;
3531 last_index = folio->index + folio_nr_pages(folio) - 1;
3532
3533 details.even_cows = false;
3534 details.single_folio = folio;
3535 details.zap_flags = ZAP_FLAG_DROP_MARKER;
3536
3537 i_mmap_lock_read(mapping);
3538 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3539 unmap_mapping_range_tree(&mapping->i_mmap, first_index,
3540 last_index, &details);
3541 i_mmap_unlock_read(mapping);
3542 }
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556 void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
3557 pgoff_t nr, bool even_cows)
3558 {
3559 struct zap_details details = { };
3560 pgoff_t first_index = start;
3561 pgoff_t last_index = start + nr - 1;
3562
3563 details.even_cows = even_cows;
3564 if (last_index < first_index)
3565 last_index = ULONG_MAX;
3566
3567 i_mmap_lock_read(mapping);
3568 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3569 unmap_mapping_range_tree(&mapping->i_mmap, first_index,
3570 last_index, &details);
3571 i_mmap_unlock_read(mapping);
3572 }
3573 EXPORT_SYMBOL_GPL(unmap_mapping_pages);
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592 void unmap_mapping_range(struct address_space *mapping,
3593 loff_t const holebegin, loff_t const holelen, int even_cows)
3594 {
3595 pgoff_t hba = holebegin >> PAGE_SHIFT;
3596 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3597
3598
3599 if (sizeof(holelen) > sizeof(hlen)) {
3600 long long holeend =
3601 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3602 if (holeend & ~(long long)ULONG_MAX)
3603 hlen = ULONG_MAX - hba + 1;
3604 }
3605
3606 unmap_mapping_pages(mapping, hba, hlen, even_cows);
3607 }
3608 EXPORT_SYMBOL(unmap_mapping_range);
3609
3610
3611
3612
3613 static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
3614 {
3615 struct page *page = vmf->page;
3616 struct vm_area_struct *vma = vmf->vma;
3617 struct mmu_notifier_range range;
3618
3619 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
3620 return VM_FAULT_RETRY;
3621 mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
3622 vma->vm_mm, vmf->address & PAGE_MASK,
3623 (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
3624 mmu_notifier_invalidate_range_start(&range);
3625
3626 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3627 &vmf->ptl);
3628 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3629 restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
3630
3631 pte_unmap_unlock(vmf->pte, vmf->ptl);
3632 unlock_page(page);
3633
3634 mmu_notifier_invalidate_range_end(&range);
3635 return 0;
3636 }
3637
3638 static inline bool should_try_to_free_swap(struct page *page,
3639 struct vm_area_struct *vma,
3640 unsigned int fault_flags)
3641 {
3642 if (!PageSwapCache(page))
3643 return false;
3644 if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
3645 PageMlocked(page))
3646 return true;
3647
3648
3649
3650
3651
3652
3653 return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
3654 page_count(page) == 2;
3655 }
3656
3657 static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
3658 {
3659 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
3660 vmf->address, &vmf->ptl);
3661
3662
3663
3664
3665 if (is_pte_marker(*vmf->pte))
3666 pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
3667 pte_unmap_unlock(vmf->pte, vmf->ptl);
3668 return 0;
3669 }
3670
3671
3672
3673
3674
3675 static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
3676 {
3677
3678
3679
3680
3681
3682
3683 if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
3684 return pte_marker_clear(vmf);
3685
3686
3687 return do_fault(vmf);
3688 }
3689
3690 static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
3691 {
3692 swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
3693 unsigned long marker = pte_marker_get(entry);
3694
3695
3696
3697
3698
3699
3700 if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
3701 return VM_FAULT_SIGBUS;
3702
3703 if (pte_marker_entry_uffd_wp(entry))
3704 return pte_marker_handle_uffd_wp(vmf);
3705
3706
3707 return VM_FAULT_SIGBUS;
3708 }
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718 vm_fault_t do_swap_page(struct vm_fault *vmf)
3719 {
3720 struct vm_area_struct *vma = vmf->vma;
3721 struct page *page = NULL, *swapcache;
3722 struct swap_info_struct *si = NULL;
3723 rmap_t rmap_flags = RMAP_NONE;
3724 bool exclusive = false;
3725 swp_entry_t entry;
3726 pte_t pte;
3727 int locked;
3728 vm_fault_t ret = 0;
3729 void *shadow = NULL;
3730
3731 if (!pte_unmap_same(vmf))
3732 goto out;
3733
3734 entry = pte_to_swp_entry(vmf->orig_pte);
3735 if (unlikely(non_swap_entry(entry))) {
3736 if (is_migration_entry(entry)) {
3737 migration_entry_wait(vma->vm_mm, vmf->pmd,
3738 vmf->address);
3739 } else if (is_device_exclusive_entry(entry)) {
3740 vmf->page = pfn_swap_entry_to_page(entry);
3741 ret = remove_device_exclusive_entry(vmf);
3742 } else if (is_device_private_entry(entry)) {
3743 vmf->page = pfn_swap_entry_to_page(entry);
3744 ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
3745 } else if (is_hwpoison_entry(entry)) {
3746 ret = VM_FAULT_HWPOISON;
3747 } else if (is_swapin_error_entry(entry)) {
3748 ret = VM_FAULT_SIGBUS;
3749 } else if (is_pte_marker_entry(entry)) {
3750 ret = handle_pte_marker(vmf);
3751 } else {
3752 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
3753 ret = VM_FAULT_SIGBUS;
3754 }
3755 goto out;
3756 }
3757
3758
3759 si = get_swap_device(entry);
3760 if (unlikely(!si))
3761 goto out;
3762
3763 page = lookup_swap_cache(entry, vma, vmf->address);
3764 swapcache = page;
3765
3766 if (!page) {
3767 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
3768 __swap_count(entry) == 1) {
3769
3770 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
3771 vmf->address);
3772 if (page) {
3773 __SetPageLocked(page);
3774 __SetPageSwapBacked(page);
3775
3776 if (mem_cgroup_swapin_charge_page(page,
3777 vma->vm_mm, GFP_KERNEL, entry)) {
3778 ret = VM_FAULT_OOM;
3779 goto out_page;
3780 }
3781 mem_cgroup_swapin_uncharge_swap(entry);
3782
3783 shadow = get_shadow_from_swap_cache(entry);
3784 if (shadow)
3785 workingset_refault(page_folio(page),
3786 shadow);
3787
3788 lru_cache_add(page);
3789
3790
3791 set_page_private(page, entry.val);
3792 swap_readpage(page, true, NULL);
3793 set_page_private(page, 0);
3794 }
3795 } else {
3796 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
3797 vmf);
3798 swapcache = page;
3799 }
3800
3801 if (!page) {
3802
3803
3804
3805
3806 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3807 vmf->address, &vmf->ptl);
3808 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3809 ret = VM_FAULT_OOM;
3810 goto unlock;
3811 }
3812
3813
3814 ret = VM_FAULT_MAJOR;
3815 count_vm_event(PGMAJFAULT);
3816 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
3817 } else if (PageHWPoison(page)) {
3818
3819
3820
3821
3822 ret = VM_FAULT_HWPOISON;
3823 goto out_release;
3824 }
3825
3826 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
3827
3828 if (!locked) {
3829 ret |= VM_FAULT_RETRY;
3830 goto out_release;
3831 }
3832
3833 if (swapcache) {
3834
3835
3836
3837
3838
3839
3840
3841 if (unlikely(!PageSwapCache(page) ||
3842 page_private(page) != entry.val))
3843 goto out_page;
3844
3845
3846
3847
3848
3849
3850 page = ksm_might_need_to_copy(page, vma, vmf->address);
3851 if (unlikely(!page)) {
3852 ret = VM_FAULT_OOM;
3853 page = swapcache;
3854 goto out_page;
3855 }
3856
3857
3858
3859
3860
3861
3862
3863 if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
3864 !PageKsm(page) && !PageLRU(page))
3865 lru_add_drain();
3866 }
3867
3868 cgroup_throttle_swaprate(page, GFP_KERNEL);
3869
3870
3871
3872
3873 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3874 &vmf->ptl);
3875 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3876 goto out_nomap;
3877
3878 if (unlikely(!PageUptodate(page))) {
3879 ret = VM_FAULT_SIGBUS;
3880 goto out_nomap;
3881 }
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891 BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
3892 BUG_ON(PageAnon(page) && PageAnonExclusive(page));
3893
3894
3895
3896
3897
3898 if (!PageKsm(page)) {
3899
3900
3901
3902
3903 exclusive = pte_swp_exclusive(vmf->orig_pte);
3904 if (page != swapcache) {
3905
3906
3907
3908
3909 exclusive = true;
3910 } else if (exclusive && PageWriteback(page) &&
3911 data_race(si->flags & SWP_STABLE_WRITES)) {
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930 exclusive = false;
3931 }
3932 }
3933
3934
3935
3936
3937
3938
3939 swap_free(entry);
3940 if (should_try_to_free_swap(page, vma, vmf->flags))
3941 try_to_free_swap(page);
3942
3943 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3944 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3945 pte = mk_pte(page, vma->vm_page_prot);
3946
3947
3948
3949
3950
3951
3952
3953 if (!PageKsm(page) && (exclusive || page_count(page) == 1)) {
3954 if (vmf->flags & FAULT_FLAG_WRITE) {
3955 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3956 vmf->flags &= ~FAULT_FLAG_WRITE;
3957 ret |= VM_FAULT_WRITE;
3958 }
3959 rmap_flags |= RMAP_EXCLUSIVE;
3960 }
3961 flush_icache_page(vma, page);
3962 if (pte_swp_soft_dirty(vmf->orig_pte))
3963 pte = pte_mksoft_dirty(pte);
3964 if (pte_swp_uffd_wp(vmf->orig_pte)) {
3965 pte = pte_mkuffd_wp(pte);
3966 pte = pte_wrprotect(pte);
3967 }
3968 vmf->orig_pte = pte;
3969
3970
3971 if (unlikely(page != swapcache && swapcache)) {
3972 page_add_new_anon_rmap(page, vma, vmf->address);
3973 lru_cache_add_inactive_or_unevictable(page, vma);
3974 } else {
3975 page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
3976 }
3977
3978 VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page)));
3979 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3980 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
3981
3982 unlock_page(page);
3983 if (page != swapcache && swapcache) {
3984
3985
3986
3987
3988
3989
3990
3991
3992 unlock_page(swapcache);
3993 put_page(swapcache);
3994 }
3995
3996 if (vmf->flags & FAULT_FLAG_WRITE) {
3997 ret |= do_wp_page(vmf);
3998 if (ret & VM_FAULT_ERROR)
3999 ret &= VM_FAULT_ERROR;
4000 goto out;
4001 }
4002
4003
4004 update_mmu_cache(vma, vmf->address, vmf->pte);
4005 unlock:
4006 pte_unmap_unlock(vmf->pte, vmf->ptl);
4007 out:
4008 if (si)
4009 put_swap_device(si);
4010 return ret;
4011 out_nomap:
4012 pte_unmap_unlock(vmf->pte, vmf->ptl);
4013 out_page:
4014 unlock_page(page);
4015 out_release:
4016 put_page(page);
4017 if (page != swapcache && swapcache) {
4018 unlock_page(swapcache);
4019 put_page(swapcache);
4020 }
4021 if (si)
4022 put_swap_device(si);
4023 return ret;
4024 }
4025
4026
4027
4028
4029
4030
4031 static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
4032 {
4033 struct vm_area_struct *vma = vmf->vma;
4034 struct page *page;
4035 vm_fault_t ret = 0;
4036 pte_t entry;
4037
4038
4039 if (vma->vm_flags & VM_SHARED)
4040 return VM_FAULT_SIGBUS;
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052 if (pte_alloc(vma->vm_mm, vmf->pmd))
4053 return VM_FAULT_OOM;
4054
4055
4056 if (unlikely(pmd_trans_unstable(vmf->pmd)))
4057 return 0;
4058
4059
4060 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
4061 !mm_forbids_zeropage(vma->vm_mm)) {
4062 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
4063 vma->vm_page_prot));
4064 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
4065 vmf->address, &vmf->ptl);
4066 if (!pte_none(*vmf->pte)) {
4067 update_mmu_tlb(vma, vmf->address, vmf->pte);
4068 goto unlock;
4069 }
4070 ret = check_stable_address_space(vma->vm_mm);
4071 if (ret)
4072 goto unlock;
4073
4074 if (userfaultfd_missing(vma)) {
4075 pte_unmap_unlock(vmf->pte, vmf->ptl);
4076 return handle_userfault(vmf, VM_UFFD_MISSING);
4077 }
4078 goto setpte;
4079 }
4080
4081
4082 if (unlikely(anon_vma_prepare(vma)))
4083 goto oom;
4084 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
4085 if (!page)
4086 goto oom;
4087
4088 if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
4089 goto oom_free_page;
4090 cgroup_throttle_swaprate(page, GFP_KERNEL);
4091
4092
4093
4094
4095
4096
4097 __SetPageUptodate(page);
4098
4099 entry = mk_pte(page, vma->vm_page_prot);
4100 entry = pte_sw_mkyoung(entry);
4101 if (vma->vm_flags & VM_WRITE)
4102 entry = pte_mkwrite(pte_mkdirty(entry));
4103
4104 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
4105 &vmf->ptl);
4106 if (!pte_none(*vmf->pte)) {
4107 update_mmu_cache(vma, vmf->address, vmf->pte);
4108 goto release;
4109 }
4110
4111 ret = check_stable_address_space(vma->vm_mm);
4112 if (ret)
4113 goto release;
4114
4115
4116 if (userfaultfd_missing(vma)) {
4117 pte_unmap_unlock(vmf->pte, vmf->ptl);
4118 put_page(page);
4119 return handle_userfault(vmf, VM_UFFD_MISSING);
4120 }
4121
4122 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
4123 page_add_new_anon_rmap(page, vma, vmf->address);
4124 lru_cache_add_inactive_or_unevictable(page, vma);
4125 setpte:
4126 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
4127
4128
4129 update_mmu_cache(vma, vmf->address, vmf->pte);
4130 unlock:
4131 pte_unmap_unlock(vmf->pte, vmf->ptl);
4132 return ret;
4133 release:
4134 put_page(page);
4135 goto unlock;
4136 oom_free_page:
4137 put_page(page);
4138 oom:
4139 return VM_FAULT_OOM;
4140 }
4141
4142
4143
4144
4145
4146
4147 static vm_fault_t __do_fault(struct vm_fault *vmf)
4148 {
4149 struct vm_area_struct *vma = vmf->vma;
4150 vm_fault_t ret;
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
4168 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
4169 if (!vmf->prealloc_pte)
4170 return VM_FAULT_OOM;
4171 }
4172
4173 ret = vma->vm_ops->fault(vmf);
4174 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
4175 VM_FAULT_DONE_COW)))
4176 return ret;
4177
4178 if (unlikely(PageHWPoison(vmf->page))) {
4179 struct page *page = vmf->page;
4180 vm_fault_t poisonret = VM_FAULT_HWPOISON;
4181 if (ret & VM_FAULT_LOCKED) {
4182 if (page_mapped(page))
4183 unmap_mapping_pages(page_mapping(page),
4184 page->index, 1, false);
4185
4186 if (invalidate_inode_page(page))
4187 poisonret = VM_FAULT_NOPAGE;
4188 unlock_page(page);
4189 }
4190 put_page(page);
4191 vmf->page = NULL;
4192 return poisonret;
4193 }
4194
4195 if (unlikely(!(ret & VM_FAULT_LOCKED)))
4196 lock_page(vmf->page);
4197 else
4198 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
4199
4200 return ret;
4201 }
4202
4203 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4204 static void deposit_prealloc_pte(struct vm_fault *vmf)
4205 {
4206 struct vm_area_struct *vma = vmf->vma;
4207
4208 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
4209
4210
4211
4212
4213 mm_inc_nr_ptes(vma->vm_mm);
4214 vmf->prealloc_pte = NULL;
4215 }
4216
4217 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
4218 {
4219 struct vm_area_struct *vma = vmf->vma;
4220 bool write = vmf->flags & FAULT_FLAG_WRITE;
4221 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
4222 pmd_t entry;
4223 int i;
4224 vm_fault_t ret = VM_FAULT_FALLBACK;
4225
4226 if (!transhuge_vma_suitable(vma, haddr))
4227 return ret;
4228
4229 page = compound_head(page);
4230 if (compound_order(page) != HPAGE_PMD_ORDER)
4231 return ret;
4232
4233
4234
4235
4236
4237
4238
4239 if (unlikely(PageHasHWPoisoned(page)))
4240 return ret;
4241
4242
4243
4244
4245
4246 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
4247 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
4248 if (!vmf->prealloc_pte)
4249 return VM_FAULT_OOM;
4250 }
4251
4252 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
4253 if (unlikely(!pmd_none(*vmf->pmd)))
4254 goto out;
4255
4256 for (i = 0; i < HPAGE_PMD_NR; i++)
4257 flush_icache_page(vma, page + i);
4258
4259 entry = mk_huge_pmd(page, vma->vm_page_prot);
4260 if (write)
4261 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
4262
4263 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
4264 page_add_file_rmap(page, vma, true);
4265
4266
4267
4268
4269 if (arch_needs_pgtable_deposit())
4270 deposit_prealloc_pte(vmf);
4271
4272 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
4273
4274 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
4275
4276
4277 ret = 0;
4278 count_vm_event(THP_FILE_MAPPED);
4279 out:
4280 spin_unlock(vmf->ptl);
4281 return ret;
4282 }
4283 #else
4284 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
4285 {
4286 return VM_FAULT_FALLBACK;
4287 }
4288 #endif
4289
4290 void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
4291 {
4292 struct vm_area_struct *vma = vmf->vma;
4293 bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
4294 bool write = vmf->flags & FAULT_FLAG_WRITE;
4295 bool prefault = vmf->address != addr;
4296 pte_t entry;
4297
4298 flush_icache_page(vma, page);
4299 entry = mk_pte(page, vma->vm_page_prot);
4300
4301 if (prefault && arch_wants_old_prefaulted_pte())
4302 entry = pte_mkold(entry);
4303 else
4304 entry = pte_sw_mkyoung(entry);
4305
4306 if (write)
4307 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
4308 if (unlikely(uffd_wp))
4309 entry = pte_mkuffd_wp(pte_wrprotect(entry));
4310
4311 if (write && !(vma->vm_flags & VM_SHARED)) {
4312 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
4313 page_add_new_anon_rmap(page, vma, addr);
4314 lru_cache_add_inactive_or_unevictable(page, vma);
4315 } else {
4316 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
4317 page_add_file_rmap(page, vma, false);
4318 }
4319 set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
4320 }
4321
4322 static bool vmf_pte_changed(struct vm_fault *vmf)
4323 {
4324 if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
4325 return !pte_same(*vmf->pte, vmf->orig_pte);
4326
4327 return !pte_none(*vmf->pte);
4328 }
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345 vm_fault_t finish_fault(struct vm_fault *vmf)
4346 {
4347 struct vm_area_struct *vma = vmf->vma;
4348 struct page *page;
4349 vm_fault_t ret;
4350
4351
4352 if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
4353 page = vmf->cow_page;
4354 else
4355 page = vmf->page;
4356
4357
4358
4359
4360
4361 if (!(vma->vm_flags & VM_SHARED)) {
4362 ret = check_stable_address_space(vma->vm_mm);
4363 if (ret)
4364 return ret;
4365 }
4366
4367 if (pmd_none(*vmf->pmd)) {
4368 if (PageTransCompound(page)) {
4369 ret = do_set_pmd(vmf, page);
4370 if (ret != VM_FAULT_FALLBACK)
4371 return ret;
4372 }
4373
4374 if (vmf->prealloc_pte)
4375 pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
4376 else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
4377 return VM_FAULT_OOM;
4378 }
4379
4380
4381
4382
4383
4384 if (pmd_devmap_trans_unstable(vmf->pmd))
4385 return VM_FAULT_NOPAGE;
4386
4387 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
4388 vmf->address, &vmf->ptl);
4389
4390
4391 if (likely(!vmf_pte_changed(vmf))) {
4392 do_set_pte(vmf, page, vmf->address);
4393
4394
4395 update_mmu_cache(vma, vmf->address, vmf->pte);
4396
4397 ret = 0;
4398 } else {
4399 update_mmu_tlb(vma, vmf->address, vmf->pte);
4400 ret = VM_FAULT_NOPAGE;
4401 }
4402
4403 pte_unmap_unlock(vmf->pte, vmf->ptl);
4404 return ret;
4405 }
4406
4407 static unsigned long fault_around_bytes __read_mostly =
4408 rounddown_pow_of_two(65536);
4409
4410 #ifdef CONFIG_DEBUG_FS
4411 static int fault_around_bytes_get(void *data, u64 *val)
4412 {
4413 *val = fault_around_bytes;
4414 return 0;
4415 }
4416
4417
4418
4419
4420
4421 static int fault_around_bytes_set(void *data, u64 val)
4422 {
4423 if (val / PAGE_SIZE > PTRS_PER_PTE)
4424 return -EINVAL;
4425 if (val > PAGE_SIZE)
4426 fault_around_bytes = rounddown_pow_of_two(val);
4427 else
4428 fault_around_bytes = PAGE_SIZE;
4429 return 0;
4430 }
4431 DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
4432 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
4433
4434 static int __init fault_around_debugfs(void)
4435 {
4436 debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
4437 &fault_around_bytes_fops);
4438 return 0;
4439 }
4440 late_initcall(fault_around_debugfs);
4441 #endif
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463 static vm_fault_t do_fault_around(struct vm_fault *vmf)
4464 {
4465 unsigned long address = vmf->address, nr_pages, mask;
4466 pgoff_t start_pgoff = vmf->pgoff;
4467 pgoff_t end_pgoff;
4468 int off;
4469
4470 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
4471 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
4472
4473 address = max(address & mask, vmf->vma->vm_start);
4474 off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
4475 start_pgoff -= off;
4476
4477
4478
4479
4480
4481 end_pgoff = start_pgoff -
4482 ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
4483 PTRS_PER_PTE - 1;
4484 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
4485 start_pgoff + nr_pages - 1);
4486
4487 if (pmd_none(*vmf->pmd)) {
4488 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
4489 if (!vmf->prealloc_pte)
4490 return VM_FAULT_OOM;
4491 }
4492
4493 return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
4494 }
4495
4496
4497 static inline bool should_fault_around(struct vm_fault *vmf)
4498 {
4499
4500 if (!vmf->vma->vm_ops->map_pages)
4501 return false;
4502
4503 if (uffd_disable_fault_around(vmf->vma))
4504 return false;
4505
4506 return fault_around_bytes >> PAGE_SHIFT > 1;
4507 }
4508
4509 static vm_fault_t do_read_fault(struct vm_fault *vmf)
4510 {
4511 vm_fault_t ret = 0;
4512
4513
4514
4515
4516
4517
4518 if (should_fault_around(vmf)) {
4519 ret = do_fault_around(vmf);
4520 if (ret)
4521 return ret;
4522 }
4523
4524 ret = __do_fault(vmf);
4525 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4526 return ret;
4527
4528 ret |= finish_fault(vmf);
4529 unlock_page(vmf->page);
4530 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4531 put_page(vmf->page);
4532 return ret;
4533 }
4534
4535 static vm_fault_t do_cow_fault(struct vm_fault *vmf)
4536 {
4537 struct vm_area_struct *vma = vmf->vma;
4538 vm_fault_t ret;
4539
4540 if (unlikely(anon_vma_prepare(vma)))
4541 return VM_FAULT_OOM;
4542
4543 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
4544 if (!vmf->cow_page)
4545 return VM_FAULT_OOM;
4546
4547 if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
4548 GFP_KERNEL)) {
4549 put_page(vmf->cow_page);
4550 return VM_FAULT_OOM;
4551 }
4552 cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
4553
4554 ret = __do_fault(vmf);
4555 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4556 goto uncharge_out;
4557 if (ret & VM_FAULT_DONE_COW)
4558 return ret;
4559
4560 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
4561 __SetPageUptodate(vmf->cow_page);
4562
4563 ret |= finish_fault(vmf);
4564 unlock_page(vmf->page);
4565 put_page(vmf->page);
4566 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4567 goto uncharge_out;
4568 return ret;
4569 uncharge_out:
4570 put_page(vmf->cow_page);
4571 return ret;
4572 }
4573
4574 static vm_fault_t do_shared_fault(struct vm_fault *vmf)
4575 {
4576 struct vm_area_struct *vma = vmf->vma;
4577 vm_fault_t ret, tmp;
4578
4579 ret = __do_fault(vmf);
4580 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4581 return ret;
4582
4583
4584
4585
4586
4587 if (vma->vm_ops->page_mkwrite) {
4588 unlock_page(vmf->page);
4589 tmp = do_page_mkwrite(vmf);
4590 if (unlikely(!tmp ||
4591 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
4592 put_page(vmf->page);
4593 return tmp;
4594 }
4595 }
4596
4597 ret |= finish_fault(vmf);
4598 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
4599 VM_FAULT_RETRY))) {
4600 unlock_page(vmf->page);
4601 put_page(vmf->page);
4602 return ret;
4603 }
4604
4605 ret |= fault_dirty_shared_page(vmf);
4606 return ret;
4607 }
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617 static vm_fault_t do_fault(struct vm_fault *vmf)
4618 {
4619 struct vm_area_struct *vma = vmf->vma;
4620 struct mm_struct *vm_mm = vma->vm_mm;
4621 vm_fault_t ret;
4622
4623
4624
4625
4626 if (!vma->vm_ops->fault) {
4627
4628
4629
4630
4631 if (unlikely(!pmd_present(*vmf->pmd)))
4632 ret = VM_FAULT_SIGBUS;
4633 else {
4634 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
4635 vmf->pmd,
4636 vmf->address,
4637 &vmf->ptl);
4638
4639
4640
4641
4642
4643
4644
4645 if (unlikely(pte_none(*vmf->pte)))
4646 ret = VM_FAULT_SIGBUS;
4647 else
4648 ret = VM_FAULT_NOPAGE;
4649
4650 pte_unmap_unlock(vmf->pte, vmf->ptl);
4651 }
4652 } else if (!(vmf->flags & FAULT_FLAG_WRITE))
4653 ret = do_read_fault(vmf);
4654 else if (!(vma->vm_flags & VM_SHARED))
4655 ret = do_cow_fault(vmf);
4656 else
4657 ret = do_shared_fault(vmf);
4658
4659
4660 if (vmf->prealloc_pte) {
4661 pte_free(vm_mm, vmf->prealloc_pte);
4662 vmf->prealloc_pte = NULL;
4663 }
4664 return ret;
4665 }
4666
4667 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
4668 unsigned long addr, int page_nid, int *flags)
4669 {
4670 get_page(page);
4671
4672 count_vm_numa_event(NUMA_HINT_FAULTS);
4673 if (page_nid == numa_node_id()) {
4674 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
4675 *flags |= TNF_FAULT_LOCAL;
4676 }
4677
4678 return mpol_misplaced(page, vma, addr);
4679 }
4680
4681 static vm_fault_t do_numa_page(struct vm_fault *vmf)
4682 {
4683 struct vm_area_struct *vma = vmf->vma;
4684 struct page *page = NULL;
4685 int page_nid = NUMA_NO_NODE;
4686 int last_cpupid;
4687 int target_nid;
4688 pte_t pte, old_pte;
4689 bool was_writable = pte_savedwrite(vmf->orig_pte);
4690 int flags = 0;
4691
4692
4693
4694
4695
4696
4697 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
4698 spin_lock(vmf->ptl);
4699 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4700 pte_unmap_unlock(vmf->pte, vmf->ptl);
4701 goto out;
4702 }
4703
4704
4705 old_pte = ptep_get(vmf->pte);
4706 pte = pte_modify(old_pte, vma->vm_page_prot);
4707
4708 page = vm_normal_page(vma, vmf->address, pte);
4709 if (!page || is_zone_device_page(page))
4710 goto out_map;
4711
4712
4713 if (PageCompound(page))
4714 goto out_map;
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724 if (!was_writable)
4725 flags |= TNF_NO_GROUP;
4726
4727
4728
4729
4730
4731 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
4732 flags |= TNF_SHARED;
4733
4734 last_cpupid = page_cpupid_last(page);
4735 page_nid = page_to_nid(page);
4736 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
4737 &flags);
4738 if (target_nid == NUMA_NO_NODE) {
4739 put_page(page);
4740 goto out_map;
4741 }
4742 pte_unmap_unlock(vmf->pte, vmf->ptl);
4743
4744
4745 if (migrate_misplaced_page(page, vma, target_nid)) {
4746 page_nid = target_nid;
4747 flags |= TNF_MIGRATED;
4748 } else {
4749 flags |= TNF_MIGRATE_FAIL;
4750 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4751 spin_lock(vmf->ptl);
4752 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4753 pte_unmap_unlock(vmf->pte, vmf->ptl);
4754 goto out;
4755 }
4756 goto out_map;
4757 }
4758
4759 out:
4760 if (page_nid != NUMA_NO_NODE)
4761 task_numa_fault(last_cpupid, page_nid, 1, flags);
4762 return 0;
4763 out_map:
4764
4765
4766
4767
4768 old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
4769 pte = pte_modify(old_pte, vma->vm_page_prot);
4770 pte = pte_mkyoung(pte);
4771 if (was_writable)
4772 pte = pte_mkwrite(pte);
4773 ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
4774 update_mmu_cache(vma, vmf->address, vmf->pte);
4775 pte_unmap_unlock(vmf->pte, vmf->ptl);
4776 goto out;
4777 }
4778
4779 static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
4780 {
4781 if (vma_is_anonymous(vmf->vma))
4782 return do_huge_pmd_anonymous_page(vmf);
4783 if (vmf->vma->vm_ops->huge_fault)
4784 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4785 return VM_FAULT_FALLBACK;
4786 }
4787
4788
4789 static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
4790 {
4791 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
4792
4793 if (vma_is_anonymous(vmf->vma)) {
4794 if (likely(!unshare) &&
4795 userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
4796 return handle_userfault(vmf, VM_UFFD_WP);
4797 return do_huge_pmd_wp_page(vmf);
4798 }
4799 if (vmf->vma->vm_ops->huge_fault) {
4800 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4801
4802 if (!(ret & VM_FAULT_FALLBACK))
4803 return ret;
4804 }
4805
4806
4807 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
4808
4809 return VM_FAULT_FALLBACK;
4810 }
4811
4812 static vm_fault_t create_huge_pud(struct vm_fault *vmf)
4813 {
4814 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
4815 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4816
4817 if (vma_is_anonymous(vmf->vma))
4818 return VM_FAULT_FALLBACK;
4819 if (vmf->vma->vm_ops->huge_fault)
4820 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4821 #endif
4822 return VM_FAULT_FALLBACK;
4823 }
4824
4825 static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
4826 {
4827 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
4828 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4829
4830 if (vma_is_anonymous(vmf->vma))
4831 goto split;
4832 if (vmf->vma->vm_ops->huge_fault) {
4833 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4834
4835 if (!(ret & VM_FAULT_FALLBACK))
4836 return ret;
4837 }
4838 split:
4839
4840 __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
4841 #endif
4842 return VM_FAULT_FALLBACK;
4843 }
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860 static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
4861 {
4862 pte_t entry;
4863
4864 if (unlikely(pmd_none(*vmf->pmd))) {
4865
4866
4867
4868
4869
4870
4871 vmf->pte = NULL;
4872 vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
4873 } else {
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886 if (pmd_devmap_trans_unstable(vmf->pmd))
4887 return 0;
4888
4889
4890
4891
4892
4893
4894 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4895 vmf->orig_pte = *vmf->pte;
4896 vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906 barrier();
4907 if (pte_none(vmf->orig_pte)) {
4908 pte_unmap(vmf->pte);
4909 vmf->pte = NULL;
4910 }
4911 }
4912
4913 if (!vmf->pte) {
4914 if (vma_is_anonymous(vmf->vma))
4915 return do_anonymous_page(vmf);
4916 else
4917 return do_fault(vmf);
4918 }
4919
4920 if (!pte_present(vmf->orig_pte))
4921 return do_swap_page(vmf);
4922
4923 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
4924 return do_numa_page(vmf);
4925
4926 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
4927 spin_lock(vmf->ptl);
4928 entry = vmf->orig_pte;
4929 if (unlikely(!pte_same(*vmf->pte, entry))) {
4930 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
4931 goto unlock;
4932 }
4933 if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
4934 if (!pte_write(entry))
4935 return do_wp_page(vmf);
4936 else if (likely(vmf->flags & FAULT_FLAG_WRITE))
4937 entry = pte_mkdirty(entry);
4938 }
4939 entry = pte_mkyoung(entry);
4940 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
4941 vmf->flags & FAULT_FLAG_WRITE)) {
4942 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
4943 } else {
4944
4945 if (vmf->flags & FAULT_FLAG_TRIED)
4946 goto unlock;
4947
4948
4949
4950
4951
4952
4953 if (vmf->flags & FAULT_FLAG_WRITE)
4954 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
4955 }
4956 unlock:
4957 pte_unmap_unlock(vmf->pte, vmf->ptl);
4958 return 0;
4959 }
4960
4961
4962
4963
4964
4965
4966
4967 static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
4968 unsigned long address, unsigned int flags)
4969 {
4970 struct vm_fault vmf = {
4971 .vma = vma,
4972 .address = address & PAGE_MASK,
4973 .real_address = address,
4974 .flags = flags,
4975 .pgoff = linear_page_index(vma, address),
4976 .gfp_mask = __get_fault_gfp_mask(vma),
4977 };
4978 struct mm_struct *mm = vma->vm_mm;
4979 unsigned long vm_flags = vma->vm_flags;
4980 pgd_t *pgd;
4981 p4d_t *p4d;
4982 vm_fault_t ret;
4983
4984 pgd = pgd_offset(mm, address);
4985 p4d = p4d_alloc(mm, pgd, address);
4986 if (!p4d)
4987 return VM_FAULT_OOM;
4988
4989 vmf.pud = pud_alloc(mm, p4d, address);
4990 if (!vmf.pud)
4991 return VM_FAULT_OOM;
4992 retry_pud:
4993 if (pud_none(*vmf.pud) &&
4994 hugepage_vma_check(vma, vm_flags, false, true)) {
4995 ret = create_huge_pud(&vmf);
4996 if (!(ret & VM_FAULT_FALLBACK))
4997 return ret;
4998 } else {
4999 pud_t orig_pud = *vmf.pud;
5000
5001 barrier();
5002 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
5003
5004
5005
5006
5007
5008 if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
5009 ret = wp_huge_pud(&vmf, orig_pud);
5010 if (!(ret & VM_FAULT_FALLBACK))
5011 return ret;
5012 } else {
5013 huge_pud_set_accessed(&vmf, orig_pud);
5014 return 0;
5015 }
5016 }
5017 }
5018
5019 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
5020 if (!vmf.pmd)
5021 return VM_FAULT_OOM;
5022
5023
5024 if (pud_trans_unstable(vmf.pud))
5025 goto retry_pud;
5026
5027 if (pmd_none(*vmf.pmd) &&
5028 hugepage_vma_check(vma, vm_flags, false, true)) {
5029 ret = create_huge_pmd(&vmf);
5030 if (!(ret & VM_FAULT_FALLBACK))
5031 return ret;
5032 } else {
5033 vmf.orig_pmd = *vmf.pmd;
5034
5035 barrier();
5036 if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
5037 VM_BUG_ON(thp_migration_supported() &&
5038 !is_pmd_migration_entry(vmf.orig_pmd));
5039 if (is_pmd_migration_entry(vmf.orig_pmd))
5040 pmd_migration_entry_wait(mm, vmf.pmd);
5041 return 0;
5042 }
5043 if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
5044 if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
5045 return do_huge_pmd_numa_page(&vmf);
5046
5047 if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
5048 !pmd_write(vmf.orig_pmd)) {
5049 ret = wp_huge_pmd(&vmf);
5050 if (!(ret & VM_FAULT_FALLBACK))
5051 return ret;
5052 } else {
5053 huge_pmd_set_accessed(&vmf);
5054 return 0;
5055 }
5056 }
5057 }
5058
5059 return handle_pte_fault(&vmf);
5060 }
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077 static inline void mm_account_fault(struct pt_regs *regs,
5078 unsigned long address, unsigned int flags,
5079 vm_fault_t ret)
5080 {
5081 bool major;
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094 if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
5095 return;
5096
5097
5098
5099
5100
5101
5102 major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
5103
5104 if (major)
5105 current->maj_flt++;
5106 else
5107 current->min_flt++;
5108
5109
5110
5111
5112
5113
5114 if (!regs)
5115 return;
5116
5117 if (major)
5118 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
5119 else
5120 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
5121 }
5122
5123
5124
5125
5126
5127
5128
5129 vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
5130 unsigned int flags, struct pt_regs *regs)
5131 {
5132 vm_fault_t ret;
5133
5134 __set_current_state(TASK_RUNNING);
5135
5136 count_vm_event(PGFAULT);
5137 count_memcg_event_mm(vma->vm_mm, PGFAULT);
5138
5139
5140 check_sync_rss_stat(current);
5141
5142 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
5143 flags & FAULT_FLAG_INSTRUCTION,
5144 flags & FAULT_FLAG_REMOTE))
5145 return VM_FAULT_SIGSEGV;
5146
5147
5148
5149
5150
5151 if (flags & FAULT_FLAG_USER)
5152 mem_cgroup_enter_user_fault();
5153
5154 if (unlikely(is_vm_hugetlb_page(vma)))
5155 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
5156 else
5157 ret = __handle_mm_fault(vma, address, flags);
5158
5159 if (flags & FAULT_FLAG_USER) {
5160 mem_cgroup_exit_user_fault();
5161
5162
5163
5164
5165
5166
5167 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
5168 mem_cgroup_oom_synchronize(false);
5169 }
5170
5171 mm_account_fault(regs, address, flags, ret);
5172
5173 return ret;
5174 }
5175 EXPORT_SYMBOL_GPL(handle_mm_fault);
5176
5177 #ifndef __PAGETABLE_P4D_FOLDED
5178
5179
5180
5181
5182 int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
5183 {
5184 p4d_t *new = p4d_alloc_one(mm, address);
5185 if (!new)
5186 return -ENOMEM;
5187
5188 spin_lock(&mm->page_table_lock);
5189 if (pgd_present(*pgd)) {
5190 p4d_free(mm, new);
5191 } else {
5192 smp_wmb();
5193 pgd_populate(mm, pgd, new);
5194 }
5195 spin_unlock(&mm->page_table_lock);
5196 return 0;
5197 }
5198 #endif
5199
5200 #ifndef __PAGETABLE_PUD_FOLDED
5201
5202
5203
5204
5205 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
5206 {
5207 pud_t *new = pud_alloc_one(mm, address);
5208 if (!new)
5209 return -ENOMEM;
5210
5211 spin_lock(&mm->page_table_lock);
5212 if (!p4d_present(*p4d)) {
5213 mm_inc_nr_puds(mm);
5214 smp_wmb();
5215 p4d_populate(mm, p4d, new);
5216 } else
5217 pud_free(mm, new);
5218 spin_unlock(&mm->page_table_lock);
5219 return 0;
5220 }
5221 #endif
5222
5223 #ifndef __PAGETABLE_PMD_FOLDED
5224
5225
5226
5227
5228 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
5229 {
5230 spinlock_t *ptl;
5231 pmd_t *new = pmd_alloc_one(mm, address);
5232 if (!new)
5233 return -ENOMEM;
5234
5235 ptl = pud_lock(mm, pud);
5236 if (!pud_present(*pud)) {
5237 mm_inc_nr_pmds(mm);
5238 smp_wmb();
5239 pud_populate(mm, pud, new);
5240 } else {
5241 pmd_free(mm, new);
5242 }
5243 spin_unlock(ptl);
5244 return 0;
5245 }
5246 #endif
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269 int follow_pte(struct mm_struct *mm, unsigned long address,
5270 pte_t **ptepp, spinlock_t **ptlp)
5271 {
5272 pgd_t *pgd;
5273 p4d_t *p4d;
5274 pud_t *pud;
5275 pmd_t *pmd;
5276 pte_t *ptep;
5277
5278 pgd = pgd_offset(mm, address);
5279 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
5280 goto out;
5281
5282 p4d = p4d_offset(pgd, address);
5283 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
5284 goto out;
5285
5286 pud = pud_offset(p4d, address);
5287 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
5288 goto out;
5289
5290 pmd = pmd_offset(pud, address);
5291 VM_BUG_ON(pmd_trans_huge(*pmd));
5292
5293 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
5294 goto out;
5295
5296 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
5297 if (!pte_present(*ptep))
5298 goto unlock;
5299 *ptepp = ptep;
5300 return 0;
5301 unlock:
5302 pte_unmap_unlock(ptep, *ptlp);
5303 out:
5304 return -EINVAL;
5305 }
5306 EXPORT_SYMBOL_GPL(follow_pte);
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
5322 unsigned long *pfn)
5323 {
5324 int ret = -EINVAL;
5325 spinlock_t *ptl;
5326 pte_t *ptep;
5327
5328 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
5329 return ret;
5330
5331 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
5332 if (ret)
5333 return ret;
5334 *pfn = pte_pfn(*ptep);
5335 pte_unmap_unlock(ptep, ptl);
5336 return 0;
5337 }
5338 EXPORT_SYMBOL(follow_pfn);
5339
5340 #ifdef CONFIG_HAVE_IOREMAP_PROT
5341 int follow_phys(struct vm_area_struct *vma,
5342 unsigned long address, unsigned int flags,
5343 unsigned long *prot, resource_size_t *phys)
5344 {
5345 int ret = -EINVAL;
5346 pte_t *ptep, pte;
5347 spinlock_t *ptl;
5348
5349 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
5350 goto out;
5351
5352 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
5353 goto out;
5354 pte = *ptep;
5355
5356 if ((flags & FOLL_WRITE) && !pte_write(pte))
5357 goto unlock;
5358
5359 *prot = pgprot_val(pte_pgprot(pte));
5360 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
5361
5362 ret = 0;
5363 unlock:
5364 pte_unmap_unlock(ptep, ptl);
5365 out:
5366 return ret;
5367 }
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
5382 void *buf, int len, int write)
5383 {
5384 resource_size_t phys_addr;
5385 unsigned long prot = 0;
5386 void __iomem *maddr;
5387 pte_t *ptep, pte;
5388 spinlock_t *ptl;
5389 int offset = offset_in_page(addr);
5390 int ret = -EINVAL;
5391
5392 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
5393 return -EINVAL;
5394
5395 retry:
5396 if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
5397 return -EINVAL;
5398 pte = *ptep;
5399 pte_unmap_unlock(ptep, ptl);
5400
5401 prot = pgprot_val(pte_pgprot(pte));
5402 phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
5403
5404 if ((write & FOLL_WRITE) && !pte_write(pte))
5405 return -EINVAL;
5406
5407 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
5408 if (!maddr)
5409 return -ENOMEM;
5410
5411 if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
5412 goto out_unmap;
5413
5414 if (!pte_same(pte, *ptep)) {
5415 pte_unmap_unlock(ptep, ptl);
5416 iounmap(maddr);
5417
5418 goto retry;
5419 }
5420
5421 if (write)
5422 memcpy_toio(maddr + offset, buf, len);
5423 else
5424 memcpy_fromio(buf, maddr + offset, len);
5425 ret = len;
5426 pte_unmap_unlock(ptep, ptl);
5427 out_unmap:
5428 iounmap(maddr);
5429
5430 return ret;
5431 }
5432 EXPORT_SYMBOL_GPL(generic_access_phys);
5433 #endif
5434
5435
5436
5437
5438 int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
5439 int len, unsigned int gup_flags)
5440 {
5441 struct vm_area_struct *vma;
5442 void *old_buf = buf;
5443 int write = gup_flags & FOLL_WRITE;
5444
5445 if (mmap_read_lock_killable(mm))
5446 return 0;
5447
5448
5449 while (len) {
5450 int bytes, ret, offset;
5451 void *maddr;
5452 struct page *page = NULL;
5453
5454 ret = get_user_pages_remote(mm, addr, 1,
5455 gup_flags, &page, &vma, NULL);
5456 if (ret <= 0) {
5457 #ifndef CONFIG_HAVE_IOREMAP_PROT
5458 break;
5459 #else
5460
5461
5462
5463
5464 vma = vma_lookup(mm, addr);
5465 if (!vma)
5466 break;
5467 if (vma->vm_ops && vma->vm_ops->access)
5468 ret = vma->vm_ops->access(vma, addr, buf,
5469 len, write);
5470 if (ret <= 0)
5471 break;
5472 bytes = ret;
5473 #endif
5474 } else {
5475 bytes = len;
5476 offset = addr & (PAGE_SIZE-1);
5477 if (bytes > PAGE_SIZE-offset)
5478 bytes = PAGE_SIZE-offset;
5479
5480 maddr = kmap(page);
5481 if (write) {
5482 copy_to_user_page(vma, page, addr,
5483 maddr + offset, buf, bytes);
5484 set_page_dirty_lock(page);
5485 } else {
5486 copy_from_user_page(vma, page, addr,
5487 buf, maddr + offset, bytes);
5488 }
5489 kunmap(page);
5490 put_page(page);
5491 }
5492 len -= bytes;
5493 buf += bytes;
5494 addr += bytes;
5495 }
5496 mmap_read_unlock(mm);
5497
5498 return buf - old_buf;
5499 }
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
5514 void *buf, int len, unsigned int gup_flags)
5515 {
5516 return __access_remote_vm(mm, addr, buf, len, gup_flags);
5517 }
5518
5519
5520
5521
5522
5523
5524 int access_process_vm(struct task_struct *tsk, unsigned long addr,
5525 void *buf, int len, unsigned int gup_flags)
5526 {
5527 struct mm_struct *mm;
5528 int ret;
5529
5530 mm = get_task_mm(tsk);
5531 if (!mm)
5532 return 0;
5533
5534 ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
5535
5536 mmput(mm);
5537
5538 return ret;
5539 }
5540 EXPORT_SYMBOL_GPL(access_process_vm);
5541
5542
5543
5544
5545 void print_vma_addr(char *prefix, unsigned long ip)
5546 {
5547 struct mm_struct *mm = current->mm;
5548 struct vm_area_struct *vma;
5549
5550
5551
5552
5553 if (!mmap_read_trylock(mm))
5554 return;
5555
5556 vma = find_vma(mm, ip);
5557 if (vma && vma->vm_file) {
5558 struct file *f = vma->vm_file;
5559 char *buf = (char *)__get_free_page(GFP_NOWAIT);
5560 if (buf) {
5561 char *p;
5562
5563 p = file_path(f, buf, PAGE_SIZE);
5564 if (IS_ERR(p))
5565 p = "?";
5566 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
5567 vma->vm_start,
5568 vma->vm_end - vma->vm_start);
5569 free_page((unsigned long)buf);
5570 }
5571 }
5572 mmap_read_unlock(mm);
5573 }
5574
5575 #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
5576 void __might_fault(const char *file, int line)
5577 {
5578 if (pagefault_disabled())
5579 return;
5580 __might_sleep(file, line);
5581 #if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
5582 if (current->mm)
5583 might_lock_read(¤t->mm->mmap_lock);
5584 #endif
5585 }
5586 EXPORT_SYMBOL(__might_fault);
5587 #endif
5588
5589 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
5590
5591
5592
5593
5594
5595 static inline void process_huge_page(
5596 unsigned long addr_hint, unsigned int pages_per_huge_page,
5597 void (*process_subpage)(unsigned long addr, int idx, void *arg),
5598 void *arg)
5599 {
5600 int i, n, base, l;
5601 unsigned long addr = addr_hint &
5602 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5603
5604
5605 might_sleep();
5606 n = (addr_hint - addr) / PAGE_SIZE;
5607 if (2 * n <= pages_per_huge_page) {
5608
5609 base = 0;
5610 l = n;
5611
5612 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
5613 cond_resched();
5614 process_subpage(addr + i * PAGE_SIZE, i, arg);
5615 }
5616 } else {
5617
5618 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
5619 l = pages_per_huge_page - n;
5620
5621 for (i = 0; i < base; i++) {
5622 cond_resched();
5623 process_subpage(addr + i * PAGE_SIZE, i, arg);
5624 }
5625 }
5626
5627
5628
5629
5630 for (i = 0; i < l; i++) {
5631 int left_idx = base + i;
5632 int right_idx = base + 2 * l - 1 - i;
5633
5634 cond_resched();
5635 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
5636 cond_resched();
5637 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
5638 }
5639 }
5640
5641 static void clear_gigantic_page(struct page *page,
5642 unsigned long addr,
5643 unsigned int pages_per_huge_page)
5644 {
5645 int i;
5646 struct page *p = page;
5647
5648 might_sleep();
5649 for (i = 0; i < pages_per_huge_page;
5650 i++, p = mem_map_next(p, page, i)) {
5651 cond_resched();
5652 clear_user_highpage(p, addr + i * PAGE_SIZE);
5653 }
5654 }
5655
5656 static void clear_subpage(unsigned long addr, int idx, void *arg)
5657 {
5658 struct page *page = arg;
5659
5660 clear_user_highpage(page + idx, addr);
5661 }
5662
5663 void clear_huge_page(struct page *page,
5664 unsigned long addr_hint, unsigned int pages_per_huge_page)
5665 {
5666 unsigned long addr = addr_hint &
5667 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5668
5669 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
5670 clear_gigantic_page(page, addr, pages_per_huge_page);
5671 return;
5672 }
5673
5674 process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
5675 }
5676
5677 static void copy_user_gigantic_page(struct page *dst, struct page *src,
5678 unsigned long addr,
5679 struct vm_area_struct *vma,
5680 unsigned int pages_per_huge_page)
5681 {
5682 int i;
5683 struct page *dst_base = dst;
5684 struct page *src_base = src;
5685
5686 for (i = 0; i < pages_per_huge_page; ) {
5687 cond_resched();
5688 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
5689
5690 i++;
5691 dst = mem_map_next(dst, dst_base, i);
5692 src = mem_map_next(src, src_base, i);
5693 }
5694 }
5695
5696 struct copy_subpage_arg {
5697 struct page *dst;
5698 struct page *src;
5699 struct vm_area_struct *vma;
5700 };
5701
5702 static void copy_subpage(unsigned long addr, int idx, void *arg)
5703 {
5704 struct copy_subpage_arg *copy_arg = arg;
5705
5706 copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
5707 addr, copy_arg->vma);
5708 }
5709
5710 void copy_user_huge_page(struct page *dst, struct page *src,
5711 unsigned long addr_hint, struct vm_area_struct *vma,
5712 unsigned int pages_per_huge_page)
5713 {
5714 unsigned long addr = addr_hint &
5715 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5716 struct copy_subpage_arg arg = {
5717 .dst = dst,
5718 .src = src,
5719 .vma = vma,
5720 };
5721
5722 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
5723 copy_user_gigantic_page(dst, src, addr, vma,
5724 pages_per_huge_page);
5725 return;
5726 }
5727
5728 process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
5729 }
5730
5731 long copy_huge_page_from_user(struct page *dst_page,
5732 const void __user *usr_src,
5733 unsigned int pages_per_huge_page,
5734 bool allow_pagefault)
5735 {
5736 void *page_kaddr;
5737 unsigned long i, rc = 0;
5738 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
5739 struct page *subpage = dst_page;
5740
5741 for (i = 0; i < pages_per_huge_page;
5742 i++, subpage = mem_map_next(subpage, dst_page, i)) {
5743 if (allow_pagefault)
5744 page_kaddr = kmap(subpage);
5745 else
5746 page_kaddr = kmap_atomic(subpage);
5747 rc = copy_from_user(page_kaddr,
5748 usr_src + i * PAGE_SIZE, PAGE_SIZE);
5749 if (allow_pagefault)
5750 kunmap(subpage);
5751 else
5752 kunmap_atomic(page_kaddr);
5753
5754 ret_val -= (PAGE_SIZE - rc);
5755 if (rc)
5756 break;
5757
5758 flush_dcache_page(subpage);
5759
5760 cond_resched();
5761 }
5762 return ret_val;
5763 }
5764 #endif
5765
5766 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
5767
5768 static struct kmem_cache *page_ptl_cachep;
5769
5770 void __init ptlock_cache_init(void)
5771 {
5772 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
5773 SLAB_PANIC, NULL);
5774 }
5775
5776 bool ptlock_alloc(struct page *page)
5777 {
5778 spinlock_t *ptl;
5779
5780 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
5781 if (!ptl)
5782 return false;
5783 page->ptl = ptl;
5784 return true;
5785 }
5786
5787 void ptlock_free(struct page *page)
5788 {
5789 kmem_cache_free(page_ptl_cachep, page->ptl);
5790 }
5791 #endif