0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <linux/mman.h>
0010 #include <linux/pagemap.h>
0011 #include <linux/syscalls.h>
0012 #include <linux/mempolicy.h>
0013 #include <linux/page-isolation.h>
0014 #include <linux/page_idle.h>
0015 #include <linux/userfaultfd_k.h>
0016 #include <linux/hugetlb.h>
0017 #include <linux/falloc.h>
0018 #include <linux/fadvise.h>
0019 #include <linux/sched.h>
0020 #include <linux/sched/mm.h>
0021 #include <linux/mm_inline.h>
0022 #include <linux/string.h>
0023 #include <linux/uio.h>
0024 #include <linux/ksm.h>
0025 #include <linux/fs.h>
0026 #include <linux/file.h>
0027 #include <linux/blkdev.h>
0028 #include <linux/backing-dev.h>
0029 #include <linux/pagewalk.h>
0030 #include <linux/swap.h>
0031 #include <linux/swapops.h>
0032 #include <linux/shmem_fs.h>
0033 #include <linux/mmu_notifier.h>
0034
0035 #include <asm/tlb.h>
0036
0037 #include "internal.h"
0038 #include "swap.h"
0039
0040 struct madvise_walk_private {
0041 struct mmu_gather *tlb;
0042 bool pageout;
0043 };
0044
0045
0046
0047
0048
0049
0050 static int madvise_need_mmap_write(int behavior)
0051 {
0052 switch (behavior) {
0053 case MADV_REMOVE:
0054 case MADV_WILLNEED:
0055 case MADV_DONTNEED:
0056 case MADV_DONTNEED_LOCKED:
0057 case MADV_COLD:
0058 case MADV_PAGEOUT:
0059 case MADV_FREE:
0060 case MADV_POPULATE_READ:
0061 case MADV_POPULATE_WRITE:
0062 return 0;
0063 default:
0064
0065 return 1;
0066 }
0067 }
0068
0069 #ifdef CONFIG_ANON_VMA_NAME
0070 struct anon_vma_name *anon_vma_name_alloc(const char *name)
0071 {
0072 struct anon_vma_name *anon_name;
0073 size_t count;
0074
0075
0076 count = strlen(name) + 1;
0077 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
0078 if (anon_name) {
0079 kref_init(&anon_name->kref);
0080 memcpy(anon_name->name, name, count);
0081 }
0082
0083 return anon_name;
0084 }
0085
0086 void anon_vma_name_free(struct kref *kref)
0087 {
0088 struct anon_vma_name *anon_name =
0089 container_of(kref, struct anon_vma_name, kref);
0090 kfree(anon_name);
0091 }
0092
0093 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
0094 {
0095 mmap_assert_locked(vma->vm_mm);
0096
0097 if (vma->vm_file)
0098 return NULL;
0099
0100 return vma->anon_name;
0101 }
0102
0103
0104 static int replace_anon_vma_name(struct vm_area_struct *vma,
0105 struct anon_vma_name *anon_name)
0106 {
0107 struct anon_vma_name *orig_name = anon_vma_name(vma);
0108
0109 if (!anon_name) {
0110 vma->anon_name = NULL;
0111 anon_vma_name_put(orig_name);
0112 return 0;
0113 }
0114
0115 if (anon_vma_name_eq(orig_name, anon_name))
0116 return 0;
0117
0118 vma->anon_name = anon_vma_name_reuse(anon_name);
0119 anon_vma_name_put(orig_name);
0120
0121 return 0;
0122 }
0123 #else
0124 static int replace_anon_vma_name(struct vm_area_struct *vma,
0125 struct anon_vma_name *anon_name)
0126 {
0127 if (anon_name)
0128 return -EINVAL;
0129
0130 return 0;
0131 }
0132 #endif
0133
0134
0135
0136
0137
0138
0139 static int madvise_update_vma(struct vm_area_struct *vma,
0140 struct vm_area_struct **prev, unsigned long start,
0141 unsigned long end, unsigned long new_flags,
0142 struct anon_vma_name *anon_name)
0143 {
0144 struct mm_struct *mm = vma->vm_mm;
0145 int error;
0146 pgoff_t pgoff;
0147
0148 if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
0149 *prev = vma;
0150 return 0;
0151 }
0152
0153 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
0154 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
0155 vma->vm_file, pgoff, vma_policy(vma),
0156 vma->vm_userfaultfd_ctx, anon_name);
0157 if (*prev) {
0158 vma = *prev;
0159 goto success;
0160 }
0161
0162 *prev = vma;
0163
0164 if (start != vma->vm_start) {
0165 if (unlikely(mm->map_count >= sysctl_max_map_count))
0166 return -ENOMEM;
0167 error = __split_vma(mm, vma, start, 1);
0168 if (error)
0169 return error;
0170 }
0171
0172 if (end != vma->vm_end) {
0173 if (unlikely(mm->map_count >= sysctl_max_map_count))
0174 return -ENOMEM;
0175 error = __split_vma(mm, vma, end, 0);
0176 if (error)
0177 return error;
0178 }
0179
0180 success:
0181
0182
0183
0184 vma->vm_flags = new_flags;
0185 if (!vma->vm_file) {
0186 error = replace_anon_vma_name(vma, anon_name);
0187 if (error)
0188 return error;
0189 }
0190
0191 return 0;
0192 }
0193
0194 #ifdef CONFIG_SWAP
0195 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
0196 unsigned long end, struct mm_walk *walk)
0197 {
0198 struct vm_area_struct *vma = walk->private;
0199 unsigned long index;
0200 struct swap_iocb *splug = NULL;
0201
0202 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
0203 return 0;
0204
0205 for (index = start; index != end; index += PAGE_SIZE) {
0206 pte_t pte;
0207 swp_entry_t entry;
0208 struct page *page;
0209 spinlock_t *ptl;
0210 pte_t *ptep;
0211
0212 ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
0213 pte = *ptep;
0214 pte_unmap_unlock(ptep, ptl);
0215
0216 if (!is_swap_pte(pte))
0217 continue;
0218 entry = pte_to_swp_entry(pte);
0219 if (unlikely(non_swap_entry(entry)))
0220 continue;
0221
0222 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
0223 vma, index, false, &splug);
0224 if (page)
0225 put_page(page);
0226 }
0227 swap_read_unplug(splug);
0228
0229 return 0;
0230 }
0231
0232 static const struct mm_walk_ops swapin_walk_ops = {
0233 .pmd_entry = swapin_walk_pmd_entry,
0234 };
0235
0236 static void force_shm_swapin_readahead(struct vm_area_struct *vma,
0237 unsigned long start, unsigned long end,
0238 struct address_space *mapping)
0239 {
0240 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
0241 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
0242 struct page *page;
0243 struct swap_iocb *splug = NULL;
0244
0245 rcu_read_lock();
0246 xas_for_each(&xas, page, end_index) {
0247 swp_entry_t swap;
0248
0249 if (!xa_is_value(page))
0250 continue;
0251 swap = radix_to_swp_entry(page);
0252
0253 if (non_swap_entry(swap))
0254 continue;
0255 xas_pause(&xas);
0256 rcu_read_unlock();
0257
0258 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
0259 NULL, 0, false, &splug);
0260 if (page)
0261 put_page(page);
0262
0263 rcu_read_lock();
0264 }
0265 rcu_read_unlock();
0266 swap_read_unplug(splug);
0267
0268 lru_add_drain();
0269 }
0270 #endif
0271
0272
0273
0274
0275 static long madvise_willneed(struct vm_area_struct *vma,
0276 struct vm_area_struct **prev,
0277 unsigned long start, unsigned long end)
0278 {
0279 struct mm_struct *mm = vma->vm_mm;
0280 struct file *file = vma->vm_file;
0281 loff_t offset;
0282
0283 *prev = vma;
0284 #ifdef CONFIG_SWAP
0285 if (!file) {
0286 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
0287 lru_add_drain();
0288 return 0;
0289 }
0290
0291 if (shmem_mapping(file->f_mapping)) {
0292 force_shm_swapin_readahead(vma, start, end,
0293 file->f_mapping);
0294 return 0;
0295 }
0296 #else
0297 if (!file)
0298 return -EBADF;
0299 #endif
0300
0301 if (IS_DAX(file_inode(file))) {
0302
0303 return 0;
0304 }
0305
0306
0307
0308
0309
0310
0311
0312 *prev = NULL;
0313 get_file(file);
0314 offset = (loff_t)(start - vma->vm_start)
0315 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
0316 mmap_read_unlock(mm);
0317 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
0318 fput(file);
0319 mmap_read_lock(mm);
0320 return 0;
0321 }
0322
0323 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
0324 unsigned long addr, unsigned long end,
0325 struct mm_walk *walk)
0326 {
0327 struct madvise_walk_private *private = walk->private;
0328 struct mmu_gather *tlb = private->tlb;
0329 bool pageout = private->pageout;
0330 struct mm_struct *mm = tlb->mm;
0331 struct vm_area_struct *vma = walk->vma;
0332 pte_t *orig_pte, *pte, ptent;
0333 spinlock_t *ptl;
0334 struct page *page = NULL;
0335 LIST_HEAD(page_list);
0336
0337 if (fatal_signal_pending(current))
0338 return -EINTR;
0339
0340 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0341 if (pmd_trans_huge(*pmd)) {
0342 pmd_t orig_pmd;
0343 unsigned long next = pmd_addr_end(addr, end);
0344
0345 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
0346 ptl = pmd_trans_huge_lock(pmd, vma);
0347 if (!ptl)
0348 return 0;
0349
0350 orig_pmd = *pmd;
0351 if (is_huge_zero_pmd(orig_pmd))
0352 goto huge_unlock;
0353
0354 if (unlikely(!pmd_present(orig_pmd))) {
0355 VM_BUG_ON(thp_migration_supported() &&
0356 !is_pmd_migration_entry(orig_pmd));
0357 goto huge_unlock;
0358 }
0359
0360 page = pmd_page(orig_pmd);
0361
0362
0363 if (page_mapcount(page) != 1)
0364 goto huge_unlock;
0365
0366 if (next - addr != HPAGE_PMD_SIZE) {
0367 int err;
0368
0369 get_page(page);
0370 spin_unlock(ptl);
0371 lock_page(page);
0372 err = split_huge_page(page);
0373 unlock_page(page);
0374 put_page(page);
0375 if (!err)
0376 goto regular_page;
0377 return 0;
0378 }
0379
0380 if (pmd_young(orig_pmd)) {
0381 pmdp_invalidate(vma, addr, pmd);
0382 orig_pmd = pmd_mkold(orig_pmd);
0383
0384 set_pmd_at(mm, addr, pmd, orig_pmd);
0385 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
0386 }
0387
0388 ClearPageReferenced(page);
0389 test_and_clear_page_young(page);
0390 if (pageout) {
0391 if (!isolate_lru_page(page)) {
0392 if (PageUnevictable(page))
0393 putback_lru_page(page);
0394 else
0395 list_add(&page->lru, &page_list);
0396 }
0397 } else
0398 deactivate_page(page);
0399 huge_unlock:
0400 spin_unlock(ptl);
0401 if (pageout)
0402 reclaim_pages(&page_list);
0403 return 0;
0404 }
0405
0406 regular_page:
0407 if (pmd_trans_unstable(pmd))
0408 return 0;
0409 #endif
0410 tlb_change_page_size(tlb, PAGE_SIZE);
0411 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
0412 flush_tlb_batched_pending(mm);
0413 arch_enter_lazy_mmu_mode();
0414 for (; addr < end; pte++, addr += PAGE_SIZE) {
0415 ptent = *pte;
0416
0417 if (pte_none(ptent))
0418 continue;
0419
0420 if (!pte_present(ptent))
0421 continue;
0422
0423 page = vm_normal_page(vma, addr, ptent);
0424 if (!page || is_zone_device_page(page))
0425 continue;
0426
0427
0428
0429
0430
0431 if (PageTransCompound(page)) {
0432 if (page_mapcount(page) != 1)
0433 break;
0434 get_page(page);
0435 if (!trylock_page(page)) {
0436 put_page(page);
0437 break;
0438 }
0439 pte_unmap_unlock(orig_pte, ptl);
0440 if (split_huge_page(page)) {
0441 unlock_page(page);
0442 put_page(page);
0443 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
0444 break;
0445 }
0446 unlock_page(page);
0447 put_page(page);
0448 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
0449 pte--;
0450 addr -= PAGE_SIZE;
0451 continue;
0452 }
0453
0454
0455
0456
0457
0458 if (!PageLRU(page) || page_mapcount(page) != 1)
0459 continue;
0460
0461 VM_BUG_ON_PAGE(PageTransCompound(page), page);
0462
0463 if (pte_young(ptent)) {
0464 ptent = ptep_get_and_clear_full(mm, addr, pte,
0465 tlb->fullmm);
0466 ptent = pte_mkold(ptent);
0467 set_pte_at(mm, addr, pte, ptent);
0468 tlb_remove_tlb_entry(tlb, pte, addr);
0469 }
0470
0471
0472
0473
0474
0475
0476
0477 ClearPageReferenced(page);
0478 test_and_clear_page_young(page);
0479 if (pageout) {
0480 if (!isolate_lru_page(page)) {
0481 if (PageUnevictable(page))
0482 putback_lru_page(page);
0483 else
0484 list_add(&page->lru, &page_list);
0485 }
0486 } else
0487 deactivate_page(page);
0488 }
0489
0490 arch_leave_lazy_mmu_mode();
0491 pte_unmap_unlock(orig_pte, ptl);
0492 if (pageout)
0493 reclaim_pages(&page_list);
0494 cond_resched();
0495
0496 return 0;
0497 }
0498
0499 static const struct mm_walk_ops cold_walk_ops = {
0500 .pmd_entry = madvise_cold_or_pageout_pte_range,
0501 };
0502
0503 static void madvise_cold_page_range(struct mmu_gather *tlb,
0504 struct vm_area_struct *vma,
0505 unsigned long addr, unsigned long end)
0506 {
0507 struct madvise_walk_private walk_private = {
0508 .pageout = false,
0509 .tlb = tlb,
0510 };
0511
0512 tlb_start_vma(tlb, vma);
0513 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
0514 tlb_end_vma(tlb, vma);
0515 }
0516
0517 static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
0518 {
0519 return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
0520 }
0521
0522 static long madvise_cold(struct vm_area_struct *vma,
0523 struct vm_area_struct **prev,
0524 unsigned long start_addr, unsigned long end_addr)
0525 {
0526 struct mm_struct *mm = vma->vm_mm;
0527 struct mmu_gather tlb;
0528
0529 *prev = vma;
0530 if (!can_madv_lru_vma(vma))
0531 return -EINVAL;
0532
0533 lru_add_drain();
0534 tlb_gather_mmu(&tlb, mm);
0535 madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
0536 tlb_finish_mmu(&tlb);
0537
0538 return 0;
0539 }
0540
0541 static void madvise_pageout_page_range(struct mmu_gather *tlb,
0542 struct vm_area_struct *vma,
0543 unsigned long addr, unsigned long end)
0544 {
0545 struct madvise_walk_private walk_private = {
0546 .pageout = true,
0547 .tlb = tlb,
0548 };
0549
0550 tlb_start_vma(tlb, vma);
0551 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
0552 tlb_end_vma(tlb, vma);
0553 }
0554
0555 static inline bool can_do_pageout(struct vm_area_struct *vma)
0556 {
0557 if (vma_is_anonymous(vma))
0558 return true;
0559 if (!vma->vm_file)
0560 return false;
0561
0562
0563
0564
0565
0566
0567 return inode_owner_or_capable(&init_user_ns,
0568 file_inode(vma->vm_file)) ||
0569 file_permission(vma->vm_file, MAY_WRITE) == 0;
0570 }
0571
0572 static long madvise_pageout(struct vm_area_struct *vma,
0573 struct vm_area_struct **prev,
0574 unsigned long start_addr, unsigned long end_addr)
0575 {
0576 struct mm_struct *mm = vma->vm_mm;
0577 struct mmu_gather tlb;
0578
0579 *prev = vma;
0580 if (!can_madv_lru_vma(vma))
0581 return -EINVAL;
0582
0583 if (!can_do_pageout(vma))
0584 return 0;
0585
0586 lru_add_drain();
0587 tlb_gather_mmu(&tlb, mm);
0588 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
0589 tlb_finish_mmu(&tlb);
0590
0591 return 0;
0592 }
0593
0594 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
0595 unsigned long end, struct mm_walk *walk)
0596
0597 {
0598 struct mmu_gather *tlb = walk->private;
0599 struct mm_struct *mm = tlb->mm;
0600 struct vm_area_struct *vma = walk->vma;
0601 spinlock_t *ptl;
0602 pte_t *orig_pte, *pte, ptent;
0603 struct page *page;
0604 int nr_swap = 0;
0605 unsigned long next;
0606
0607 next = pmd_addr_end(addr, end);
0608 if (pmd_trans_huge(*pmd))
0609 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
0610 goto next;
0611
0612 if (pmd_trans_unstable(pmd))
0613 return 0;
0614
0615 tlb_change_page_size(tlb, PAGE_SIZE);
0616 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
0617 flush_tlb_batched_pending(mm);
0618 arch_enter_lazy_mmu_mode();
0619 for (; addr != end; pte++, addr += PAGE_SIZE) {
0620 ptent = *pte;
0621
0622 if (pte_none(ptent))
0623 continue;
0624
0625
0626
0627
0628
0629 if (!pte_present(ptent)) {
0630 swp_entry_t entry;
0631
0632 entry = pte_to_swp_entry(ptent);
0633 if (!non_swap_entry(entry)) {
0634 nr_swap--;
0635 free_swap_and_cache(entry);
0636 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
0637 } else if (is_hwpoison_entry(entry) ||
0638 is_swapin_error_entry(entry)) {
0639 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
0640 }
0641 continue;
0642 }
0643
0644 page = vm_normal_page(vma, addr, ptent);
0645 if (!page || is_zone_device_page(page))
0646 continue;
0647
0648
0649
0650
0651
0652
0653 if (PageTransCompound(page)) {
0654 if (page_mapcount(page) != 1)
0655 goto out;
0656 get_page(page);
0657 if (!trylock_page(page)) {
0658 put_page(page);
0659 goto out;
0660 }
0661 pte_unmap_unlock(orig_pte, ptl);
0662 if (split_huge_page(page)) {
0663 unlock_page(page);
0664 put_page(page);
0665 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
0666 goto out;
0667 }
0668 unlock_page(page);
0669 put_page(page);
0670 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
0671 pte--;
0672 addr -= PAGE_SIZE;
0673 continue;
0674 }
0675
0676 VM_BUG_ON_PAGE(PageTransCompound(page), page);
0677
0678 if (PageSwapCache(page) || PageDirty(page)) {
0679 if (!trylock_page(page))
0680 continue;
0681
0682
0683
0684
0685 if (page_mapcount(page) != 1) {
0686 unlock_page(page);
0687 continue;
0688 }
0689
0690 if (PageSwapCache(page) && !try_to_free_swap(page)) {
0691 unlock_page(page);
0692 continue;
0693 }
0694
0695 ClearPageDirty(page);
0696 unlock_page(page);
0697 }
0698
0699 if (pte_young(ptent) || pte_dirty(ptent)) {
0700
0701
0702
0703
0704
0705
0706 ptent = ptep_get_and_clear_full(mm, addr, pte,
0707 tlb->fullmm);
0708
0709 ptent = pte_mkold(ptent);
0710 ptent = pte_mkclean(ptent);
0711 set_pte_at(mm, addr, pte, ptent);
0712 tlb_remove_tlb_entry(tlb, pte, addr);
0713 }
0714 mark_page_lazyfree(page);
0715 }
0716 out:
0717 if (nr_swap) {
0718 if (current->mm == mm)
0719 sync_mm_rss(mm);
0720
0721 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
0722 }
0723 arch_leave_lazy_mmu_mode();
0724 pte_unmap_unlock(orig_pte, ptl);
0725 cond_resched();
0726 next:
0727 return 0;
0728 }
0729
0730 static const struct mm_walk_ops madvise_free_walk_ops = {
0731 .pmd_entry = madvise_free_pte_range,
0732 };
0733
0734 static int madvise_free_single_vma(struct vm_area_struct *vma,
0735 unsigned long start_addr, unsigned long end_addr)
0736 {
0737 struct mm_struct *mm = vma->vm_mm;
0738 struct mmu_notifier_range range;
0739 struct mmu_gather tlb;
0740
0741
0742 if (!vma_is_anonymous(vma))
0743 return -EINVAL;
0744
0745 range.start = max(vma->vm_start, start_addr);
0746 if (range.start >= vma->vm_end)
0747 return -EINVAL;
0748 range.end = min(vma->vm_end, end_addr);
0749 if (range.end <= vma->vm_start)
0750 return -EINVAL;
0751 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
0752 range.start, range.end);
0753
0754 lru_add_drain();
0755 tlb_gather_mmu(&tlb, mm);
0756 update_hiwater_rss(mm);
0757
0758 mmu_notifier_invalidate_range_start(&range);
0759 tlb_start_vma(&tlb, vma);
0760 walk_page_range(vma->vm_mm, range.start, range.end,
0761 &madvise_free_walk_ops, &tlb);
0762 tlb_end_vma(&tlb, vma);
0763 mmu_notifier_invalidate_range_end(&range);
0764 tlb_finish_mmu(&tlb);
0765
0766 return 0;
0767 }
0768
0769
0770
0771
0772
0773
0774
0775
0776
0777
0778
0779
0780
0781
0782
0783
0784
0785
0786
0787
0788 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
0789 unsigned long start, unsigned long end)
0790 {
0791 zap_page_range(vma, start, end - start);
0792 return 0;
0793 }
0794
0795 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
0796 unsigned long start,
0797 unsigned long *end,
0798 int behavior)
0799 {
0800 if (!is_vm_hugetlb_page(vma)) {
0801 unsigned int forbidden = VM_PFNMAP;
0802
0803 if (behavior != MADV_DONTNEED_LOCKED)
0804 forbidden |= VM_LOCKED;
0805
0806 return !(vma->vm_flags & forbidden);
0807 }
0808
0809 if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
0810 return false;
0811 if (start & ~huge_page_mask(hstate_vma(vma)))
0812 return false;
0813
0814 *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
0815 return true;
0816 }
0817
0818 static long madvise_dontneed_free(struct vm_area_struct *vma,
0819 struct vm_area_struct **prev,
0820 unsigned long start, unsigned long end,
0821 int behavior)
0822 {
0823 struct mm_struct *mm = vma->vm_mm;
0824
0825 *prev = vma;
0826 if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
0827 return -EINVAL;
0828
0829 if (!userfaultfd_remove(vma, start, end)) {
0830 *prev = NULL;
0831
0832 mmap_read_lock(mm);
0833 vma = find_vma(mm, start);
0834 if (!vma)
0835 return -ENOMEM;
0836 if (start < vma->vm_start) {
0837
0838
0839
0840
0841
0842
0843
0844
0845
0846 return -ENOMEM;
0847 }
0848
0849
0850
0851
0852 if (!madvise_dontneed_free_valid_vma(vma, start, &end,
0853 behavior))
0854 return -EINVAL;
0855 if (end > vma->vm_end) {
0856
0857
0858
0859
0860
0861
0862
0863
0864
0865
0866
0867
0868 end = vma->vm_end;
0869 }
0870 VM_WARN_ON(start >= end);
0871 }
0872
0873 if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
0874 return madvise_dontneed_single_vma(vma, start, end);
0875 else if (behavior == MADV_FREE)
0876 return madvise_free_single_vma(vma, start, end);
0877 else
0878 return -EINVAL;
0879 }
0880
0881 static long madvise_populate(struct vm_area_struct *vma,
0882 struct vm_area_struct **prev,
0883 unsigned long start, unsigned long end,
0884 int behavior)
0885 {
0886 const bool write = behavior == MADV_POPULATE_WRITE;
0887 struct mm_struct *mm = vma->vm_mm;
0888 unsigned long tmp_end;
0889 int locked = 1;
0890 long pages;
0891
0892 *prev = vma;
0893
0894 while (start < end) {
0895
0896
0897
0898
0899 if (!vma || start >= vma->vm_end) {
0900 vma = vma_lookup(mm, start);
0901 if (!vma)
0902 return -ENOMEM;
0903 }
0904
0905 tmp_end = min_t(unsigned long, end, vma->vm_end);
0906
0907 pages = faultin_vma_page_range(vma, start, tmp_end, write,
0908 &locked);
0909 if (!locked) {
0910 mmap_read_lock(mm);
0911 locked = 1;
0912 *prev = NULL;
0913 vma = NULL;
0914 }
0915 if (pages < 0) {
0916 switch (pages) {
0917 case -EINTR:
0918 return -EINTR;
0919 case -EINVAL:
0920 return -EINVAL;
0921 case -EHWPOISON:
0922 return -EHWPOISON;
0923 case -EFAULT:
0924 return -EFAULT;
0925 default:
0926 pr_warn_once("%s: unhandled return value: %ld\n",
0927 __func__, pages);
0928 fallthrough;
0929 case -ENOMEM:
0930 return -ENOMEM;
0931 }
0932 }
0933 start += pages * PAGE_SIZE;
0934 }
0935 return 0;
0936 }
0937
0938
0939
0940
0941
0942 static long madvise_remove(struct vm_area_struct *vma,
0943 struct vm_area_struct **prev,
0944 unsigned long start, unsigned long end)
0945 {
0946 loff_t offset;
0947 int error;
0948 struct file *f;
0949 struct mm_struct *mm = vma->vm_mm;
0950
0951 *prev = NULL;
0952
0953 if (vma->vm_flags & VM_LOCKED)
0954 return -EINVAL;
0955
0956 f = vma->vm_file;
0957
0958 if (!f || !f->f_mapping || !f->f_mapping->host) {
0959 return -EINVAL;
0960 }
0961
0962 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
0963 return -EACCES;
0964
0965 offset = (loff_t)(start - vma->vm_start)
0966 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
0967
0968
0969
0970
0971
0972
0973
0974 get_file(f);
0975 if (userfaultfd_remove(vma, start, end)) {
0976
0977 mmap_read_unlock(mm);
0978 }
0979 error = vfs_fallocate(f,
0980 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
0981 offset, end - start);
0982 fput(f);
0983 mmap_read_lock(mm);
0984 return error;
0985 }
0986
0987
0988
0989
0990
0991
0992 static int madvise_vma_behavior(struct vm_area_struct *vma,
0993 struct vm_area_struct **prev,
0994 unsigned long start, unsigned long end,
0995 unsigned long behavior)
0996 {
0997 int error;
0998 struct anon_vma_name *anon_name;
0999 unsigned long new_flags = vma->vm_flags;
1000
1001 switch (behavior) {
1002 case MADV_REMOVE:
1003 return madvise_remove(vma, prev, start, end);
1004 case MADV_WILLNEED:
1005 return madvise_willneed(vma, prev, start, end);
1006 case MADV_COLD:
1007 return madvise_cold(vma, prev, start, end);
1008 case MADV_PAGEOUT:
1009 return madvise_pageout(vma, prev, start, end);
1010 case MADV_FREE:
1011 case MADV_DONTNEED:
1012 case MADV_DONTNEED_LOCKED:
1013 return madvise_dontneed_free(vma, prev, start, end, behavior);
1014 case MADV_POPULATE_READ:
1015 case MADV_POPULATE_WRITE:
1016 return madvise_populate(vma, prev, start, end, behavior);
1017 case MADV_NORMAL:
1018 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
1019 break;
1020 case MADV_SEQUENTIAL:
1021 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
1022 break;
1023 case MADV_RANDOM:
1024 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
1025 break;
1026 case MADV_DONTFORK:
1027 new_flags |= VM_DONTCOPY;
1028 break;
1029 case MADV_DOFORK:
1030 if (vma->vm_flags & VM_IO)
1031 return -EINVAL;
1032 new_flags &= ~VM_DONTCOPY;
1033 break;
1034 case MADV_WIPEONFORK:
1035
1036 if (vma->vm_file || vma->vm_flags & VM_SHARED)
1037 return -EINVAL;
1038 new_flags |= VM_WIPEONFORK;
1039 break;
1040 case MADV_KEEPONFORK:
1041 new_flags &= ~VM_WIPEONFORK;
1042 break;
1043 case MADV_DONTDUMP:
1044 new_flags |= VM_DONTDUMP;
1045 break;
1046 case MADV_DODUMP:
1047 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
1048 return -EINVAL;
1049 new_flags &= ~VM_DONTDUMP;
1050 break;
1051 case MADV_MERGEABLE:
1052 case MADV_UNMERGEABLE:
1053 error = ksm_madvise(vma, start, end, behavior, &new_flags);
1054 if (error)
1055 goto out;
1056 break;
1057 case MADV_HUGEPAGE:
1058 case MADV_NOHUGEPAGE:
1059 error = hugepage_madvise(vma, &new_flags, behavior);
1060 if (error)
1061 goto out;
1062 break;
1063 }
1064
1065 anon_name = anon_vma_name(vma);
1066 anon_vma_name_get(anon_name);
1067 error = madvise_update_vma(vma, prev, start, end, new_flags,
1068 anon_name);
1069 anon_vma_name_put(anon_name);
1070
1071 out:
1072
1073
1074
1075
1076 if (error == -ENOMEM)
1077 error = -EAGAIN;
1078 return error;
1079 }
1080
1081 #ifdef CONFIG_MEMORY_FAILURE
1082
1083
1084
1085 static int madvise_inject_error(int behavior,
1086 unsigned long start, unsigned long end)
1087 {
1088 unsigned long size;
1089
1090 if (!capable(CAP_SYS_ADMIN))
1091 return -EPERM;
1092
1093
1094 for (; start < end; start += size) {
1095 unsigned long pfn;
1096 struct page *page;
1097 int ret;
1098
1099 ret = get_user_pages_fast(start, 1, 0, &page);
1100 if (ret != 1)
1101 return ret;
1102 pfn = page_to_pfn(page);
1103
1104
1105
1106
1107
1108
1109 size = page_size(compound_head(page));
1110
1111 if (behavior == MADV_SOFT_OFFLINE) {
1112 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1113 pfn, start);
1114 ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1115 } else {
1116 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1117 pfn, start);
1118 ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
1119 if (ret == -EOPNOTSUPP)
1120 ret = 0;
1121 }
1122
1123 if (ret)
1124 return ret;
1125 }
1126
1127 return 0;
1128 }
1129 #endif
1130
1131 static bool
1132 madvise_behavior_valid(int behavior)
1133 {
1134 switch (behavior) {
1135 case MADV_DOFORK:
1136 case MADV_DONTFORK:
1137 case MADV_NORMAL:
1138 case MADV_SEQUENTIAL:
1139 case MADV_RANDOM:
1140 case MADV_REMOVE:
1141 case MADV_WILLNEED:
1142 case MADV_DONTNEED:
1143 case MADV_DONTNEED_LOCKED:
1144 case MADV_FREE:
1145 case MADV_COLD:
1146 case MADV_PAGEOUT:
1147 case MADV_POPULATE_READ:
1148 case MADV_POPULATE_WRITE:
1149 #ifdef CONFIG_KSM
1150 case MADV_MERGEABLE:
1151 case MADV_UNMERGEABLE:
1152 #endif
1153 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1154 case MADV_HUGEPAGE:
1155 case MADV_NOHUGEPAGE:
1156 #endif
1157 case MADV_DONTDUMP:
1158 case MADV_DODUMP:
1159 case MADV_WIPEONFORK:
1160 case MADV_KEEPONFORK:
1161 #ifdef CONFIG_MEMORY_FAILURE
1162 case MADV_SOFT_OFFLINE:
1163 case MADV_HWPOISON:
1164 #endif
1165 return true;
1166
1167 default:
1168 return false;
1169 }
1170 }
1171
1172 static bool
1173 process_madvise_behavior_valid(int behavior)
1174 {
1175 switch (behavior) {
1176 case MADV_COLD:
1177 case MADV_PAGEOUT:
1178 case MADV_WILLNEED:
1179 return true;
1180 default:
1181 return false;
1182 }
1183 }
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193 static
1194 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1195 unsigned long end, unsigned long arg,
1196 int (*visit)(struct vm_area_struct *vma,
1197 struct vm_area_struct **prev, unsigned long start,
1198 unsigned long end, unsigned long arg))
1199 {
1200 struct vm_area_struct *vma;
1201 struct vm_area_struct *prev;
1202 unsigned long tmp;
1203 int unmapped_error = 0;
1204
1205
1206
1207
1208
1209
1210 vma = find_vma_prev(mm, start, &prev);
1211 if (vma && start > vma->vm_start)
1212 prev = vma;
1213
1214 for (;;) {
1215 int error;
1216
1217
1218 if (!vma)
1219 return -ENOMEM;
1220
1221
1222 if (start < vma->vm_start) {
1223 unmapped_error = -ENOMEM;
1224 start = vma->vm_start;
1225 if (start >= end)
1226 break;
1227 }
1228
1229
1230 tmp = vma->vm_end;
1231 if (end < tmp)
1232 tmp = end;
1233
1234
1235 error = visit(vma, &prev, start, tmp, arg);
1236 if (error)
1237 return error;
1238 start = tmp;
1239 if (prev && start < prev->vm_end)
1240 start = prev->vm_end;
1241 if (start >= end)
1242 break;
1243 if (prev)
1244 vma = prev->vm_next;
1245 else
1246 vma = find_vma(mm, start);
1247 }
1248
1249 return unmapped_error;
1250 }
1251
1252 #ifdef CONFIG_ANON_VMA_NAME
1253 static int madvise_vma_anon_name(struct vm_area_struct *vma,
1254 struct vm_area_struct **prev,
1255 unsigned long start, unsigned long end,
1256 unsigned long anon_name)
1257 {
1258 int error;
1259
1260
1261 if (vma->vm_file)
1262 return -EBADF;
1263
1264 error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
1265 (struct anon_vma_name *)anon_name);
1266
1267
1268
1269
1270
1271 if (error == -ENOMEM)
1272 error = -EAGAIN;
1273 return error;
1274 }
1275
1276 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
1277 unsigned long len_in, struct anon_vma_name *anon_name)
1278 {
1279 unsigned long end;
1280 unsigned long len;
1281
1282 if (start & ~PAGE_MASK)
1283 return -EINVAL;
1284 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
1285
1286
1287 if (len_in && !len)
1288 return -EINVAL;
1289
1290 end = start + len;
1291 if (end < start)
1292 return -EINVAL;
1293
1294 if (end == start)
1295 return 0;
1296
1297 return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
1298 madvise_vma_anon_name);
1299 }
1300 #endif
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1372 {
1373 unsigned long end;
1374 int error;
1375 int write;
1376 size_t len;
1377 struct blk_plug plug;
1378
1379 start = untagged_addr(start);
1380
1381 if (!madvise_behavior_valid(behavior))
1382 return -EINVAL;
1383
1384 if (!PAGE_ALIGNED(start))
1385 return -EINVAL;
1386 len = PAGE_ALIGN(len_in);
1387
1388
1389 if (len_in && !len)
1390 return -EINVAL;
1391
1392 end = start + len;
1393 if (end < start)
1394 return -EINVAL;
1395
1396 if (end == start)
1397 return 0;
1398
1399 #ifdef CONFIG_MEMORY_FAILURE
1400 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1401 return madvise_inject_error(behavior, start, start + len_in);
1402 #endif
1403
1404 write = madvise_need_mmap_write(behavior);
1405 if (write) {
1406 if (mmap_write_lock_killable(mm))
1407 return -EINTR;
1408 } else {
1409 mmap_read_lock(mm);
1410 }
1411
1412 blk_start_plug(&plug);
1413 error = madvise_walk_vmas(mm, start, end, behavior,
1414 madvise_vma_behavior);
1415 blk_finish_plug(&plug);
1416 if (write)
1417 mmap_write_unlock(mm);
1418 else
1419 mmap_read_unlock(mm);
1420
1421 return error;
1422 }
1423
1424 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1425 {
1426 return do_madvise(current->mm, start, len_in, behavior);
1427 }
1428
1429 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1430 size_t, vlen, int, behavior, unsigned int, flags)
1431 {
1432 ssize_t ret;
1433 struct iovec iovstack[UIO_FASTIOV], iovec;
1434 struct iovec *iov = iovstack;
1435 struct iov_iter iter;
1436 struct task_struct *task;
1437 struct mm_struct *mm;
1438 size_t total_len;
1439 unsigned int f_flags;
1440
1441 if (flags != 0) {
1442 ret = -EINVAL;
1443 goto out;
1444 }
1445
1446 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1447 if (ret < 0)
1448 goto out;
1449
1450 task = pidfd_get_task(pidfd, &f_flags);
1451 if (IS_ERR(task)) {
1452 ret = PTR_ERR(task);
1453 goto free_iov;
1454 }
1455
1456 if (!process_madvise_behavior_valid(behavior)) {
1457 ret = -EINVAL;
1458 goto release_task;
1459 }
1460
1461
1462 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1463 if (IS_ERR_OR_NULL(mm)) {
1464 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1465 goto release_task;
1466 }
1467
1468
1469
1470
1471
1472 if (!capable(CAP_SYS_NICE)) {
1473 ret = -EPERM;
1474 goto release_mm;
1475 }
1476
1477 total_len = iov_iter_count(&iter);
1478
1479 while (iov_iter_count(&iter)) {
1480 iovec = iov_iter_iovec(&iter);
1481 ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1482 iovec.iov_len, behavior);
1483 if (ret < 0)
1484 break;
1485 iov_iter_advance(&iter, iovec.iov_len);
1486 }
1487
1488 ret = (total_len - iov_iter_count(&iter)) ? : ret;
1489
1490 release_mm:
1491 mmput(mm);
1492 release_task:
1493 put_task_struct(task);
1494 free_iov:
1495 kfree(iov);
1496 out:
1497 return ret;
1498 }