Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *  linux/mm/madvise.c
0004  *
0005  * Copyright (C) 1999  Linus Torvalds
0006  * Copyright (C) 2002  Christoph Hellwig
0007  */
0008 
0009 #include <linux/mman.h>
0010 #include <linux/pagemap.h>
0011 #include <linux/syscalls.h>
0012 #include <linux/mempolicy.h>
0013 #include <linux/page-isolation.h>
0014 #include <linux/page_idle.h>
0015 #include <linux/userfaultfd_k.h>
0016 #include <linux/hugetlb.h>
0017 #include <linux/falloc.h>
0018 #include <linux/fadvise.h>
0019 #include <linux/sched.h>
0020 #include <linux/sched/mm.h>
0021 #include <linux/mm_inline.h>
0022 #include <linux/string.h>
0023 #include <linux/uio.h>
0024 #include <linux/ksm.h>
0025 #include <linux/fs.h>
0026 #include <linux/file.h>
0027 #include <linux/blkdev.h>
0028 #include <linux/backing-dev.h>
0029 #include <linux/pagewalk.h>
0030 #include <linux/swap.h>
0031 #include <linux/swapops.h>
0032 #include <linux/shmem_fs.h>
0033 #include <linux/mmu_notifier.h>
0034 
0035 #include <asm/tlb.h>
0036 
0037 #include "internal.h"
0038 #include "swap.h"
0039 
0040 struct madvise_walk_private {
0041     struct mmu_gather *tlb;
0042     bool pageout;
0043 };
0044 
0045 /*
0046  * Any behaviour which results in changes to the vma->vm_flags needs to
0047  * take mmap_lock for writing. Others, which simply traverse vmas, need
0048  * to only take it for reading.
0049  */
0050 static int madvise_need_mmap_write(int behavior)
0051 {
0052     switch (behavior) {
0053     case MADV_REMOVE:
0054     case MADV_WILLNEED:
0055     case MADV_DONTNEED:
0056     case MADV_DONTNEED_LOCKED:
0057     case MADV_COLD:
0058     case MADV_PAGEOUT:
0059     case MADV_FREE:
0060     case MADV_POPULATE_READ:
0061     case MADV_POPULATE_WRITE:
0062         return 0;
0063     default:
0064         /* be safe, default to 1. list exceptions explicitly */
0065         return 1;
0066     }
0067 }
0068 
0069 #ifdef CONFIG_ANON_VMA_NAME
0070 struct anon_vma_name *anon_vma_name_alloc(const char *name)
0071 {
0072     struct anon_vma_name *anon_name;
0073     size_t count;
0074 
0075     /* Add 1 for NUL terminator at the end of the anon_name->name */
0076     count = strlen(name) + 1;
0077     anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
0078     if (anon_name) {
0079         kref_init(&anon_name->kref);
0080         memcpy(anon_name->name, name, count);
0081     }
0082 
0083     return anon_name;
0084 }
0085 
0086 void anon_vma_name_free(struct kref *kref)
0087 {
0088     struct anon_vma_name *anon_name =
0089             container_of(kref, struct anon_vma_name, kref);
0090     kfree(anon_name);
0091 }
0092 
0093 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
0094 {
0095     mmap_assert_locked(vma->vm_mm);
0096 
0097     if (vma->vm_file)
0098         return NULL;
0099 
0100     return vma->anon_name;
0101 }
0102 
0103 /* mmap_lock should be write-locked */
0104 static int replace_anon_vma_name(struct vm_area_struct *vma,
0105                  struct anon_vma_name *anon_name)
0106 {
0107     struct anon_vma_name *orig_name = anon_vma_name(vma);
0108 
0109     if (!anon_name) {
0110         vma->anon_name = NULL;
0111         anon_vma_name_put(orig_name);
0112         return 0;
0113     }
0114 
0115     if (anon_vma_name_eq(orig_name, anon_name))
0116         return 0;
0117 
0118     vma->anon_name = anon_vma_name_reuse(anon_name);
0119     anon_vma_name_put(orig_name);
0120 
0121     return 0;
0122 }
0123 #else /* CONFIG_ANON_VMA_NAME */
0124 static int replace_anon_vma_name(struct vm_area_struct *vma,
0125                  struct anon_vma_name *anon_name)
0126 {
0127     if (anon_name)
0128         return -EINVAL;
0129 
0130     return 0;
0131 }
0132 #endif /* CONFIG_ANON_VMA_NAME */
0133 /*
0134  * Update the vm_flags on region of a vma, splitting it or merging it as
0135  * necessary.  Must be called with mmap_sem held for writing;
0136  * Caller should ensure anon_name stability by raising its refcount even when
0137  * anon_name belongs to a valid vma because this function might free that vma.
0138  */
0139 static int madvise_update_vma(struct vm_area_struct *vma,
0140                   struct vm_area_struct **prev, unsigned long start,
0141                   unsigned long end, unsigned long new_flags,
0142                   struct anon_vma_name *anon_name)
0143 {
0144     struct mm_struct *mm = vma->vm_mm;
0145     int error;
0146     pgoff_t pgoff;
0147 
0148     if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
0149         *prev = vma;
0150         return 0;
0151     }
0152 
0153     pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
0154     *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
0155               vma->vm_file, pgoff, vma_policy(vma),
0156               vma->vm_userfaultfd_ctx, anon_name);
0157     if (*prev) {
0158         vma = *prev;
0159         goto success;
0160     }
0161 
0162     *prev = vma;
0163 
0164     if (start != vma->vm_start) {
0165         if (unlikely(mm->map_count >= sysctl_max_map_count))
0166             return -ENOMEM;
0167         error = __split_vma(mm, vma, start, 1);
0168         if (error)
0169             return error;
0170     }
0171 
0172     if (end != vma->vm_end) {
0173         if (unlikely(mm->map_count >= sysctl_max_map_count))
0174             return -ENOMEM;
0175         error = __split_vma(mm, vma, end, 0);
0176         if (error)
0177             return error;
0178     }
0179 
0180 success:
0181     /*
0182      * vm_flags is protected by the mmap_lock held in write mode.
0183      */
0184     vma->vm_flags = new_flags;
0185     if (!vma->vm_file) {
0186         error = replace_anon_vma_name(vma, anon_name);
0187         if (error)
0188             return error;
0189     }
0190 
0191     return 0;
0192 }
0193 
0194 #ifdef CONFIG_SWAP
0195 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
0196     unsigned long end, struct mm_walk *walk)
0197 {
0198     struct vm_area_struct *vma = walk->private;
0199     unsigned long index;
0200     struct swap_iocb *splug = NULL;
0201 
0202     if (pmd_none_or_trans_huge_or_clear_bad(pmd))
0203         return 0;
0204 
0205     for (index = start; index != end; index += PAGE_SIZE) {
0206         pte_t pte;
0207         swp_entry_t entry;
0208         struct page *page;
0209         spinlock_t *ptl;
0210         pte_t *ptep;
0211 
0212         ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
0213         pte = *ptep;
0214         pte_unmap_unlock(ptep, ptl);
0215 
0216         if (!is_swap_pte(pte))
0217             continue;
0218         entry = pte_to_swp_entry(pte);
0219         if (unlikely(non_swap_entry(entry)))
0220             continue;
0221 
0222         page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
0223                          vma, index, false, &splug);
0224         if (page)
0225             put_page(page);
0226     }
0227     swap_read_unplug(splug);
0228 
0229     return 0;
0230 }
0231 
0232 static const struct mm_walk_ops swapin_walk_ops = {
0233     .pmd_entry      = swapin_walk_pmd_entry,
0234 };
0235 
0236 static void force_shm_swapin_readahead(struct vm_area_struct *vma,
0237         unsigned long start, unsigned long end,
0238         struct address_space *mapping)
0239 {
0240     XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
0241     pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
0242     struct page *page;
0243     struct swap_iocb *splug = NULL;
0244 
0245     rcu_read_lock();
0246     xas_for_each(&xas, page, end_index) {
0247         swp_entry_t swap;
0248 
0249         if (!xa_is_value(page))
0250             continue;
0251         swap = radix_to_swp_entry(page);
0252         /* There might be swapin error entries in shmem mapping. */
0253         if (non_swap_entry(swap))
0254             continue;
0255         xas_pause(&xas);
0256         rcu_read_unlock();
0257 
0258         page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
0259                          NULL, 0, false, &splug);
0260         if (page)
0261             put_page(page);
0262 
0263         rcu_read_lock();
0264     }
0265     rcu_read_unlock();
0266     swap_read_unplug(splug);
0267 
0268     lru_add_drain();    /* Push any new pages onto the LRU now */
0269 }
0270 #endif      /* CONFIG_SWAP */
0271 
0272 /*
0273  * Schedule all required I/O operations.  Do not wait for completion.
0274  */
0275 static long madvise_willneed(struct vm_area_struct *vma,
0276                  struct vm_area_struct **prev,
0277                  unsigned long start, unsigned long end)
0278 {
0279     struct mm_struct *mm = vma->vm_mm;
0280     struct file *file = vma->vm_file;
0281     loff_t offset;
0282 
0283     *prev = vma;
0284 #ifdef CONFIG_SWAP
0285     if (!file) {
0286         walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
0287         lru_add_drain(); /* Push any new pages onto the LRU now */
0288         return 0;
0289     }
0290 
0291     if (shmem_mapping(file->f_mapping)) {
0292         force_shm_swapin_readahead(vma, start, end,
0293                     file->f_mapping);
0294         return 0;
0295     }
0296 #else
0297     if (!file)
0298         return -EBADF;
0299 #endif
0300 
0301     if (IS_DAX(file_inode(file))) {
0302         /* no bad return value, but ignore advice */
0303         return 0;
0304     }
0305 
0306     /*
0307      * Filesystem's fadvise may need to take various locks.  We need to
0308      * explicitly grab a reference because the vma (and hence the
0309      * vma's reference to the file) can go away as soon as we drop
0310      * mmap_lock.
0311      */
0312     *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
0313     get_file(file);
0314     offset = (loff_t)(start - vma->vm_start)
0315             + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
0316     mmap_read_unlock(mm);
0317     vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
0318     fput(file);
0319     mmap_read_lock(mm);
0320     return 0;
0321 }
0322 
0323 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
0324                 unsigned long addr, unsigned long end,
0325                 struct mm_walk *walk)
0326 {
0327     struct madvise_walk_private *private = walk->private;
0328     struct mmu_gather *tlb = private->tlb;
0329     bool pageout = private->pageout;
0330     struct mm_struct *mm = tlb->mm;
0331     struct vm_area_struct *vma = walk->vma;
0332     pte_t *orig_pte, *pte, ptent;
0333     spinlock_t *ptl;
0334     struct page *page = NULL;
0335     LIST_HEAD(page_list);
0336 
0337     if (fatal_signal_pending(current))
0338         return -EINTR;
0339 
0340 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0341     if (pmd_trans_huge(*pmd)) {
0342         pmd_t orig_pmd;
0343         unsigned long next = pmd_addr_end(addr, end);
0344 
0345         tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
0346         ptl = pmd_trans_huge_lock(pmd, vma);
0347         if (!ptl)
0348             return 0;
0349 
0350         orig_pmd = *pmd;
0351         if (is_huge_zero_pmd(orig_pmd))
0352             goto huge_unlock;
0353 
0354         if (unlikely(!pmd_present(orig_pmd))) {
0355             VM_BUG_ON(thp_migration_supported() &&
0356                     !is_pmd_migration_entry(orig_pmd));
0357             goto huge_unlock;
0358         }
0359 
0360         page = pmd_page(orig_pmd);
0361 
0362         /* Do not interfere with other mappings of this page */
0363         if (page_mapcount(page) != 1)
0364             goto huge_unlock;
0365 
0366         if (next - addr != HPAGE_PMD_SIZE) {
0367             int err;
0368 
0369             get_page(page);
0370             spin_unlock(ptl);
0371             lock_page(page);
0372             err = split_huge_page(page);
0373             unlock_page(page);
0374             put_page(page);
0375             if (!err)
0376                 goto regular_page;
0377             return 0;
0378         }
0379 
0380         if (pmd_young(orig_pmd)) {
0381             pmdp_invalidate(vma, addr, pmd);
0382             orig_pmd = pmd_mkold(orig_pmd);
0383 
0384             set_pmd_at(mm, addr, pmd, orig_pmd);
0385             tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
0386         }
0387 
0388         ClearPageReferenced(page);
0389         test_and_clear_page_young(page);
0390         if (pageout) {
0391             if (!isolate_lru_page(page)) {
0392                 if (PageUnevictable(page))
0393                     putback_lru_page(page);
0394                 else
0395                     list_add(&page->lru, &page_list);
0396             }
0397         } else
0398             deactivate_page(page);
0399 huge_unlock:
0400         spin_unlock(ptl);
0401         if (pageout)
0402             reclaim_pages(&page_list);
0403         return 0;
0404     }
0405 
0406 regular_page:
0407     if (pmd_trans_unstable(pmd))
0408         return 0;
0409 #endif
0410     tlb_change_page_size(tlb, PAGE_SIZE);
0411     orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
0412     flush_tlb_batched_pending(mm);
0413     arch_enter_lazy_mmu_mode();
0414     for (; addr < end; pte++, addr += PAGE_SIZE) {
0415         ptent = *pte;
0416 
0417         if (pte_none(ptent))
0418             continue;
0419 
0420         if (!pte_present(ptent))
0421             continue;
0422 
0423         page = vm_normal_page(vma, addr, ptent);
0424         if (!page || is_zone_device_page(page))
0425             continue;
0426 
0427         /*
0428          * Creating a THP page is expensive so split it only if we
0429          * are sure it's worth. Split it if we are only owner.
0430          */
0431         if (PageTransCompound(page)) {
0432             if (page_mapcount(page) != 1)
0433                 break;
0434             get_page(page);
0435             if (!trylock_page(page)) {
0436                 put_page(page);
0437                 break;
0438             }
0439             pte_unmap_unlock(orig_pte, ptl);
0440             if (split_huge_page(page)) {
0441                 unlock_page(page);
0442                 put_page(page);
0443                 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
0444                 break;
0445             }
0446             unlock_page(page);
0447             put_page(page);
0448             orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
0449             pte--;
0450             addr -= PAGE_SIZE;
0451             continue;
0452         }
0453 
0454         /*
0455          * Do not interfere with other mappings of this page and
0456          * non-LRU page.
0457          */
0458         if (!PageLRU(page) || page_mapcount(page) != 1)
0459             continue;
0460 
0461         VM_BUG_ON_PAGE(PageTransCompound(page), page);
0462 
0463         if (pte_young(ptent)) {
0464             ptent = ptep_get_and_clear_full(mm, addr, pte,
0465                             tlb->fullmm);
0466             ptent = pte_mkold(ptent);
0467             set_pte_at(mm, addr, pte, ptent);
0468             tlb_remove_tlb_entry(tlb, pte, addr);
0469         }
0470 
0471         /*
0472          * We are deactivating a page for accelerating reclaiming.
0473          * VM couldn't reclaim the page unless we clear PG_young.
0474          * As a side effect, it makes confuse idle-page tracking
0475          * because they will miss recent referenced history.
0476          */
0477         ClearPageReferenced(page);
0478         test_and_clear_page_young(page);
0479         if (pageout) {
0480             if (!isolate_lru_page(page)) {
0481                 if (PageUnevictable(page))
0482                     putback_lru_page(page);
0483                 else
0484                     list_add(&page->lru, &page_list);
0485             }
0486         } else
0487             deactivate_page(page);
0488     }
0489 
0490     arch_leave_lazy_mmu_mode();
0491     pte_unmap_unlock(orig_pte, ptl);
0492     if (pageout)
0493         reclaim_pages(&page_list);
0494     cond_resched();
0495 
0496     return 0;
0497 }
0498 
0499 static const struct mm_walk_ops cold_walk_ops = {
0500     .pmd_entry = madvise_cold_or_pageout_pte_range,
0501 };
0502 
0503 static void madvise_cold_page_range(struct mmu_gather *tlb,
0504                  struct vm_area_struct *vma,
0505                  unsigned long addr, unsigned long end)
0506 {
0507     struct madvise_walk_private walk_private = {
0508         .pageout = false,
0509         .tlb = tlb,
0510     };
0511 
0512     tlb_start_vma(tlb, vma);
0513     walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
0514     tlb_end_vma(tlb, vma);
0515 }
0516 
0517 static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
0518 {
0519     return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
0520 }
0521 
0522 static long madvise_cold(struct vm_area_struct *vma,
0523             struct vm_area_struct **prev,
0524             unsigned long start_addr, unsigned long end_addr)
0525 {
0526     struct mm_struct *mm = vma->vm_mm;
0527     struct mmu_gather tlb;
0528 
0529     *prev = vma;
0530     if (!can_madv_lru_vma(vma))
0531         return -EINVAL;
0532 
0533     lru_add_drain();
0534     tlb_gather_mmu(&tlb, mm);
0535     madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
0536     tlb_finish_mmu(&tlb);
0537 
0538     return 0;
0539 }
0540 
0541 static void madvise_pageout_page_range(struct mmu_gather *tlb,
0542                  struct vm_area_struct *vma,
0543                  unsigned long addr, unsigned long end)
0544 {
0545     struct madvise_walk_private walk_private = {
0546         .pageout = true,
0547         .tlb = tlb,
0548     };
0549 
0550     tlb_start_vma(tlb, vma);
0551     walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
0552     tlb_end_vma(tlb, vma);
0553 }
0554 
0555 static inline bool can_do_pageout(struct vm_area_struct *vma)
0556 {
0557     if (vma_is_anonymous(vma))
0558         return true;
0559     if (!vma->vm_file)
0560         return false;
0561     /*
0562      * paging out pagecache only for non-anonymous mappings that correspond
0563      * to the files the calling process could (if tried) open for writing;
0564      * otherwise we'd be including shared non-exclusive mappings, which
0565      * opens a side channel.
0566      */
0567     return inode_owner_or_capable(&init_user_ns,
0568                       file_inode(vma->vm_file)) ||
0569            file_permission(vma->vm_file, MAY_WRITE) == 0;
0570 }
0571 
0572 static long madvise_pageout(struct vm_area_struct *vma,
0573             struct vm_area_struct **prev,
0574             unsigned long start_addr, unsigned long end_addr)
0575 {
0576     struct mm_struct *mm = vma->vm_mm;
0577     struct mmu_gather tlb;
0578 
0579     *prev = vma;
0580     if (!can_madv_lru_vma(vma))
0581         return -EINVAL;
0582 
0583     if (!can_do_pageout(vma))
0584         return 0;
0585 
0586     lru_add_drain();
0587     tlb_gather_mmu(&tlb, mm);
0588     madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
0589     tlb_finish_mmu(&tlb);
0590 
0591     return 0;
0592 }
0593 
0594 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
0595                 unsigned long end, struct mm_walk *walk)
0596 
0597 {
0598     struct mmu_gather *tlb = walk->private;
0599     struct mm_struct *mm = tlb->mm;
0600     struct vm_area_struct *vma = walk->vma;
0601     spinlock_t *ptl;
0602     pte_t *orig_pte, *pte, ptent;
0603     struct page *page;
0604     int nr_swap = 0;
0605     unsigned long next;
0606 
0607     next = pmd_addr_end(addr, end);
0608     if (pmd_trans_huge(*pmd))
0609         if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
0610             goto next;
0611 
0612     if (pmd_trans_unstable(pmd))
0613         return 0;
0614 
0615     tlb_change_page_size(tlb, PAGE_SIZE);
0616     orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
0617     flush_tlb_batched_pending(mm);
0618     arch_enter_lazy_mmu_mode();
0619     for (; addr != end; pte++, addr += PAGE_SIZE) {
0620         ptent = *pte;
0621 
0622         if (pte_none(ptent))
0623             continue;
0624         /*
0625          * If the pte has swp_entry, just clear page table to
0626          * prevent swap-in which is more expensive rather than
0627          * (page allocation + zeroing).
0628          */
0629         if (!pte_present(ptent)) {
0630             swp_entry_t entry;
0631 
0632             entry = pte_to_swp_entry(ptent);
0633             if (!non_swap_entry(entry)) {
0634                 nr_swap--;
0635                 free_swap_and_cache(entry);
0636                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
0637             } else if (is_hwpoison_entry(entry) ||
0638                    is_swapin_error_entry(entry)) {
0639                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
0640             }
0641             continue;
0642         }
0643 
0644         page = vm_normal_page(vma, addr, ptent);
0645         if (!page || is_zone_device_page(page))
0646             continue;
0647 
0648         /*
0649          * If pmd isn't transhuge but the page is THP and
0650          * is owned by only this process, split it and
0651          * deactivate all pages.
0652          */
0653         if (PageTransCompound(page)) {
0654             if (page_mapcount(page) != 1)
0655                 goto out;
0656             get_page(page);
0657             if (!trylock_page(page)) {
0658                 put_page(page);
0659                 goto out;
0660             }
0661             pte_unmap_unlock(orig_pte, ptl);
0662             if (split_huge_page(page)) {
0663                 unlock_page(page);
0664                 put_page(page);
0665                 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
0666                 goto out;
0667             }
0668             unlock_page(page);
0669             put_page(page);
0670             orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
0671             pte--;
0672             addr -= PAGE_SIZE;
0673             continue;
0674         }
0675 
0676         VM_BUG_ON_PAGE(PageTransCompound(page), page);
0677 
0678         if (PageSwapCache(page) || PageDirty(page)) {
0679             if (!trylock_page(page))
0680                 continue;
0681             /*
0682              * If page is shared with others, we couldn't clear
0683              * PG_dirty of the page.
0684              */
0685             if (page_mapcount(page) != 1) {
0686                 unlock_page(page);
0687                 continue;
0688             }
0689 
0690             if (PageSwapCache(page) && !try_to_free_swap(page)) {
0691                 unlock_page(page);
0692                 continue;
0693             }
0694 
0695             ClearPageDirty(page);
0696             unlock_page(page);
0697         }
0698 
0699         if (pte_young(ptent) || pte_dirty(ptent)) {
0700             /*
0701              * Some of architecture(ex, PPC) don't update TLB
0702              * with set_pte_at and tlb_remove_tlb_entry so for
0703              * the portability, remap the pte with old|clean
0704              * after pte clearing.
0705              */
0706             ptent = ptep_get_and_clear_full(mm, addr, pte,
0707                             tlb->fullmm);
0708 
0709             ptent = pte_mkold(ptent);
0710             ptent = pte_mkclean(ptent);
0711             set_pte_at(mm, addr, pte, ptent);
0712             tlb_remove_tlb_entry(tlb, pte, addr);
0713         }
0714         mark_page_lazyfree(page);
0715     }
0716 out:
0717     if (nr_swap) {
0718         if (current->mm == mm)
0719             sync_mm_rss(mm);
0720 
0721         add_mm_counter(mm, MM_SWAPENTS, nr_swap);
0722     }
0723     arch_leave_lazy_mmu_mode();
0724     pte_unmap_unlock(orig_pte, ptl);
0725     cond_resched();
0726 next:
0727     return 0;
0728 }
0729 
0730 static const struct mm_walk_ops madvise_free_walk_ops = {
0731     .pmd_entry      = madvise_free_pte_range,
0732 };
0733 
0734 static int madvise_free_single_vma(struct vm_area_struct *vma,
0735             unsigned long start_addr, unsigned long end_addr)
0736 {
0737     struct mm_struct *mm = vma->vm_mm;
0738     struct mmu_notifier_range range;
0739     struct mmu_gather tlb;
0740 
0741     /* MADV_FREE works for only anon vma at the moment */
0742     if (!vma_is_anonymous(vma))
0743         return -EINVAL;
0744 
0745     range.start = max(vma->vm_start, start_addr);
0746     if (range.start >= vma->vm_end)
0747         return -EINVAL;
0748     range.end = min(vma->vm_end, end_addr);
0749     if (range.end <= vma->vm_start)
0750         return -EINVAL;
0751     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
0752                 range.start, range.end);
0753 
0754     lru_add_drain();
0755     tlb_gather_mmu(&tlb, mm);
0756     update_hiwater_rss(mm);
0757 
0758     mmu_notifier_invalidate_range_start(&range);
0759     tlb_start_vma(&tlb, vma);
0760     walk_page_range(vma->vm_mm, range.start, range.end,
0761             &madvise_free_walk_ops, &tlb);
0762     tlb_end_vma(&tlb, vma);
0763     mmu_notifier_invalidate_range_end(&range);
0764     tlb_finish_mmu(&tlb);
0765 
0766     return 0;
0767 }
0768 
0769 /*
0770  * Application no longer needs these pages.  If the pages are dirty,
0771  * it's OK to just throw them away.  The app will be more careful about
0772  * data it wants to keep.  Be sure to free swap resources too.  The
0773  * zap_page_range call sets things up for shrink_active_list to actually free
0774  * these pages later if no one else has touched them in the meantime,
0775  * although we could add these pages to a global reuse list for
0776  * shrink_active_list to pick up before reclaiming other pages.
0777  *
0778  * NB: This interface discards data rather than pushes it out to swap,
0779  * as some implementations do.  This has performance implications for
0780  * applications like large transactional databases which want to discard
0781  * pages in anonymous maps after committing to backing store the data
0782  * that was kept in them.  There is no reason to write this data out to
0783  * the swap area if the application is discarding it.
0784  *
0785  * An interface that causes the system to free clean pages and flush
0786  * dirty pages is already available as msync(MS_INVALIDATE).
0787  */
0788 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
0789                     unsigned long start, unsigned long end)
0790 {
0791     zap_page_range(vma, start, end - start);
0792     return 0;
0793 }
0794 
0795 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
0796                         unsigned long start,
0797                         unsigned long *end,
0798                         int behavior)
0799 {
0800     if (!is_vm_hugetlb_page(vma)) {
0801         unsigned int forbidden = VM_PFNMAP;
0802 
0803         if (behavior != MADV_DONTNEED_LOCKED)
0804             forbidden |= VM_LOCKED;
0805 
0806         return !(vma->vm_flags & forbidden);
0807     }
0808 
0809     if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
0810         return false;
0811     if (start & ~huge_page_mask(hstate_vma(vma)))
0812         return false;
0813 
0814     *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
0815     return true;
0816 }
0817 
0818 static long madvise_dontneed_free(struct vm_area_struct *vma,
0819                   struct vm_area_struct **prev,
0820                   unsigned long start, unsigned long end,
0821                   int behavior)
0822 {
0823     struct mm_struct *mm = vma->vm_mm;
0824 
0825     *prev = vma;
0826     if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
0827         return -EINVAL;
0828 
0829     if (!userfaultfd_remove(vma, start, end)) {
0830         *prev = NULL; /* mmap_lock has been dropped, prev is stale */
0831 
0832         mmap_read_lock(mm);
0833         vma = find_vma(mm, start);
0834         if (!vma)
0835             return -ENOMEM;
0836         if (start < vma->vm_start) {
0837             /*
0838              * This "vma" under revalidation is the one
0839              * with the lowest vma->vm_start where start
0840              * is also < vma->vm_end. If start <
0841              * vma->vm_start it means an hole materialized
0842              * in the user address space within the
0843              * virtual range passed to MADV_DONTNEED
0844              * or MADV_FREE.
0845              */
0846             return -ENOMEM;
0847         }
0848         /*
0849          * Potential end adjustment for hugetlb vma is OK as
0850          * the check below keeps end within vma.
0851          */
0852         if (!madvise_dontneed_free_valid_vma(vma, start, &end,
0853                              behavior))
0854             return -EINVAL;
0855         if (end > vma->vm_end) {
0856             /*
0857              * Don't fail if end > vma->vm_end. If the old
0858              * vma was split while the mmap_lock was
0859              * released the effect of the concurrent
0860              * operation may not cause madvise() to
0861              * have an undefined result. There may be an
0862              * adjacent next vma that we'll walk
0863              * next. userfaultfd_remove() will generate an
0864              * UFFD_EVENT_REMOVE repetition on the
0865              * end-vma->vm_end range, but the manager can
0866              * handle a repetition fine.
0867              */
0868             end = vma->vm_end;
0869         }
0870         VM_WARN_ON(start >= end);
0871     }
0872 
0873     if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
0874         return madvise_dontneed_single_vma(vma, start, end);
0875     else if (behavior == MADV_FREE)
0876         return madvise_free_single_vma(vma, start, end);
0877     else
0878         return -EINVAL;
0879 }
0880 
0881 static long madvise_populate(struct vm_area_struct *vma,
0882                  struct vm_area_struct **prev,
0883                  unsigned long start, unsigned long end,
0884                  int behavior)
0885 {
0886     const bool write = behavior == MADV_POPULATE_WRITE;
0887     struct mm_struct *mm = vma->vm_mm;
0888     unsigned long tmp_end;
0889     int locked = 1;
0890     long pages;
0891 
0892     *prev = vma;
0893 
0894     while (start < end) {
0895         /*
0896          * We might have temporarily dropped the lock. For example,
0897          * our VMA might have been split.
0898          */
0899         if (!vma || start >= vma->vm_end) {
0900             vma = vma_lookup(mm, start);
0901             if (!vma)
0902                 return -ENOMEM;
0903         }
0904 
0905         tmp_end = min_t(unsigned long, end, vma->vm_end);
0906         /* Populate (prefault) page tables readable/writable. */
0907         pages = faultin_vma_page_range(vma, start, tmp_end, write,
0908                            &locked);
0909         if (!locked) {
0910             mmap_read_lock(mm);
0911             locked = 1;
0912             *prev = NULL;
0913             vma = NULL;
0914         }
0915         if (pages < 0) {
0916             switch (pages) {
0917             case -EINTR:
0918                 return -EINTR;
0919             case -EINVAL: /* Incompatible mappings / permissions. */
0920                 return -EINVAL;
0921             case -EHWPOISON:
0922                 return -EHWPOISON;
0923             case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
0924                 return -EFAULT;
0925             default:
0926                 pr_warn_once("%s: unhandled return value: %ld\n",
0927                          __func__, pages);
0928                 fallthrough;
0929             case -ENOMEM:
0930                 return -ENOMEM;
0931             }
0932         }
0933         start += pages * PAGE_SIZE;
0934     }
0935     return 0;
0936 }
0937 
0938 /*
0939  * Application wants to free up the pages and associated backing store.
0940  * This is effectively punching a hole into the middle of a file.
0941  */
0942 static long madvise_remove(struct vm_area_struct *vma,
0943                 struct vm_area_struct **prev,
0944                 unsigned long start, unsigned long end)
0945 {
0946     loff_t offset;
0947     int error;
0948     struct file *f;
0949     struct mm_struct *mm = vma->vm_mm;
0950 
0951     *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
0952 
0953     if (vma->vm_flags & VM_LOCKED)
0954         return -EINVAL;
0955 
0956     f = vma->vm_file;
0957 
0958     if (!f || !f->f_mapping || !f->f_mapping->host) {
0959             return -EINVAL;
0960     }
0961 
0962     if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
0963         return -EACCES;
0964 
0965     offset = (loff_t)(start - vma->vm_start)
0966             + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
0967 
0968     /*
0969      * Filesystem's fallocate may need to take i_rwsem.  We need to
0970      * explicitly grab a reference because the vma (and hence the
0971      * vma's reference to the file) can go away as soon as we drop
0972      * mmap_lock.
0973      */
0974     get_file(f);
0975     if (userfaultfd_remove(vma, start, end)) {
0976         /* mmap_lock was not released by userfaultfd_remove() */
0977         mmap_read_unlock(mm);
0978     }
0979     error = vfs_fallocate(f,
0980                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
0981                 offset, end - start);
0982     fput(f);
0983     mmap_read_lock(mm);
0984     return error;
0985 }
0986 
0987 /*
0988  * Apply an madvise behavior to a region of a vma.  madvise_update_vma
0989  * will handle splitting a vm area into separate areas, each area with its own
0990  * behavior.
0991  */
0992 static int madvise_vma_behavior(struct vm_area_struct *vma,
0993                 struct vm_area_struct **prev,
0994                 unsigned long start, unsigned long end,
0995                 unsigned long behavior)
0996 {
0997     int error;
0998     struct anon_vma_name *anon_name;
0999     unsigned long new_flags = vma->vm_flags;
1000 
1001     switch (behavior) {
1002     case MADV_REMOVE:
1003         return madvise_remove(vma, prev, start, end);
1004     case MADV_WILLNEED:
1005         return madvise_willneed(vma, prev, start, end);
1006     case MADV_COLD:
1007         return madvise_cold(vma, prev, start, end);
1008     case MADV_PAGEOUT:
1009         return madvise_pageout(vma, prev, start, end);
1010     case MADV_FREE:
1011     case MADV_DONTNEED:
1012     case MADV_DONTNEED_LOCKED:
1013         return madvise_dontneed_free(vma, prev, start, end, behavior);
1014     case MADV_POPULATE_READ:
1015     case MADV_POPULATE_WRITE:
1016         return madvise_populate(vma, prev, start, end, behavior);
1017     case MADV_NORMAL:
1018         new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
1019         break;
1020     case MADV_SEQUENTIAL:
1021         new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
1022         break;
1023     case MADV_RANDOM:
1024         new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
1025         break;
1026     case MADV_DONTFORK:
1027         new_flags |= VM_DONTCOPY;
1028         break;
1029     case MADV_DOFORK:
1030         if (vma->vm_flags & VM_IO)
1031             return -EINVAL;
1032         new_flags &= ~VM_DONTCOPY;
1033         break;
1034     case MADV_WIPEONFORK:
1035         /* MADV_WIPEONFORK is only supported on anonymous memory. */
1036         if (vma->vm_file || vma->vm_flags & VM_SHARED)
1037             return -EINVAL;
1038         new_flags |= VM_WIPEONFORK;
1039         break;
1040     case MADV_KEEPONFORK:
1041         new_flags &= ~VM_WIPEONFORK;
1042         break;
1043     case MADV_DONTDUMP:
1044         new_flags |= VM_DONTDUMP;
1045         break;
1046     case MADV_DODUMP:
1047         if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
1048             return -EINVAL;
1049         new_flags &= ~VM_DONTDUMP;
1050         break;
1051     case MADV_MERGEABLE:
1052     case MADV_UNMERGEABLE:
1053         error = ksm_madvise(vma, start, end, behavior, &new_flags);
1054         if (error)
1055             goto out;
1056         break;
1057     case MADV_HUGEPAGE:
1058     case MADV_NOHUGEPAGE:
1059         error = hugepage_madvise(vma, &new_flags, behavior);
1060         if (error)
1061             goto out;
1062         break;
1063     }
1064 
1065     anon_name = anon_vma_name(vma);
1066     anon_vma_name_get(anon_name);
1067     error = madvise_update_vma(vma, prev, start, end, new_flags,
1068                    anon_name);
1069     anon_vma_name_put(anon_name);
1070 
1071 out:
1072     /*
1073      * madvise() returns EAGAIN if kernel resources, such as
1074      * slab, are temporarily unavailable.
1075      */
1076     if (error == -ENOMEM)
1077         error = -EAGAIN;
1078     return error;
1079 }
1080 
1081 #ifdef CONFIG_MEMORY_FAILURE
1082 /*
1083  * Error injection support for memory error handling.
1084  */
1085 static int madvise_inject_error(int behavior,
1086         unsigned long start, unsigned long end)
1087 {
1088     unsigned long size;
1089 
1090     if (!capable(CAP_SYS_ADMIN))
1091         return -EPERM;
1092 
1093 
1094     for (; start < end; start += size) {
1095         unsigned long pfn;
1096         struct page *page;
1097         int ret;
1098 
1099         ret = get_user_pages_fast(start, 1, 0, &page);
1100         if (ret != 1)
1101             return ret;
1102         pfn = page_to_pfn(page);
1103 
1104         /*
1105          * When soft offlining hugepages, after migrating the page
1106          * we dissolve it, therefore in the second loop "page" will
1107          * no longer be a compound page.
1108          */
1109         size = page_size(compound_head(page));
1110 
1111         if (behavior == MADV_SOFT_OFFLINE) {
1112             pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1113                  pfn, start);
1114             ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1115         } else {
1116             pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1117                  pfn, start);
1118             ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
1119             if (ret == -EOPNOTSUPP)
1120                 ret = 0;
1121         }
1122 
1123         if (ret)
1124             return ret;
1125     }
1126 
1127     return 0;
1128 }
1129 #endif
1130 
1131 static bool
1132 madvise_behavior_valid(int behavior)
1133 {
1134     switch (behavior) {
1135     case MADV_DOFORK:
1136     case MADV_DONTFORK:
1137     case MADV_NORMAL:
1138     case MADV_SEQUENTIAL:
1139     case MADV_RANDOM:
1140     case MADV_REMOVE:
1141     case MADV_WILLNEED:
1142     case MADV_DONTNEED:
1143     case MADV_DONTNEED_LOCKED:
1144     case MADV_FREE:
1145     case MADV_COLD:
1146     case MADV_PAGEOUT:
1147     case MADV_POPULATE_READ:
1148     case MADV_POPULATE_WRITE:
1149 #ifdef CONFIG_KSM
1150     case MADV_MERGEABLE:
1151     case MADV_UNMERGEABLE:
1152 #endif
1153 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1154     case MADV_HUGEPAGE:
1155     case MADV_NOHUGEPAGE:
1156 #endif
1157     case MADV_DONTDUMP:
1158     case MADV_DODUMP:
1159     case MADV_WIPEONFORK:
1160     case MADV_KEEPONFORK:
1161 #ifdef CONFIG_MEMORY_FAILURE
1162     case MADV_SOFT_OFFLINE:
1163     case MADV_HWPOISON:
1164 #endif
1165         return true;
1166 
1167     default:
1168         return false;
1169     }
1170 }
1171 
1172 static bool
1173 process_madvise_behavior_valid(int behavior)
1174 {
1175     switch (behavior) {
1176     case MADV_COLD:
1177     case MADV_PAGEOUT:
1178     case MADV_WILLNEED:
1179         return true;
1180     default:
1181         return false;
1182     }
1183 }
1184 
1185 /*
1186  * Walk the vmas in range [start,end), and call the visit function on each one.
1187  * The visit function will get start and end parameters that cover the overlap
1188  * between the current vma and the original range.  Any unmapped regions in the
1189  * original range will result in this function returning -ENOMEM while still
1190  * calling the visit function on all of the existing vmas in the range.
1191  * Must be called with the mmap_lock held for reading or writing.
1192  */
1193 static
1194 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1195               unsigned long end, unsigned long arg,
1196               int (*visit)(struct vm_area_struct *vma,
1197                    struct vm_area_struct **prev, unsigned long start,
1198                    unsigned long end, unsigned long arg))
1199 {
1200     struct vm_area_struct *vma;
1201     struct vm_area_struct *prev;
1202     unsigned long tmp;
1203     int unmapped_error = 0;
1204 
1205     /*
1206      * If the interval [start,end) covers some unmapped address
1207      * ranges, just ignore them, but return -ENOMEM at the end.
1208      * - different from the way of handling in mlock etc.
1209      */
1210     vma = find_vma_prev(mm, start, &prev);
1211     if (vma && start > vma->vm_start)
1212         prev = vma;
1213 
1214     for (;;) {
1215         int error;
1216 
1217         /* Still start < end. */
1218         if (!vma)
1219             return -ENOMEM;
1220 
1221         /* Here start < (end|vma->vm_end). */
1222         if (start < vma->vm_start) {
1223             unmapped_error = -ENOMEM;
1224             start = vma->vm_start;
1225             if (start >= end)
1226                 break;
1227         }
1228 
1229         /* Here vma->vm_start <= start < (end|vma->vm_end) */
1230         tmp = vma->vm_end;
1231         if (end < tmp)
1232             tmp = end;
1233 
1234         /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1235         error = visit(vma, &prev, start, tmp, arg);
1236         if (error)
1237             return error;
1238         start = tmp;
1239         if (prev && start < prev->vm_end)
1240             start = prev->vm_end;
1241         if (start >= end)
1242             break;
1243         if (prev)
1244             vma = prev->vm_next;
1245         else    /* madvise_remove dropped mmap_lock */
1246             vma = find_vma(mm, start);
1247     }
1248 
1249     return unmapped_error;
1250 }
1251 
1252 #ifdef CONFIG_ANON_VMA_NAME
1253 static int madvise_vma_anon_name(struct vm_area_struct *vma,
1254                  struct vm_area_struct **prev,
1255                  unsigned long start, unsigned long end,
1256                  unsigned long anon_name)
1257 {
1258     int error;
1259 
1260     /* Only anonymous mappings can be named */
1261     if (vma->vm_file)
1262         return -EBADF;
1263 
1264     error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
1265                    (struct anon_vma_name *)anon_name);
1266 
1267     /*
1268      * madvise() returns EAGAIN if kernel resources, such as
1269      * slab, are temporarily unavailable.
1270      */
1271     if (error == -ENOMEM)
1272         error = -EAGAIN;
1273     return error;
1274 }
1275 
1276 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
1277               unsigned long len_in, struct anon_vma_name *anon_name)
1278 {
1279     unsigned long end;
1280     unsigned long len;
1281 
1282     if (start & ~PAGE_MASK)
1283         return -EINVAL;
1284     len = (len_in + ~PAGE_MASK) & PAGE_MASK;
1285 
1286     /* Check to see whether len was rounded up from small -ve to zero */
1287     if (len_in && !len)
1288         return -EINVAL;
1289 
1290     end = start + len;
1291     if (end < start)
1292         return -EINVAL;
1293 
1294     if (end == start)
1295         return 0;
1296 
1297     return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
1298                  madvise_vma_anon_name);
1299 }
1300 #endif /* CONFIG_ANON_VMA_NAME */
1301 /*
1302  * The madvise(2) system call.
1303  *
1304  * Applications can use madvise() to advise the kernel how it should
1305  * handle paging I/O in this VM area.  The idea is to help the kernel
1306  * use appropriate read-ahead and caching techniques.  The information
1307  * provided is advisory only, and can be safely disregarded by the
1308  * kernel without affecting the correct operation of the application.
1309  *
1310  * behavior values:
1311  *  MADV_NORMAL - the default behavior is to read clusters.  This
1312  *      results in some read-ahead and read-behind.
1313  *  MADV_RANDOM - the system should read the minimum amount of data
1314  *      on any access, since it is unlikely that the appli-
1315  *      cation will need more than what it asks for.
1316  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
1317  *      once, so they can be aggressively read ahead, and
1318  *      can be freed soon after they are accessed.
1319  *  MADV_WILLNEED - the application is notifying the system to read
1320  *      some pages ahead.
1321  *  MADV_DONTNEED - the application is finished with the given range,
1322  *      so the kernel can free resources associated with it.
1323  *  MADV_FREE - the application marks pages in the given range as lazy free,
1324  *      where actual purges are postponed until memory pressure happens.
1325  *  MADV_REMOVE - the application wants to free up the given range of
1326  *      pages and associated backing store.
1327  *  MADV_DONTFORK - omit this area from child's address space when forking:
1328  *      typically, to avoid COWing pages pinned by get_user_pages().
1329  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1330  *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
1331  *              range after a fork.
1332  *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1333  *  MADV_HWPOISON - trigger memory error handler as if the given memory range
1334  *      were corrupted by unrecoverable hardware memory failure.
1335  *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1336  *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1337  *      this area with pages of identical content from other such areas.
1338  *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1339  *  MADV_HUGEPAGE - the application wants to back the given range by transparent
1340  *      huge pages in the future. Existing pages might be coalesced and
1341  *      new pages might be allocated as THP.
1342  *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1343  *      transparent huge pages so the existing pages will not be
1344  *      coalesced into THP and new pages will not be allocated as THP.
1345  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
1346  *      from being included in its core dump.
1347  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1348  *  MADV_COLD - the application is not expected to use this memory soon,
1349  *      deactivate pages in this range so that they can be reclaimed
1350  *      easily if memory pressure happens.
1351  *  MADV_PAGEOUT - the application is not expected to use this memory soon,
1352  *      page out the pages in this range immediately.
1353  *  MADV_POPULATE_READ - populate (prefault) page tables readable by
1354  *      triggering read faults if required
1355  *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1356  *      triggering write faults if required
1357  *
1358  * return values:
1359  *  zero    - success
1360  *  -EINVAL - start + len < 0, start is not page-aligned,
1361  *      "behavior" is not a valid value, or application
1362  *      is attempting to release locked or shared pages,
1363  *      or the specified address range includes file, Huge TLB,
1364  *      MAP_SHARED or VMPFNMAP range.
1365  *  -ENOMEM - addresses in the specified range are not currently
1366  *      mapped, or are outside the AS of the process.
1367  *  -EIO    - an I/O error occurred while paging in data.
1368  *  -EBADF  - map exists, but area maps something that isn't a file.
1369  *  -EAGAIN - a kernel resource was temporarily unavailable.
1370  */
1371 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1372 {
1373     unsigned long end;
1374     int error;
1375     int write;
1376     size_t len;
1377     struct blk_plug plug;
1378 
1379     start = untagged_addr(start);
1380 
1381     if (!madvise_behavior_valid(behavior))
1382         return -EINVAL;
1383 
1384     if (!PAGE_ALIGNED(start))
1385         return -EINVAL;
1386     len = PAGE_ALIGN(len_in);
1387 
1388     /* Check to see whether len was rounded up from small -ve to zero */
1389     if (len_in && !len)
1390         return -EINVAL;
1391 
1392     end = start + len;
1393     if (end < start)
1394         return -EINVAL;
1395 
1396     if (end == start)
1397         return 0;
1398 
1399 #ifdef CONFIG_MEMORY_FAILURE
1400     if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1401         return madvise_inject_error(behavior, start, start + len_in);
1402 #endif
1403 
1404     write = madvise_need_mmap_write(behavior);
1405     if (write) {
1406         if (mmap_write_lock_killable(mm))
1407             return -EINTR;
1408     } else {
1409         mmap_read_lock(mm);
1410     }
1411 
1412     blk_start_plug(&plug);
1413     error = madvise_walk_vmas(mm, start, end, behavior,
1414             madvise_vma_behavior);
1415     blk_finish_plug(&plug);
1416     if (write)
1417         mmap_write_unlock(mm);
1418     else
1419         mmap_read_unlock(mm);
1420 
1421     return error;
1422 }
1423 
1424 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1425 {
1426     return do_madvise(current->mm, start, len_in, behavior);
1427 }
1428 
1429 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1430         size_t, vlen, int, behavior, unsigned int, flags)
1431 {
1432     ssize_t ret;
1433     struct iovec iovstack[UIO_FASTIOV], iovec;
1434     struct iovec *iov = iovstack;
1435     struct iov_iter iter;
1436     struct task_struct *task;
1437     struct mm_struct *mm;
1438     size_t total_len;
1439     unsigned int f_flags;
1440 
1441     if (flags != 0) {
1442         ret = -EINVAL;
1443         goto out;
1444     }
1445 
1446     ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1447     if (ret < 0)
1448         goto out;
1449 
1450     task = pidfd_get_task(pidfd, &f_flags);
1451     if (IS_ERR(task)) {
1452         ret = PTR_ERR(task);
1453         goto free_iov;
1454     }
1455 
1456     if (!process_madvise_behavior_valid(behavior)) {
1457         ret = -EINVAL;
1458         goto release_task;
1459     }
1460 
1461     /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1462     mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1463     if (IS_ERR_OR_NULL(mm)) {
1464         ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1465         goto release_task;
1466     }
1467 
1468     /*
1469      * Require CAP_SYS_NICE for influencing process performance. Note that
1470      * only non-destructive hints are currently supported.
1471      */
1472     if (!capable(CAP_SYS_NICE)) {
1473         ret = -EPERM;
1474         goto release_mm;
1475     }
1476 
1477     total_len = iov_iter_count(&iter);
1478 
1479     while (iov_iter_count(&iter)) {
1480         iovec = iov_iter_iovec(&iter);
1481         ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1482                     iovec.iov_len, behavior);
1483         if (ret < 0)
1484             break;
1485         iov_iter_advance(&iter, iovec.iov_len);
1486     }
1487 
1488     ret = (total_len - iov_iter_count(&iter)) ? : ret;
1489 
1490 release_mm:
1491     mmput(mm);
1492 release_task:
1493     put_task_struct(task);
1494 free_iov:
1495     kfree(iov);
1496 out:
1497     return ret;
1498 }