Back to home page

LXR

 
 

    


0001 /*
0002  *  mm/mprotect.c
0003  *
0004  *  (C) Copyright 1994 Linus Torvalds
0005  *  (C) Copyright 2002 Christoph Hellwig
0006  *
0007  *  Address space accounting code   <alan@lxorguk.ukuu.org.uk>
0008  *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
0009  */
0010 
0011 #include <linux/mm.h>
0012 #include <linux/hugetlb.h>
0013 #include <linux/shm.h>
0014 #include <linux/mman.h>
0015 #include <linux/fs.h>
0016 #include <linux/highmem.h>
0017 #include <linux/security.h>
0018 #include <linux/mempolicy.h>
0019 #include <linux/personality.h>
0020 #include <linux/syscalls.h>
0021 #include <linux/swap.h>
0022 #include <linux/swapops.h>
0023 #include <linux/mmu_notifier.h>
0024 #include <linux/migrate.h>
0025 #include <linux/perf_event.h>
0026 #include <linux/pkeys.h>
0027 #include <linux/ksm.h>
0028 #include <linux/uaccess.h>
0029 #include <asm/pgtable.h>
0030 #include <asm/cacheflush.h>
0031 #include <asm/mmu_context.h>
0032 #include <asm/tlbflush.h>
0033 
0034 #include "internal.h"
0035 
0036 /*
0037  * For a prot_numa update we only hold mmap_sem for read so there is a
0038  * potential race with faulting where a pmd was temporarily none. This
0039  * function checks for a transhuge pmd under the appropriate lock. It
0040  * returns a pte if it was successfully locked or NULL if it raced with
0041  * a transhuge insertion.
0042  */
0043 static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
0044             unsigned long addr, int prot_numa, spinlock_t **ptl)
0045 {
0046     pte_t *pte;
0047     spinlock_t *pmdl;
0048 
0049     /* !prot_numa is protected by mmap_sem held for write */
0050     if (!prot_numa)
0051         return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
0052 
0053     pmdl = pmd_lock(vma->vm_mm, pmd);
0054     if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
0055         spin_unlock(pmdl);
0056         return NULL;
0057     }
0058 
0059     pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
0060     spin_unlock(pmdl);
0061     return pte;
0062 }
0063 
0064 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
0065         unsigned long addr, unsigned long end, pgprot_t newprot,
0066         int dirty_accountable, int prot_numa)
0067 {
0068     struct mm_struct *mm = vma->vm_mm;
0069     pte_t *pte, oldpte;
0070     spinlock_t *ptl;
0071     unsigned long pages = 0;
0072     int target_node = NUMA_NO_NODE;
0073 
0074     pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
0075     if (!pte)
0076         return 0;
0077 
0078     /* Get target node for single threaded private VMAs */
0079     if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
0080         atomic_read(&vma->vm_mm->mm_users) == 1)
0081         target_node = numa_node_id();
0082 
0083     arch_enter_lazy_mmu_mode();
0084     do {
0085         oldpte = *pte;
0086         if (pte_present(oldpte)) {
0087             pte_t ptent;
0088             bool preserve_write = prot_numa && pte_write(oldpte);
0089 
0090             /*
0091              * Avoid trapping faults against the zero or KSM
0092              * pages. See similar comment in change_huge_pmd.
0093              */
0094             if (prot_numa) {
0095                 struct page *page;
0096 
0097                 page = vm_normal_page(vma, addr, oldpte);
0098                 if (!page || PageKsm(page))
0099                     continue;
0100 
0101                 /* Avoid TLB flush if possible */
0102                 if (pte_protnone(oldpte))
0103                     continue;
0104 
0105                 /*
0106                  * Don't mess with PTEs if page is already on the node
0107                  * a single-threaded process is running on.
0108                  */
0109                 if (target_node == page_to_nid(page))
0110                     continue;
0111             }
0112 
0113             ptent = ptep_modify_prot_start(mm, addr, pte);
0114             ptent = pte_modify(ptent, newprot);
0115             if (preserve_write)
0116                 ptent = pte_mkwrite(ptent);
0117 
0118             /* Avoid taking write faults for known dirty pages */
0119             if (dirty_accountable && pte_dirty(ptent) &&
0120                     (pte_soft_dirty(ptent) ||
0121                      !(vma->vm_flags & VM_SOFTDIRTY))) {
0122                 ptent = pte_mkwrite(ptent);
0123             }
0124             ptep_modify_prot_commit(mm, addr, pte, ptent);
0125             pages++;
0126         } else if (IS_ENABLED(CONFIG_MIGRATION)) {
0127             swp_entry_t entry = pte_to_swp_entry(oldpte);
0128 
0129             if (is_write_migration_entry(entry)) {
0130                 pte_t newpte;
0131                 /*
0132                  * A protection check is difficult so
0133                  * just be safe and disable write
0134                  */
0135                 make_migration_entry_read(&entry);
0136                 newpte = swp_entry_to_pte(entry);
0137                 if (pte_swp_soft_dirty(oldpte))
0138                     newpte = pte_swp_mksoft_dirty(newpte);
0139                 set_pte_at(mm, addr, pte, newpte);
0140 
0141                 pages++;
0142             }
0143         }
0144     } while (pte++, addr += PAGE_SIZE, addr != end);
0145     arch_leave_lazy_mmu_mode();
0146     pte_unmap_unlock(pte - 1, ptl);
0147 
0148     return pages;
0149 }
0150 
0151 static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
0152         pud_t *pud, unsigned long addr, unsigned long end,
0153         pgprot_t newprot, int dirty_accountable, int prot_numa)
0154 {
0155     pmd_t *pmd;
0156     struct mm_struct *mm = vma->vm_mm;
0157     unsigned long next;
0158     unsigned long pages = 0;
0159     unsigned long nr_huge_updates = 0;
0160     unsigned long mni_start = 0;
0161 
0162     pmd = pmd_offset(pud, addr);
0163     do {
0164         unsigned long this_pages;
0165 
0166         next = pmd_addr_end(addr, end);
0167         if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
0168                 && pmd_none_or_clear_bad(pmd))
0169             continue;
0170 
0171         /* invoke the mmu notifier if the pmd is populated */
0172         if (!mni_start) {
0173             mni_start = addr;
0174             mmu_notifier_invalidate_range_start(mm, mni_start, end);
0175         }
0176 
0177         if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
0178             if (next - addr != HPAGE_PMD_SIZE) {
0179                 __split_huge_pmd(vma, pmd, addr, false, NULL);
0180                 if (pmd_trans_unstable(pmd))
0181                     continue;
0182             } else {
0183                 int nr_ptes = change_huge_pmd(vma, pmd, addr,
0184                         newprot, prot_numa);
0185 
0186                 if (nr_ptes) {
0187                     if (nr_ptes == HPAGE_PMD_NR) {
0188                         pages += HPAGE_PMD_NR;
0189                         nr_huge_updates++;
0190                     }
0191 
0192                     /* huge pmd was handled */
0193                     continue;
0194                 }
0195             }
0196             /* fall through, the trans huge pmd just split */
0197         }
0198         this_pages = change_pte_range(vma, pmd, addr, next, newprot,
0199                  dirty_accountable, prot_numa);
0200         pages += this_pages;
0201     } while (pmd++, addr = next, addr != end);
0202 
0203     if (mni_start)
0204         mmu_notifier_invalidate_range_end(mm, mni_start, end);
0205 
0206     if (nr_huge_updates)
0207         count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
0208     return pages;
0209 }
0210 
0211 static inline unsigned long change_pud_range(struct vm_area_struct *vma,
0212         pgd_t *pgd, unsigned long addr, unsigned long end,
0213         pgprot_t newprot, int dirty_accountable, int prot_numa)
0214 {
0215     pud_t *pud;
0216     unsigned long next;
0217     unsigned long pages = 0;
0218 
0219     pud = pud_offset(pgd, addr);
0220     do {
0221         next = pud_addr_end(addr, end);
0222         if (pud_none_or_clear_bad(pud))
0223             continue;
0224         pages += change_pmd_range(vma, pud, addr, next, newprot,
0225                  dirty_accountable, prot_numa);
0226     } while (pud++, addr = next, addr != end);
0227 
0228     return pages;
0229 }
0230 
0231 static unsigned long change_protection_range(struct vm_area_struct *vma,
0232         unsigned long addr, unsigned long end, pgprot_t newprot,
0233         int dirty_accountable, int prot_numa)
0234 {
0235     struct mm_struct *mm = vma->vm_mm;
0236     pgd_t *pgd;
0237     unsigned long next;
0238     unsigned long start = addr;
0239     unsigned long pages = 0;
0240 
0241     BUG_ON(addr >= end);
0242     pgd = pgd_offset(mm, addr);
0243     flush_cache_range(vma, addr, end);
0244     set_tlb_flush_pending(mm);
0245     do {
0246         next = pgd_addr_end(addr, end);
0247         if (pgd_none_or_clear_bad(pgd))
0248             continue;
0249         pages += change_pud_range(vma, pgd, addr, next, newprot,
0250                  dirty_accountable, prot_numa);
0251     } while (pgd++, addr = next, addr != end);
0252 
0253     /* Only flush the TLB if we actually modified any entries: */
0254     if (pages)
0255         flush_tlb_range(vma, start, end);
0256     clear_tlb_flush_pending(mm);
0257 
0258     return pages;
0259 }
0260 
0261 unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
0262                unsigned long end, pgprot_t newprot,
0263                int dirty_accountable, int prot_numa)
0264 {
0265     unsigned long pages;
0266 
0267     if (is_vm_hugetlb_page(vma))
0268         pages = hugetlb_change_protection(vma, start, end, newprot);
0269     else
0270         pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
0271 
0272     return pages;
0273 }
0274 
0275 int
0276 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
0277     unsigned long start, unsigned long end, unsigned long newflags)
0278 {
0279     struct mm_struct *mm = vma->vm_mm;
0280     unsigned long oldflags = vma->vm_flags;
0281     long nrpages = (end - start) >> PAGE_SHIFT;
0282     unsigned long charged = 0;
0283     pgoff_t pgoff;
0284     int error;
0285     int dirty_accountable = 0;
0286 
0287     if (newflags == oldflags) {
0288         *pprev = vma;
0289         return 0;
0290     }
0291 
0292     /*
0293      * If we make a private mapping writable we increase our commit;
0294      * but (without finer accounting) cannot reduce our commit if we
0295      * make it unwritable again. hugetlb mapping were accounted for
0296      * even if read-only so there is no need to account for them here
0297      */
0298     if (newflags & VM_WRITE) {
0299         /* Check space limits when area turns into data. */
0300         if (!may_expand_vm(mm, newflags, nrpages) &&
0301                 may_expand_vm(mm, oldflags, nrpages))
0302             return -ENOMEM;
0303         if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
0304                         VM_SHARED|VM_NORESERVE))) {
0305             charged = nrpages;
0306             if (security_vm_enough_memory_mm(mm, charged))
0307                 return -ENOMEM;
0308             newflags |= VM_ACCOUNT;
0309         }
0310     }
0311 
0312     /*
0313      * First try to merge with previous and/or next vma.
0314      */
0315     pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
0316     *pprev = vma_merge(mm, *pprev, start, end, newflags,
0317                vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
0318                vma->vm_userfaultfd_ctx);
0319     if (*pprev) {
0320         vma = *pprev;
0321         VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
0322         goto success;
0323     }
0324 
0325     *pprev = vma;
0326 
0327     if (start != vma->vm_start) {
0328         error = split_vma(mm, vma, start, 1);
0329         if (error)
0330             goto fail;
0331     }
0332 
0333     if (end != vma->vm_end) {
0334         error = split_vma(mm, vma, end, 0);
0335         if (error)
0336             goto fail;
0337     }
0338 
0339 success:
0340     /*
0341      * vm_flags and vm_page_prot are protected by the mmap_sem
0342      * held in write mode.
0343      */
0344     vma->vm_flags = newflags;
0345     dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
0346     vma_set_page_prot(vma);
0347 
0348     change_protection(vma, start, end, vma->vm_page_prot,
0349               dirty_accountable, 0);
0350 
0351     /*
0352      * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
0353      * fault on access.
0354      */
0355     if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
0356             (newflags & VM_WRITE)) {
0357         populate_vma_page_range(vma, start, end, NULL);
0358     }
0359 
0360     vm_stat_account(mm, oldflags, -nrpages);
0361     vm_stat_account(mm, newflags, nrpages);
0362     perf_event_mmap(vma);
0363     return 0;
0364 
0365 fail:
0366     vm_unacct_memory(charged);
0367     return error;
0368 }
0369 
0370 /*
0371  * pkey==-1 when doing a legacy mprotect()
0372  */
0373 static int do_mprotect_pkey(unsigned long start, size_t len,
0374         unsigned long prot, int pkey)
0375 {
0376     unsigned long nstart, end, tmp, reqprot;
0377     struct vm_area_struct *vma, *prev;
0378     int error = -EINVAL;
0379     const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
0380     const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
0381                 (prot & PROT_READ);
0382 
0383     prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
0384     if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
0385         return -EINVAL;
0386 
0387     if (start & ~PAGE_MASK)
0388         return -EINVAL;
0389     if (!len)
0390         return 0;
0391     len = PAGE_ALIGN(len);
0392     end = start + len;
0393     if (end <= start)
0394         return -ENOMEM;
0395     if (!arch_validate_prot(prot))
0396         return -EINVAL;
0397 
0398     reqprot = prot;
0399 
0400     if (down_write_killable(&current->mm->mmap_sem))
0401         return -EINTR;
0402 
0403     /*
0404      * If userspace did not allocate the pkey, do not let
0405      * them use it here.
0406      */
0407     error = -EINVAL;
0408     if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
0409         goto out;
0410 
0411     vma = find_vma(current->mm, start);
0412     error = -ENOMEM;
0413     if (!vma)
0414         goto out;
0415     prev = vma->vm_prev;
0416     if (unlikely(grows & PROT_GROWSDOWN)) {
0417         if (vma->vm_start >= end)
0418             goto out;
0419         start = vma->vm_start;
0420         error = -EINVAL;
0421         if (!(vma->vm_flags & VM_GROWSDOWN))
0422             goto out;
0423     } else {
0424         if (vma->vm_start > start)
0425             goto out;
0426         if (unlikely(grows & PROT_GROWSUP)) {
0427             end = vma->vm_end;
0428             error = -EINVAL;
0429             if (!(vma->vm_flags & VM_GROWSUP))
0430                 goto out;
0431         }
0432     }
0433     if (start > vma->vm_start)
0434         prev = vma;
0435 
0436     for (nstart = start ; ; ) {
0437         unsigned long mask_off_old_flags;
0438         unsigned long newflags;
0439         int new_vma_pkey;
0440 
0441         /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
0442 
0443         /* Does the application expect PROT_READ to imply PROT_EXEC */
0444         if (rier && (vma->vm_flags & VM_MAYEXEC))
0445             prot |= PROT_EXEC;
0446 
0447         /*
0448          * Each mprotect() call explicitly passes r/w/x permissions.
0449          * If a permission is not passed to mprotect(), it must be
0450          * cleared from the VMA.
0451          */
0452         mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
0453                     ARCH_VM_PKEY_FLAGS;
0454 
0455         new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
0456         newflags = calc_vm_prot_bits(prot, new_vma_pkey);
0457         newflags |= (vma->vm_flags & ~mask_off_old_flags);
0458 
0459         /* newflags >> 4 shift VM_MAY% in place of VM_% */
0460         if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
0461             error = -EACCES;
0462             goto out;
0463         }
0464 
0465         error = security_file_mprotect(vma, reqprot, prot);
0466         if (error)
0467             goto out;
0468 
0469         tmp = vma->vm_end;
0470         if (tmp > end)
0471             tmp = end;
0472         error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
0473         if (error)
0474             goto out;
0475         nstart = tmp;
0476 
0477         if (nstart < prev->vm_end)
0478             nstart = prev->vm_end;
0479         if (nstart >= end)
0480             goto out;
0481 
0482         vma = prev->vm_next;
0483         if (!vma || vma->vm_start != nstart) {
0484             error = -ENOMEM;
0485             goto out;
0486         }
0487         prot = reqprot;
0488     }
0489 out:
0490     up_write(&current->mm->mmap_sem);
0491     return error;
0492 }
0493 
0494 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
0495         unsigned long, prot)
0496 {
0497     return do_mprotect_pkey(start, len, prot, -1);
0498 }
0499 
0500 #ifdef CONFIG_ARCH_HAS_PKEYS
0501 
0502 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
0503         unsigned long, prot, int, pkey)
0504 {
0505     return do_mprotect_pkey(start, len, prot, pkey);
0506 }
0507 
0508 SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
0509 {
0510     int pkey;
0511     int ret;
0512 
0513     /* No flags supported yet. */
0514     if (flags)
0515         return -EINVAL;
0516     /* check for unsupported init values */
0517     if (init_val & ~PKEY_ACCESS_MASK)
0518         return -EINVAL;
0519 
0520     down_write(&current->mm->mmap_sem);
0521     pkey = mm_pkey_alloc(current->mm);
0522 
0523     ret = -ENOSPC;
0524     if (pkey == -1)
0525         goto out;
0526 
0527     ret = arch_set_user_pkey_access(current, pkey, init_val);
0528     if (ret) {
0529         mm_pkey_free(current->mm, pkey);
0530         goto out;
0531     }
0532     ret = pkey;
0533 out:
0534     up_write(&current->mm->mmap_sem);
0535     return ret;
0536 }
0537 
0538 SYSCALL_DEFINE1(pkey_free, int, pkey)
0539 {
0540     int ret;
0541 
0542     down_write(&current->mm->mmap_sem);
0543     ret = mm_pkey_free(current->mm, pkey);
0544     up_write(&current->mm->mmap_sem);
0545 
0546     /*
0547      * We could provie warnings or errors if any VMA still
0548      * has the pkey set here.
0549      */
0550     return ret;
0551 }
0552 
0553 #endif /* CONFIG_ARCH_HAS_PKEYS */