Back to home page

LXR

 
 

    


0001 /*
0002  *  mm/userfaultfd.c
0003  *
0004  *  Copyright (C) 2015  Red Hat, Inc.
0005  *
0006  *  This work is licensed under the terms of the GNU GPL, version 2. See
0007  *  the COPYING file in the top-level directory.
0008  */
0009 
0010 #include <linux/mm.h>
0011 #include <linux/pagemap.h>
0012 #include <linux/rmap.h>
0013 #include <linux/swap.h>
0014 #include <linux/swapops.h>
0015 #include <linux/userfaultfd_k.h>
0016 #include <linux/mmu_notifier.h>
0017 #include <asm/tlbflush.h>
0018 #include "internal.h"
0019 
0020 static int mcopy_atomic_pte(struct mm_struct *dst_mm,
0021                 pmd_t *dst_pmd,
0022                 struct vm_area_struct *dst_vma,
0023                 unsigned long dst_addr,
0024                 unsigned long src_addr,
0025                 struct page **pagep)
0026 {
0027     struct mem_cgroup *memcg;
0028     pte_t _dst_pte, *dst_pte;
0029     spinlock_t *ptl;
0030     void *page_kaddr;
0031     int ret;
0032     struct page *page;
0033 
0034     if (!*pagep) {
0035         ret = -ENOMEM;
0036         page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
0037         if (!page)
0038             goto out;
0039 
0040         page_kaddr = kmap_atomic(page);
0041         ret = copy_from_user(page_kaddr,
0042                      (const void __user *) src_addr,
0043                      PAGE_SIZE);
0044         kunmap_atomic(page_kaddr);
0045 
0046         /* fallback to copy_from_user outside mmap_sem */
0047         if (unlikely(ret)) {
0048             ret = -EFAULT;
0049             *pagep = page;
0050             /* don't free the page */
0051             goto out;
0052         }
0053     } else {
0054         page = *pagep;
0055         *pagep = NULL;
0056     }
0057 
0058     /*
0059      * The memory barrier inside __SetPageUptodate makes sure that
0060      * preceeding stores to the page contents become visible before
0061      * the set_pte_at() write.
0062      */
0063     __SetPageUptodate(page);
0064 
0065     ret = -ENOMEM;
0066     if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
0067         goto out_release;
0068 
0069     _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
0070     if (dst_vma->vm_flags & VM_WRITE)
0071         _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
0072 
0073     ret = -EEXIST;
0074     dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
0075     if (!pte_none(*dst_pte))
0076         goto out_release_uncharge_unlock;
0077 
0078     inc_mm_counter(dst_mm, MM_ANONPAGES);
0079     page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
0080     mem_cgroup_commit_charge(page, memcg, false, false);
0081     lru_cache_add_active_or_unevictable(page, dst_vma);
0082 
0083     set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
0084 
0085     /* No need to invalidate - it was non-present before */
0086     update_mmu_cache(dst_vma, dst_addr, dst_pte);
0087 
0088     pte_unmap_unlock(dst_pte, ptl);
0089     ret = 0;
0090 out:
0091     return ret;
0092 out_release_uncharge_unlock:
0093     pte_unmap_unlock(dst_pte, ptl);
0094     mem_cgroup_cancel_charge(page, memcg, false);
0095 out_release:
0096     put_page(page);
0097     goto out;
0098 }
0099 
0100 static int mfill_zeropage_pte(struct mm_struct *dst_mm,
0101                   pmd_t *dst_pmd,
0102                   struct vm_area_struct *dst_vma,
0103                   unsigned long dst_addr)
0104 {
0105     pte_t _dst_pte, *dst_pte;
0106     spinlock_t *ptl;
0107     int ret;
0108 
0109     _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
0110                      dst_vma->vm_page_prot));
0111     ret = -EEXIST;
0112     dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
0113     if (!pte_none(*dst_pte))
0114         goto out_unlock;
0115     set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
0116     /* No need to invalidate - it was non-present before */
0117     update_mmu_cache(dst_vma, dst_addr, dst_pte);
0118     ret = 0;
0119 out_unlock:
0120     pte_unmap_unlock(dst_pte, ptl);
0121     return ret;
0122 }
0123 
0124 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
0125 {
0126     pgd_t *pgd;
0127     pud_t *pud;
0128     pmd_t *pmd = NULL;
0129 
0130     pgd = pgd_offset(mm, address);
0131     pud = pud_alloc(mm, pgd, address);
0132     if (pud)
0133         /*
0134          * Note that we didn't run this because the pmd was
0135          * missing, the *pmd may be already established and in
0136          * turn it may also be a trans_huge_pmd.
0137          */
0138         pmd = pmd_alloc(mm, pud, address);
0139     return pmd;
0140 }
0141 
0142 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
0143                           unsigned long dst_start,
0144                           unsigned long src_start,
0145                           unsigned long len,
0146                           bool zeropage)
0147 {
0148     struct vm_area_struct *dst_vma;
0149     ssize_t err;
0150     pmd_t *dst_pmd;
0151     unsigned long src_addr, dst_addr;
0152     long copied;
0153     struct page *page;
0154 
0155     /*
0156      * Sanitize the command parameters:
0157      */
0158     BUG_ON(dst_start & ~PAGE_MASK);
0159     BUG_ON(len & ~PAGE_MASK);
0160 
0161     /* Does the address range wrap, or is the span zero-sized? */
0162     BUG_ON(src_start + len <= src_start);
0163     BUG_ON(dst_start + len <= dst_start);
0164 
0165     src_addr = src_start;
0166     dst_addr = dst_start;
0167     copied = 0;
0168     page = NULL;
0169 retry:
0170     down_read(&dst_mm->mmap_sem);
0171 
0172     /*
0173      * Make sure the vma is not shared, that the dst range is
0174      * both valid and fully within a single existing vma.
0175      */
0176     err = -EINVAL;
0177     dst_vma = find_vma(dst_mm, dst_start);
0178     if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
0179         goto out_unlock;
0180     if (dst_start < dst_vma->vm_start ||
0181         dst_start + len > dst_vma->vm_end)
0182         goto out_unlock;
0183 
0184     /*
0185      * Be strict and only allow __mcopy_atomic on userfaultfd
0186      * registered ranges to prevent userland errors going
0187      * unnoticed. As far as the VM consistency is concerned, it
0188      * would be perfectly safe to remove this check, but there's
0189      * no useful usage for __mcopy_atomic ouside of userfaultfd
0190      * registered ranges. This is after all why these are ioctls
0191      * belonging to the userfaultfd and not syscalls.
0192      */
0193     if (!dst_vma->vm_userfaultfd_ctx.ctx)
0194         goto out_unlock;
0195 
0196     /*
0197      * FIXME: only allow copying on anonymous vmas, tmpfs should
0198      * be added.
0199      */
0200     if (dst_vma->vm_ops)
0201         goto out_unlock;
0202 
0203     /*
0204      * Ensure the dst_vma has a anon_vma or this page
0205      * would get a NULL anon_vma when moved in the
0206      * dst_vma.
0207      */
0208     err = -ENOMEM;
0209     if (unlikely(anon_vma_prepare(dst_vma)))
0210         goto out_unlock;
0211 
0212     while (src_addr < src_start + len) {
0213         pmd_t dst_pmdval;
0214 
0215         BUG_ON(dst_addr >= dst_start + len);
0216 
0217         dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
0218         if (unlikely(!dst_pmd)) {
0219             err = -ENOMEM;
0220             break;
0221         }
0222 
0223         dst_pmdval = pmd_read_atomic(dst_pmd);
0224         /*
0225          * If the dst_pmd is mapped as THP don't
0226          * override it and just be strict.
0227          */
0228         if (unlikely(pmd_trans_huge(dst_pmdval))) {
0229             err = -EEXIST;
0230             break;
0231         }
0232         if (unlikely(pmd_none(dst_pmdval)) &&
0233             unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
0234             err = -ENOMEM;
0235             break;
0236         }
0237         /* If an huge pmd materialized from under us fail */
0238         if (unlikely(pmd_trans_huge(*dst_pmd))) {
0239             err = -EFAULT;
0240             break;
0241         }
0242 
0243         BUG_ON(pmd_none(*dst_pmd));
0244         BUG_ON(pmd_trans_huge(*dst_pmd));
0245 
0246         if (!zeropage)
0247             err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
0248                            dst_addr, src_addr, &page);
0249         else
0250             err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
0251                          dst_addr);
0252 
0253         cond_resched();
0254 
0255         if (unlikely(err == -EFAULT)) {
0256             void *page_kaddr;
0257 
0258             up_read(&dst_mm->mmap_sem);
0259             BUG_ON(!page);
0260 
0261             page_kaddr = kmap(page);
0262             err = copy_from_user(page_kaddr,
0263                          (const void __user *) src_addr,
0264                          PAGE_SIZE);
0265             kunmap(page);
0266             if (unlikely(err)) {
0267                 err = -EFAULT;
0268                 goto out;
0269             }
0270             goto retry;
0271         } else
0272             BUG_ON(page);
0273 
0274         if (!err) {
0275             dst_addr += PAGE_SIZE;
0276             src_addr += PAGE_SIZE;
0277             copied += PAGE_SIZE;
0278 
0279             if (fatal_signal_pending(current))
0280                 err = -EINTR;
0281         }
0282         if (err)
0283             break;
0284     }
0285 
0286 out_unlock:
0287     up_read(&dst_mm->mmap_sem);
0288 out:
0289     if (page)
0290         put_page(page);
0291     BUG_ON(copied < 0);
0292     BUG_ON(err > 0);
0293     BUG_ON(!copied && !err);
0294     return copied ? copied : err;
0295 }
0296 
0297 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
0298              unsigned long src_start, unsigned long len)
0299 {
0300     return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
0301 }
0302 
0303 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
0304                unsigned long len)
0305 {
0306     return __mcopy_atomic(dst_mm, start, 0, len, true);
0307 }