Back to home page

LXR

 
 

    


0001 /*
0002  *  mm/mremap.c
0003  *
0004  *  (C) Copyright 1996 Linus Torvalds
0005  *
0006  *  Address space accounting code   <alan@lxorguk.ukuu.org.uk>
0007  *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
0008  */
0009 
0010 #include <linux/mm.h>
0011 #include <linux/hugetlb.h>
0012 #include <linux/shm.h>
0013 #include <linux/ksm.h>
0014 #include <linux/mman.h>
0015 #include <linux/swap.h>
0016 #include <linux/capability.h>
0017 #include <linux/fs.h>
0018 #include <linux/swapops.h>
0019 #include <linux/highmem.h>
0020 #include <linux/security.h>
0021 #include <linux/syscalls.h>
0022 #include <linux/mmu_notifier.h>
0023 #include <linux/uaccess.h>
0024 #include <linux/mm-arch-hooks.h>
0025 
0026 #include <asm/cacheflush.h>
0027 #include <asm/tlbflush.h>
0028 
0029 #include "internal.h"
0030 
0031 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
0032 {
0033     pgd_t *pgd;
0034     pud_t *pud;
0035     pmd_t *pmd;
0036 
0037     pgd = pgd_offset(mm, addr);
0038     if (pgd_none_or_clear_bad(pgd))
0039         return NULL;
0040 
0041     pud = pud_offset(pgd, addr);
0042     if (pud_none_or_clear_bad(pud))
0043         return NULL;
0044 
0045     pmd = pmd_offset(pud, addr);
0046     if (pmd_none(*pmd))
0047         return NULL;
0048 
0049     return pmd;
0050 }
0051 
0052 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
0053                 unsigned long addr)
0054 {
0055     pgd_t *pgd;
0056     pud_t *pud;
0057     pmd_t *pmd;
0058 
0059     pgd = pgd_offset(mm, addr);
0060     pud = pud_alloc(mm, pgd, addr);
0061     if (!pud)
0062         return NULL;
0063 
0064     pmd = pmd_alloc(mm, pud, addr);
0065     if (!pmd)
0066         return NULL;
0067 
0068     VM_BUG_ON(pmd_trans_huge(*pmd));
0069 
0070     return pmd;
0071 }
0072 
0073 static void take_rmap_locks(struct vm_area_struct *vma)
0074 {
0075     if (vma->vm_file)
0076         i_mmap_lock_write(vma->vm_file->f_mapping);
0077     if (vma->anon_vma)
0078         anon_vma_lock_write(vma->anon_vma);
0079 }
0080 
0081 static void drop_rmap_locks(struct vm_area_struct *vma)
0082 {
0083     if (vma->anon_vma)
0084         anon_vma_unlock_write(vma->anon_vma);
0085     if (vma->vm_file)
0086         i_mmap_unlock_write(vma->vm_file->f_mapping);
0087 }
0088 
0089 static pte_t move_soft_dirty_pte(pte_t pte)
0090 {
0091     /*
0092      * Set soft dirty bit so we can notice
0093      * in userspace the ptes were moved.
0094      */
0095 #ifdef CONFIG_MEM_SOFT_DIRTY
0096     if (pte_present(pte))
0097         pte = pte_mksoft_dirty(pte);
0098     else if (is_swap_pte(pte))
0099         pte = pte_swp_mksoft_dirty(pte);
0100 #endif
0101     return pte;
0102 }
0103 
0104 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
0105         unsigned long old_addr, unsigned long old_end,
0106         struct vm_area_struct *new_vma, pmd_t *new_pmd,
0107         unsigned long new_addr, bool need_rmap_locks, bool *need_flush)
0108 {
0109     struct mm_struct *mm = vma->vm_mm;
0110     pte_t *old_pte, *new_pte, pte;
0111     spinlock_t *old_ptl, *new_ptl;
0112     bool force_flush = false;
0113     unsigned long len = old_end - old_addr;
0114 
0115     /*
0116      * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
0117      * locks to ensure that rmap will always observe either the old or the
0118      * new ptes. This is the easiest way to avoid races with
0119      * truncate_pagecache(), page migration, etc...
0120      *
0121      * When need_rmap_locks is false, we use other ways to avoid
0122      * such races:
0123      *
0124      * - During exec() shift_arg_pages(), we use a specially tagged vma
0125      *   which rmap call sites look for using is_vma_temporary_stack().
0126      *
0127      * - During mremap(), new_vma is often known to be placed after vma
0128      *   in rmap traversal order. This ensures rmap will always observe
0129      *   either the old pte, or the new pte, or both (the page table locks
0130      *   serialize access to individual ptes, but only rmap traversal
0131      *   order guarantees that we won't miss both the old and new ptes).
0132      */
0133     if (need_rmap_locks)
0134         take_rmap_locks(vma);
0135 
0136     /*
0137      * We don't have to worry about the ordering of src and dst
0138      * pte locks because exclusive mmap_sem prevents deadlock.
0139      */
0140     old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
0141     new_pte = pte_offset_map(new_pmd, new_addr);
0142     new_ptl = pte_lockptr(mm, new_pmd);
0143     if (new_ptl != old_ptl)
0144         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
0145     arch_enter_lazy_mmu_mode();
0146 
0147     for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
0148                    new_pte++, new_addr += PAGE_SIZE) {
0149         if (pte_none(*old_pte))
0150             continue;
0151 
0152         pte = ptep_get_and_clear(mm, old_addr, old_pte);
0153         /*
0154          * If we are remapping a dirty PTE, make sure
0155          * to flush TLB before we drop the PTL for the
0156          * old PTE or we may race with page_mkclean().
0157          *
0158          * This check has to be done after we removed the
0159          * old PTE from page tables or another thread may
0160          * dirty it after the check and before the removal.
0161          */
0162         if (pte_present(pte) && pte_dirty(pte))
0163             force_flush = true;
0164         pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
0165         pte = move_soft_dirty_pte(pte);
0166         set_pte_at(mm, new_addr, new_pte, pte);
0167     }
0168 
0169     arch_leave_lazy_mmu_mode();
0170     if (new_ptl != old_ptl)
0171         spin_unlock(new_ptl);
0172     pte_unmap(new_pte - 1);
0173     if (force_flush)
0174         flush_tlb_range(vma, old_end - len, old_end);
0175     else
0176         *need_flush = true;
0177     pte_unmap_unlock(old_pte - 1, old_ptl);
0178     if (need_rmap_locks)
0179         drop_rmap_locks(vma);
0180 }
0181 
0182 #define LATENCY_LIMIT   (64 * PAGE_SIZE)
0183 
0184 unsigned long move_page_tables(struct vm_area_struct *vma,
0185         unsigned long old_addr, struct vm_area_struct *new_vma,
0186         unsigned long new_addr, unsigned long len,
0187         bool need_rmap_locks)
0188 {
0189     unsigned long extent, next, old_end;
0190     pmd_t *old_pmd, *new_pmd;
0191     bool need_flush = false;
0192     unsigned long mmun_start;   /* For mmu_notifiers */
0193     unsigned long mmun_end;     /* For mmu_notifiers */
0194 
0195     old_end = old_addr + len;
0196     flush_cache_range(vma, old_addr, old_end);
0197 
0198     mmun_start = old_addr;
0199     mmun_end   = old_end;
0200     mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
0201 
0202     for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
0203         cond_resched();
0204         next = (old_addr + PMD_SIZE) & PMD_MASK;
0205         /* even if next overflowed, extent below will be ok */
0206         extent = next - old_addr;
0207         if (extent > old_end - old_addr)
0208             extent = old_end - old_addr;
0209         old_pmd = get_old_pmd(vma->vm_mm, old_addr);
0210         if (!old_pmd)
0211             continue;
0212         new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
0213         if (!new_pmd)
0214             break;
0215         if (pmd_trans_huge(*old_pmd)) {
0216             if (extent == HPAGE_PMD_SIZE) {
0217                 bool moved;
0218                 /* See comment in move_ptes() */
0219                 if (need_rmap_locks)
0220                     take_rmap_locks(vma);
0221                 moved = move_huge_pmd(vma, old_addr, new_addr,
0222                             old_end, old_pmd, new_pmd,
0223                             &need_flush);
0224                 if (need_rmap_locks)
0225                     drop_rmap_locks(vma);
0226                 if (moved)
0227                     continue;
0228             }
0229             split_huge_pmd(vma, old_pmd, old_addr);
0230             if (pmd_trans_unstable(old_pmd))
0231                 continue;
0232         }
0233         if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
0234             break;
0235         next = (new_addr + PMD_SIZE) & PMD_MASK;
0236         if (extent > next - new_addr)
0237             extent = next - new_addr;
0238         if (extent > LATENCY_LIMIT)
0239             extent = LATENCY_LIMIT;
0240         move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
0241               new_pmd, new_addr, need_rmap_locks, &need_flush);
0242     }
0243     if (need_flush)
0244         flush_tlb_range(vma, old_end-len, old_addr);
0245 
0246     mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
0247 
0248     return len + old_addr - old_end;    /* how much done */
0249 }
0250 
0251 static unsigned long move_vma(struct vm_area_struct *vma,
0252         unsigned long old_addr, unsigned long old_len,
0253         unsigned long new_len, unsigned long new_addr, bool *locked)
0254 {
0255     struct mm_struct *mm = vma->vm_mm;
0256     struct vm_area_struct *new_vma;
0257     unsigned long vm_flags = vma->vm_flags;
0258     unsigned long new_pgoff;
0259     unsigned long moved_len;
0260     unsigned long excess = 0;
0261     unsigned long hiwater_vm;
0262     int split = 0;
0263     int err;
0264     bool need_rmap_locks;
0265 
0266     /*
0267      * We'd prefer to avoid failure later on in do_munmap:
0268      * which may split one vma into three before unmapping.
0269      */
0270     if (mm->map_count >= sysctl_max_map_count - 3)
0271         return -ENOMEM;
0272 
0273     /*
0274      * Advise KSM to break any KSM pages in the area to be moved:
0275      * it would be confusing if they were to turn up at the new
0276      * location, where they happen to coincide with different KSM
0277      * pages recently unmapped.  But leave vma->vm_flags as it was,
0278      * so KSM can come around to merge on vma and new_vma afterwards.
0279      */
0280     err = ksm_madvise(vma, old_addr, old_addr + old_len,
0281                         MADV_UNMERGEABLE, &vm_flags);
0282     if (err)
0283         return err;
0284 
0285     new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
0286     new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
0287                &need_rmap_locks);
0288     if (!new_vma)
0289         return -ENOMEM;
0290 
0291     moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
0292                      need_rmap_locks);
0293     if (moved_len < old_len) {
0294         err = -ENOMEM;
0295     } else if (vma->vm_ops && vma->vm_ops->mremap) {
0296         err = vma->vm_ops->mremap(new_vma);
0297     }
0298 
0299     if (unlikely(err)) {
0300         /*
0301          * On error, move entries back from new area to old,
0302          * which will succeed since page tables still there,
0303          * and then proceed to unmap new area instead of old.
0304          */
0305         move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
0306                  true);
0307         vma = new_vma;
0308         old_len = new_len;
0309         old_addr = new_addr;
0310         new_addr = err;
0311     } else {
0312         arch_remap(mm, old_addr, old_addr + old_len,
0313                new_addr, new_addr + new_len);
0314     }
0315 
0316     /* Conceal VM_ACCOUNT so old reservation is not undone */
0317     if (vm_flags & VM_ACCOUNT) {
0318         vma->vm_flags &= ~VM_ACCOUNT;
0319         excess = vma->vm_end - vma->vm_start - old_len;
0320         if (old_addr > vma->vm_start &&
0321             old_addr + old_len < vma->vm_end)
0322             split = 1;
0323     }
0324 
0325     /*
0326      * If we failed to move page tables we still do total_vm increment
0327      * since do_munmap() will decrement it by old_len == new_len.
0328      *
0329      * Since total_vm is about to be raised artificially high for a
0330      * moment, we need to restore high watermark afterwards: if stats
0331      * are taken meanwhile, total_vm and hiwater_vm appear too high.
0332      * If this were a serious issue, we'd add a flag to do_munmap().
0333      */
0334     hiwater_vm = mm->hiwater_vm;
0335     vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
0336 
0337     /* Tell pfnmap has moved from this vma */
0338     if (unlikely(vma->vm_flags & VM_PFNMAP))
0339         untrack_pfn_moved(vma);
0340 
0341     if (do_munmap(mm, old_addr, old_len) < 0) {
0342         /* OOM: unable to split vma, just get accounts right */
0343         vm_unacct_memory(excess >> PAGE_SHIFT);
0344         excess = 0;
0345     }
0346     mm->hiwater_vm = hiwater_vm;
0347 
0348     /* Restore VM_ACCOUNT if one or two pieces of vma left */
0349     if (excess) {
0350         vma->vm_flags |= VM_ACCOUNT;
0351         if (split)
0352             vma->vm_next->vm_flags |= VM_ACCOUNT;
0353     }
0354 
0355     if (vm_flags & VM_LOCKED) {
0356         mm->locked_vm += new_len >> PAGE_SHIFT;
0357         *locked = true;
0358     }
0359 
0360     return new_addr;
0361 }
0362 
0363 static struct vm_area_struct *vma_to_resize(unsigned long addr,
0364     unsigned long old_len, unsigned long new_len, unsigned long *p)
0365 {
0366     struct mm_struct *mm = current->mm;
0367     struct vm_area_struct *vma = find_vma(mm, addr);
0368     unsigned long pgoff;
0369 
0370     if (!vma || vma->vm_start > addr)
0371         return ERR_PTR(-EFAULT);
0372 
0373     if (is_vm_hugetlb_page(vma))
0374         return ERR_PTR(-EINVAL);
0375 
0376     /* We can't remap across vm area boundaries */
0377     if (old_len > vma->vm_end - addr)
0378         return ERR_PTR(-EFAULT);
0379 
0380     if (new_len == old_len)
0381         return vma;
0382 
0383     /* Need to be careful about a growing mapping */
0384     pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
0385     pgoff += vma->vm_pgoff;
0386     if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
0387         return ERR_PTR(-EINVAL);
0388 
0389     if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
0390         return ERR_PTR(-EFAULT);
0391 
0392     if (vma->vm_flags & VM_LOCKED) {
0393         unsigned long locked, lock_limit;
0394         locked = mm->locked_vm << PAGE_SHIFT;
0395         lock_limit = rlimit(RLIMIT_MEMLOCK);
0396         locked += new_len - old_len;
0397         if (locked > lock_limit && !capable(CAP_IPC_LOCK))
0398             return ERR_PTR(-EAGAIN);
0399     }
0400 
0401     if (!may_expand_vm(mm, vma->vm_flags,
0402                 (new_len - old_len) >> PAGE_SHIFT))
0403         return ERR_PTR(-ENOMEM);
0404 
0405     if (vma->vm_flags & VM_ACCOUNT) {
0406         unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
0407         if (security_vm_enough_memory_mm(mm, charged))
0408             return ERR_PTR(-ENOMEM);
0409         *p = charged;
0410     }
0411 
0412     return vma;
0413 }
0414 
0415 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
0416         unsigned long new_addr, unsigned long new_len, bool *locked)
0417 {
0418     struct mm_struct *mm = current->mm;
0419     struct vm_area_struct *vma;
0420     unsigned long ret = -EINVAL;
0421     unsigned long charged = 0;
0422     unsigned long map_flags;
0423 
0424     if (offset_in_page(new_addr))
0425         goto out;
0426 
0427     if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
0428         goto out;
0429 
0430     /* Ensure the old/new locations do not overlap */
0431     if (addr + old_len > new_addr && new_addr + new_len > addr)
0432         goto out;
0433 
0434     ret = do_munmap(mm, new_addr, new_len);
0435     if (ret)
0436         goto out;
0437 
0438     if (old_len >= new_len) {
0439         ret = do_munmap(mm, addr+new_len, old_len - new_len);
0440         if (ret && old_len != new_len)
0441             goto out;
0442         old_len = new_len;
0443     }
0444 
0445     vma = vma_to_resize(addr, old_len, new_len, &charged);
0446     if (IS_ERR(vma)) {
0447         ret = PTR_ERR(vma);
0448         goto out;
0449     }
0450 
0451     map_flags = MAP_FIXED;
0452     if (vma->vm_flags & VM_MAYSHARE)
0453         map_flags |= MAP_SHARED;
0454 
0455     ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
0456                 ((addr - vma->vm_start) >> PAGE_SHIFT),
0457                 map_flags);
0458     if (offset_in_page(ret))
0459         goto out1;
0460 
0461     ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
0462     if (!(offset_in_page(ret)))
0463         goto out;
0464 out1:
0465     vm_unacct_memory(charged);
0466 
0467 out:
0468     return ret;
0469 }
0470 
0471 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
0472 {
0473     unsigned long end = vma->vm_end + delta;
0474     if (end < vma->vm_end) /* overflow */
0475         return 0;
0476     if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
0477         return 0;
0478     if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
0479                   0, MAP_FIXED) & ~PAGE_MASK)
0480         return 0;
0481     return 1;
0482 }
0483 
0484 /*
0485  * Expand (or shrink) an existing mapping, potentially moving it at the
0486  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
0487  *
0488  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
0489  * This option implies MREMAP_MAYMOVE.
0490  */
0491 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
0492         unsigned long, new_len, unsigned long, flags,
0493         unsigned long, new_addr)
0494 {
0495     struct mm_struct *mm = current->mm;
0496     struct vm_area_struct *vma;
0497     unsigned long ret = -EINVAL;
0498     unsigned long charged = 0;
0499     bool locked = false;
0500 
0501     if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
0502         return ret;
0503 
0504     if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
0505         return ret;
0506 
0507     if (offset_in_page(addr))
0508         return ret;
0509 
0510     old_len = PAGE_ALIGN(old_len);
0511     new_len = PAGE_ALIGN(new_len);
0512 
0513     /*
0514      * We allow a zero old-len as a special case
0515      * for DOS-emu "duplicate shm area" thing. But
0516      * a zero new-len is nonsensical.
0517      */
0518     if (!new_len)
0519         return ret;
0520 
0521     if (down_write_killable(&current->mm->mmap_sem))
0522         return -EINTR;
0523 
0524     if (flags & MREMAP_FIXED) {
0525         ret = mremap_to(addr, old_len, new_addr, new_len,
0526                 &locked);
0527         goto out;
0528     }
0529 
0530     /*
0531      * Always allow a shrinking remap: that just unmaps
0532      * the unnecessary pages..
0533      * do_munmap does all the needed commit accounting
0534      */
0535     if (old_len >= new_len) {
0536         ret = do_munmap(mm, addr+new_len, old_len - new_len);
0537         if (ret && old_len != new_len)
0538             goto out;
0539         ret = addr;
0540         goto out;
0541     }
0542 
0543     /*
0544      * Ok, we need to grow..
0545      */
0546     vma = vma_to_resize(addr, old_len, new_len, &charged);
0547     if (IS_ERR(vma)) {
0548         ret = PTR_ERR(vma);
0549         goto out;
0550     }
0551 
0552     /* old_len exactly to the end of the area..
0553      */
0554     if (old_len == vma->vm_end - addr) {
0555         /* can we just expand the current mapping? */
0556         if (vma_expandable(vma, new_len - old_len)) {
0557             int pages = (new_len - old_len) >> PAGE_SHIFT;
0558 
0559             if (vma_adjust(vma, vma->vm_start, addr + new_len,
0560                        vma->vm_pgoff, NULL)) {
0561                 ret = -ENOMEM;
0562                 goto out;
0563             }
0564 
0565             vm_stat_account(mm, vma->vm_flags, pages);
0566             if (vma->vm_flags & VM_LOCKED) {
0567                 mm->locked_vm += pages;
0568                 locked = true;
0569                 new_addr = addr;
0570             }
0571             ret = addr;
0572             goto out;
0573         }
0574     }
0575 
0576     /*
0577      * We weren't able to just expand or shrink the area,
0578      * we need to create a new one and move it..
0579      */
0580     ret = -ENOMEM;
0581     if (flags & MREMAP_MAYMOVE) {
0582         unsigned long map_flags = 0;
0583         if (vma->vm_flags & VM_MAYSHARE)
0584             map_flags |= MAP_SHARED;
0585 
0586         new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
0587                     vma->vm_pgoff +
0588                     ((addr - vma->vm_start) >> PAGE_SHIFT),
0589                     map_flags);
0590         if (offset_in_page(new_addr)) {
0591             ret = new_addr;
0592             goto out;
0593         }
0594 
0595         ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
0596     }
0597 out:
0598     if (offset_in_page(ret)) {
0599         vm_unacct_memory(charged);
0600         locked = 0;
0601     }
0602     up_write(&current->mm->mmap_sem);
0603     if (locked && new_len > old_len)
0604         mm_populate(new_addr + old_len, new_len - old_len);
0605     return ret;
0606 }