Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/mm/nommu.c
0003  *
0004  *  Replacement code for mm functions to support CPU's that don't
0005  *  have any form of memory management unit (thus no virtual memory).
0006  *
0007  *  See Documentation/nommu-mmap.txt
0008  *
0009  *  Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
0010  *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
0011  *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
0012  *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
0013  *  Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
0014  */
0015 
0016 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0017 
0018 #include <linux/export.h>
0019 #include <linux/mm.h>
0020 #include <linux/vmacache.h>
0021 #include <linux/mman.h>
0022 #include <linux/swap.h>
0023 #include <linux/file.h>
0024 #include <linux/highmem.h>
0025 #include <linux/pagemap.h>
0026 #include <linux/slab.h>
0027 #include <linux/vmalloc.h>
0028 #include <linux/blkdev.h>
0029 #include <linux/backing-dev.h>
0030 #include <linux/compiler.h>
0031 #include <linux/mount.h>
0032 #include <linux/personality.h>
0033 #include <linux/security.h>
0034 #include <linux/syscalls.h>
0035 #include <linux/audit.h>
0036 #include <linux/printk.h>
0037 
0038 #include <linux/uaccess.h>
0039 #include <asm/tlb.h>
0040 #include <asm/tlbflush.h>
0041 #include <asm/mmu_context.h>
0042 #include "internal.h"
0043 
0044 void *high_memory;
0045 EXPORT_SYMBOL(high_memory);
0046 struct page *mem_map;
0047 unsigned long max_mapnr;
0048 EXPORT_SYMBOL(max_mapnr);
0049 unsigned long highest_memmap_pfn;
0050 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
0051 int heap_stack_gap = 0;
0052 
0053 atomic_long_t mmap_pages_allocated;
0054 
0055 EXPORT_SYMBOL(mem_map);
0056 
0057 /* list of mapped, potentially shareable regions */
0058 static struct kmem_cache *vm_region_jar;
0059 struct rb_root nommu_region_tree = RB_ROOT;
0060 DECLARE_RWSEM(nommu_region_sem);
0061 
0062 const struct vm_operations_struct generic_file_vm_ops = {
0063 };
0064 
0065 /*
0066  * Return the total memory allocated for this pointer, not
0067  * just what the caller asked for.
0068  *
0069  * Doesn't have to be accurate, i.e. may have races.
0070  */
0071 unsigned int kobjsize(const void *objp)
0072 {
0073     struct page *page;
0074 
0075     /*
0076      * If the object we have should not have ksize performed on it,
0077      * return size of 0
0078      */
0079     if (!objp || !virt_addr_valid(objp))
0080         return 0;
0081 
0082     page = virt_to_head_page(objp);
0083 
0084     /*
0085      * If the allocator sets PageSlab, we know the pointer came from
0086      * kmalloc().
0087      */
0088     if (PageSlab(page))
0089         return ksize(objp);
0090 
0091     /*
0092      * If it's not a compound page, see if we have a matching VMA
0093      * region. This test is intentionally done in reverse order,
0094      * so if there's no VMA, we still fall through and hand back
0095      * PAGE_SIZE for 0-order pages.
0096      */
0097     if (!PageCompound(page)) {
0098         struct vm_area_struct *vma;
0099 
0100         vma = find_vma(current->mm, (unsigned long)objp);
0101         if (vma)
0102             return vma->vm_end - vma->vm_start;
0103     }
0104 
0105     /*
0106      * The ksize() function is only guaranteed to work for pointers
0107      * returned by kmalloc(). So handle arbitrary pointers here.
0108      */
0109     return PAGE_SIZE << compound_order(page);
0110 }
0111 
0112 static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
0113               unsigned long start, unsigned long nr_pages,
0114               unsigned int foll_flags, struct page **pages,
0115               struct vm_area_struct **vmas, int *nonblocking)
0116 {
0117     struct vm_area_struct *vma;
0118     unsigned long vm_flags;
0119     int i;
0120 
0121     /* calculate required read or write permissions.
0122      * If FOLL_FORCE is set, we only require the "MAY" flags.
0123      */
0124     vm_flags  = (foll_flags & FOLL_WRITE) ?
0125             (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
0126     vm_flags &= (foll_flags & FOLL_FORCE) ?
0127             (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
0128 
0129     for (i = 0; i < nr_pages; i++) {
0130         vma = find_vma(mm, start);
0131         if (!vma)
0132             goto finish_or_fault;
0133 
0134         /* protect what we can, including chardevs */
0135         if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
0136             !(vm_flags & vma->vm_flags))
0137             goto finish_or_fault;
0138 
0139         if (pages) {
0140             pages[i] = virt_to_page(start);
0141             if (pages[i])
0142                 get_page(pages[i]);
0143         }
0144         if (vmas)
0145             vmas[i] = vma;
0146         start = (start + PAGE_SIZE) & PAGE_MASK;
0147     }
0148 
0149     return i;
0150 
0151 finish_or_fault:
0152     return i ? : -EFAULT;
0153 }
0154 
0155 /*
0156  * get a list of pages in an address range belonging to the specified process
0157  * and indicate the VMA that covers each page
0158  * - this is potentially dodgy as we may end incrementing the page count of a
0159  *   slab page or a secondary page from a compound page
0160  * - don't permit access to VMAs that don't support it, such as I/O mappings
0161  */
0162 long get_user_pages(unsigned long start, unsigned long nr_pages,
0163             unsigned int gup_flags, struct page **pages,
0164             struct vm_area_struct **vmas)
0165 {
0166     return __get_user_pages(current, current->mm, start, nr_pages,
0167                 gup_flags, pages, vmas, NULL);
0168 }
0169 EXPORT_SYMBOL(get_user_pages);
0170 
0171 long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
0172                 unsigned int gup_flags, struct page **pages,
0173                 int *locked)
0174 {
0175     return get_user_pages(start, nr_pages, gup_flags, pages, NULL);
0176 }
0177 EXPORT_SYMBOL(get_user_pages_locked);
0178 
0179 static long __get_user_pages_unlocked(struct task_struct *tsk,
0180             struct mm_struct *mm, unsigned long start,
0181             unsigned long nr_pages, struct page **pages,
0182             unsigned int gup_flags)
0183 {
0184     long ret;
0185     down_read(&mm->mmap_sem);
0186     ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages,
0187                 NULL, NULL);
0188     up_read(&mm->mmap_sem);
0189     return ret;
0190 }
0191 
0192 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
0193                  struct page **pages, unsigned int gup_flags)
0194 {
0195     return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
0196                      pages, gup_flags);
0197 }
0198 EXPORT_SYMBOL(get_user_pages_unlocked);
0199 
0200 /**
0201  * follow_pfn - look up PFN at a user virtual address
0202  * @vma: memory mapping
0203  * @address: user virtual address
0204  * @pfn: location to store found PFN
0205  *
0206  * Only IO mappings and raw PFN mappings are allowed.
0207  *
0208  * Returns zero and the pfn at @pfn on success, -ve otherwise.
0209  */
0210 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
0211     unsigned long *pfn)
0212 {
0213     if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
0214         return -EINVAL;
0215 
0216     *pfn = address >> PAGE_SHIFT;
0217     return 0;
0218 }
0219 EXPORT_SYMBOL(follow_pfn);
0220 
0221 LIST_HEAD(vmap_area_list);
0222 
0223 void vfree(const void *addr)
0224 {
0225     kfree(addr);
0226 }
0227 EXPORT_SYMBOL(vfree);
0228 
0229 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
0230 {
0231     /*
0232      *  You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
0233      * returns only a logical address.
0234      */
0235     return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
0236 }
0237 EXPORT_SYMBOL(__vmalloc);
0238 
0239 void *vmalloc_user(unsigned long size)
0240 {
0241     void *ret;
0242 
0243     ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
0244             PAGE_KERNEL);
0245     if (ret) {
0246         struct vm_area_struct *vma;
0247 
0248         down_write(&current->mm->mmap_sem);
0249         vma = find_vma(current->mm, (unsigned long)ret);
0250         if (vma)
0251             vma->vm_flags |= VM_USERMAP;
0252         up_write(&current->mm->mmap_sem);
0253     }
0254 
0255     return ret;
0256 }
0257 EXPORT_SYMBOL(vmalloc_user);
0258 
0259 struct page *vmalloc_to_page(const void *addr)
0260 {
0261     return virt_to_page(addr);
0262 }
0263 EXPORT_SYMBOL(vmalloc_to_page);
0264 
0265 unsigned long vmalloc_to_pfn(const void *addr)
0266 {
0267     return page_to_pfn(virt_to_page(addr));
0268 }
0269 EXPORT_SYMBOL(vmalloc_to_pfn);
0270 
0271 long vread(char *buf, char *addr, unsigned long count)
0272 {
0273     /* Don't allow overflow */
0274     if ((unsigned long) buf + count < count)
0275         count = -(unsigned long) buf;
0276 
0277     memcpy(buf, addr, count);
0278     return count;
0279 }
0280 
0281 long vwrite(char *buf, char *addr, unsigned long count)
0282 {
0283     /* Don't allow overflow */
0284     if ((unsigned long) addr + count < count)
0285         count = -(unsigned long) addr;
0286 
0287     memcpy(addr, buf, count);
0288     return count;
0289 }
0290 
0291 /*
0292  *  vmalloc  -  allocate virtually contiguous memory
0293  *
0294  *  @size:      allocation size
0295  *
0296  *  Allocate enough pages to cover @size from the page level
0297  *  allocator and map them into contiguous kernel virtual space.
0298  *
0299  *  For tight control over page level allocator and protection flags
0300  *  use __vmalloc() instead.
0301  */
0302 void *vmalloc(unsigned long size)
0303 {
0304        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
0305 }
0306 EXPORT_SYMBOL(vmalloc);
0307 
0308 /*
0309  *  vzalloc - allocate virtually contiguous memory with zero fill
0310  *
0311  *  @size:      allocation size
0312  *
0313  *  Allocate enough pages to cover @size from the page level
0314  *  allocator and map them into contiguous kernel virtual space.
0315  *  The memory allocated is set to zero.
0316  *
0317  *  For tight control over page level allocator and protection flags
0318  *  use __vmalloc() instead.
0319  */
0320 void *vzalloc(unsigned long size)
0321 {
0322     return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
0323             PAGE_KERNEL);
0324 }
0325 EXPORT_SYMBOL(vzalloc);
0326 
0327 /**
0328  * vmalloc_node - allocate memory on a specific node
0329  * @size:   allocation size
0330  * @node:   numa node
0331  *
0332  * Allocate enough pages to cover @size from the page level
0333  * allocator and map them into contiguous kernel virtual space.
0334  *
0335  * For tight control over page level allocator and protection flags
0336  * use __vmalloc() instead.
0337  */
0338 void *vmalloc_node(unsigned long size, int node)
0339 {
0340     return vmalloc(size);
0341 }
0342 EXPORT_SYMBOL(vmalloc_node);
0343 
0344 /**
0345  * vzalloc_node - allocate memory on a specific node with zero fill
0346  * @size:   allocation size
0347  * @node:   numa node
0348  *
0349  * Allocate enough pages to cover @size from the page level
0350  * allocator and map them into contiguous kernel virtual space.
0351  * The memory allocated is set to zero.
0352  *
0353  * For tight control over page level allocator and protection flags
0354  * use __vmalloc() instead.
0355  */
0356 void *vzalloc_node(unsigned long size, int node)
0357 {
0358     return vzalloc(size);
0359 }
0360 EXPORT_SYMBOL(vzalloc_node);
0361 
0362 #ifndef PAGE_KERNEL_EXEC
0363 # define PAGE_KERNEL_EXEC PAGE_KERNEL
0364 #endif
0365 
0366 /**
0367  *  vmalloc_exec  -  allocate virtually contiguous, executable memory
0368  *  @size:      allocation size
0369  *
0370  *  Kernel-internal function to allocate enough pages to cover @size
0371  *  the page level allocator and map them into contiguous and
0372  *  executable kernel virtual space.
0373  *
0374  *  For tight control over page level allocator and protection flags
0375  *  use __vmalloc() instead.
0376  */
0377 
0378 void *vmalloc_exec(unsigned long size)
0379 {
0380     return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
0381 }
0382 
0383 /**
0384  * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
0385  *  @size:      allocation size
0386  *
0387  *  Allocate enough 32bit PA addressable pages to cover @size from the
0388  *  page level allocator and map them into contiguous kernel virtual space.
0389  */
0390 void *vmalloc_32(unsigned long size)
0391 {
0392     return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
0393 }
0394 EXPORT_SYMBOL(vmalloc_32);
0395 
0396 /**
0397  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
0398  *  @size:      allocation size
0399  *
0400  * The resulting memory area is 32bit addressable and zeroed so it can be
0401  * mapped to userspace without leaking data.
0402  *
0403  * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
0404  * remap_vmalloc_range() are permissible.
0405  */
0406 void *vmalloc_32_user(unsigned long size)
0407 {
0408     /*
0409      * We'll have to sort out the ZONE_DMA bits for 64-bit,
0410      * but for now this can simply use vmalloc_user() directly.
0411      */
0412     return vmalloc_user(size);
0413 }
0414 EXPORT_SYMBOL(vmalloc_32_user);
0415 
0416 void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
0417 {
0418     BUG();
0419     return NULL;
0420 }
0421 EXPORT_SYMBOL(vmap);
0422 
0423 void vunmap(const void *addr)
0424 {
0425     BUG();
0426 }
0427 EXPORT_SYMBOL(vunmap);
0428 
0429 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
0430 {
0431     BUG();
0432     return NULL;
0433 }
0434 EXPORT_SYMBOL(vm_map_ram);
0435 
0436 void vm_unmap_ram(const void *mem, unsigned int count)
0437 {
0438     BUG();
0439 }
0440 EXPORT_SYMBOL(vm_unmap_ram);
0441 
0442 void vm_unmap_aliases(void)
0443 {
0444 }
0445 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
0446 
0447 /*
0448  * Implement a stub for vmalloc_sync_all() if the architecture chose not to
0449  * have one.
0450  */
0451 void __weak vmalloc_sync_all(void)
0452 {
0453 }
0454 
0455 /**
0456  *  alloc_vm_area - allocate a range of kernel address space
0457  *  @size:      size of the area
0458  *
0459  *  Returns:    NULL on failure, vm_struct on success
0460  *
0461  *  This function reserves a range of kernel address space, and
0462  *  allocates pagetables to map that range.  No actual mappings
0463  *  are created.  If the kernel address space is not shared
0464  *  between processes, it syncs the pagetable across all
0465  *  processes.
0466  */
0467 struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
0468 {
0469     BUG();
0470     return NULL;
0471 }
0472 EXPORT_SYMBOL_GPL(alloc_vm_area);
0473 
0474 void free_vm_area(struct vm_struct *area)
0475 {
0476     BUG();
0477 }
0478 EXPORT_SYMBOL_GPL(free_vm_area);
0479 
0480 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
0481            struct page *page)
0482 {
0483     return -EINVAL;
0484 }
0485 EXPORT_SYMBOL(vm_insert_page);
0486 
0487 /*
0488  *  sys_brk() for the most part doesn't need the global kernel
0489  *  lock, except when an application is doing something nasty
0490  *  like trying to un-brk an area that has already been mapped
0491  *  to a regular file.  in this case, the unmapping will need
0492  *  to invoke file system routines that need the global lock.
0493  */
0494 SYSCALL_DEFINE1(brk, unsigned long, brk)
0495 {
0496     struct mm_struct *mm = current->mm;
0497 
0498     if (brk < mm->start_brk || brk > mm->context.end_brk)
0499         return mm->brk;
0500 
0501     if (mm->brk == brk)
0502         return mm->brk;
0503 
0504     /*
0505      * Always allow shrinking brk
0506      */
0507     if (brk <= mm->brk) {
0508         mm->brk = brk;
0509         return brk;
0510     }
0511 
0512     /*
0513      * Ok, looks good - let it rip.
0514      */
0515     flush_icache_range(mm->brk, brk);
0516     return mm->brk = brk;
0517 }
0518 
0519 /*
0520  * initialise the VMA and region record slabs
0521  */
0522 void __init mmap_init(void)
0523 {
0524     int ret;
0525 
0526     ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
0527     VM_BUG_ON(ret);
0528     vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
0529 }
0530 
0531 /*
0532  * validate the region tree
0533  * - the caller must hold the region lock
0534  */
0535 #ifdef CONFIG_DEBUG_NOMMU_REGIONS
0536 static noinline void validate_nommu_regions(void)
0537 {
0538     struct vm_region *region, *last;
0539     struct rb_node *p, *lastp;
0540 
0541     lastp = rb_first(&nommu_region_tree);
0542     if (!lastp)
0543         return;
0544 
0545     last = rb_entry(lastp, struct vm_region, vm_rb);
0546     BUG_ON(last->vm_end <= last->vm_start);
0547     BUG_ON(last->vm_top < last->vm_end);
0548 
0549     while ((p = rb_next(lastp))) {
0550         region = rb_entry(p, struct vm_region, vm_rb);
0551         last = rb_entry(lastp, struct vm_region, vm_rb);
0552 
0553         BUG_ON(region->vm_end <= region->vm_start);
0554         BUG_ON(region->vm_top < region->vm_end);
0555         BUG_ON(region->vm_start < last->vm_top);
0556 
0557         lastp = p;
0558     }
0559 }
0560 #else
0561 static void validate_nommu_regions(void)
0562 {
0563 }
0564 #endif
0565 
0566 /*
0567  * add a region into the global tree
0568  */
0569 static void add_nommu_region(struct vm_region *region)
0570 {
0571     struct vm_region *pregion;
0572     struct rb_node **p, *parent;
0573 
0574     validate_nommu_regions();
0575 
0576     parent = NULL;
0577     p = &nommu_region_tree.rb_node;
0578     while (*p) {
0579         parent = *p;
0580         pregion = rb_entry(parent, struct vm_region, vm_rb);
0581         if (region->vm_start < pregion->vm_start)
0582             p = &(*p)->rb_left;
0583         else if (region->vm_start > pregion->vm_start)
0584             p = &(*p)->rb_right;
0585         else if (pregion == region)
0586             return;
0587         else
0588             BUG();
0589     }
0590 
0591     rb_link_node(&region->vm_rb, parent, p);
0592     rb_insert_color(&region->vm_rb, &nommu_region_tree);
0593 
0594     validate_nommu_regions();
0595 }
0596 
0597 /*
0598  * delete a region from the global tree
0599  */
0600 static void delete_nommu_region(struct vm_region *region)
0601 {
0602     BUG_ON(!nommu_region_tree.rb_node);
0603 
0604     validate_nommu_regions();
0605     rb_erase(&region->vm_rb, &nommu_region_tree);
0606     validate_nommu_regions();
0607 }
0608 
0609 /*
0610  * free a contiguous series of pages
0611  */
0612 static void free_page_series(unsigned long from, unsigned long to)
0613 {
0614     for (; from < to; from += PAGE_SIZE) {
0615         struct page *page = virt_to_page(from);
0616 
0617         atomic_long_dec(&mmap_pages_allocated);
0618         put_page(page);
0619     }
0620 }
0621 
0622 /*
0623  * release a reference to a region
0624  * - the caller must hold the region semaphore for writing, which this releases
0625  * - the region may not have been added to the tree yet, in which case vm_top
0626  *   will equal vm_start
0627  */
0628 static void __put_nommu_region(struct vm_region *region)
0629     __releases(nommu_region_sem)
0630 {
0631     BUG_ON(!nommu_region_tree.rb_node);
0632 
0633     if (--region->vm_usage == 0) {
0634         if (region->vm_top > region->vm_start)
0635             delete_nommu_region(region);
0636         up_write(&nommu_region_sem);
0637 
0638         if (region->vm_file)
0639             fput(region->vm_file);
0640 
0641         /* IO memory and memory shared directly out of the pagecache
0642          * from ramfs/tmpfs mustn't be released here */
0643         if (region->vm_flags & VM_MAPPED_COPY)
0644             free_page_series(region->vm_start, region->vm_top);
0645         kmem_cache_free(vm_region_jar, region);
0646     } else {
0647         up_write(&nommu_region_sem);
0648     }
0649 }
0650 
0651 /*
0652  * release a reference to a region
0653  */
0654 static void put_nommu_region(struct vm_region *region)
0655 {
0656     down_write(&nommu_region_sem);
0657     __put_nommu_region(region);
0658 }
0659 
0660 /*
0661  * update protection on a vma
0662  */
0663 static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
0664 {
0665 #ifdef CONFIG_MPU
0666     struct mm_struct *mm = vma->vm_mm;
0667     long start = vma->vm_start & PAGE_MASK;
0668     while (start < vma->vm_end) {
0669         protect_page(mm, start, flags);
0670         start += PAGE_SIZE;
0671     }
0672     update_protections(mm);
0673 #endif
0674 }
0675 
0676 /*
0677  * add a VMA into a process's mm_struct in the appropriate place in the list
0678  * and tree and add to the address space's page tree also if not an anonymous
0679  * page
0680  * - should be called with mm->mmap_sem held writelocked
0681  */
0682 static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
0683 {
0684     struct vm_area_struct *pvma, *prev;
0685     struct address_space *mapping;
0686     struct rb_node **p, *parent, *rb_prev;
0687 
0688     BUG_ON(!vma->vm_region);
0689 
0690     mm->map_count++;
0691     vma->vm_mm = mm;
0692 
0693     protect_vma(vma, vma->vm_flags);
0694 
0695     /* add the VMA to the mapping */
0696     if (vma->vm_file) {
0697         mapping = vma->vm_file->f_mapping;
0698 
0699         i_mmap_lock_write(mapping);
0700         flush_dcache_mmap_lock(mapping);
0701         vma_interval_tree_insert(vma, &mapping->i_mmap);
0702         flush_dcache_mmap_unlock(mapping);
0703         i_mmap_unlock_write(mapping);
0704     }
0705 
0706     /* add the VMA to the tree */
0707     parent = rb_prev = NULL;
0708     p = &mm->mm_rb.rb_node;
0709     while (*p) {
0710         parent = *p;
0711         pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
0712 
0713         /* sort by: start addr, end addr, VMA struct addr in that order
0714          * (the latter is necessary as we may get identical VMAs) */
0715         if (vma->vm_start < pvma->vm_start)
0716             p = &(*p)->rb_left;
0717         else if (vma->vm_start > pvma->vm_start) {
0718             rb_prev = parent;
0719             p = &(*p)->rb_right;
0720         } else if (vma->vm_end < pvma->vm_end)
0721             p = &(*p)->rb_left;
0722         else if (vma->vm_end > pvma->vm_end) {
0723             rb_prev = parent;
0724             p = &(*p)->rb_right;
0725         } else if (vma < pvma)
0726             p = &(*p)->rb_left;
0727         else if (vma > pvma) {
0728             rb_prev = parent;
0729             p = &(*p)->rb_right;
0730         } else
0731             BUG();
0732     }
0733 
0734     rb_link_node(&vma->vm_rb, parent, p);
0735     rb_insert_color(&vma->vm_rb, &mm->mm_rb);
0736 
0737     /* add VMA to the VMA list also */
0738     prev = NULL;
0739     if (rb_prev)
0740         prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
0741 
0742     __vma_link_list(mm, vma, prev, parent);
0743 }
0744 
0745 /*
0746  * delete a VMA from its owning mm_struct and address space
0747  */
0748 static void delete_vma_from_mm(struct vm_area_struct *vma)
0749 {
0750     int i;
0751     struct address_space *mapping;
0752     struct mm_struct *mm = vma->vm_mm;
0753     struct task_struct *curr = current;
0754 
0755     protect_vma(vma, 0);
0756 
0757     mm->map_count--;
0758     for (i = 0; i < VMACACHE_SIZE; i++) {
0759         /* if the vma is cached, invalidate the entire cache */
0760         if (curr->vmacache[i] == vma) {
0761             vmacache_invalidate(mm);
0762             break;
0763         }
0764     }
0765 
0766     /* remove the VMA from the mapping */
0767     if (vma->vm_file) {
0768         mapping = vma->vm_file->f_mapping;
0769 
0770         i_mmap_lock_write(mapping);
0771         flush_dcache_mmap_lock(mapping);
0772         vma_interval_tree_remove(vma, &mapping->i_mmap);
0773         flush_dcache_mmap_unlock(mapping);
0774         i_mmap_unlock_write(mapping);
0775     }
0776 
0777     /* remove from the MM's tree and list */
0778     rb_erase(&vma->vm_rb, &mm->mm_rb);
0779 
0780     if (vma->vm_prev)
0781         vma->vm_prev->vm_next = vma->vm_next;
0782     else
0783         mm->mmap = vma->vm_next;
0784 
0785     if (vma->vm_next)
0786         vma->vm_next->vm_prev = vma->vm_prev;
0787 }
0788 
0789 /*
0790  * destroy a VMA record
0791  */
0792 static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
0793 {
0794     if (vma->vm_ops && vma->vm_ops->close)
0795         vma->vm_ops->close(vma);
0796     if (vma->vm_file)
0797         fput(vma->vm_file);
0798     put_nommu_region(vma->vm_region);
0799     kmem_cache_free(vm_area_cachep, vma);
0800 }
0801 
0802 /*
0803  * look up the first VMA in which addr resides, NULL if none
0804  * - should be called with mm->mmap_sem at least held readlocked
0805  */
0806 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
0807 {
0808     struct vm_area_struct *vma;
0809 
0810     /* check the cache first */
0811     vma = vmacache_find(mm, addr);
0812     if (likely(vma))
0813         return vma;
0814 
0815     /* trawl the list (there may be multiple mappings in which addr
0816      * resides) */
0817     for (vma = mm->mmap; vma; vma = vma->vm_next) {
0818         if (vma->vm_start > addr)
0819             return NULL;
0820         if (vma->vm_end > addr) {
0821             vmacache_update(addr, vma);
0822             return vma;
0823         }
0824     }
0825 
0826     return NULL;
0827 }
0828 EXPORT_SYMBOL(find_vma);
0829 
0830 /*
0831  * find a VMA
0832  * - we don't extend stack VMAs under NOMMU conditions
0833  */
0834 struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
0835 {
0836     return find_vma(mm, addr);
0837 }
0838 
0839 /*
0840  * expand a stack to a given address
0841  * - not supported under NOMMU conditions
0842  */
0843 int expand_stack(struct vm_area_struct *vma, unsigned long address)
0844 {
0845     return -ENOMEM;
0846 }
0847 
0848 /*
0849  * look up the first VMA exactly that exactly matches addr
0850  * - should be called with mm->mmap_sem at least held readlocked
0851  */
0852 static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
0853                          unsigned long addr,
0854                          unsigned long len)
0855 {
0856     struct vm_area_struct *vma;
0857     unsigned long end = addr + len;
0858 
0859     /* check the cache first */
0860     vma = vmacache_find_exact(mm, addr, end);
0861     if (vma)
0862         return vma;
0863 
0864     /* trawl the list (there may be multiple mappings in which addr
0865      * resides) */
0866     for (vma = mm->mmap; vma; vma = vma->vm_next) {
0867         if (vma->vm_start < addr)
0868             continue;
0869         if (vma->vm_start > addr)
0870             return NULL;
0871         if (vma->vm_end == end) {
0872             vmacache_update(addr, vma);
0873             return vma;
0874         }
0875     }
0876 
0877     return NULL;
0878 }
0879 
0880 /*
0881  * determine whether a mapping should be permitted and, if so, what sort of
0882  * mapping we're capable of supporting
0883  */
0884 static int validate_mmap_request(struct file *file,
0885                  unsigned long addr,
0886                  unsigned long len,
0887                  unsigned long prot,
0888                  unsigned long flags,
0889                  unsigned long pgoff,
0890                  unsigned long *_capabilities)
0891 {
0892     unsigned long capabilities, rlen;
0893     int ret;
0894 
0895     /* do the simple checks first */
0896     if (flags & MAP_FIXED)
0897         return -EINVAL;
0898 
0899     if ((flags & MAP_TYPE) != MAP_PRIVATE &&
0900         (flags & MAP_TYPE) != MAP_SHARED)
0901         return -EINVAL;
0902 
0903     if (!len)
0904         return -EINVAL;
0905 
0906     /* Careful about overflows.. */
0907     rlen = PAGE_ALIGN(len);
0908     if (!rlen || rlen > TASK_SIZE)
0909         return -ENOMEM;
0910 
0911     /* offset overflow? */
0912     if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
0913         return -EOVERFLOW;
0914 
0915     if (file) {
0916         /* files must support mmap */
0917         if (!file->f_op->mmap)
0918             return -ENODEV;
0919 
0920         /* work out if what we've got could possibly be shared
0921          * - we support chardevs that provide their own "memory"
0922          * - we support files/blockdevs that are memory backed
0923          */
0924         if (file->f_op->mmap_capabilities) {
0925             capabilities = file->f_op->mmap_capabilities(file);
0926         } else {
0927             /* no explicit capabilities set, so assume some
0928              * defaults */
0929             switch (file_inode(file)->i_mode & S_IFMT) {
0930             case S_IFREG:
0931             case S_IFBLK:
0932                 capabilities = NOMMU_MAP_COPY;
0933                 break;
0934 
0935             case S_IFCHR:
0936                 capabilities =
0937                     NOMMU_MAP_DIRECT |
0938                     NOMMU_MAP_READ |
0939                     NOMMU_MAP_WRITE;
0940                 break;
0941 
0942             default:
0943                 return -EINVAL;
0944             }
0945         }
0946 
0947         /* eliminate any capabilities that we can't support on this
0948          * device */
0949         if (!file->f_op->get_unmapped_area)
0950             capabilities &= ~NOMMU_MAP_DIRECT;
0951         if (!(file->f_mode & FMODE_CAN_READ))
0952             capabilities &= ~NOMMU_MAP_COPY;
0953 
0954         /* The file shall have been opened with read permission. */
0955         if (!(file->f_mode & FMODE_READ))
0956             return -EACCES;
0957 
0958         if (flags & MAP_SHARED) {
0959             /* do checks for writing, appending and locking */
0960             if ((prot & PROT_WRITE) &&
0961                 !(file->f_mode & FMODE_WRITE))
0962                 return -EACCES;
0963 
0964             if (IS_APPEND(file_inode(file)) &&
0965                 (file->f_mode & FMODE_WRITE))
0966                 return -EACCES;
0967 
0968             if (locks_verify_locked(file))
0969                 return -EAGAIN;
0970 
0971             if (!(capabilities & NOMMU_MAP_DIRECT))
0972                 return -ENODEV;
0973 
0974             /* we mustn't privatise shared mappings */
0975             capabilities &= ~NOMMU_MAP_COPY;
0976         } else {
0977             /* we're going to read the file into private memory we
0978              * allocate */
0979             if (!(capabilities & NOMMU_MAP_COPY))
0980                 return -ENODEV;
0981 
0982             /* we don't permit a private writable mapping to be
0983              * shared with the backing device */
0984             if (prot & PROT_WRITE)
0985                 capabilities &= ~NOMMU_MAP_DIRECT;
0986         }
0987 
0988         if (capabilities & NOMMU_MAP_DIRECT) {
0989             if (((prot & PROT_READ)  && !(capabilities & NOMMU_MAP_READ))  ||
0990                 ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
0991                 ((prot & PROT_EXEC)  && !(capabilities & NOMMU_MAP_EXEC))
0992                 ) {
0993                 capabilities &= ~NOMMU_MAP_DIRECT;
0994                 if (flags & MAP_SHARED) {
0995                     pr_warn("MAP_SHARED not completely supported on !MMU\n");
0996                     return -EINVAL;
0997                 }
0998             }
0999         }
1000 
1001         /* handle executable mappings and implied executable
1002          * mappings */
1003         if (path_noexec(&file->f_path)) {
1004             if (prot & PROT_EXEC)
1005                 return -EPERM;
1006         } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
1007             /* handle implication of PROT_EXEC by PROT_READ */
1008             if (current->personality & READ_IMPLIES_EXEC) {
1009                 if (capabilities & NOMMU_MAP_EXEC)
1010                     prot |= PROT_EXEC;
1011             }
1012         } else if ((prot & PROT_READ) &&
1013              (prot & PROT_EXEC) &&
1014              !(capabilities & NOMMU_MAP_EXEC)
1015              ) {
1016             /* backing file is not executable, try to copy */
1017             capabilities &= ~NOMMU_MAP_DIRECT;
1018         }
1019     } else {
1020         /* anonymous mappings are always memory backed and can be
1021          * privately mapped
1022          */
1023         capabilities = NOMMU_MAP_COPY;
1024 
1025         /* handle PROT_EXEC implication by PROT_READ */
1026         if ((prot & PROT_READ) &&
1027             (current->personality & READ_IMPLIES_EXEC))
1028             prot |= PROT_EXEC;
1029     }
1030 
1031     /* allow the security API to have its say */
1032     ret = security_mmap_addr(addr);
1033     if (ret < 0)
1034         return ret;
1035 
1036     /* looks okay */
1037     *_capabilities = capabilities;
1038     return 0;
1039 }
1040 
1041 /*
1042  * we've determined that we can make the mapping, now translate what we
1043  * now know into VMA flags
1044  */
1045 static unsigned long determine_vm_flags(struct file *file,
1046                     unsigned long prot,
1047                     unsigned long flags,
1048                     unsigned long capabilities)
1049 {
1050     unsigned long vm_flags;
1051 
1052     vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
1053     /* vm_flags |= mm->def_flags; */
1054 
1055     if (!(capabilities & NOMMU_MAP_DIRECT)) {
1056         /* attempt to share read-only copies of mapped file chunks */
1057         vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1058         if (file && !(prot & PROT_WRITE))
1059             vm_flags |= VM_MAYSHARE;
1060     } else {
1061         /* overlay a shareable mapping on the backing device or inode
1062          * if possible - used for chardevs, ramfs/tmpfs/shmfs and
1063          * romfs/cramfs */
1064         vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
1065         if (flags & MAP_SHARED)
1066             vm_flags |= VM_SHARED;
1067     }
1068 
1069     /* refuse to let anyone share private mappings with this process if
1070      * it's being traced - otherwise breakpoints set in it may interfere
1071      * with another untraced process
1072      */
1073     if ((flags & MAP_PRIVATE) && current->ptrace)
1074         vm_flags &= ~VM_MAYSHARE;
1075 
1076     return vm_flags;
1077 }
1078 
1079 /*
1080  * set up a shared mapping on a file (the driver or filesystem provides and
1081  * pins the storage)
1082  */
1083 static int do_mmap_shared_file(struct vm_area_struct *vma)
1084 {
1085     int ret;
1086 
1087     ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1088     if (ret == 0) {
1089         vma->vm_region->vm_top = vma->vm_region->vm_end;
1090         return 0;
1091     }
1092     if (ret != -ENOSYS)
1093         return ret;
1094 
1095     /* getting -ENOSYS indicates that direct mmap isn't possible (as
1096      * opposed to tried but failed) so we can only give a suitable error as
1097      * it's not possible to make a private copy if MAP_SHARED was given */
1098     return -ENODEV;
1099 }
1100 
1101 /*
1102  * set up a private mapping or an anonymous shared mapping
1103  */
1104 static int do_mmap_private(struct vm_area_struct *vma,
1105                struct vm_region *region,
1106                unsigned long len,
1107                unsigned long capabilities)
1108 {
1109     unsigned long total, point;
1110     void *base;
1111     int ret, order;
1112 
1113     /* invoke the file's mapping function so that it can keep track of
1114      * shared mappings on devices or memory
1115      * - VM_MAYSHARE will be set if it may attempt to share
1116      */
1117     if (capabilities & NOMMU_MAP_DIRECT) {
1118         ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1119         if (ret == 0) {
1120             /* shouldn't return success if we're not sharing */
1121             BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
1122             vma->vm_region->vm_top = vma->vm_region->vm_end;
1123             return 0;
1124         }
1125         if (ret != -ENOSYS)
1126             return ret;
1127 
1128         /* getting an ENOSYS error indicates that direct mmap isn't
1129          * possible (as opposed to tried but failed) so we'll try to
1130          * make a private copy of the data and map that instead */
1131     }
1132 
1133 
1134     /* allocate some memory to hold the mapping
1135      * - note that this may not return a page-aligned address if the object
1136      *   we're allocating is smaller than a page
1137      */
1138     order = get_order(len);
1139     total = 1 << order;
1140     point = len >> PAGE_SHIFT;
1141 
1142     /* we don't want to allocate a power-of-2 sized page set */
1143     if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages)
1144         total = point;
1145 
1146     base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
1147     if (!base)
1148         goto enomem;
1149 
1150     atomic_long_add(total, &mmap_pages_allocated);
1151 
1152     region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
1153     region->vm_start = (unsigned long) base;
1154     region->vm_end   = region->vm_start + len;
1155     region->vm_top   = region->vm_start + (total << PAGE_SHIFT);
1156 
1157     vma->vm_start = region->vm_start;
1158     vma->vm_end   = region->vm_start + len;
1159 
1160     if (vma->vm_file) {
1161         /* read the contents of a file into the copy */
1162         mm_segment_t old_fs;
1163         loff_t fpos;
1164 
1165         fpos = vma->vm_pgoff;
1166         fpos <<= PAGE_SHIFT;
1167 
1168         old_fs = get_fs();
1169         set_fs(KERNEL_DS);
1170         ret = __vfs_read(vma->vm_file, base, len, &fpos);
1171         set_fs(old_fs);
1172 
1173         if (ret < 0)
1174             goto error_free;
1175 
1176         /* clear the last little bit */
1177         if (ret < len)
1178             memset(base + ret, 0, len - ret);
1179 
1180     }
1181 
1182     return 0;
1183 
1184 error_free:
1185     free_page_series(region->vm_start, region->vm_top);
1186     region->vm_start = vma->vm_start = 0;
1187     region->vm_end   = vma->vm_end = 0;
1188     region->vm_top   = 0;
1189     return ret;
1190 
1191 enomem:
1192     pr_err("Allocation of length %lu from process %d (%s) failed\n",
1193            len, current->pid, current->comm);
1194     show_free_areas(0);
1195     return -ENOMEM;
1196 }
1197 
1198 /*
1199  * handle mapping creation for uClinux
1200  */
1201 unsigned long do_mmap(struct file *file,
1202             unsigned long addr,
1203             unsigned long len,
1204             unsigned long prot,
1205             unsigned long flags,
1206             vm_flags_t vm_flags,
1207             unsigned long pgoff,
1208             unsigned long *populate)
1209 {
1210     struct vm_area_struct *vma;
1211     struct vm_region *region;
1212     struct rb_node *rb;
1213     unsigned long capabilities, result;
1214     int ret;
1215 
1216     *populate = 0;
1217 
1218     /* decide whether we should attempt the mapping, and if so what sort of
1219      * mapping */
1220     ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
1221                     &capabilities);
1222     if (ret < 0)
1223         return ret;
1224 
1225     /* we ignore the address hint */
1226     addr = 0;
1227     len = PAGE_ALIGN(len);
1228 
1229     /* we've determined that we can make the mapping, now translate what we
1230      * now know into VMA flags */
1231     vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
1232 
1233     /* we're going to need to record the mapping */
1234     region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
1235     if (!region)
1236         goto error_getting_region;
1237 
1238     vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1239     if (!vma)
1240         goto error_getting_vma;
1241 
1242     region->vm_usage = 1;
1243     region->vm_flags = vm_flags;
1244     region->vm_pgoff = pgoff;
1245 
1246     INIT_LIST_HEAD(&vma->anon_vma_chain);
1247     vma->vm_flags = vm_flags;
1248     vma->vm_pgoff = pgoff;
1249 
1250     if (file) {
1251         region->vm_file = get_file(file);
1252         vma->vm_file = get_file(file);
1253     }
1254 
1255     down_write(&nommu_region_sem);
1256 
1257     /* if we want to share, we need to check for regions created by other
1258      * mmap() calls that overlap with our proposed mapping
1259      * - we can only share with a superset match on most regular files
1260      * - shared mappings on character devices and memory backed files are
1261      *   permitted to overlap inexactly as far as we are concerned for in
1262      *   these cases, sharing is handled in the driver or filesystem rather
1263      *   than here
1264      */
1265     if (vm_flags & VM_MAYSHARE) {
1266         struct vm_region *pregion;
1267         unsigned long pglen, rpglen, pgend, rpgend, start;
1268 
1269         pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1270         pgend = pgoff + pglen;
1271 
1272         for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
1273             pregion = rb_entry(rb, struct vm_region, vm_rb);
1274 
1275             if (!(pregion->vm_flags & VM_MAYSHARE))
1276                 continue;
1277 
1278             /* search for overlapping mappings on the same file */
1279             if (file_inode(pregion->vm_file) !=
1280                 file_inode(file))
1281                 continue;
1282 
1283             if (pregion->vm_pgoff >= pgend)
1284                 continue;
1285 
1286             rpglen = pregion->vm_end - pregion->vm_start;
1287             rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1288             rpgend = pregion->vm_pgoff + rpglen;
1289             if (pgoff >= rpgend)
1290                 continue;
1291 
1292             /* handle inexactly overlapping matches between
1293              * mappings */
1294             if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
1295                 !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
1296                 /* new mapping is not a subset of the region */
1297                 if (!(capabilities & NOMMU_MAP_DIRECT))
1298                     goto sharing_violation;
1299                 continue;
1300             }
1301 
1302             /* we've found a region we can share */
1303             pregion->vm_usage++;
1304             vma->vm_region = pregion;
1305             start = pregion->vm_start;
1306             start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
1307             vma->vm_start = start;
1308             vma->vm_end = start + len;
1309 
1310             if (pregion->vm_flags & VM_MAPPED_COPY)
1311                 vma->vm_flags |= VM_MAPPED_COPY;
1312             else {
1313                 ret = do_mmap_shared_file(vma);
1314                 if (ret < 0) {
1315                     vma->vm_region = NULL;
1316                     vma->vm_start = 0;
1317                     vma->vm_end = 0;
1318                     pregion->vm_usage--;
1319                     pregion = NULL;
1320                     goto error_just_free;
1321                 }
1322             }
1323             fput(region->vm_file);
1324             kmem_cache_free(vm_region_jar, region);
1325             region = pregion;
1326             result = start;
1327             goto share;
1328         }
1329 
1330         /* obtain the address at which to make a shared mapping
1331          * - this is the hook for quasi-memory character devices to
1332          *   tell us the location of a shared mapping
1333          */
1334         if (capabilities & NOMMU_MAP_DIRECT) {
1335             addr = file->f_op->get_unmapped_area(file, addr, len,
1336                                  pgoff, flags);
1337             if (IS_ERR_VALUE(addr)) {
1338                 ret = addr;
1339                 if (ret != -ENOSYS)
1340                     goto error_just_free;
1341 
1342                 /* the driver refused to tell us where to site
1343                  * the mapping so we'll have to attempt to copy
1344                  * it */
1345                 ret = -ENODEV;
1346                 if (!(capabilities & NOMMU_MAP_COPY))
1347                     goto error_just_free;
1348 
1349                 capabilities &= ~NOMMU_MAP_DIRECT;
1350             } else {
1351                 vma->vm_start = region->vm_start = addr;
1352                 vma->vm_end = region->vm_end = addr + len;
1353             }
1354         }
1355     }
1356 
1357     vma->vm_region = region;
1358 
1359     /* set up the mapping
1360      * - the region is filled in if NOMMU_MAP_DIRECT is still set
1361      */
1362     if (file && vma->vm_flags & VM_SHARED)
1363         ret = do_mmap_shared_file(vma);
1364     else
1365         ret = do_mmap_private(vma, region, len, capabilities);
1366     if (ret < 0)
1367         goto error_just_free;
1368     add_nommu_region(region);
1369 
1370     /* clear anonymous mappings that don't ask for uninitialized data */
1371     if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
1372         memset((void *)region->vm_start, 0,
1373                region->vm_end - region->vm_start);
1374 
1375     /* okay... we have a mapping; now we have to register it */
1376     result = vma->vm_start;
1377 
1378     current->mm->total_vm += len >> PAGE_SHIFT;
1379 
1380 share:
1381     add_vma_to_mm(current->mm, vma);
1382 
1383     /* we flush the region from the icache only when the first executable
1384      * mapping of it is made  */
1385     if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1386         flush_icache_range(region->vm_start, region->vm_end);
1387         region->vm_icache_flushed = true;
1388     }
1389 
1390     up_write(&nommu_region_sem);
1391 
1392     return result;
1393 
1394 error_just_free:
1395     up_write(&nommu_region_sem);
1396 error:
1397     if (region->vm_file)
1398         fput(region->vm_file);
1399     kmem_cache_free(vm_region_jar, region);
1400     if (vma->vm_file)
1401         fput(vma->vm_file);
1402     kmem_cache_free(vm_area_cachep, vma);
1403     return ret;
1404 
1405 sharing_violation:
1406     up_write(&nommu_region_sem);
1407     pr_warn("Attempt to share mismatched mappings\n");
1408     ret = -EINVAL;
1409     goto error;
1410 
1411 error_getting_vma:
1412     kmem_cache_free(vm_region_jar, region);
1413     pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
1414             len, current->pid);
1415     show_free_areas(0);
1416     return -ENOMEM;
1417 
1418 error_getting_region:
1419     pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
1420             len, current->pid);
1421     show_free_areas(0);
1422     return -ENOMEM;
1423 }
1424 
1425 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1426         unsigned long, prot, unsigned long, flags,
1427         unsigned long, fd, unsigned long, pgoff)
1428 {
1429     struct file *file = NULL;
1430     unsigned long retval = -EBADF;
1431 
1432     audit_mmap_fd(fd, flags);
1433     if (!(flags & MAP_ANONYMOUS)) {
1434         file = fget(fd);
1435         if (!file)
1436             goto out;
1437     }
1438 
1439     flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1440 
1441     retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1442 
1443     if (file)
1444         fput(file);
1445 out:
1446     return retval;
1447 }
1448 
1449 #ifdef __ARCH_WANT_SYS_OLD_MMAP
1450 struct mmap_arg_struct {
1451     unsigned long addr;
1452     unsigned long len;
1453     unsigned long prot;
1454     unsigned long flags;
1455     unsigned long fd;
1456     unsigned long offset;
1457 };
1458 
1459 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1460 {
1461     struct mmap_arg_struct a;
1462 
1463     if (copy_from_user(&a, arg, sizeof(a)))
1464         return -EFAULT;
1465     if (offset_in_page(a.offset))
1466         return -EINVAL;
1467 
1468     return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1469                   a.offset >> PAGE_SHIFT);
1470 }
1471 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
1472 
1473 /*
1474  * split a vma into two pieces at address 'addr', a new vma is allocated either
1475  * for the first part or the tail.
1476  */
1477 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1478           unsigned long addr, int new_below)
1479 {
1480     struct vm_area_struct *new;
1481     struct vm_region *region;
1482     unsigned long npages;
1483 
1484     /* we're only permitted to split anonymous regions (these should have
1485      * only a single usage on the region) */
1486     if (vma->vm_file)
1487         return -ENOMEM;
1488 
1489     if (mm->map_count >= sysctl_max_map_count)
1490         return -ENOMEM;
1491 
1492     region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
1493     if (!region)
1494         return -ENOMEM;
1495 
1496     new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1497     if (!new) {
1498         kmem_cache_free(vm_region_jar, region);
1499         return -ENOMEM;
1500     }
1501 
1502     /* most fields are the same, copy all, and then fixup */
1503     *new = *vma;
1504     *region = *vma->vm_region;
1505     new->vm_region = region;
1506 
1507     npages = (addr - vma->vm_start) >> PAGE_SHIFT;
1508 
1509     if (new_below) {
1510         region->vm_top = region->vm_end = new->vm_end = addr;
1511     } else {
1512         region->vm_start = new->vm_start = addr;
1513         region->vm_pgoff = new->vm_pgoff += npages;
1514     }
1515 
1516     if (new->vm_ops && new->vm_ops->open)
1517         new->vm_ops->open(new);
1518 
1519     delete_vma_from_mm(vma);
1520     down_write(&nommu_region_sem);
1521     delete_nommu_region(vma->vm_region);
1522     if (new_below) {
1523         vma->vm_region->vm_start = vma->vm_start = addr;
1524         vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
1525     } else {
1526         vma->vm_region->vm_end = vma->vm_end = addr;
1527         vma->vm_region->vm_top = addr;
1528     }
1529     add_nommu_region(vma->vm_region);
1530     add_nommu_region(new->vm_region);
1531     up_write(&nommu_region_sem);
1532     add_vma_to_mm(mm, vma);
1533     add_vma_to_mm(mm, new);
1534     return 0;
1535 }
1536 
1537 /*
1538  * shrink a VMA by removing the specified chunk from either the beginning or
1539  * the end
1540  */
1541 static int shrink_vma(struct mm_struct *mm,
1542               struct vm_area_struct *vma,
1543               unsigned long from, unsigned long to)
1544 {
1545     struct vm_region *region;
1546 
1547     /* adjust the VMA's pointers, which may reposition it in the MM's tree
1548      * and list */
1549     delete_vma_from_mm(vma);
1550     if (from > vma->vm_start)
1551         vma->vm_end = from;
1552     else
1553         vma->vm_start = to;
1554     add_vma_to_mm(mm, vma);
1555 
1556     /* cut the backing region down to size */
1557     region = vma->vm_region;
1558     BUG_ON(region->vm_usage != 1);
1559 
1560     down_write(&nommu_region_sem);
1561     delete_nommu_region(region);
1562     if (from > region->vm_start) {
1563         to = region->vm_top;
1564         region->vm_top = region->vm_end = from;
1565     } else {
1566         region->vm_start = to;
1567     }
1568     add_nommu_region(region);
1569     up_write(&nommu_region_sem);
1570 
1571     free_page_series(from, to);
1572     return 0;
1573 }
1574 
1575 /*
1576  * release a mapping
1577  * - under NOMMU conditions the chunk to be unmapped must be backed by a single
1578  *   VMA, though it need not cover the whole VMA
1579  */
1580 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1581 {
1582     struct vm_area_struct *vma;
1583     unsigned long end;
1584     int ret;
1585 
1586     len = PAGE_ALIGN(len);
1587     if (len == 0)
1588         return -EINVAL;
1589 
1590     end = start + len;
1591 
1592     /* find the first potentially overlapping VMA */
1593     vma = find_vma(mm, start);
1594     if (!vma) {
1595         static int limit;
1596         if (limit < 5) {
1597             pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n",
1598                     current->pid, current->comm,
1599                     start, start + len - 1);
1600             limit++;
1601         }
1602         return -EINVAL;
1603     }
1604 
1605     /* we're allowed to split an anonymous VMA but not a file-backed one */
1606     if (vma->vm_file) {
1607         do {
1608             if (start > vma->vm_start)
1609                 return -EINVAL;
1610             if (end == vma->vm_end)
1611                 goto erase_whole_vma;
1612             vma = vma->vm_next;
1613         } while (vma);
1614         return -EINVAL;
1615     } else {
1616         /* the chunk must be a subset of the VMA found */
1617         if (start == vma->vm_start && end == vma->vm_end)
1618             goto erase_whole_vma;
1619         if (start < vma->vm_start || end > vma->vm_end)
1620             return -EINVAL;
1621         if (offset_in_page(start))
1622             return -EINVAL;
1623         if (end != vma->vm_end && offset_in_page(end))
1624             return -EINVAL;
1625         if (start != vma->vm_start && end != vma->vm_end) {
1626             ret = split_vma(mm, vma, start, 1);
1627             if (ret < 0)
1628                 return ret;
1629         }
1630         return shrink_vma(mm, vma, start, end);
1631     }
1632 
1633 erase_whole_vma:
1634     delete_vma_from_mm(vma);
1635     delete_vma(mm, vma);
1636     return 0;
1637 }
1638 EXPORT_SYMBOL(do_munmap);
1639 
1640 int vm_munmap(unsigned long addr, size_t len)
1641 {
1642     struct mm_struct *mm = current->mm;
1643     int ret;
1644 
1645     down_write(&mm->mmap_sem);
1646     ret = do_munmap(mm, addr, len);
1647     up_write(&mm->mmap_sem);
1648     return ret;
1649 }
1650 EXPORT_SYMBOL(vm_munmap);
1651 
1652 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1653 {
1654     return vm_munmap(addr, len);
1655 }
1656 
1657 /*
1658  * release all the mappings made in a process's VM space
1659  */
1660 void exit_mmap(struct mm_struct *mm)
1661 {
1662     struct vm_area_struct *vma;
1663 
1664     if (!mm)
1665         return;
1666 
1667     mm->total_vm = 0;
1668 
1669     while ((vma = mm->mmap)) {
1670         mm->mmap = vma->vm_next;
1671         delete_vma_from_mm(vma);
1672         delete_vma(mm, vma);
1673         cond_resched();
1674     }
1675 }
1676 
1677 int vm_brk(unsigned long addr, unsigned long len)
1678 {
1679     return -ENOMEM;
1680 }
1681 
1682 /*
1683  * expand (or shrink) an existing mapping, potentially moving it at the same
1684  * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1685  *
1686  * under NOMMU conditions, we only permit changing a mapping's size, and only
1687  * as long as it stays within the region allocated by do_mmap_private() and the
1688  * block is not shareable
1689  *
1690  * MREMAP_FIXED is not supported under NOMMU conditions
1691  */
1692 static unsigned long do_mremap(unsigned long addr,
1693             unsigned long old_len, unsigned long new_len,
1694             unsigned long flags, unsigned long new_addr)
1695 {
1696     struct vm_area_struct *vma;
1697 
1698     /* insanity checks first */
1699     old_len = PAGE_ALIGN(old_len);
1700     new_len = PAGE_ALIGN(new_len);
1701     if (old_len == 0 || new_len == 0)
1702         return (unsigned long) -EINVAL;
1703 
1704     if (offset_in_page(addr))
1705         return -EINVAL;
1706 
1707     if (flags & MREMAP_FIXED && new_addr != addr)
1708         return (unsigned long) -EINVAL;
1709 
1710     vma = find_vma_exact(current->mm, addr, old_len);
1711     if (!vma)
1712         return (unsigned long) -EINVAL;
1713 
1714     if (vma->vm_end != vma->vm_start + old_len)
1715         return (unsigned long) -EFAULT;
1716 
1717     if (vma->vm_flags & VM_MAYSHARE)
1718         return (unsigned long) -EPERM;
1719 
1720     if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
1721         return (unsigned long) -ENOMEM;
1722 
1723     /* all checks complete - do it */
1724     vma->vm_end = vma->vm_start + new_len;
1725     return vma->vm_start;
1726 }
1727 
1728 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1729         unsigned long, new_len, unsigned long, flags,
1730         unsigned long, new_addr)
1731 {
1732     unsigned long ret;
1733 
1734     down_write(&current->mm->mmap_sem);
1735     ret = do_mremap(addr, old_len, new_len, flags, new_addr);
1736     up_write(&current->mm->mmap_sem);
1737     return ret;
1738 }
1739 
1740 struct page *follow_page_mask(struct vm_area_struct *vma,
1741                   unsigned long address, unsigned int flags,
1742                   unsigned int *page_mask)
1743 {
1744     *page_mask = 0;
1745     return NULL;
1746 }
1747 
1748 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1749         unsigned long pfn, unsigned long size, pgprot_t prot)
1750 {
1751     if (addr != (pfn << PAGE_SHIFT))
1752         return -EINVAL;
1753 
1754     vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1755     return 0;
1756 }
1757 EXPORT_SYMBOL(remap_pfn_range);
1758 
1759 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1760 {
1761     unsigned long pfn = start >> PAGE_SHIFT;
1762     unsigned long vm_len = vma->vm_end - vma->vm_start;
1763 
1764     pfn += vma->vm_pgoff;
1765     return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1766 }
1767 EXPORT_SYMBOL(vm_iomap_memory);
1768 
1769 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1770             unsigned long pgoff)
1771 {
1772     unsigned int size = vma->vm_end - vma->vm_start;
1773 
1774     if (!(vma->vm_flags & VM_USERMAP))
1775         return -EINVAL;
1776 
1777     vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
1778     vma->vm_end = vma->vm_start + size;
1779 
1780     return 0;
1781 }
1782 EXPORT_SYMBOL(remap_vmalloc_range);
1783 
1784 unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1785     unsigned long len, unsigned long pgoff, unsigned long flags)
1786 {
1787     return -ENOMEM;
1788 }
1789 
1790 void unmap_mapping_range(struct address_space *mapping,
1791              loff_t const holebegin, loff_t const holelen,
1792              int even_cows)
1793 {
1794 }
1795 EXPORT_SYMBOL(unmap_mapping_range);
1796 
1797 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1798 {
1799     BUG();
1800     return 0;
1801 }
1802 EXPORT_SYMBOL(filemap_fault);
1803 
1804 void filemap_map_pages(struct vm_fault *vmf,
1805         pgoff_t start_pgoff, pgoff_t end_pgoff)
1806 {
1807     BUG();
1808 }
1809 EXPORT_SYMBOL(filemap_map_pages);
1810 
1811 int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1812         unsigned long addr, void *buf, int len, unsigned int gup_flags)
1813 {
1814     struct vm_area_struct *vma;
1815     int write = gup_flags & FOLL_WRITE;
1816 
1817     down_read(&mm->mmap_sem);
1818 
1819     /* the access must start within one of the target process's mappings */
1820     vma = find_vma(mm, addr);
1821     if (vma) {
1822         /* don't overrun this mapping */
1823         if (addr + len >= vma->vm_end)
1824             len = vma->vm_end - addr;
1825 
1826         /* only read or write mappings where it is permitted */
1827         if (write && vma->vm_flags & VM_MAYWRITE)
1828             copy_to_user_page(vma, NULL, addr,
1829                      (void *) addr, buf, len);
1830         else if (!write && vma->vm_flags & VM_MAYREAD)
1831             copy_from_user_page(vma, NULL, addr,
1832                         buf, (void *) addr, len);
1833         else
1834             len = 0;
1835     } else {
1836         len = 0;
1837     }
1838 
1839     up_read(&mm->mmap_sem);
1840 
1841     return len;
1842 }
1843 
1844 /**
1845  * @access_remote_vm - access another process' address space
1846  * @mm:     the mm_struct of the target address space
1847  * @addr:   start address to access
1848  * @buf:    source or destination buffer
1849  * @len:    number of bytes to transfer
1850  * @gup_flags:  flags modifying lookup behaviour
1851  *
1852  * The caller must hold a reference on @mm.
1853  */
1854 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
1855         void *buf, int len, unsigned int gup_flags)
1856 {
1857     return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
1858 }
1859 
1860 /*
1861  * Access another process' address space.
1862  * - source/target buffer must be kernel space
1863  */
1864 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len,
1865         unsigned int gup_flags)
1866 {
1867     struct mm_struct *mm;
1868 
1869     if (addr + len < addr)
1870         return 0;
1871 
1872     mm = get_task_mm(tsk);
1873     if (!mm)
1874         return 0;
1875 
1876     len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
1877 
1878     mmput(mm);
1879     return len;
1880 }
1881 EXPORT_SYMBOL_GPL(access_process_vm);
1882 
1883 /**
1884  * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
1885  * @inode: The inode to check
1886  * @size: The current filesize of the inode
1887  * @newsize: The proposed filesize of the inode
1888  *
1889  * Check the shared mappings on an inode on behalf of a shrinking truncate to
1890  * make sure that that any outstanding VMAs aren't broken and then shrink the
1891  * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
1892  * automatically grant mappings that are too large.
1893  */
1894 int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
1895                 size_t newsize)
1896 {
1897     struct vm_area_struct *vma;
1898     struct vm_region *region;
1899     pgoff_t low, high;
1900     size_t r_size, r_top;
1901 
1902     low = newsize >> PAGE_SHIFT;
1903     high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1904 
1905     down_write(&nommu_region_sem);
1906     i_mmap_lock_read(inode->i_mapping);
1907 
1908     /* search for VMAs that fall within the dead zone */
1909     vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
1910         /* found one - only interested if it's shared out of the page
1911          * cache */
1912         if (vma->vm_flags & VM_SHARED) {
1913             i_mmap_unlock_read(inode->i_mapping);
1914             up_write(&nommu_region_sem);
1915             return -ETXTBSY; /* not quite true, but near enough */
1916         }
1917     }
1918 
1919     /* reduce any regions that overlap the dead zone - if in existence,
1920      * these will be pointed to by VMAs that don't overlap the dead zone
1921      *
1922      * we don't check for any regions that start beyond the EOF as there
1923      * shouldn't be any
1924      */
1925     vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
1926         if (!(vma->vm_flags & VM_SHARED))
1927             continue;
1928 
1929         region = vma->vm_region;
1930         r_size = region->vm_top - region->vm_start;
1931         r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
1932 
1933         if (r_top > newsize) {
1934             region->vm_top -= r_top - newsize;
1935             if (region->vm_end > region->vm_top)
1936                 region->vm_end = region->vm_top;
1937         }
1938     }
1939 
1940     i_mmap_unlock_read(inode->i_mapping);
1941     up_write(&nommu_region_sem);
1942     return 0;
1943 }
1944 
1945 /*
1946  * Initialise sysctl_user_reserve_kbytes.
1947  *
1948  * This is intended to prevent a user from starting a single memory hogging
1949  * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
1950  * mode.
1951  *
1952  * The default value is min(3% of free memory, 128MB)
1953  * 128MB is enough to recover with sshd/login, bash, and top/kill.
1954  */
1955 static int __meminit init_user_reserve(void)
1956 {
1957     unsigned long free_kbytes;
1958 
1959     free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
1960 
1961     sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
1962     return 0;
1963 }
1964 subsys_initcall(init_user_reserve);
1965 
1966 /*
1967  * Initialise sysctl_admin_reserve_kbytes.
1968  *
1969  * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
1970  * to log in and kill a memory hogging process.
1971  *
1972  * Systems with more than 256MB will reserve 8MB, enough to recover
1973  * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
1974  * only reserve 3% of free pages by default.
1975  */
1976 static int __meminit init_admin_reserve(void)
1977 {
1978     unsigned long free_kbytes;
1979 
1980     free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
1981 
1982     sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
1983     return 0;
1984 }
1985 subsys_initcall(init_admin_reserve);