Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/mm/mlock.c
0003  *
0004  *  (C) Copyright 1995 Linus Torvalds
0005  *  (C) Copyright 2002 Christoph Hellwig
0006  */
0007 
0008 #include <linux/capability.h>
0009 #include <linux/mman.h>
0010 #include <linux/mm.h>
0011 #include <linux/swap.h>
0012 #include <linux/swapops.h>
0013 #include <linux/pagemap.h>
0014 #include <linux/pagevec.h>
0015 #include <linux/mempolicy.h>
0016 #include <linux/syscalls.h>
0017 #include <linux/sched.h>
0018 #include <linux/export.h>
0019 #include <linux/rmap.h>
0020 #include <linux/mmzone.h>
0021 #include <linux/hugetlb.h>
0022 #include <linux/memcontrol.h>
0023 #include <linux/mm_inline.h>
0024 
0025 #include "internal.h"
0026 
0027 bool can_do_mlock(void)
0028 {
0029     if (rlimit(RLIMIT_MEMLOCK) != 0)
0030         return true;
0031     if (capable(CAP_IPC_LOCK))
0032         return true;
0033     return false;
0034 }
0035 EXPORT_SYMBOL(can_do_mlock);
0036 
0037 /*
0038  * Mlocked pages are marked with PageMlocked() flag for efficient testing
0039  * in vmscan and, possibly, the fault path; and to support semi-accurate
0040  * statistics.
0041  *
0042  * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
0043  * be placed on the LRU "unevictable" list, rather than the [in]active lists.
0044  * The unevictable list is an LRU sibling list to the [in]active lists.
0045  * PageUnevictable is set to indicate the unevictable state.
0046  *
0047  * When lazy mlocking via vmscan, it is important to ensure that the
0048  * vma's VM_LOCKED status is not concurrently being modified, otherwise we
0049  * may have mlocked a page that is being munlocked. So lazy mlock must take
0050  * the mmap_sem for read, and verify that the vma really is locked
0051  * (see mm/rmap.c).
0052  */
0053 
0054 /*
0055  *  LRU accounting for clear_page_mlock()
0056  */
0057 void clear_page_mlock(struct page *page)
0058 {
0059     if (!TestClearPageMlocked(page))
0060         return;
0061 
0062     mod_zone_page_state(page_zone(page), NR_MLOCK,
0063                 -hpage_nr_pages(page));
0064     count_vm_event(UNEVICTABLE_PGCLEARED);
0065     if (!isolate_lru_page(page)) {
0066         putback_lru_page(page);
0067     } else {
0068         /*
0069          * We lost the race. the page already moved to evictable list.
0070          */
0071         if (PageUnevictable(page))
0072             count_vm_event(UNEVICTABLE_PGSTRANDED);
0073     }
0074 }
0075 
0076 /*
0077  * Mark page as mlocked if not already.
0078  * If page on LRU, isolate and putback to move to unevictable list.
0079  */
0080 void mlock_vma_page(struct page *page)
0081 {
0082     /* Serialize with page migration */
0083     BUG_ON(!PageLocked(page));
0084 
0085     VM_BUG_ON_PAGE(PageTail(page), page);
0086     VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
0087 
0088     if (!TestSetPageMlocked(page)) {
0089         mod_zone_page_state(page_zone(page), NR_MLOCK,
0090                     hpage_nr_pages(page));
0091         count_vm_event(UNEVICTABLE_PGMLOCKED);
0092         if (!isolate_lru_page(page))
0093             putback_lru_page(page);
0094     }
0095 }
0096 
0097 /*
0098  * Isolate a page from LRU with optional get_page() pin.
0099  * Assumes lru_lock already held and page already pinned.
0100  */
0101 static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
0102 {
0103     if (PageLRU(page)) {
0104         struct lruvec *lruvec;
0105 
0106         lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
0107         if (getpage)
0108             get_page(page);
0109         ClearPageLRU(page);
0110         del_page_from_lru_list(page, lruvec, page_lru(page));
0111         return true;
0112     }
0113 
0114     return false;
0115 }
0116 
0117 /*
0118  * Finish munlock after successful page isolation
0119  *
0120  * Page must be locked. This is a wrapper for try_to_munlock()
0121  * and putback_lru_page() with munlock accounting.
0122  */
0123 static void __munlock_isolated_page(struct page *page)
0124 {
0125     int ret = SWAP_AGAIN;
0126 
0127     /*
0128      * Optimization: if the page was mapped just once, that's our mapping
0129      * and we don't need to check all the other vmas.
0130      */
0131     if (page_mapcount(page) > 1)
0132         ret = try_to_munlock(page);
0133 
0134     /* Did try_to_unlock() succeed or punt? */
0135     if (ret != SWAP_MLOCK)
0136         count_vm_event(UNEVICTABLE_PGMUNLOCKED);
0137 
0138     putback_lru_page(page);
0139 }
0140 
0141 /*
0142  * Accounting for page isolation fail during munlock
0143  *
0144  * Performs accounting when page isolation fails in munlock. There is nothing
0145  * else to do because it means some other task has already removed the page
0146  * from the LRU. putback_lru_page() will take care of removing the page from
0147  * the unevictable list, if necessary. vmscan [page_referenced()] will move
0148  * the page back to the unevictable list if some other vma has it mlocked.
0149  */
0150 static void __munlock_isolation_failed(struct page *page)
0151 {
0152     if (PageUnevictable(page))
0153         __count_vm_event(UNEVICTABLE_PGSTRANDED);
0154     else
0155         __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
0156 }
0157 
0158 /**
0159  * munlock_vma_page - munlock a vma page
0160  * @page - page to be unlocked, either a normal page or THP page head
0161  *
0162  * returns the size of the page as a page mask (0 for normal page,
0163  *         HPAGE_PMD_NR - 1 for THP head page)
0164  *
0165  * called from munlock()/munmap() path with page supposedly on the LRU.
0166  * When we munlock a page, because the vma where we found the page is being
0167  * munlock()ed or munmap()ed, we want to check whether other vmas hold the
0168  * page locked so that we can leave it on the unevictable lru list and not
0169  * bother vmscan with it.  However, to walk the page's rmap list in
0170  * try_to_munlock() we must isolate the page from the LRU.  If some other
0171  * task has removed the page from the LRU, we won't be able to do that.
0172  * So we clear the PageMlocked as we might not get another chance.  If we
0173  * can't isolate the page, we leave it for putback_lru_page() and vmscan
0174  * [page_referenced()/try_to_unmap()] to deal with.
0175  */
0176 unsigned int munlock_vma_page(struct page *page)
0177 {
0178     int nr_pages;
0179     struct zone *zone = page_zone(page);
0180 
0181     /* For try_to_munlock() and to serialize with page migration */
0182     BUG_ON(!PageLocked(page));
0183 
0184     VM_BUG_ON_PAGE(PageTail(page), page);
0185 
0186     /*
0187      * Serialize with any parallel __split_huge_page_refcount() which
0188      * might otherwise copy PageMlocked to part of the tail pages before
0189      * we clear it in the head page. It also stabilizes hpage_nr_pages().
0190      */
0191     spin_lock_irq(zone_lru_lock(zone));
0192 
0193     if (!TestClearPageMlocked(page)) {
0194         /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
0195         nr_pages = 1;
0196         goto unlock_out;
0197     }
0198 
0199     nr_pages = hpage_nr_pages(page);
0200     __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
0201 
0202     if (__munlock_isolate_lru_page(page, true)) {
0203         spin_unlock_irq(zone_lru_lock(zone));
0204         __munlock_isolated_page(page);
0205         goto out;
0206     }
0207     __munlock_isolation_failed(page);
0208 
0209 unlock_out:
0210     spin_unlock_irq(zone_lru_lock(zone));
0211 
0212 out:
0213     return nr_pages - 1;
0214 }
0215 
0216 /*
0217  * convert get_user_pages() return value to posix mlock() error
0218  */
0219 static int __mlock_posix_error_return(long retval)
0220 {
0221     if (retval == -EFAULT)
0222         retval = -ENOMEM;
0223     else if (retval == -ENOMEM)
0224         retval = -EAGAIN;
0225     return retval;
0226 }
0227 
0228 /*
0229  * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
0230  *
0231  * The fast path is available only for evictable pages with single mapping.
0232  * Then we can bypass the per-cpu pvec and get better performance.
0233  * when mapcount > 1 we need try_to_munlock() which can fail.
0234  * when !page_evictable(), we need the full redo logic of putback_lru_page to
0235  * avoid leaving evictable page in unevictable list.
0236  *
0237  * In case of success, @page is added to @pvec and @pgrescued is incremented
0238  * in case that the page was previously unevictable. @page is also unlocked.
0239  */
0240 static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
0241         int *pgrescued)
0242 {
0243     VM_BUG_ON_PAGE(PageLRU(page), page);
0244     VM_BUG_ON_PAGE(!PageLocked(page), page);
0245 
0246     if (page_mapcount(page) <= 1 && page_evictable(page)) {
0247         pagevec_add(pvec, page);
0248         if (TestClearPageUnevictable(page))
0249             (*pgrescued)++;
0250         unlock_page(page);
0251         return true;
0252     }
0253 
0254     return false;
0255 }
0256 
0257 /*
0258  * Putback multiple evictable pages to the LRU
0259  *
0260  * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
0261  * the pages might have meanwhile become unevictable but that is OK.
0262  */
0263 static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
0264 {
0265     count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
0266     /*
0267      *__pagevec_lru_add() calls release_pages() so we don't call
0268      * put_page() explicitly
0269      */
0270     __pagevec_lru_add(pvec);
0271     count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
0272 }
0273 
0274 /*
0275  * Munlock a batch of pages from the same zone
0276  *
0277  * The work is split to two main phases. First phase clears the Mlocked flag
0278  * and attempts to isolate the pages, all under a single zone lru lock.
0279  * The second phase finishes the munlock only for pages where isolation
0280  * succeeded.
0281  *
0282  * Note that the pagevec may be modified during the process.
0283  */
0284 static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
0285 {
0286     int i;
0287     int nr = pagevec_count(pvec);
0288     int delta_munlocked;
0289     struct pagevec pvec_putback;
0290     int pgrescued = 0;
0291 
0292     pagevec_init(&pvec_putback, 0);
0293 
0294     /* Phase 1: page isolation */
0295     spin_lock_irq(zone_lru_lock(zone));
0296     for (i = 0; i < nr; i++) {
0297         struct page *page = pvec->pages[i];
0298 
0299         if (TestClearPageMlocked(page)) {
0300             /*
0301              * We already have pin from follow_page_mask()
0302              * so we can spare the get_page() here.
0303              */
0304             if (__munlock_isolate_lru_page(page, false))
0305                 continue;
0306             else
0307                 __munlock_isolation_failed(page);
0308         }
0309 
0310         /*
0311          * We won't be munlocking this page in the next phase
0312          * but we still need to release the follow_page_mask()
0313          * pin. We cannot do it under lru_lock however. If it's
0314          * the last pin, __page_cache_release() would deadlock.
0315          */
0316         pagevec_add(&pvec_putback, pvec->pages[i]);
0317         pvec->pages[i] = NULL;
0318     }
0319     delta_munlocked = -nr + pagevec_count(&pvec_putback);
0320     __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
0321     spin_unlock_irq(zone_lru_lock(zone));
0322 
0323     /* Now we can release pins of pages that we are not munlocking */
0324     pagevec_release(&pvec_putback);
0325 
0326     /* Phase 2: page munlock */
0327     for (i = 0; i < nr; i++) {
0328         struct page *page = pvec->pages[i];
0329 
0330         if (page) {
0331             lock_page(page);
0332             if (!__putback_lru_fast_prepare(page, &pvec_putback,
0333                     &pgrescued)) {
0334                 /*
0335                  * Slow path. We don't want to lose the last
0336                  * pin before unlock_page()
0337                  */
0338                 get_page(page); /* for putback_lru_page() */
0339                 __munlock_isolated_page(page);
0340                 unlock_page(page);
0341                 put_page(page); /* from follow_page_mask() */
0342             }
0343         }
0344     }
0345 
0346     /*
0347      * Phase 3: page putback for pages that qualified for the fast path
0348      * This will also call put_page() to return pin from follow_page_mask()
0349      */
0350     if (pagevec_count(&pvec_putback))
0351         __putback_lru_fast(&pvec_putback, pgrescued);
0352 }
0353 
0354 /*
0355  * Fill up pagevec for __munlock_pagevec using pte walk
0356  *
0357  * The function expects that the struct page corresponding to @start address is
0358  * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
0359  *
0360  * The rest of @pvec is filled by subsequent pages within the same pmd and same
0361  * zone, as long as the pte's are present and vm_normal_page() succeeds. These
0362  * pages also get pinned.
0363  *
0364  * Returns the address of the next page that should be scanned. This equals
0365  * @start + PAGE_SIZE when no page could be added by the pte walk.
0366  */
0367 static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
0368         struct vm_area_struct *vma, int zoneid, unsigned long start,
0369         unsigned long end)
0370 {
0371     pte_t *pte;
0372     spinlock_t *ptl;
0373 
0374     /*
0375      * Initialize pte walk starting at the already pinned page where we
0376      * are sure that there is a pte, as it was pinned under the same
0377      * mmap_sem write op.
0378      */
0379     pte = get_locked_pte(vma->vm_mm, start, &ptl);
0380     /* Make sure we do not cross the page table boundary */
0381     end = pgd_addr_end(start, end);
0382     end = pud_addr_end(start, end);
0383     end = pmd_addr_end(start, end);
0384 
0385     /* The page next to the pinned page is the first we will try to get */
0386     start += PAGE_SIZE;
0387     while (start < end) {
0388         struct page *page = NULL;
0389         pte++;
0390         if (pte_present(*pte))
0391             page = vm_normal_page(vma, start, *pte);
0392         /*
0393          * Break if page could not be obtained or the page's node+zone does not
0394          * match
0395          */
0396         if (!page || page_zone_id(page) != zoneid)
0397             break;
0398 
0399         /*
0400          * Do not use pagevec for PTE-mapped THP,
0401          * munlock_vma_pages_range() will handle them.
0402          */
0403         if (PageTransCompound(page))
0404             break;
0405 
0406         get_page(page);
0407         /*
0408          * Increase the address that will be returned *before* the
0409          * eventual break due to pvec becoming full by adding the page
0410          */
0411         start += PAGE_SIZE;
0412         if (pagevec_add(pvec, page) == 0)
0413             break;
0414     }
0415     pte_unmap_unlock(pte, ptl);
0416     return start;
0417 }
0418 
0419 /*
0420  * munlock_vma_pages_range() - munlock all pages in the vma range.'
0421  * @vma - vma containing range to be munlock()ed.
0422  * @start - start address in @vma of the range
0423  * @end - end of range in @vma.
0424  *
0425  *  For mremap(), munmap() and exit().
0426  *
0427  * Called with @vma VM_LOCKED.
0428  *
0429  * Returns with VM_LOCKED cleared.  Callers must be prepared to
0430  * deal with this.
0431  *
0432  * We don't save and restore VM_LOCKED here because pages are
0433  * still on lru.  In unmap path, pages might be scanned by reclaim
0434  * and re-mlocked by try_to_{munlock|unmap} before we unmap and
0435  * free them.  This will result in freeing mlocked pages.
0436  */
0437 void munlock_vma_pages_range(struct vm_area_struct *vma,
0438                  unsigned long start, unsigned long end)
0439 {
0440     vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
0441 
0442     while (start < end) {
0443         struct page *page;
0444         unsigned int page_mask;
0445         unsigned long page_increm;
0446         struct pagevec pvec;
0447         struct zone *zone;
0448         int zoneid;
0449 
0450         pagevec_init(&pvec, 0);
0451         /*
0452          * Although FOLL_DUMP is intended for get_dump_page(),
0453          * it just so happens that its special treatment of the
0454          * ZERO_PAGE (returning an error instead of doing get_page)
0455          * suits munlock very well (and if somehow an abnormal page
0456          * has sneaked into the range, we won't oops here: great).
0457          */
0458         page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
0459                 &page_mask);
0460 
0461         if (page && !IS_ERR(page)) {
0462             if (PageTransTail(page)) {
0463                 VM_BUG_ON_PAGE(PageMlocked(page), page);
0464                 put_page(page); /* follow_page_mask() */
0465             } else if (PageTransHuge(page)) {
0466                 lock_page(page);
0467                 /*
0468                  * Any THP page found by follow_page_mask() may
0469                  * have gotten split before reaching
0470                  * munlock_vma_page(), so we need to recompute
0471                  * the page_mask here.
0472                  */
0473                 page_mask = munlock_vma_page(page);
0474                 unlock_page(page);
0475                 put_page(page); /* follow_page_mask() */
0476             } else {
0477                 /*
0478                  * Non-huge pages are handled in batches via
0479                  * pagevec. The pin from follow_page_mask()
0480                  * prevents them from collapsing by THP.
0481                  */
0482                 pagevec_add(&pvec, page);
0483                 zone = page_zone(page);
0484                 zoneid = page_zone_id(page);
0485 
0486                 /*
0487                  * Try to fill the rest of pagevec using fast
0488                  * pte walk. This will also update start to
0489                  * the next page to process. Then munlock the
0490                  * pagevec.
0491                  */
0492                 start = __munlock_pagevec_fill(&pvec, vma,
0493                         zoneid, start, end);
0494                 __munlock_pagevec(&pvec, zone);
0495                 goto next;
0496             }
0497         }
0498         page_increm = 1 + page_mask;
0499         start += page_increm * PAGE_SIZE;
0500 next:
0501         cond_resched();
0502     }
0503 }
0504 
0505 /*
0506  * mlock_fixup  - handle mlock[all]/munlock[all] requests.
0507  *
0508  * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
0509  * munlock is a no-op.  However, for some special vmas, we go ahead and
0510  * populate the ptes.
0511  *
0512  * For vmas that pass the filters, merge/split as appropriate.
0513  */
0514 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
0515     unsigned long start, unsigned long end, vm_flags_t newflags)
0516 {
0517     struct mm_struct *mm = vma->vm_mm;
0518     pgoff_t pgoff;
0519     int nr_pages;
0520     int ret = 0;
0521     int lock = !!(newflags & VM_LOCKED);
0522     vm_flags_t old_flags = vma->vm_flags;
0523 
0524     if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
0525         is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
0526         /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
0527         goto out;
0528 
0529     pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
0530     *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
0531               vma->vm_file, pgoff, vma_policy(vma),
0532               vma->vm_userfaultfd_ctx);
0533     if (*prev) {
0534         vma = *prev;
0535         goto success;
0536     }
0537 
0538     if (start != vma->vm_start) {
0539         ret = split_vma(mm, vma, start, 1);
0540         if (ret)
0541             goto out;
0542     }
0543 
0544     if (end != vma->vm_end) {
0545         ret = split_vma(mm, vma, end, 0);
0546         if (ret)
0547             goto out;
0548     }
0549 
0550 success:
0551     /*
0552      * Keep track of amount of locked VM.
0553      */
0554     nr_pages = (end - start) >> PAGE_SHIFT;
0555     if (!lock)
0556         nr_pages = -nr_pages;
0557     else if (old_flags & VM_LOCKED)
0558         nr_pages = 0;
0559     mm->locked_vm += nr_pages;
0560 
0561     /*
0562      * vm_flags is protected by the mmap_sem held in write mode.
0563      * It's okay if try_to_unmap_one unmaps a page just after we
0564      * set VM_LOCKED, populate_vma_page_range will bring it back.
0565      */
0566 
0567     if (lock)
0568         vma->vm_flags = newflags;
0569     else
0570         munlock_vma_pages_range(vma, start, end);
0571 
0572 out:
0573     *prev = vma;
0574     return ret;
0575 }
0576 
0577 static int apply_vma_lock_flags(unsigned long start, size_t len,
0578                 vm_flags_t flags)
0579 {
0580     unsigned long nstart, end, tmp;
0581     struct vm_area_struct * vma, * prev;
0582     int error;
0583 
0584     VM_BUG_ON(offset_in_page(start));
0585     VM_BUG_ON(len != PAGE_ALIGN(len));
0586     end = start + len;
0587     if (end < start)
0588         return -EINVAL;
0589     if (end == start)
0590         return 0;
0591     vma = find_vma(current->mm, start);
0592     if (!vma || vma->vm_start > start)
0593         return -ENOMEM;
0594 
0595     prev = vma->vm_prev;
0596     if (start > vma->vm_start)
0597         prev = vma;
0598 
0599     for (nstart = start ; ; ) {
0600         vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
0601 
0602         newflags |= flags;
0603 
0604         /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
0605         tmp = vma->vm_end;
0606         if (tmp > end)
0607             tmp = end;
0608         error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
0609         if (error)
0610             break;
0611         nstart = tmp;
0612         if (nstart < prev->vm_end)
0613             nstart = prev->vm_end;
0614         if (nstart >= end)
0615             break;
0616 
0617         vma = prev->vm_next;
0618         if (!vma || vma->vm_start != nstart) {
0619             error = -ENOMEM;
0620             break;
0621         }
0622     }
0623     return error;
0624 }
0625 
0626 /*
0627  * Go through vma areas and sum size of mlocked
0628  * vma pages, as return value.
0629  * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
0630  * is also counted.
0631  * Return value: previously mlocked page counts
0632  */
0633 static int count_mm_mlocked_page_nr(struct mm_struct *mm,
0634         unsigned long start, size_t len)
0635 {
0636     struct vm_area_struct *vma;
0637     int count = 0;
0638 
0639     if (mm == NULL)
0640         mm = current->mm;
0641 
0642     vma = find_vma(mm, start);
0643     if (vma == NULL)
0644         vma = mm->mmap;
0645 
0646     for (; vma ; vma = vma->vm_next) {
0647         if (start >= vma->vm_end)
0648             continue;
0649         if (start + len <=  vma->vm_start)
0650             break;
0651         if (vma->vm_flags & VM_LOCKED) {
0652             if (start > vma->vm_start)
0653                 count -= (start - vma->vm_start);
0654             if (start + len < vma->vm_end) {
0655                 count += start + len - vma->vm_start;
0656                 break;
0657             }
0658             count += vma->vm_end - vma->vm_start;
0659         }
0660     }
0661 
0662     return count >> PAGE_SHIFT;
0663 }
0664 
0665 static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
0666 {
0667     unsigned long locked;
0668     unsigned long lock_limit;
0669     int error = -ENOMEM;
0670 
0671     if (!can_do_mlock())
0672         return -EPERM;
0673 
0674     lru_add_drain_all();    /* flush pagevec */
0675 
0676     len = PAGE_ALIGN(len + (offset_in_page(start)));
0677     start &= PAGE_MASK;
0678 
0679     lock_limit = rlimit(RLIMIT_MEMLOCK);
0680     lock_limit >>= PAGE_SHIFT;
0681     locked = len >> PAGE_SHIFT;
0682 
0683     if (down_write_killable(&current->mm->mmap_sem))
0684         return -EINTR;
0685 
0686     locked += current->mm->locked_vm;
0687     if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
0688         /*
0689          * It is possible that the regions requested intersect with
0690          * previously mlocked areas, that part area in "mm->locked_vm"
0691          * should not be counted to new mlock increment count. So check
0692          * and adjust locked count if necessary.
0693          */
0694         locked -= count_mm_mlocked_page_nr(current->mm,
0695                 start, len);
0696     }
0697 
0698     /* check against resource limits */
0699     if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
0700         error = apply_vma_lock_flags(start, len, flags);
0701 
0702     up_write(&current->mm->mmap_sem);
0703     if (error)
0704         return error;
0705 
0706     error = __mm_populate(start, len, 0);
0707     if (error)
0708         return __mlock_posix_error_return(error);
0709     return 0;
0710 }
0711 
0712 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
0713 {
0714     return do_mlock(start, len, VM_LOCKED);
0715 }
0716 
0717 SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
0718 {
0719     vm_flags_t vm_flags = VM_LOCKED;
0720 
0721     if (flags & ~MLOCK_ONFAULT)
0722         return -EINVAL;
0723 
0724     if (flags & MLOCK_ONFAULT)
0725         vm_flags |= VM_LOCKONFAULT;
0726 
0727     return do_mlock(start, len, vm_flags);
0728 }
0729 
0730 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
0731 {
0732     int ret;
0733 
0734     len = PAGE_ALIGN(len + (offset_in_page(start)));
0735     start &= PAGE_MASK;
0736 
0737     if (down_write_killable(&current->mm->mmap_sem))
0738         return -EINTR;
0739     ret = apply_vma_lock_flags(start, len, 0);
0740     up_write(&current->mm->mmap_sem);
0741 
0742     return ret;
0743 }
0744 
0745 /*
0746  * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
0747  * and translate into the appropriate modifications to mm->def_flags and/or the
0748  * flags for all current VMAs.
0749  *
0750  * There are a couple of subtleties with this.  If mlockall() is called multiple
0751  * times with different flags, the values do not necessarily stack.  If mlockall
0752  * is called once including the MCL_FUTURE flag and then a second time without
0753  * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
0754  */
0755 static int apply_mlockall_flags(int flags)
0756 {
0757     struct vm_area_struct * vma, * prev = NULL;
0758     vm_flags_t to_add = 0;
0759 
0760     current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
0761     if (flags & MCL_FUTURE) {
0762         current->mm->def_flags |= VM_LOCKED;
0763 
0764         if (flags & MCL_ONFAULT)
0765             current->mm->def_flags |= VM_LOCKONFAULT;
0766 
0767         if (!(flags & MCL_CURRENT))
0768             goto out;
0769     }
0770 
0771     if (flags & MCL_CURRENT) {
0772         to_add |= VM_LOCKED;
0773         if (flags & MCL_ONFAULT)
0774             to_add |= VM_LOCKONFAULT;
0775     }
0776 
0777     for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
0778         vm_flags_t newflags;
0779 
0780         newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
0781         newflags |= to_add;
0782 
0783         /* Ignore errors */
0784         mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
0785         cond_resched_rcu_qs();
0786     }
0787 out:
0788     return 0;
0789 }
0790 
0791 SYSCALL_DEFINE1(mlockall, int, flags)
0792 {
0793     unsigned long lock_limit;
0794     int ret;
0795 
0796     if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)))
0797         return -EINVAL;
0798 
0799     if (!can_do_mlock())
0800         return -EPERM;
0801 
0802     if (flags & MCL_CURRENT)
0803         lru_add_drain_all();    /* flush pagevec */
0804 
0805     lock_limit = rlimit(RLIMIT_MEMLOCK);
0806     lock_limit >>= PAGE_SHIFT;
0807 
0808     if (down_write_killable(&current->mm->mmap_sem))
0809         return -EINTR;
0810 
0811     ret = -ENOMEM;
0812     if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
0813         capable(CAP_IPC_LOCK))
0814         ret = apply_mlockall_flags(flags);
0815     up_write(&current->mm->mmap_sem);
0816     if (!ret && (flags & MCL_CURRENT))
0817         mm_populate(0, TASK_SIZE);
0818 
0819     return ret;
0820 }
0821 
0822 SYSCALL_DEFINE0(munlockall)
0823 {
0824     int ret;
0825 
0826     if (down_write_killable(&current->mm->mmap_sem))
0827         return -EINTR;
0828     ret = apply_mlockall_flags(0);
0829     up_write(&current->mm->mmap_sem);
0830     return ret;
0831 }
0832 
0833 /*
0834  * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
0835  * shm segments) get accounted against the user_struct instead.
0836  */
0837 static DEFINE_SPINLOCK(shmlock_user_lock);
0838 
0839 int user_shm_lock(size_t size, struct user_struct *user)
0840 {
0841     unsigned long lock_limit, locked;
0842     int allowed = 0;
0843 
0844     locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
0845     lock_limit = rlimit(RLIMIT_MEMLOCK);
0846     if (lock_limit == RLIM_INFINITY)
0847         allowed = 1;
0848     lock_limit >>= PAGE_SHIFT;
0849     spin_lock(&shmlock_user_lock);
0850     if (!allowed &&
0851         locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
0852         goto out;
0853     get_uid(user);
0854     user->locked_shm += locked;
0855     allowed = 1;
0856 out:
0857     spin_unlock(&shmlock_user_lock);
0858     return allowed;
0859 }
0860 
0861 void user_shm_unlock(size_t size, struct user_struct *user)
0862 {
0863     spin_lock(&shmlock_user_lock);
0864     user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
0865     spin_unlock(&shmlock_user_lock);
0866     free_uid(user);
0867 }