Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/mm/swap.c
0003  *
0004  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
0005  */
0006 
0007 /*
0008  * This file contains the default values for the operation of the
0009  * Linux VM subsystem. Fine-tuning documentation can be found in
0010  * Documentation/sysctl/vm.txt.
0011  * Started 18.12.91
0012  * Swap aging added 23.2.95, Stephen Tweedie.
0013  * Buffermem limits added 12.3.98, Rik van Riel.
0014  */
0015 
0016 #include <linux/mm.h>
0017 #include <linux/sched.h>
0018 #include <linux/kernel_stat.h>
0019 #include <linux/swap.h>
0020 #include <linux/mman.h>
0021 #include <linux/pagemap.h>
0022 #include <linux/pagevec.h>
0023 #include <linux/init.h>
0024 #include <linux/export.h>
0025 #include <linux/mm_inline.h>
0026 #include <linux/percpu_counter.h>
0027 #include <linux/memremap.h>
0028 #include <linux/percpu.h>
0029 #include <linux/cpu.h>
0030 #include <linux/notifier.h>
0031 #include <linux/backing-dev.h>
0032 #include <linux/memcontrol.h>
0033 #include <linux/gfp.h>
0034 #include <linux/uio.h>
0035 #include <linux/hugetlb.h>
0036 #include <linux/page_idle.h>
0037 
0038 #include "internal.h"
0039 
0040 #define CREATE_TRACE_POINTS
0041 #include <trace/events/pagemap.h>
0042 
0043 /* How many pages do we try to swap or page in/out together? */
0044 int page_cluster;
0045 
0046 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
0047 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
0048 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
0049 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
0050 #ifdef CONFIG_SMP
0051 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
0052 #endif
0053 
0054 /*
0055  * This path almost never happens for VM activity - pages are normally
0056  * freed via pagevecs.  But it gets used by networking.
0057  */
0058 static void __page_cache_release(struct page *page)
0059 {
0060     if (PageLRU(page)) {
0061         struct zone *zone = page_zone(page);
0062         struct lruvec *lruvec;
0063         unsigned long flags;
0064 
0065         spin_lock_irqsave(zone_lru_lock(zone), flags);
0066         lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
0067         VM_BUG_ON_PAGE(!PageLRU(page), page);
0068         __ClearPageLRU(page);
0069         del_page_from_lru_list(page, lruvec, page_off_lru(page));
0070         spin_unlock_irqrestore(zone_lru_lock(zone), flags);
0071     }
0072     __ClearPageWaiters(page);
0073     mem_cgroup_uncharge(page);
0074 }
0075 
0076 static void __put_single_page(struct page *page)
0077 {
0078     __page_cache_release(page);
0079     free_hot_cold_page(page, false);
0080 }
0081 
0082 static void __put_compound_page(struct page *page)
0083 {
0084     compound_page_dtor *dtor;
0085 
0086     /*
0087      * __page_cache_release() is supposed to be called for thp, not for
0088      * hugetlb. This is because hugetlb page does never have PageLRU set
0089      * (it's never listed to any LRU lists) and no memcg routines should
0090      * be called for hugetlb (it has a separate hugetlb_cgroup.)
0091      */
0092     if (!PageHuge(page))
0093         __page_cache_release(page);
0094     dtor = get_compound_page_dtor(page);
0095     (*dtor)(page);
0096 }
0097 
0098 void __put_page(struct page *page)
0099 {
0100     if (unlikely(PageCompound(page)))
0101         __put_compound_page(page);
0102     else
0103         __put_single_page(page);
0104 }
0105 EXPORT_SYMBOL(__put_page);
0106 
0107 /**
0108  * put_pages_list() - release a list of pages
0109  * @pages: list of pages threaded on page->lru
0110  *
0111  * Release a list of pages which are strung together on page.lru.  Currently
0112  * used by read_cache_pages() and related error recovery code.
0113  */
0114 void put_pages_list(struct list_head *pages)
0115 {
0116     while (!list_empty(pages)) {
0117         struct page *victim;
0118 
0119         victim = list_entry(pages->prev, struct page, lru);
0120         list_del(&victim->lru);
0121         put_page(victim);
0122     }
0123 }
0124 EXPORT_SYMBOL(put_pages_list);
0125 
0126 /*
0127  * get_kernel_pages() - pin kernel pages in memory
0128  * @kiov:   An array of struct kvec structures
0129  * @nr_segs:    number of segments to pin
0130  * @write:  pinning for read/write, currently ignored
0131  * @pages:  array that receives pointers to the pages pinned.
0132  *      Should be at least nr_segs long.
0133  *
0134  * Returns number of pages pinned. This may be fewer than the number
0135  * requested. If nr_pages is 0 or negative, returns 0. If no pages
0136  * were pinned, returns -errno. Each page returned must be released
0137  * with a put_page() call when it is finished with.
0138  */
0139 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
0140         struct page **pages)
0141 {
0142     int seg;
0143 
0144     for (seg = 0; seg < nr_segs; seg++) {
0145         if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
0146             return seg;
0147 
0148         pages[seg] = kmap_to_page(kiov[seg].iov_base);
0149         get_page(pages[seg]);
0150     }
0151 
0152     return seg;
0153 }
0154 EXPORT_SYMBOL_GPL(get_kernel_pages);
0155 
0156 /*
0157  * get_kernel_page() - pin a kernel page in memory
0158  * @start:  starting kernel address
0159  * @write:  pinning for read/write, currently ignored
0160  * @pages:  array that receives pointer to the page pinned.
0161  *      Must be at least nr_segs long.
0162  *
0163  * Returns 1 if page is pinned. If the page was not pinned, returns
0164  * -errno. The page returned must be released with a put_page() call
0165  * when it is finished with.
0166  */
0167 int get_kernel_page(unsigned long start, int write, struct page **pages)
0168 {
0169     const struct kvec kiov = {
0170         .iov_base = (void *)start,
0171         .iov_len = PAGE_SIZE
0172     };
0173 
0174     return get_kernel_pages(&kiov, 1, write, pages);
0175 }
0176 EXPORT_SYMBOL_GPL(get_kernel_page);
0177 
0178 static void pagevec_lru_move_fn(struct pagevec *pvec,
0179     void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
0180     void *arg)
0181 {
0182     int i;
0183     struct pglist_data *pgdat = NULL;
0184     struct lruvec *lruvec;
0185     unsigned long flags = 0;
0186 
0187     for (i = 0; i < pagevec_count(pvec); i++) {
0188         struct page *page = pvec->pages[i];
0189         struct pglist_data *pagepgdat = page_pgdat(page);
0190 
0191         if (pagepgdat != pgdat) {
0192             if (pgdat)
0193                 spin_unlock_irqrestore(&pgdat->lru_lock, flags);
0194             pgdat = pagepgdat;
0195             spin_lock_irqsave(&pgdat->lru_lock, flags);
0196         }
0197 
0198         lruvec = mem_cgroup_page_lruvec(page, pgdat);
0199         (*move_fn)(page, lruvec, arg);
0200     }
0201     if (pgdat)
0202         spin_unlock_irqrestore(&pgdat->lru_lock, flags);
0203     release_pages(pvec->pages, pvec->nr, pvec->cold);
0204     pagevec_reinit(pvec);
0205 }
0206 
0207 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
0208                  void *arg)
0209 {
0210     int *pgmoved = arg;
0211 
0212     if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
0213         enum lru_list lru = page_lru_base_type(page);
0214         list_move_tail(&page->lru, &lruvec->lists[lru]);
0215         (*pgmoved)++;
0216     }
0217 }
0218 
0219 /*
0220  * pagevec_move_tail() must be called with IRQ disabled.
0221  * Otherwise this may cause nasty races.
0222  */
0223 static void pagevec_move_tail(struct pagevec *pvec)
0224 {
0225     int pgmoved = 0;
0226 
0227     pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
0228     __count_vm_events(PGROTATED, pgmoved);
0229 }
0230 
0231 /*
0232  * Writeback is about to end against a page which has been marked for immediate
0233  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
0234  * inactive list.
0235  */
0236 void rotate_reclaimable_page(struct page *page)
0237 {
0238     if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
0239         !PageUnevictable(page) && PageLRU(page)) {
0240         struct pagevec *pvec;
0241         unsigned long flags;
0242 
0243         get_page(page);
0244         local_irq_save(flags);
0245         pvec = this_cpu_ptr(&lru_rotate_pvecs);
0246         if (!pagevec_add(pvec, page) || PageCompound(page))
0247             pagevec_move_tail(pvec);
0248         local_irq_restore(flags);
0249     }
0250 }
0251 
0252 static void update_page_reclaim_stat(struct lruvec *lruvec,
0253                      int file, int rotated)
0254 {
0255     struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
0256 
0257     reclaim_stat->recent_scanned[file]++;
0258     if (rotated)
0259         reclaim_stat->recent_rotated[file]++;
0260 }
0261 
0262 static void __activate_page(struct page *page, struct lruvec *lruvec,
0263                 void *arg)
0264 {
0265     if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
0266         int file = page_is_file_cache(page);
0267         int lru = page_lru_base_type(page);
0268 
0269         del_page_from_lru_list(page, lruvec, lru);
0270         SetPageActive(page);
0271         lru += LRU_ACTIVE;
0272         add_page_to_lru_list(page, lruvec, lru);
0273         trace_mm_lru_activate(page);
0274 
0275         __count_vm_event(PGACTIVATE);
0276         update_page_reclaim_stat(lruvec, file, 1);
0277     }
0278 }
0279 
0280 #ifdef CONFIG_SMP
0281 static void activate_page_drain(int cpu)
0282 {
0283     struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
0284 
0285     if (pagevec_count(pvec))
0286         pagevec_lru_move_fn(pvec, __activate_page, NULL);
0287 }
0288 
0289 static bool need_activate_page_drain(int cpu)
0290 {
0291     return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
0292 }
0293 
0294 void activate_page(struct page *page)
0295 {
0296     page = compound_head(page);
0297     if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
0298         struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
0299 
0300         get_page(page);
0301         if (!pagevec_add(pvec, page) || PageCompound(page))
0302             pagevec_lru_move_fn(pvec, __activate_page, NULL);
0303         put_cpu_var(activate_page_pvecs);
0304     }
0305 }
0306 
0307 #else
0308 static inline void activate_page_drain(int cpu)
0309 {
0310 }
0311 
0312 static bool need_activate_page_drain(int cpu)
0313 {
0314     return false;
0315 }
0316 
0317 void activate_page(struct page *page)
0318 {
0319     struct zone *zone = page_zone(page);
0320 
0321     page = compound_head(page);
0322     spin_lock_irq(zone_lru_lock(zone));
0323     __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
0324     spin_unlock_irq(zone_lru_lock(zone));
0325 }
0326 #endif
0327 
0328 static void __lru_cache_activate_page(struct page *page)
0329 {
0330     struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
0331     int i;
0332 
0333     /*
0334      * Search backwards on the optimistic assumption that the page being
0335      * activated has just been added to this pagevec. Note that only
0336      * the local pagevec is examined as a !PageLRU page could be in the
0337      * process of being released, reclaimed, migrated or on a remote
0338      * pagevec that is currently being drained. Furthermore, marking
0339      * a remote pagevec's page PageActive potentially hits a race where
0340      * a page is marked PageActive just after it is added to the inactive
0341      * list causing accounting errors and BUG_ON checks to trigger.
0342      */
0343     for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
0344         struct page *pagevec_page = pvec->pages[i];
0345 
0346         if (pagevec_page == page) {
0347             SetPageActive(page);
0348             break;
0349         }
0350     }
0351 
0352     put_cpu_var(lru_add_pvec);
0353 }
0354 
0355 /*
0356  * Mark a page as having seen activity.
0357  *
0358  * inactive,unreferenced    ->  inactive,referenced
0359  * inactive,referenced      ->  active,unreferenced
0360  * active,unreferenced      ->  active,referenced
0361  *
0362  * When a newly allocated page is not yet visible, so safe for non-atomic ops,
0363  * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
0364  */
0365 void mark_page_accessed(struct page *page)
0366 {
0367     page = compound_head(page);
0368     if (!PageActive(page) && !PageUnevictable(page) &&
0369             PageReferenced(page)) {
0370 
0371         /*
0372          * If the page is on the LRU, queue it for activation via
0373          * activate_page_pvecs. Otherwise, assume the page is on a
0374          * pagevec, mark it active and it'll be moved to the active
0375          * LRU on the next drain.
0376          */
0377         if (PageLRU(page))
0378             activate_page(page);
0379         else
0380             __lru_cache_activate_page(page);
0381         ClearPageReferenced(page);
0382         if (page_is_file_cache(page))
0383             workingset_activation(page);
0384     } else if (!PageReferenced(page)) {
0385         SetPageReferenced(page);
0386     }
0387     if (page_is_idle(page))
0388         clear_page_idle(page);
0389 }
0390 EXPORT_SYMBOL(mark_page_accessed);
0391 
0392 static void __lru_cache_add(struct page *page)
0393 {
0394     struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
0395 
0396     get_page(page);
0397     if (!pagevec_add(pvec, page) || PageCompound(page))
0398         __pagevec_lru_add(pvec);
0399     put_cpu_var(lru_add_pvec);
0400 }
0401 
0402 /**
0403  * lru_cache_add: add a page to the page lists
0404  * @page: the page to add
0405  */
0406 void lru_cache_add_anon(struct page *page)
0407 {
0408     if (PageActive(page))
0409         ClearPageActive(page);
0410     __lru_cache_add(page);
0411 }
0412 
0413 void lru_cache_add_file(struct page *page)
0414 {
0415     if (PageActive(page))
0416         ClearPageActive(page);
0417     __lru_cache_add(page);
0418 }
0419 EXPORT_SYMBOL(lru_cache_add_file);
0420 
0421 /**
0422  * lru_cache_add - add a page to a page list
0423  * @page: the page to be added to the LRU.
0424  *
0425  * Queue the page for addition to the LRU via pagevec. The decision on whether
0426  * to add the page to the [in]active [file|anon] list is deferred until the
0427  * pagevec is drained. This gives a chance for the caller of lru_cache_add()
0428  * have the page added to the active list using mark_page_accessed().
0429  */
0430 void lru_cache_add(struct page *page)
0431 {
0432     VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
0433     VM_BUG_ON_PAGE(PageLRU(page), page);
0434     __lru_cache_add(page);
0435 }
0436 
0437 /**
0438  * add_page_to_unevictable_list - add a page to the unevictable list
0439  * @page:  the page to be added to the unevictable list
0440  *
0441  * Add page directly to its zone's unevictable list.  To avoid races with
0442  * tasks that might be making the page evictable, through eg. munlock,
0443  * munmap or exit, while it's not on the lru, we want to add the page
0444  * while it's locked or otherwise "invisible" to other tasks.  This is
0445  * difficult to do when using the pagevec cache, so bypass that.
0446  */
0447 void add_page_to_unevictable_list(struct page *page)
0448 {
0449     struct pglist_data *pgdat = page_pgdat(page);
0450     struct lruvec *lruvec;
0451 
0452     spin_lock_irq(&pgdat->lru_lock);
0453     lruvec = mem_cgroup_page_lruvec(page, pgdat);
0454     ClearPageActive(page);
0455     SetPageUnevictable(page);
0456     SetPageLRU(page);
0457     add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
0458     spin_unlock_irq(&pgdat->lru_lock);
0459 }
0460 
0461 /**
0462  * lru_cache_add_active_or_unevictable
0463  * @page:  the page to be added to LRU
0464  * @vma:   vma in which page is mapped for determining reclaimability
0465  *
0466  * Place @page on the active or unevictable LRU list, depending on its
0467  * evictability.  Note that if the page is not evictable, it goes
0468  * directly back onto it's zone's unevictable list, it does NOT use a
0469  * per cpu pagevec.
0470  */
0471 void lru_cache_add_active_or_unevictable(struct page *page,
0472                      struct vm_area_struct *vma)
0473 {
0474     VM_BUG_ON_PAGE(PageLRU(page), page);
0475 
0476     if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
0477         SetPageActive(page);
0478         lru_cache_add(page);
0479         return;
0480     }
0481 
0482     if (!TestSetPageMlocked(page)) {
0483         /*
0484          * We use the irq-unsafe __mod_zone_page_stat because this
0485          * counter is not modified from interrupt context, and the pte
0486          * lock is held(spinlock), which implies preemption disabled.
0487          */
0488         __mod_zone_page_state(page_zone(page), NR_MLOCK,
0489                     hpage_nr_pages(page));
0490         count_vm_event(UNEVICTABLE_PGMLOCKED);
0491     }
0492     add_page_to_unevictable_list(page);
0493 }
0494 
0495 /*
0496  * If the page can not be invalidated, it is moved to the
0497  * inactive list to speed up its reclaim.  It is moved to the
0498  * head of the list, rather than the tail, to give the flusher
0499  * threads some time to write it out, as this is much more
0500  * effective than the single-page writeout from reclaim.
0501  *
0502  * If the page isn't page_mapped and dirty/writeback, the page
0503  * could reclaim asap using PG_reclaim.
0504  *
0505  * 1. active, mapped page -> none
0506  * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
0507  * 3. inactive, mapped page -> none
0508  * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
0509  * 5. inactive, clean -> inactive, tail
0510  * 6. Others -> none
0511  *
0512  * In 4, why it moves inactive's head, the VM expects the page would
0513  * be write it out by flusher threads as this is much more effective
0514  * than the single-page writeout from reclaim.
0515  */
0516 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
0517                   void *arg)
0518 {
0519     int lru, file;
0520     bool active;
0521 
0522     if (!PageLRU(page))
0523         return;
0524 
0525     if (PageUnevictable(page))
0526         return;
0527 
0528     /* Some processes are using the page */
0529     if (page_mapped(page))
0530         return;
0531 
0532     active = PageActive(page);
0533     file = page_is_file_cache(page);
0534     lru = page_lru_base_type(page);
0535 
0536     del_page_from_lru_list(page, lruvec, lru + active);
0537     ClearPageActive(page);
0538     ClearPageReferenced(page);
0539     add_page_to_lru_list(page, lruvec, lru);
0540 
0541     if (PageWriteback(page) || PageDirty(page)) {
0542         /*
0543          * PG_reclaim could be raced with end_page_writeback
0544          * It can make readahead confusing.  But race window
0545          * is _really_ small and  it's non-critical problem.
0546          */
0547         SetPageReclaim(page);
0548     } else {
0549         /*
0550          * The page's writeback ends up during pagevec
0551          * We moves tha page into tail of inactive.
0552          */
0553         list_move_tail(&page->lru, &lruvec->lists[lru]);
0554         __count_vm_event(PGROTATED);
0555     }
0556 
0557     if (active)
0558         __count_vm_event(PGDEACTIVATE);
0559     update_page_reclaim_stat(lruvec, file, 0);
0560 }
0561 
0562 
0563 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
0564                 void *arg)
0565 {
0566     if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
0567         int file = page_is_file_cache(page);
0568         int lru = page_lru_base_type(page);
0569 
0570         del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
0571         ClearPageActive(page);
0572         ClearPageReferenced(page);
0573         add_page_to_lru_list(page, lruvec, lru);
0574 
0575         __count_vm_event(PGDEACTIVATE);
0576         update_page_reclaim_stat(lruvec, file, 0);
0577     }
0578 }
0579 
0580 /*
0581  * Drain pages out of the cpu's pagevecs.
0582  * Either "cpu" is the current CPU, and preemption has already been
0583  * disabled; or "cpu" is being hot-unplugged, and is already dead.
0584  */
0585 void lru_add_drain_cpu(int cpu)
0586 {
0587     struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
0588 
0589     if (pagevec_count(pvec))
0590         __pagevec_lru_add(pvec);
0591 
0592     pvec = &per_cpu(lru_rotate_pvecs, cpu);
0593     if (pagevec_count(pvec)) {
0594         unsigned long flags;
0595 
0596         /* No harm done if a racing interrupt already did this */
0597         local_irq_save(flags);
0598         pagevec_move_tail(pvec);
0599         local_irq_restore(flags);
0600     }
0601 
0602     pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
0603     if (pagevec_count(pvec))
0604         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
0605 
0606     pvec = &per_cpu(lru_deactivate_pvecs, cpu);
0607     if (pagevec_count(pvec))
0608         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
0609 
0610     activate_page_drain(cpu);
0611 }
0612 
0613 /**
0614  * deactivate_file_page - forcefully deactivate a file page
0615  * @page: page to deactivate
0616  *
0617  * This function hints the VM that @page is a good reclaim candidate,
0618  * for example if its invalidation fails due to the page being dirty
0619  * or under writeback.
0620  */
0621 void deactivate_file_page(struct page *page)
0622 {
0623     /*
0624      * In a workload with many unevictable page such as mprotect,
0625      * unevictable page deactivation for accelerating reclaim is pointless.
0626      */
0627     if (PageUnevictable(page))
0628         return;
0629 
0630     if (likely(get_page_unless_zero(page))) {
0631         struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
0632 
0633         if (!pagevec_add(pvec, page) || PageCompound(page))
0634             pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
0635         put_cpu_var(lru_deactivate_file_pvecs);
0636     }
0637 }
0638 
0639 /**
0640  * deactivate_page - deactivate a page
0641  * @page: page to deactivate
0642  *
0643  * deactivate_page() moves @page to the inactive list if @page was on the active
0644  * list and was not an unevictable page.  This is done to accelerate the reclaim
0645  * of @page.
0646  */
0647 void deactivate_page(struct page *page)
0648 {
0649     if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
0650         struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
0651 
0652         get_page(page);
0653         if (!pagevec_add(pvec, page) || PageCompound(page))
0654             pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
0655         put_cpu_var(lru_deactivate_pvecs);
0656     }
0657 }
0658 
0659 void lru_add_drain(void)
0660 {
0661     lru_add_drain_cpu(get_cpu());
0662     put_cpu();
0663 }
0664 
0665 static void lru_add_drain_per_cpu(struct work_struct *dummy)
0666 {
0667     lru_add_drain();
0668 }
0669 
0670 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
0671 
0672 /*
0673  * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
0674  * workqueue, aiding in getting memory freed.
0675  */
0676 static struct workqueue_struct *lru_add_drain_wq;
0677 
0678 static int __init lru_init(void)
0679 {
0680     lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
0681 
0682     if (WARN(!lru_add_drain_wq,
0683         "Failed to create workqueue lru_add_drain_wq"))
0684         return -ENOMEM;
0685 
0686     return 0;
0687 }
0688 early_initcall(lru_init);
0689 
0690 void lru_add_drain_all(void)
0691 {
0692     static DEFINE_MUTEX(lock);
0693     static struct cpumask has_work;
0694     int cpu;
0695 
0696     mutex_lock(&lock);
0697     get_online_cpus();
0698     cpumask_clear(&has_work);
0699 
0700     for_each_online_cpu(cpu) {
0701         struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
0702 
0703         if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
0704             pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
0705             pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
0706             pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
0707             need_activate_page_drain(cpu)) {
0708             INIT_WORK(work, lru_add_drain_per_cpu);
0709             queue_work_on(cpu, lru_add_drain_wq, work);
0710             cpumask_set_cpu(cpu, &has_work);
0711         }
0712     }
0713 
0714     for_each_cpu(cpu, &has_work)
0715         flush_work(&per_cpu(lru_add_drain_work, cpu));
0716 
0717     put_online_cpus();
0718     mutex_unlock(&lock);
0719 }
0720 
0721 /**
0722  * release_pages - batched put_page()
0723  * @pages: array of pages to release
0724  * @nr: number of pages
0725  * @cold: whether the pages are cache cold
0726  *
0727  * Decrement the reference count on all the pages in @pages.  If it
0728  * fell to zero, remove the page from the LRU and free it.
0729  */
0730 void release_pages(struct page **pages, int nr, bool cold)
0731 {
0732     int i;
0733     LIST_HEAD(pages_to_free);
0734     struct pglist_data *locked_pgdat = NULL;
0735     struct lruvec *lruvec;
0736     unsigned long uninitialized_var(flags);
0737     unsigned int uninitialized_var(lock_batch);
0738 
0739     for (i = 0; i < nr; i++) {
0740         struct page *page = pages[i];
0741 
0742         /*
0743          * Make sure the IRQ-safe lock-holding time does not get
0744          * excessive with a continuous string of pages from the
0745          * same pgdat. The lock is held only if pgdat != NULL.
0746          */
0747         if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
0748             spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
0749             locked_pgdat = NULL;
0750         }
0751 
0752         if (is_huge_zero_page(page))
0753             continue;
0754 
0755         page = compound_head(page);
0756         if (!put_page_testzero(page))
0757             continue;
0758 
0759         if (PageCompound(page)) {
0760             if (locked_pgdat) {
0761                 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
0762                 locked_pgdat = NULL;
0763             }
0764             __put_compound_page(page);
0765             continue;
0766         }
0767 
0768         if (PageLRU(page)) {
0769             struct pglist_data *pgdat = page_pgdat(page);
0770 
0771             if (pgdat != locked_pgdat) {
0772                 if (locked_pgdat)
0773                     spin_unlock_irqrestore(&locked_pgdat->lru_lock,
0774                                     flags);
0775                 lock_batch = 0;
0776                 locked_pgdat = pgdat;
0777                 spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
0778             }
0779 
0780             lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
0781             VM_BUG_ON_PAGE(!PageLRU(page), page);
0782             __ClearPageLRU(page);
0783             del_page_from_lru_list(page, lruvec, page_off_lru(page));
0784         }
0785 
0786         /* Clear Active bit in case of parallel mark_page_accessed */
0787         __ClearPageActive(page);
0788         __ClearPageWaiters(page);
0789 
0790         list_add(&page->lru, &pages_to_free);
0791     }
0792     if (locked_pgdat)
0793         spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
0794 
0795     mem_cgroup_uncharge_list(&pages_to_free);
0796     free_hot_cold_page_list(&pages_to_free, cold);
0797 }
0798 EXPORT_SYMBOL(release_pages);
0799 
0800 /*
0801  * The pages which we're about to release may be in the deferred lru-addition
0802  * queues.  That would prevent them from really being freed right now.  That's
0803  * OK from a correctness point of view but is inefficient - those pages may be
0804  * cache-warm and we want to give them back to the page allocator ASAP.
0805  *
0806  * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
0807  * and __pagevec_lru_add_active() call release_pages() directly to avoid
0808  * mutual recursion.
0809  */
0810 void __pagevec_release(struct pagevec *pvec)
0811 {
0812     lru_add_drain();
0813     release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
0814     pagevec_reinit(pvec);
0815 }
0816 EXPORT_SYMBOL(__pagevec_release);
0817 
0818 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0819 /* used by __split_huge_page_refcount() */
0820 void lru_add_page_tail(struct page *page, struct page *page_tail,
0821                struct lruvec *lruvec, struct list_head *list)
0822 {
0823     const int file = 0;
0824 
0825     VM_BUG_ON_PAGE(!PageHead(page), page);
0826     VM_BUG_ON_PAGE(PageCompound(page_tail), page);
0827     VM_BUG_ON_PAGE(PageLRU(page_tail), page);
0828     VM_BUG_ON(NR_CPUS != 1 &&
0829           !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
0830 
0831     if (!list)
0832         SetPageLRU(page_tail);
0833 
0834     if (likely(PageLRU(page)))
0835         list_add_tail(&page_tail->lru, &page->lru);
0836     else if (list) {
0837         /* page reclaim is reclaiming a huge page */
0838         get_page(page_tail);
0839         list_add_tail(&page_tail->lru, list);
0840     } else {
0841         struct list_head *list_head;
0842         /*
0843          * Head page has not yet been counted, as an hpage,
0844          * so we must account for each subpage individually.
0845          *
0846          * Use the standard add function to put page_tail on the list,
0847          * but then correct its position so they all end up in order.
0848          */
0849         add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
0850         list_head = page_tail->lru.prev;
0851         list_move_tail(&page_tail->lru, list_head);
0852     }
0853 
0854     if (!PageUnevictable(page))
0855         update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
0856 }
0857 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
0858 
0859 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
0860                  void *arg)
0861 {
0862     int file = page_is_file_cache(page);
0863     int active = PageActive(page);
0864     enum lru_list lru = page_lru(page);
0865 
0866     VM_BUG_ON_PAGE(PageLRU(page), page);
0867 
0868     SetPageLRU(page);
0869     add_page_to_lru_list(page, lruvec, lru);
0870     update_page_reclaim_stat(lruvec, file, active);
0871     trace_mm_lru_insertion(page, lru);
0872 }
0873 
0874 /*
0875  * Add the passed pages to the LRU, then drop the caller's refcount
0876  * on them.  Reinitialises the caller's pagevec.
0877  */
0878 void __pagevec_lru_add(struct pagevec *pvec)
0879 {
0880     pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
0881 }
0882 EXPORT_SYMBOL(__pagevec_lru_add);
0883 
0884 /**
0885  * pagevec_lookup_entries - gang pagecache lookup
0886  * @pvec:   Where the resulting entries are placed
0887  * @mapping:    The address_space to search
0888  * @start:  The starting entry index
0889  * @nr_entries: The maximum number of entries
0890  * @indices:    The cache indices corresponding to the entries in @pvec
0891  *
0892  * pagevec_lookup_entries() will search for and return a group of up
0893  * to @nr_entries pages and shadow entries in the mapping.  All
0894  * entries are placed in @pvec.  pagevec_lookup_entries() takes a
0895  * reference against actual pages in @pvec.
0896  *
0897  * The search returns a group of mapping-contiguous entries with
0898  * ascending indexes.  There may be holes in the indices due to
0899  * not-present entries.
0900  *
0901  * pagevec_lookup_entries() returns the number of entries which were
0902  * found.
0903  */
0904 unsigned pagevec_lookup_entries(struct pagevec *pvec,
0905                 struct address_space *mapping,
0906                 pgoff_t start, unsigned nr_pages,
0907                 pgoff_t *indices)
0908 {
0909     pvec->nr = find_get_entries(mapping, start, nr_pages,
0910                     pvec->pages, indices);
0911     return pagevec_count(pvec);
0912 }
0913 
0914 /**
0915  * pagevec_remove_exceptionals - pagevec exceptionals pruning
0916  * @pvec:   The pagevec to prune
0917  *
0918  * pagevec_lookup_entries() fills both pages and exceptional radix
0919  * tree entries into the pagevec.  This function prunes all
0920  * exceptionals from @pvec without leaving holes, so that it can be
0921  * passed on to page-only pagevec operations.
0922  */
0923 void pagevec_remove_exceptionals(struct pagevec *pvec)
0924 {
0925     int i, j;
0926 
0927     for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
0928         struct page *page = pvec->pages[i];
0929         if (!radix_tree_exceptional_entry(page))
0930             pvec->pages[j++] = page;
0931     }
0932     pvec->nr = j;
0933 }
0934 
0935 /**
0936  * pagevec_lookup - gang pagecache lookup
0937  * @pvec:   Where the resulting pages are placed
0938  * @mapping:    The address_space to search
0939  * @start:  The starting page index
0940  * @nr_pages:   The maximum number of pages
0941  *
0942  * pagevec_lookup() will search for and return a group of up to @nr_pages pages
0943  * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
0944  * reference against the pages in @pvec.
0945  *
0946  * The search returns a group of mapping-contiguous pages with ascending
0947  * indexes.  There may be holes in the indices due to not-present pages.
0948  *
0949  * pagevec_lookup() returns the number of pages which were found.
0950  */
0951 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
0952         pgoff_t start, unsigned nr_pages)
0953 {
0954     pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
0955     return pagevec_count(pvec);
0956 }
0957 EXPORT_SYMBOL(pagevec_lookup);
0958 
0959 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
0960         pgoff_t *index, int tag, unsigned nr_pages)
0961 {
0962     pvec->nr = find_get_pages_tag(mapping, index, tag,
0963                     nr_pages, pvec->pages);
0964     return pagevec_count(pvec);
0965 }
0966 EXPORT_SYMBOL(pagevec_lookup_tag);
0967 
0968 /*
0969  * Perform any setup for the swap system
0970  */
0971 void __init swap_setup(void)
0972 {
0973     unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
0974 #ifdef CONFIG_SWAP
0975     int i;
0976 
0977     for (i = 0; i < MAX_SWAPFILES; i++)
0978         spin_lock_init(&swapper_spaces[i].tree_lock);
0979 #endif
0980 
0981     /* Use a smaller cluster for small-memory machines */
0982     if (megs < 16)
0983         page_cluster = 2;
0984     else
0985         page_cluster = 3;
0986     /*
0987      * Right now other parts of the system means that we
0988      * _really_ don't want to cluster much more
0989      */
0990 }