the-tree/mm/swap_state.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  *  linux/mm/swap_state.c
0004  *
0005  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
0006  *  Swap reorganised 29.12.95, Stephen Tweedie
0007  *
0008  *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
0009  */
0010 #include <linux/mm.h>
0011 #include <linux/gfp.h>
0012 #include <linux/kernel_stat.h>
0013 #include <linux/swap.h>
0014 #include <linux/swapops.h>
0015 #include <linux/init.h>
0016 #include <linux/pagemap.h>
0017 #include <linux/backing-dev.h>
0018 #include <linux/blkdev.h>
0019 #include <linux/pagevec.h>
0020 #include <linux/migrate.h>
0021 #include <linux/vmalloc.h>
0022 #include <linux/swap_slots.h>
0023 #include <linux/huge_mm.h>
0024 #include <linux/shmem_fs.h>
0025 #include "internal.h"
0026 #include "swap.h"
0027
0028 /*
0029  * swapper_space is a fiction, retained to simplify the path through
0030  * vmscan's shrink_page_list.
0031  */
0032 static const struct address_space_operations swap_aops = {
0033     .writepage  = swap_writepage,
0034     .dirty_folio    = noop_dirty_folio,
0035 #ifdef CONFIG_MIGRATION
0036     .migrate_folio  = migrate_folio,
0037 #endif
0038 };
0039
0040 struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
0041 static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
0042 static bool enable_vma_readahead __read_mostly = true;
0043
0044 #define SWAP_RA_WIN_SHIFT   (PAGE_SHIFT / 2)
0045 #define SWAP_RA_HITS_MASK   ((1UL << SWAP_RA_WIN_SHIFT) - 1)
0046 #define SWAP_RA_HITS_MAX    SWAP_RA_HITS_MASK
0047 #define SWAP_RA_WIN_MASK    (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
0048
0049 #define SWAP_RA_HITS(v)     ((v) & SWAP_RA_HITS_MASK)
0050 #define SWAP_RA_WIN(v)      (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
0051 #define SWAP_RA_ADDR(v)     ((v) & PAGE_MASK)
0052
0053 #define SWAP_RA_VAL(addr, win, hits)                \
0054     (((addr) & PAGE_MASK) |                 \
0055      (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |    \
0056      ((hits) & SWAP_RA_HITS_MASK))
0057
0058 /* Initial readahead hits is 4 to start up with a small window */
0059 #define GET_SWAP_RA_VAL(vma)                    \
0060     (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
0061
0062 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
0063
0064 void show_swap_cache_info(void)
0065 {
0066     printk("%lu pages in swap cache\n", total_swapcache_pages());
0067     printk("Free swap  = %ldkB\n",
0068         get_nr_swap_pages() << (PAGE_SHIFT - 10));
0069     printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
0070 }
0071
0072 void *get_shadow_from_swap_cache(swp_entry_t entry)
0073 {
0074     struct address_space *address_space = swap_address_space(entry);
0075     pgoff_t idx = swp_offset(entry);
0076     struct page *page;
0077
0078     page = xa_load(&address_space->i_pages, idx);
0079     if (xa_is_value(page))
0080         return page;
0081     return NULL;
0082 }
0083
0084 /*
0085  * add_to_swap_cache resembles filemap_add_folio on swapper_space,
0086  * but sets SwapCache flag and private instead of mapping and index.
0087  */
0088 int add_to_swap_cache(struct page *page, swp_entry_t entry,
0089             gfp_t gfp, void **shadowp)
0090 {
0091     struct address_space *address_space = swap_address_space(entry);
0092     pgoff_t idx = swp_offset(entry);
0093     XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
0094     unsigned long i, nr = thp_nr_pages(page);
0095     void *old;
0096
0097     VM_BUG_ON_PAGE(!PageLocked(page), page);
0098     VM_BUG_ON_PAGE(PageSwapCache(page), page);
0099     VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
0100
0101     page_ref_add(page, nr);
0102     SetPageSwapCache(page);
0103
0104     do {
0105         xas_lock_irq(&xas);
0106         xas_create_range(&xas);
0107         if (xas_error(&xas))
0108             goto unlock;
0109         for (i = 0; i < nr; i++) {
0110             VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
0111             old = xas_load(&xas);
0112             if (xa_is_value(old)) {
0113                 if (shadowp)
0114                     *shadowp = old;
0115             }
0116             set_page_private(page + i, entry.val + i);
0117             xas_store(&xas, page);
0118             xas_next(&xas);
0119         }
0120         address_space->nrpages += nr;
0121         __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
0122         __mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
0123 unlock:
0124         xas_unlock_irq(&xas);
0125     } while (xas_nomem(&xas, gfp));
0126
0127     if (!xas_error(&xas))
0128         return 0;
0129
0130     ClearPageSwapCache(page);
0131     page_ref_sub(page, nr);
0132     return xas_error(&xas);
0133 }
0134
0135 /*
0136  * This must be called only on folios that have
0137  * been verified to be in the swap cache.
0138  */
0139 void __delete_from_swap_cache(struct folio *folio,
0140             swp_entry_t entry, void *shadow)
0141 {
0142     struct address_space *address_space = swap_address_space(entry);
0143     int i;
0144     long nr = folio_nr_pages(folio);
0145     pgoff_t idx = swp_offset(entry);
0146     XA_STATE(xas, &address_space->i_pages, idx);
0147
0148     VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
0149     VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
0150     VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
0151
0152     for (i = 0; i < nr; i++) {
0153         void *entry = xas_store(&xas, shadow);
0154         VM_BUG_ON_PAGE(entry != folio, entry);
0155         set_page_private(folio_page(folio, i), 0);
0156         xas_next(&xas);
0157     }
0158     folio_clear_swapcache(folio);
0159     address_space->nrpages -= nr;
0160     __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
0161     __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
0162 }
0163
0164 /**
0165  * add_to_swap - allocate swap space for a folio
0166  * @folio: folio we want to move to swap
0167  *
0168  * Allocate swap space for the folio and add the folio to the
0169  * swap cache.
0170  *
0171  * Context: Caller needs to hold the folio lock.
0172  * Return: Whether the folio was added to the swap cache.
0173  */
0174 bool add_to_swap(struct folio *folio)
0175 {
0176     swp_entry_t entry;
0177     int err;
0178
0179     VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
0180     VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
0181
0182     entry = folio_alloc_swap(folio);
0183     if (!entry.val)
0184         return false;
0185
0186     /*
0187      * XArray node allocations from PF_MEMALLOC contexts could
0188      * completely exhaust the page allocator. __GFP_NOMEMALLOC
0189      * stops emergency reserves from being allocated.
0190      *
0191      * TODO: this could cause a theoretical memory reclaim
0192      * deadlock in the swap out path.
0193      */
0194     /*
0195      * Add it to the swap cache.
0196      */
0197     err = add_to_swap_cache(&folio->page, entry,
0198             __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
0199     if (err)
0200         /*
0201          * add_to_swap_cache() doesn't return -EEXIST, so we can safely
0202          * clear SWAP_HAS_CACHE flag.
0203          */
0204         goto fail;
0205     /*
0206      * Normally the folio will be dirtied in unmap because its
0207      * pte should be dirty. A special case is MADV_FREE page. The
0208      * page's pte could have dirty bit cleared but the folio's
0209      * SwapBacked flag is still set because clearing the dirty bit
0210      * and SwapBacked flag has no lock protected. For such folio,
0211      * unmap will not set dirty bit for it, so folio reclaim will
0212      * not write the folio out. This can cause data corruption when
0213      * the folio is swapped in later. Always setting the dirty flag
0214      * for the folio solves the problem.
0215      */
0216     folio_mark_dirty(folio);
0217
0218     return true;
0219
0220 fail:
0221     put_swap_page(&folio->page, entry);
0222     return false;
0223 }
0224
0225 /*
0226  * This must be called only on folios that have
0227  * been verified to be in the swap cache and locked.
0228  * It will never put the folio into the free list,
0229  * the caller has a reference on the folio.
0230  */
0231 void delete_from_swap_cache(struct folio *folio)
0232 {
0233     swp_entry_t entry = folio_swap_entry(folio);
0234     struct address_space *address_space = swap_address_space(entry);
0235
0236     xa_lock_irq(&address_space->i_pages);
0237     __delete_from_swap_cache(folio, entry, NULL);
0238     xa_unlock_irq(&address_space->i_pages);
0239
0240     put_swap_page(&folio->page, entry);
0241     folio_ref_sub(folio, folio_nr_pages(folio));
0242 }
0243
0244 void clear_shadow_from_swap_cache(int type, unsigned long begin,
0245                 unsigned long end)
0246 {
0247     unsigned long curr = begin;
0248     void *old;
0249
0250     for (;;) {
0251         swp_entry_t entry = swp_entry(type, curr);
0252         struct address_space *address_space = swap_address_space(entry);
0253         XA_STATE(xas, &address_space->i_pages, curr);
0254
0255         xa_lock_irq(&address_space->i_pages);
0256         xas_for_each(&xas, old, end) {
0257             if (!xa_is_value(old))
0258                 continue;
0259             xas_store(&xas, NULL);
0260         }
0261         xa_unlock_irq(&address_space->i_pages);
0262
0263         /* search the next swapcache until we meet end */
0264         curr >>= SWAP_ADDRESS_SPACE_SHIFT;
0265         curr++;
0266         curr <<= SWAP_ADDRESS_SPACE_SHIFT;
0267         if (curr > end)
0268             break;
0269     }
0270 }
0271
0272 /*
0273  * If we are the only user, then try to free up the swap cache.
0274  *
0275  * Its ok to check for PageSwapCache without the page lock
0276  * here because we are going to recheck again inside
0277  * try_to_free_swap() _with_ the lock.
0278  *                  - Marcelo
0279  */
0280 void free_swap_cache(struct page *page)
0281 {
0282     if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
0283         try_to_free_swap(page);
0284         unlock_page(page);
0285     }
0286 }
0287
0288 /*
0289  * Perform a free_page(), also freeing any swap cache associated with
0290  * this page if it is the last user of the page.
0291  */
0292 void free_page_and_swap_cache(struct page *page)
0293 {
0294     free_swap_cache(page);
0295     if (!is_huge_zero_page(page))
0296         put_page(page);
0297 }
0298
0299 /*
0300  * Passed an array of pages, drop them all from swapcache and then release
0301  * them.  They are removed from the LRU and freed if this is their last use.
0302  */
0303 void free_pages_and_swap_cache(struct page **pages, int nr)
0304 {
0305     struct page **pagep = pages;
0306     int i;
0307
0308     lru_add_drain();
0309     for (i = 0; i < nr; i++)
0310         free_swap_cache(pagep[i]);
0311     release_pages(pagep, nr);
0312 }
0313
0314 static inline bool swap_use_vma_readahead(void)
0315 {
0316     return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
0317 }
0318
0319 /*
0320  * Lookup a swap entry in the swap cache. A found page will be returned
0321  * unlocked and with its refcount incremented - we rely on the kernel
0322  * lock getting page table operations atomic even if we drop the page
0323  * lock before returning.
0324  */
0325 struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
0326                    unsigned long addr)
0327 {
0328     struct page *page;
0329     struct swap_info_struct *si;
0330
0331     si = get_swap_device(entry);
0332     if (!si)
0333         return NULL;
0334     page = find_get_page(swap_address_space(entry), swp_offset(entry));
0335     put_swap_device(si);
0336
0337     if (page) {
0338         bool vma_ra = swap_use_vma_readahead();
0339         bool readahead;
0340
0341         /*
0342          * At the moment, we don't support PG_readahead for anon THP
0343          * so let's bail out rather than confusing the readahead stat.
0344          */
0345         if (unlikely(PageTransCompound(page)))
0346             return page;
0347
0348         readahead = TestClearPageReadahead(page);
0349         if (vma && vma_ra) {
0350             unsigned long ra_val;
0351             int win, hits;
0352
0353             ra_val = GET_SWAP_RA_VAL(vma);
0354             win = SWAP_RA_WIN(ra_val);
0355             hits = SWAP_RA_HITS(ra_val);
0356             if (readahead)
0357                 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
0358             atomic_long_set(&vma->swap_readahead_info,
0359                     SWAP_RA_VAL(addr, win, hits));
0360         }
0361
0362         if (readahead) {
0363             count_vm_event(SWAP_RA_HIT);
0364             if (!vma || !vma_ra)
0365                 atomic_inc(&swapin_readahead_hits);
0366         }
0367     }
0368
0369     return page;
0370 }
0371
0372 /**
0373  * find_get_incore_page - Find and get a page from the page or swap caches.
0374  * @mapping: The address_space to search.
0375  * @index: The page cache index.
0376  *
0377  * This differs from find_get_page() in that it will also look for the
0378  * page in the swap cache.
0379  *
0380  * Return: The found page or %NULL.
0381  */
0382 struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
0383 {
0384     swp_entry_t swp;
0385     struct swap_info_struct *si;
0386     struct page *page = pagecache_get_page(mapping, index,
0387                         FGP_ENTRY | FGP_HEAD, 0);
0388
0389     if (!page)
0390         return page;
0391     if (!xa_is_value(page))
0392         return find_subpage(page, index);
0393     if (!shmem_mapping(mapping))
0394         return NULL;
0395
0396     swp = radix_to_swp_entry(page);
0397     /* There might be swapin error entries in shmem mapping. */
0398     if (non_swap_entry(swp))
0399         return NULL;
0400     /* Prevent swapoff from happening to us */
0401     si = get_swap_device(swp);
0402     if (!si)
0403         return NULL;
0404     page = find_get_page(swap_address_space(swp), swp_offset(swp));
0405     put_swap_device(si);
0406     return page;
0407 }
0408
0409 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
0410             struct vm_area_struct *vma, unsigned long addr,
0411             bool *new_page_allocated)
0412 {
0413     struct swap_info_struct *si;
0414     struct page *page;
0415     void *shadow = NULL;
0416
0417     *new_page_allocated = false;
0418
0419     for (;;) {
0420         int err;
0421         /*
0422          * First check the swap cache.  Since this is normally
0423          * called after lookup_swap_cache() failed, re-calling
0424          * that would confuse statistics.
0425          */
0426         si = get_swap_device(entry);
0427         if (!si)
0428             return NULL;
0429         page = find_get_page(swap_address_space(entry),
0430                      swp_offset(entry));
0431         put_swap_device(si);
0432         if (page)
0433             return page;
0434
0435         /*
0436          * Just skip read ahead for unused swap slot.
0437          * During swap_off when swap_slot_cache is disabled,
0438          * we have to handle the race between putting
0439          * swap entry in swap cache and marking swap slot
0440          * as SWAP_HAS_CACHE.  That's done in later part of code or
0441          * else swap_off will be aborted if we return NULL.
0442          */
0443         if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
0444             return NULL;
0445
0446         /*
0447          * Get a new page to read into from swap.  Allocate it now,
0448          * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
0449          * cause any racers to loop around until we add it to cache.
0450          */
0451         page = alloc_page_vma(gfp_mask, vma, addr);
0452         if (!page)
0453             return NULL;
0454
0455         /*
0456          * Swap entry may have been freed since our caller observed it.
0457          */
0458         err = swapcache_prepare(entry);
0459         if (!err)
0460             break;
0461
0462         put_page(page);
0463         if (err != -EEXIST)
0464             return NULL;
0465
0466         /*
0467          * We might race against __delete_from_swap_cache(), and
0468          * stumble across a swap_map entry whose SWAP_HAS_CACHE
0469          * has not yet been cleared.  Or race against another
0470          * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
0471          * in swap_map, but not yet added its page to swap cache.
0472          */
0473         schedule_timeout_uninterruptible(1);
0474     }
0475
0476     /*
0477      * The swap entry is ours to swap in. Prepare the new page.
0478      */
0479
0480     __SetPageLocked(page);
0481     __SetPageSwapBacked(page);
0482
0483     if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
0484         goto fail_unlock;
0485
0486     /* May fail (-ENOMEM) if XArray node allocation failed. */
0487     if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
0488         goto fail_unlock;
0489
0490     mem_cgroup_swapin_uncharge_swap(entry);
0491
0492     if (shadow)
0493         workingset_refault(page_folio(page), shadow);
0494
0495     /* Caller will initiate read into locked page */
0496     lru_cache_add(page);
0497     *new_page_allocated = true;
0498     return page;
0499
0500 fail_unlock:
0501     put_swap_page(page, entry);
0502     unlock_page(page);
0503     put_page(page);
0504     return NULL;
0505 }
0506
0507 /*
0508  * Locate a page of swap in physical memory, reserving swap cache space
0509  * and reading the disk if it is not already cached.
0510  * A failure return means that either the page allocation failed or that
0511  * the swap entry is no longer in use.
0512  */
0513 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
0514                    struct vm_area_struct *vma,
0515                    unsigned long addr, bool do_poll,
0516                    struct swap_iocb **plug)
0517 {
0518     bool page_was_allocated;
0519     struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
0520             vma, addr, &page_was_allocated);
0521
0522     if (page_was_allocated)
0523         swap_readpage(retpage, do_poll, plug);
0524
0525     return retpage;
0526 }
0527
0528 static unsigned int __swapin_nr_pages(unsigned long prev_offset,
0529                       unsigned long offset,
0530                       int hits,
0531                       int max_pages,
0532                       int prev_win)
0533 {
0534     unsigned int pages, last_ra;
0535
0536     /*
0537      * This heuristic has been found to work well on both sequential and
0538      * random loads, swapping to hard disk or to SSD: please don't ask
0539      * what the "+ 2" means, it just happens to work well, that's all.
0540      */
0541     pages = hits + 2;
0542     if (pages == 2) {
0543         /*
0544          * We can have no readahead hits to judge by: but must not get
0545          * stuck here forever, so check for an adjacent offset instead
0546          * (and don't even bother to check whether swap type is same).
0547          */
0548         if (offset != prev_offset + 1 && offset != prev_offset - 1)
0549             pages = 1;
0550     } else {
0551         unsigned int roundup = 4;
0552         while (roundup < pages)
0553             roundup <<= 1;
0554         pages = roundup;
0555     }
0556
0557     if (pages > max_pages)
0558         pages = max_pages;
0559
0560     /* Don't shrink readahead too fast */
0561     last_ra = prev_win / 2;
0562     if (pages < last_ra)
0563         pages = last_ra;
0564
0565     return pages;
0566 }
0567
0568 static unsigned long swapin_nr_pages(unsigned long offset)
0569 {
0570     static unsigned long prev_offset;
0571     unsigned int hits, pages, max_pages;
0572     static atomic_t last_readahead_pages;
0573
0574     max_pages = 1 << READ_ONCE(page_cluster);
0575     if (max_pages <= 1)
0576         return 1;
0577
0578     hits = atomic_xchg(&swapin_readahead_hits, 0);
0579     pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
0580                   max_pages,
0581                   atomic_read(&last_readahead_pages));
0582     if (!hits)
0583         WRITE_ONCE(prev_offset, offset);
0584     atomic_set(&last_readahead_pages, pages);
0585
0586     return pages;
0587 }
0588
0589 /**
0590  * swap_cluster_readahead - swap in pages in hope we need them soon
0591  * @entry: swap entry of this memory
0592  * @gfp_mask: memory allocation flags
0593  * @vmf: fault information
0594  *
0595  * Returns the struct page for entry and addr, after queueing swapin.
0596  *
0597  * Primitive swap readahead code. We simply read an aligned block of
0598  * (1 << page_cluster) entries in the swap area. This method is chosen
0599  * because it doesn't cost us any seek time.  We also make sure to queue
0600  * the 'original' request together with the readahead ones...
0601  *
0602  * This has been extended to use the NUMA policies from the mm triggering
0603  * the readahead.
0604  *
0605  * Caller must hold read mmap_lock if vmf->vma is not NULL.
0606  */
0607 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
0608                 struct vm_fault *vmf)
0609 {
0610     struct page *page;
0611     unsigned long entry_offset = swp_offset(entry);
0612     unsigned long offset = entry_offset;
0613     unsigned long start_offset, end_offset;
0614     unsigned long mask;
0615     struct swap_info_struct *si = swp_swap_info(entry);
0616     struct blk_plug plug;
0617     struct swap_iocb *splug = NULL;
0618     bool do_poll = true, page_allocated;
0619     struct vm_area_struct *vma = vmf->vma;
0620     unsigned long addr = vmf->address;
0621
0622     mask = swapin_nr_pages(offset) - 1;
0623     if (!mask)
0624         goto skip;
0625
0626     do_poll = false;
0627     /* Read a page_cluster sized and aligned cluster around offset. */
0628     start_offset = offset & ~mask;
0629     end_offset = offset | mask;
0630     if (!start_offset)  /* First page is swap header. */
0631         start_offset++;
0632     if (end_offset >= si->max)
0633         end_offset = si->max - 1;
0634
0635     blk_start_plug(&plug);
0636     for (offset = start_offset; offset <= end_offset ; offset++) {
0637         /* Ok, do the async read-ahead now */
0638         page = __read_swap_cache_async(
0639             swp_entry(swp_type(entry), offset),
0640             gfp_mask, vma, addr, &page_allocated);
0641         if (!page)
0642             continue;
0643         if (page_allocated) {
0644             swap_readpage(page, false, &splug);
0645             if (offset != entry_offset) {
0646                 SetPageReadahead(page);
0647                 count_vm_event(SWAP_RA);
0648             }
0649         }
0650         put_page(page);
0651     }
0652     blk_finish_plug(&plug);
0653     swap_read_unplug(splug);
0654
0655     lru_add_drain();    /* Push any new pages onto the LRU now */
0656 skip:
0657     /* The page was likely read above, so no need for plugging here */
0658     return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL);
0659 }
0660
0661 int init_swap_address_space(unsigned int type, unsigned long nr_pages)
0662 {
0663     struct address_space *spaces, *space;
0664     unsigned int i, nr;
0665
0666     nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
0667     spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
0668     if (!spaces)
0669         return -ENOMEM;
0670     for (i = 0; i < nr; i++) {
0671         space = spaces + i;
0672         xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
0673         atomic_set(&space->i_mmap_writable, 0);
0674         space->a_ops = &swap_aops;
0675         /* swap cache doesn't use writeback related tags */
0676         mapping_set_no_writeback_tags(space);
0677     }
0678     nr_swapper_spaces[type] = nr;
0679     swapper_spaces[type] = spaces;
0680
0681     return 0;
0682 }
0683
0684 void exit_swap_address_space(unsigned int type)
0685 {
0686     int i;
0687     struct address_space *spaces = swapper_spaces[type];
0688
0689     for (i = 0; i < nr_swapper_spaces[type]; i++)
0690         VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
0691     kvfree(spaces);
0692     nr_swapper_spaces[type] = 0;
0693     swapper_spaces[type] = NULL;
0694 }
0695
0696 static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
0697                      unsigned long faddr,
0698                      unsigned long lpfn,
0699                      unsigned long rpfn,
0700                      unsigned long *start,
0701                      unsigned long *end)
0702 {
0703     *start = max3(lpfn, PFN_DOWN(vma->vm_start),
0704               PFN_DOWN(faddr & PMD_MASK));
0705     *end = min3(rpfn, PFN_DOWN(vma->vm_end),
0706             PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
0707 }
0708
0709 static void swap_ra_info(struct vm_fault *vmf,
0710             struct vma_swap_readahead *ra_info)
0711 {
0712     struct vm_area_struct *vma = vmf->vma;
0713     unsigned long ra_val;
0714     unsigned long faddr, pfn, fpfn;
0715     unsigned long start, end;
0716     pte_t *pte, *orig_pte;
0717     unsigned int max_win, hits, prev_win, win, left;
0718 #ifndef CONFIG_64BIT
0719     pte_t *tpte;
0720 #endif
0721
0722     max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
0723                  SWAP_RA_ORDER_CEILING);
0724     if (max_win == 1) {
0725         ra_info->win = 1;
0726         return;
0727     }
0728
0729     faddr = vmf->address;
0730     orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
0731
0732     fpfn = PFN_DOWN(faddr);
0733     ra_val = GET_SWAP_RA_VAL(vma);
0734     pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
0735     prev_win = SWAP_RA_WIN(ra_val);
0736     hits = SWAP_RA_HITS(ra_val);
0737     ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
0738                            max_win, prev_win);
0739     atomic_long_set(&vma->swap_readahead_info,
0740             SWAP_RA_VAL(faddr, win, 0));
0741
0742     if (win == 1) {
0743         pte_unmap(orig_pte);
0744         return;
0745     }
0746
0747     /* Copy the PTEs because the page table may be unmapped */
0748     if (fpfn == pfn + 1)
0749         swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
0750     else if (pfn == fpfn + 1)
0751         swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
0752                   &start, &end);
0753     else {
0754         left = (win - 1) / 2;
0755         swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
0756                   &start, &end);
0757     }
0758     ra_info->nr_pte = end - start;
0759     ra_info->offset = fpfn - start;
0760     pte -= ra_info->offset;
0761 #ifdef CONFIG_64BIT
0762     ra_info->ptes = pte;
0763 #else
0764     tpte = ra_info->ptes;
0765     for (pfn = start; pfn != end; pfn++)
0766         *tpte++ = *pte++;
0767 #endif
0768     pte_unmap(orig_pte);
0769 }
0770
0771 /**
0772  * swap_vma_readahead - swap in pages in hope we need them soon
0773  * @fentry: swap entry of this memory
0774  * @gfp_mask: memory allocation flags
0775  * @vmf: fault information
0776  *
0777  * Returns the struct page for entry and addr, after queueing swapin.
0778  *
0779  * Primitive swap readahead code. We simply read in a few pages whose
0780  * virtual addresses are around the fault address in the same vma.
0781  *
0782  * Caller must hold read mmap_lock if vmf->vma is not NULL.
0783  *
0784  */
0785 static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
0786                        struct vm_fault *vmf)
0787 {
0788     struct blk_plug plug;
0789     struct swap_iocb *splug = NULL;
0790     struct vm_area_struct *vma = vmf->vma;
0791     struct page *page;
0792     pte_t *pte, pentry;
0793     swp_entry_t entry;
0794     unsigned int i;
0795     bool page_allocated;
0796     struct vma_swap_readahead ra_info = {
0797         .win = 1,
0798     };
0799
0800     swap_ra_info(vmf, &ra_info);
0801     if (ra_info.win == 1)
0802         goto skip;
0803
0804     blk_start_plug(&plug);
0805     for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
0806          i++, pte++) {
0807         pentry = *pte;
0808         if (!is_swap_pte(pentry))
0809             continue;
0810         entry = pte_to_swp_entry(pentry);
0811         if (unlikely(non_swap_entry(entry)))
0812             continue;
0813         page = __read_swap_cache_async(entry, gfp_mask, vma,
0814                            vmf->address, &page_allocated);
0815         if (!page)
0816             continue;
0817         if (page_allocated) {
0818             swap_readpage(page, false, &splug);
0819             if (i != ra_info.offset) {
0820                 SetPageReadahead(page);
0821                 count_vm_event(SWAP_RA);
0822             }
0823         }
0824         put_page(page);
0825     }
0826     blk_finish_plug(&plug);
0827     swap_read_unplug(splug);
0828     lru_add_drain();
0829 skip:
0830     /* The page was likely read above, so no need for plugging here */
0831     return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
0832                      ra_info.win == 1, NULL);
0833 }
0834
0835 /**
0836  * swapin_readahead - swap in pages in hope we need them soon
0837  * @entry: swap entry of this memory
0838  * @gfp_mask: memory allocation flags
0839  * @vmf: fault information
0840  *
0841  * Returns the struct page for entry and addr, after queueing swapin.
0842  *
0843  * It's a main entry function for swap readahead. By the configuration,
0844  * it will read ahead blocks by cluster-based(ie, physical disk based)
0845  * or vma-based(ie, virtual address based on faulty address) readahead.
0846  */
0847 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
0848                 struct vm_fault *vmf)
0849 {
0850     return swap_use_vma_readahead() ?
0851             swap_vma_readahead(entry, gfp_mask, vmf) :
0852             swap_cluster_readahead(entry, gfp_mask, vmf);
0853 }
0854
0855 #ifdef CONFIG_SYSFS
0856 static ssize_t vma_ra_enabled_show(struct kobject *kobj,
0857                      struct kobj_attribute *attr, char *buf)
0858 {
0859     return sysfs_emit(buf, "%s\n",
0860               enable_vma_readahead ? "true" : "false");
0861 }
0862 static ssize_t vma_ra_enabled_store(struct kobject *kobj,
0863                       struct kobj_attribute *attr,
0864                       const char *buf, size_t count)
0865 {
0866     ssize_t ret;
0867
0868     ret = kstrtobool(buf, &enable_vma_readahead);
0869     if (ret)
0870         return ret;
0871
0872     return count;
0873 }
0874 static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
0875
0876 static struct attribute *swap_attrs[] = {
0877     &vma_ra_enabled_attr.attr,
0878     NULL,
0879 };
0880
0881 static const struct attribute_group swap_attr_group = {
0882     .attrs = swap_attrs,
0883 };
0884
0885 static int __init swap_init_sysfs(void)
0886 {
0887     int err;
0888     struct kobject *swap_kobj;
0889
0890     swap_kobj = kobject_create_and_add("swap", mm_kobj);
0891     if (!swap_kobj) {
0892         pr_err("failed to create swap kobject\n");
0893         return -ENOMEM;
0894     }
0895     err = sysfs_create_group(swap_kobj, &swap_attr_group);
0896     if (err) {
0897         pr_err("failed to register swap group\n");
0898         goto delete_obj;
0899     }
0900     return 0;
0901
0902 delete_obj:
0903     kobject_put(swap_kobj);
0904     return err;
0905 }
0906 subsys_initcall(swap_init_sysfs);
0907 #endif