Back to home page

LXR

 
 

    


0001 /*
0002  * linux/mm/compaction.c
0003  *
0004  * Memory compaction for the reduction of external fragmentation. Note that
0005  * this heavily depends upon page migration to do all the real heavy
0006  * lifting
0007  *
0008  * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
0009  */
0010 #include <linux/cpu.h>
0011 #include <linux/swap.h>
0012 #include <linux/migrate.h>
0013 #include <linux/compaction.h>
0014 #include <linux/mm_inline.h>
0015 #include <linux/backing-dev.h>
0016 #include <linux/sysctl.h>
0017 #include <linux/sysfs.h>
0018 #include <linux/page-isolation.h>
0019 #include <linux/kasan.h>
0020 #include <linux/kthread.h>
0021 #include <linux/freezer.h>
0022 #include <linux/page_owner.h>
0023 #include "internal.h"
0024 
0025 #ifdef CONFIG_COMPACTION
0026 static inline void count_compact_event(enum vm_event_item item)
0027 {
0028     count_vm_event(item);
0029 }
0030 
0031 static inline void count_compact_events(enum vm_event_item item, long delta)
0032 {
0033     count_vm_events(item, delta);
0034 }
0035 #else
0036 #define count_compact_event(item) do { } while (0)
0037 #define count_compact_events(item, delta) do { } while (0)
0038 #endif
0039 
0040 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
0041 
0042 #define CREATE_TRACE_POINTS
0043 #include <trace/events/compaction.h>
0044 
0045 #define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
0046 #define block_end_pfn(pfn, order)   ALIGN((pfn) + 1, 1UL << (order))
0047 #define pageblock_start_pfn(pfn)    block_start_pfn(pfn, pageblock_order)
0048 #define pageblock_end_pfn(pfn)      block_end_pfn(pfn, pageblock_order)
0049 
0050 static unsigned long release_freepages(struct list_head *freelist)
0051 {
0052     struct page *page, *next;
0053     unsigned long high_pfn = 0;
0054 
0055     list_for_each_entry_safe(page, next, freelist, lru) {
0056         unsigned long pfn = page_to_pfn(page);
0057         list_del(&page->lru);
0058         __free_page(page);
0059         if (pfn > high_pfn)
0060             high_pfn = pfn;
0061     }
0062 
0063     return high_pfn;
0064 }
0065 
0066 static void map_pages(struct list_head *list)
0067 {
0068     unsigned int i, order, nr_pages;
0069     struct page *page, *next;
0070     LIST_HEAD(tmp_list);
0071 
0072     list_for_each_entry_safe(page, next, list, lru) {
0073         list_del(&page->lru);
0074 
0075         order = page_private(page);
0076         nr_pages = 1 << order;
0077 
0078         post_alloc_hook(page, order, __GFP_MOVABLE);
0079         if (order)
0080             split_page(page, order);
0081 
0082         for (i = 0; i < nr_pages; i++) {
0083             list_add(&page->lru, &tmp_list);
0084             page++;
0085         }
0086     }
0087 
0088     list_splice(&tmp_list, list);
0089 }
0090 
0091 static inline bool migrate_async_suitable(int migratetype)
0092 {
0093     return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
0094 }
0095 
0096 #ifdef CONFIG_COMPACTION
0097 
0098 int PageMovable(struct page *page)
0099 {
0100     struct address_space *mapping;
0101 
0102     VM_BUG_ON_PAGE(!PageLocked(page), page);
0103     if (!__PageMovable(page))
0104         return 0;
0105 
0106     mapping = page_mapping(page);
0107     if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
0108         return 1;
0109 
0110     return 0;
0111 }
0112 EXPORT_SYMBOL(PageMovable);
0113 
0114 void __SetPageMovable(struct page *page, struct address_space *mapping)
0115 {
0116     VM_BUG_ON_PAGE(!PageLocked(page), page);
0117     VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
0118     page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
0119 }
0120 EXPORT_SYMBOL(__SetPageMovable);
0121 
0122 void __ClearPageMovable(struct page *page)
0123 {
0124     VM_BUG_ON_PAGE(!PageLocked(page), page);
0125     VM_BUG_ON_PAGE(!PageMovable(page), page);
0126     /*
0127      * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
0128      * flag so that VM can catch up released page by driver after isolation.
0129      * With it, VM migration doesn't try to put it back.
0130      */
0131     page->mapping = (void *)((unsigned long)page->mapping &
0132                 PAGE_MAPPING_MOVABLE);
0133 }
0134 EXPORT_SYMBOL(__ClearPageMovable);
0135 
0136 /* Do not skip compaction more than 64 times */
0137 #define COMPACT_MAX_DEFER_SHIFT 6
0138 
0139 /*
0140  * Compaction is deferred when compaction fails to result in a page
0141  * allocation success. 1 << compact_defer_limit compactions are skipped up
0142  * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
0143  */
0144 void defer_compaction(struct zone *zone, int order)
0145 {
0146     zone->compact_considered = 0;
0147     zone->compact_defer_shift++;
0148 
0149     if (order < zone->compact_order_failed)
0150         zone->compact_order_failed = order;
0151 
0152     if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
0153         zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
0154 
0155     trace_mm_compaction_defer_compaction(zone, order);
0156 }
0157 
0158 /* Returns true if compaction should be skipped this time */
0159 bool compaction_deferred(struct zone *zone, int order)
0160 {
0161     unsigned long defer_limit = 1UL << zone->compact_defer_shift;
0162 
0163     if (order < zone->compact_order_failed)
0164         return false;
0165 
0166     /* Avoid possible overflow */
0167     if (++zone->compact_considered > defer_limit)
0168         zone->compact_considered = defer_limit;
0169 
0170     if (zone->compact_considered >= defer_limit)
0171         return false;
0172 
0173     trace_mm_compaction_deferred(zone, order);
0174 
0175     return true;
0176 }
0177 
0178 /*
0179  * Update defer tracking counters after successful compaction of given order,
0180  * which means an allocation either succeeded (alloc_success == true) or is
0181  * expected to succeed.
0182  */
0183 void compaction_defer_reset(struct zone *zone, int order,
0184         bool alloc_success)
0185 {
0186     if (alloc_success) {
0187         zone->compact_considered = 0;
0188         zone->compact_defer_shift = 0;
0189     }
0190     if (order >= zone->compact_order_failed)
0191         zone->compact_order_failed = order + 1;
0192 
0193     trace_mm_compaction_defer_reset(zone, order);
0194 }
0195 
0196 /* Returns true if restarting compaction after many failures */
0197 bool compaction_restarting(struct zone *zone, int order)
0198 {
0199     if (order < zone->compact_order_failed)
0200         return false;
0201 
0202     return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
0203         zone->compact_considered >= 1UL << zone->compact_defer_shift;
0204 }
0205 
0206 /* Returns true if the pageblock should be scanned for pages to isolate. */
0207 static inline bool isolation_suitable(struct compact_control *cc,
0208                     struct page *page)
0209 {
0210     if (cc->ignore_skip_hint)
0211         return true;
0212 
0213     return !get_pageblock_skip(page);
0214 }
0215 
0216 static void reset_cached_positions(struct zone *zone)
0217 {
0218     zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
0219     zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
0220     zone->compact_cached_free_pfn =
0221                 pageblock_start_pfn(zone_end_pfn(zone) - 1);
0222 }
0223 
0224 /*
0225  * This function is called to clear all cached information on pageblocks that
0226  * should be skipped for page isolation when the migrate and free page scanner
0227  * meet.
0228  */
0229 static void __reset_isolation_suitable(struct zone *zone)
0230 {
0231     unsigned long start_pfn = zone->zone_start_pfn;
0232     unsigned long end_pfn = zone_end_pfn(zone);
0233     unsigned long pfn;
0234 
0235     zone->compact_blockskip_flush = false;
0236 
0237     /* Walk the zone and mark every pageblock as suitable for isolation */
0238     for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
0239         struct page *page;
0240 
0241         cond_resched();
0242 
0243         if (!pfn_valid(pfn))
0244             continue;
0245 
0246         page = pfn_to_page(pfn);
0247         if (zone != page_zone(page))
0248             continue;
0249 
0250         clear_pageblock_skip(page);
0251     }
0252 
0253     reset_cached_positions(zone);
0254 }
0255 
0256 void reset_isolation_suitable(pg_data_t *pgdat)
0257 {
0258     int zoneid;
0259 
0260     for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
0261         struct zone *zone = &pgdat->node_zones[zoneid];
0262         if (!populated_zone(zone))
0263             continue;
0264 
0265         /* Only flush if a full compaction finished recently */
0266         if (zone->compact_blockskip_flush)
0267             __reset_isolation_suitable(zone);
0268     }
0269 }
0270 
0271 /*
0272  * If no pages were isolated then mark this pageblock to be skipped in the
0273  * future. The information is later cleared by __reset_isolation_suitable().
0274  */
0275 static void update_pageblock_skip(struct compact_control *cc,
0276             struct page *page, unsigned long nr_isolated,
0277             bool migrate_scanner)
0278 {
0279     struct zone *zone = cc->zone;
0280     unsigned long pfn;
0281 
0282     if (cc->ignore_skip_hint)
0283         return;
0284 
0285     if (!page)
0286         return;
0287 
0288     if (nr_isolated)
0289         return;
0290 
0291     set_pageblock_skip(page);
0292 
0293     pfn = page_to_pfn(page);
0294 
0295     /* Update where async and sync compaction should restart */
0296     if (migrate_scanner) {
0297         if (pfn > zone->compact_cached_migrate_pfn[0])
0298             zone->compact_cached_migrate_pfn[0] = pfn;
0299         if (cc->mode != MIGRATE_ASYNC &&
0300             pfn > zone->compact_cached_migrate_pfn[1])
0301             zone->compact_cached_migrate_pfn[1] = pfn;
0302     } else {
0303         if (pfn < zone->compact_cached_free_pfn)
0304             zone->compact_cached_free_pfn = pfn;
0305     }
0306 }
0307 #else
0308 static inline bool isolation_suitable(struct compact_control *cc,
0309                     struct page *page)
0310 {
0311     return true;
0312 }
0313 
0314 static void update_pageblock_skip(struct compact_control *cc,
0315             struct page *page, unsigned long nr_isolated,
0316             bool migrate_scanner)
0317 {
0318 }
0319 #endif /* CONFIG_COMPACTION */
0320 
0321 /*
0322  * Compaction requires the taking of some coarse locks that are potentially
0323  * very heavily contended. For async compaction, back out if the lock cannot
0324  * be taken immediately. For sync compaction, spin on the lock if needed.
0325  *
0326  * Returns true if the lock is held
0327  * Returns false if the lock is not held and compaction should abort
0328  */
0329 static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
0330                         struct compact_control *cc)
0331 {
0332     if (cc->mode == MIGRATE_ASYNC) {
0333         if (!spin_trylock_irqsave(lock, *flags)) {
0334             cc->contended = true;
0335             return false;
0336         }
0337     } else {
0338         spin_lock_irqsave(lock, *flags);
0339     }
0340 
0341     return true;
0342 }
0343 
0344 /*
0345  * Compaction requires the taking of some coarse locks that are potentially
0346  * very heavily contended. The lock should be periodically unlocked to avoid
0347  * having disabled IRQs for a long time, even when there is nobody waiting on
0348  * the lock. It might also be that allowing the IRQs will result in
0349  * need_resched() becoming true. If scheduling is needed, async compaction
0350  * aborts. Sync compaction schedules.
0351  * Either compaction type will also abort if a fatal signal is pending.
0352  * In either case if the lock was locked, it is dropped and not regained.
0353  *
0354  * Returns true if compaction should abort due to fatal signal pending, or
0355  *      async compaction due to need_resched()
0356  * Returns false when compaction can continue (sync compaction might have
0357  *      scheduled)
0358  */
0359 static bool compact_unlock_should_abort(spinlock_t *lock,
0360         unsigned long flags, bool *locked, struct compact_control *cc)
0361 {
0362     if (*locked) {
0363         spin_unlock_irqrestore(lock, flags);
0364         *locked = false;
0365     }
0366 
0367     if (fatal_signal_pending(current)) {
0368         cc->contended = true;
0369         return true;
0370     }
0371 
0372     if (need_resched()) {
0373         if (cc->mode == MIGRATE_ASYNC) {
0374             cc->contended = true;
0375             return true;
0376         }
0377         cond_resched();
0378     }
0379 
0380     return false;
0381 }
0382 
0383 /*
0384  * Aside from avoiding lock contention, compaction also periodically checks
0385  * need_resched() and either schedules in sync compaction or aborts async
0386  * compaction. This is similar to what compact_unlock_should_abort() does, but
0387  * is used where no lock is concerned.
0388  *
0389  * Returns false when no scheduling was needed, or sync compaction scheduled.
0390  * Returns true when async compaction should abort.
0391  */
0392 static inline bool compact_should_abort(struct compact_control *cc)
0393 {
0394     /* async compaction aborts if contended */
0395     if (need_resched()) {
0396         if (cc->mode == MIGRATE_ASYNC) {
0397             cc->contended = true;
0398             return true;
0399         }
0400 
0401         cond_resched();
0402     }
0403 
0404     return false;
0405 }
0406 
0407 /*
0408  * Isolate free pages onto a private freelist. If @strict is true, will abort
0409  * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
0410  * (even though it may still end up isolating some pages).
0411  */
0412 static unsigned long isolate_freepages_block(struct compact_control *cc,
0413                 unsigned long *start_pfn,
0414                 unsigned long end_pfn,
0415                 struct list_head *freelist,
0416                 bool strict)
0417 {
0418     int nr_scanned = 0, total_isolated = 0;
0419     struct page *cursor, *valid_page = NULL;
0420     unsigned long flags = 0;
0421     bool locked = false;
0422     unsigned long blockpfn = *start_pfn;
0423     unsigned int order;
0424 
0425     cursor = pfn_to_page(blockpfn);
0426 
0427     /* Isolate free pages. */
0428     for (; blockpfn < end_pfn; blockpfn++, cursor++) {
0429         int isolated;
0430         struct page *page = cursor;
0431 
0432         /*
0433          * Periodically drop the lock (if held) regardless of its
0434          * contention, to give chance to IRQs. Abort if fatal signal
0435          * pending or async compaction detects need_resched()
0436          */
0437         if (!(blockpfn % SWAP_CLUSTER_MAX)
0438             && compact_unlock_should_abort(&cc->zone->lock, flags,
0439                                 &locked, cc))
0440             break;
0441 
0442         nr_scanned++;
0443         if (!pfn_valid_within(blockpfn))
0444             goto isolate_fail;
0445 
0446         if (!valid_page)
0447             valid_page = page;
0448 
0449         /*
0450          * For compound pages such as THP and hugetlbfs, we can save
0451          * potentially a lot of iterations if we skip them at once.
0452          * The check is racy, but we can consider only valid values
0453          * and the only danger is skipping too much.
0454          */
0455         if (PageCompound(page)) {
0456             unsigned int comp_order = compound_order(page);
0457 
0458             if (likely(comp_order < MAX_ORDER)) {
0459                 blockpfn += (1UL << comp_order) - 1;
0460                 cursor += (1UL << comp_order) - 1;
0461             }
0462 
0463             goto isolate_fail;
0464         }
0465 
0466         if (!PageBuddy(page))
0467             goto isolate_fail;
0468 
0469         /*
0470          * If we already hold the lock, we can skip some rechecking.
0471          * Note that if we hold the lock now, checked_pageblock was
0472          * already set in some previous iteration (or strict is true),
0473          * so it is correct to skip the suitable migration target
0474          * recheck as well.
0475          */
0476         if (!locked) {
0477             /*
0478              * The zone lock must be held to isolate freepages.
0479              * Unfortunately this is a very coarse lock and can be
0480              * heavily contended if there are parallel allocations
0481              * or parallel compactions. For async compaction do not
0482              * spin on the lock and we acquire the lock as late as
0483              * possible.
0484              */
0485             locked = compact_trylock_irqsave(&cc->zone->lock,
0486                                 &flags, cc);
0487             if (!locked)
0488                 break;
0489 
0490             /* Recheck this is a buddy page under lock */
0491             if (!PageBuddy(page))
0492                 goto isolate_fail;
0493         }
0494 
0495         /* Found a free page, will break it into order-0 pages */
0496         order = page_order(page);
0497         isolated = __isolate_free_page(page, order);
0498         if (!isolated)
0499             break;
0500         set_page_private(page, order);
0501 
0502         total_isolated += isolated;
0503         cc->nr_freepages += isolated;
0504         list_add_tail(&page->lru, freelist);
0505 
0506         if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
0507             blockpfn += isolated;
0508             break;
0509         }
0510         /* Advance to the end of split page */
0511         blockpfn += isolated - 1;
0512         cursor += isolated - 1;
0513         continue;
0514 
0515 isolate_fail:
0516         if (strict)
0517             break;
0518         else
0519             continue;
0520 
0521     }
0522 
0523     if (locked)
0524         spin_unlock_irqrestore(&cc->zone->lock, flags);
0525 
0526     /*
0527      * There is a tiny chance that we have read bogus compound_order(),
0528      * so be careful to not go outside of the pageblock.
0529      */
0530     if (unlikely(blockpfn > end_pfn))
0531         blockpfn = end_pfn;
0532 
0533     trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
0534                     nr_scanned, total_isolated);
0535 
0536     /* Record how far we have got within the block */
0537     *start_pfn = blockpfn;
0538 
0539     /*
0540      * If strict isolation is requested by CMA then check that all the
0541      * pages requested were isolated. If there were any failures, 0 is
0542      * returned and CMA will fail.
0543      */
0544     if (strict && blockpfn < end_pfn)
0545         total_isolated = 0;
0546 
0547     /* Update the pageblock-skip if the whole pageblock was scanned */
0548     if (blockpfn == end_pfn)
0549         update_pageblock_skip(cc, valid_page, total_isolated, false);
0550 
0551     count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
0552     if (total_isolated)
0553         count_compact_events(COMPACTISOLATED, total_isolated);
0554     return total_isolated;
0555 }
0556 
0557 /**
0558  * isolate_freepages_range() - isolate free pages.
0559  * @start_pfn: The first PFN to start isolating.
0560  * @end_pfn:   The one-past-last PFN.
0561  *
0562  * Non-free pages, invalid PFNs, or zone boundaries within the
0563  * [start_pfn, end_pfn) range are considered errors, cause function to
0564  * undo its actions and return zero.
0565  *
0566  * Otherwise, function returns one-past-the-last PFN of isolated page
0567  * (which may be greater then end_pfn if end fell in a middle of
0568  * a free page).
0569  */
0570 unsigned long
0571 isolate_freepages_range(struct compact_control *cc,
0572             unsigned long start_pfn, unsigned long end_pfn)
0573 {
0574     unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
0575     LIST_HEAD(freelist);
0576 
0577     pfn = start_pfn;
0578     block_start_pfn = pageblock_start_pfn(pfn);
0579     if (block_start_pfn < cc->zone->zone_start_pfn)
0580         block_start_pfn = cc->zone->zone_start_pfn;
0581     block_end_pfn = pageblock_end_pfn(pfn);
0582 
0583     for (; pfn < end_pfn; pfn += isolated,
0584                 block_start_pfn = block_end_pfn,
0585                 block_end_pfn += pageblock_nr_pages) {
0586         /* Protect pfn from changing by isolate_freepages_block */
0587         unsigned long isolate_start_pfn = pfn;
0588 
0589         block_end_pfn = min(block_end_pfn, end_pfn);
0590 
0591         /*
0592          * pfn could pass the block_end_pfn if isolated freepage
0593          * is more than pageblock order. In this case, we adjust
0594          * scanning range to right one.
0595          */
0596         if (pfn >= block_end_pfn) {
0597             block_start_pfn = pageblock_start_pfn(pfn);
0598             block_end_pfn = pageblock_end_pfn(pfn);
0599             block_end_pfn = min(block_end_pfn, end_pfn);
0600         }
0601 
0602         if (!pageblock_pfn_to_page(block_start_pfn,
0603                     block_end_pfn, cc->zone))
0604             break;
0605 
0606         isolated = isolate_freepages_block(cc, &isolate_start_pfn,
0607                         block_end_pfn, &freelist, true);
0608 
0609         /*
0610          * In strict mode, isolate_freepages_block() returns 0 if
0611          * there are any holes in the block (ie. invalid PFNs or
0612          * non-free pages).
0613          */
0614         if (!isolated)
0615             break;
0616 
0617         /*
0618          * If we managed to isolate pages, it is always (1 << n) *
0619          * pageblock_nr_pages for some non-negative n.  (Max order
0620          * page may span two pageblocks).
0621          */
0622     }
0623 
0624     /* __isolate_free_page() does not map the pages */
0625     map_pages(&freelist);
0626 
0627     if (pfn < end_pfn) {
0628         /* Loop terminated early, cleanup. */
0629         release_freepages(&freelist);
0630         return 0;
0631     }
0632 
0633     /* We don't use freelists for anything. */
0634     return pfn;
0635 }
0636 
0637 /* Similar to reclaim, but different enough that they don't share logic */
0638 static bool too_many_isolated(struct zone *zone)
0639 {
0640     unsigned long active, inactive, isolated;
0641 
0642     inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
0643             node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
0644     active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
0645             node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
0646     isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
0647             node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
0648 
0649     return isolated > (inactive + active) / 2;
0650 }
0651 
0652 /**
0653  * isolate_migratepages_block() - isolate all migrate-able pages within
0654  *                a single pageblock
0655  * @cc:     Compaction control structure.
0656  * @low_pfn:    The first PFN to isolate
0657  * @end_pfn:    The one-past-the-last PFN to isolate, within same pageblock
0658  * @isolate_mode: Isolation mode to be used.
0659  *
0660  * Isolate all pages that can be migrated from the range specified by
0661  * [low_pfn, end_pfn). The range is expected to be within same pageblock.
0662  * Returns zero if there is a fatal signal pending, otherwise PFN of the
0663  * first page that was not scanned (which may be both less, equal to or more
0664  * than end_pfn).
0665  *
0666  * The pages are isolated on cc->migratepages list (not required to be empty),
0667  * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
0668  * is neither read nor updated.
0669  */
0670 static unsigned long
0671 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
0672             unsigned long end_pfn, isolate_mode_t isolate_mode)
0673 {
0674     struct zone *zone = cc->zone;
0675     unsigned long nr_scanned = 0, nr_isolated = 0;
0676     struct lruvec *lruvec;
0677     unsigned long flags = 0;
0678     bool locked = false;
0679     struct page *page = NULL, *valid_page = NULL;
0680     unsigned long start_pfn = low_pfn;
0681     bool skip_on_failure = false;
0682     unsigned long next_skip_pfn = 0;
0683 
0684     /*
0685      * Ensure that there are not too many pages isolated from the LRU
0686      * list by either parallel reclaimers or compaction. If there are,
0687      * delay for some time until fewer pages are isolated
0688      */
0689     while (unlikely(too_many_isolated(zone))) {
0690         /* async migration should just abort */
0691         if (cc->mode == MIGRATE_ASYNC)
0692             return 0;
0693 
0694         congestion_wait(BLK_RW_ASYNC, HZ/10);
0695 
0696         if (fatal_signal_pending(current))
0697             return 0;
0698     }
0699 
0700     if (compact_should_abort(cc))
0701         return 0;
0702 
0703     if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
0704         skip_on_failure = true;
0705         next_skip_pfn = block_end_pfn(low_pfn, cc->order);
0706     }
0707 
0708     /* Time to isolate some pages for migration */
0709     for (; low_pfn < end_pfn; low_pfn++) {
0710 
0711         if (skip_on_failure && low_pfn >= next_skip_pfn) {
0712             /*
0713              * We have isolated all migration candidates in the
0714              * previous order-aligned block, and did not skip it due
0715              * to failure. We should migrate the pages now and
0716              * hopefully succeed compaction.
0717              */
0718             if (nr_isolated)
0719                 break;
0720 
0721             /*
0722              * We failed to isolate in the previous order-aligned
0723              * block. Set the new boundary to the end of the
0724              * current block. Note we can't simply increase
0725              * next_skip_pfn by 1 << order, as low_pfn might have
0726              * been incremented by a higher number due to skipping
0727              * a compound or a high-order buddy page in the
0728              * previous loop iteration.
0729              */
0730             next_skip_pfn = block_end_pfn(low_pfn, cc->order);
0731         }
0732 
0733         /*
0734          * Periodically drop the lock (if held) regardless of its
0735          * contention, to give chance to IRQs. Abort async compaction
0736          * if contended.
0737          */
0738         if (!(low_pfn % SWAP_CLUSTER_MAX)
0739             && compact_unlock_should_abort(zone_lru_lock(zone), flags,
0740                                 &locked, cc))
0741             break;
0742 
0743         if (!pfn_valid_within(low_pfn))
0744             goto isolate_fail;
0745         nr_scanned++;
0746 
0747         page = pfn_to_page(low_pfn);
0748 
0749         if (!valid_page)
0750             valid_page = page;
0751 
0752         /*
0753          * Skip if free. We read page order here without zone lock
0754          * which is generally unsafe, but the race window is small and
0755          * the worst thing that can happen is that we skip some
0756          * potential isolation targets.
0757          */
0758         if (PageBuddy(page)) {
0759             unsigned long freepage_order = page_order_unsafe(page);
0760 
0761             /*
0762              * Without lock, we cannot be sure that what we got is
0763              * a valid page order. Consider only values in the
0764              * valid order range to prevent low_pfn overflow.
0765              */
0766             if (freepage_order > 0 && freepage_order < MAX_ORDER)
0767                 low_pfn += (1UL << freepage_order) - 1;
0768             continue;
0769         }
0770 
0771         /*
0772          * Regardless of being on LRU, compound pages such as THP and
0773          * hugetlbfs are not to be compacted. We can potentially save
0774          * a lot of iterations if we skip them at once. The check is
0775          * racy, but we can consider only valid values and the only
0776          * danger is skipping too much.
0777          */
0778         if (PageCompound(page)) {
0779             unsigned int comp_order = compound_order(page);
0780 
0781             if (likely(comp_order < MAX_ORDER))
0782                 low_pfn += (1UL << comp_order) - 1;
0783 
0784             goto isolate_fail;
0785         }
0786 
0787         /*
0788          * Check may be lockless but that's ok as we recheck later.
0789          * It's possible to migrate LRU and non-lru movable pages.
0790          * Skip any other type of page
0791          */
0792         if (!PageLRU(page)) {
0793             /*
0794              * __PageMovable can return false positive so we need
0795              * to verify it under page_lock.
0796              */
0797             if (unlikely(__PageMovable(page)) &&
0798                     !PageIsolated(page)) {
0799                 if (locked) {
0800                     spin_unlock_irqrestore(zone_lru_lock(zone),
0801                                     flags);
0802                     locked = false;
0803                 }
0804 
0805                 if (isolate_movable_page(page, isolate_mode))
0806                     goto isolate_success;
0807             }
0808 
0809             goto isolate_fail;
0810         }
0811 
0812         /*
0813          * Migration will fail if an anonymous page is pinned in memory,
0814          * so avoid taking lru_lock and isolating it unnecessarily in an
0815          * admittedly racy check.
0816          */
0817         if (!page_mapping(page) &&
0818             page_count(page) > page_mapcount(page))
0819             goto isolate_fail;
0820 
0821         /*
0822          * Only allow to migrate anonymous pages in GFP_NOFS context
0823          * because those do not depend on fs locks.
0824          */
0825         if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
0826             goto isolate_fail;
0827 
0828         /* If we already hold the lock, we can skip some rechecking */
0829         if (!locked) {
0830             locked = compact_trylock_irqsave(zone_lru_lock(zone),
0831                                 &flags, cc);
0832             if (!locked)
0833                 break;
0834 
0835             /* Recheck PageLRU and PageCompound under lock */
0836             if (!PageLRU(page))
0837                 goto isolate_fail;
0838 
0839             /*
0840              * Page become compound since the non-locked check,
0841              * and it's on LRU. It can only be a THP so the order
0842              * is safe to read and it's 0 for tail pages.
0843              */
0844             if (unlikely(PageCompound(page))) {
0845                 low_pfn += (1UL << compound_order(page)) - 1;
0846                 goto isolate_fail;
0847             }
0848         }
0849 
0850         lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
0851 
0852         /* Try isolate the page */
0853         if (__isolate_lru_page(page, isolate_mode) != 0)
0854             goto isolate_fail;
0855 
0856         VM_BUG_ON_PAGE(PageCompound(page), page);
0857 
0858         /* Successfully isolated */
0859         del_page_from_lru_list(page, lruvec, page_lru(page));
0860         inc_node_page_state(page,
0861                 NR_ISOLATED_ANON + page_is_file_cache(page));
0862 
0863 isolate_success:
0864         list_add(&page->lru, &cc->migratepages);
0865         cc->nr_migratepages++;
0866         nr_isolated++;
0867 
0868         /*
0869          * Record where we could have freed pages by migration and not
0870          * yet flushed them to buddy allocator.
0871          * - this is the lowest page that was isolated and likely be
0872          * then freed by migration.
0873          */
0874         if (!cc->last_migrated_pfn)
0875             cc->last_migrated_pfn = low_pfn;
0876 
0877         /* Avoid isolating too much */
0878         if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
0879             ++low_pfn;
0880             break;
0881         }
0882 
0883         continue;
0884 isolate_fail:
0885         if (!skip_on_failure)
0886             continue;
0887 
0888         /*
0889          * We have isolated some pages, but then failed. Release them
0890          * instead of migrating, as we cannot form the cc->order buddy
0891          * page anyway.
0892          */
0893         if (nr_isolated) {
0894             if (locked) {
0895                 spin_unlock_irqrestore(zone_lru_lock(zone), flags);
0896                 locked = false;
0897             }
0898             putback_movable_pages(&cc->migratepages);
0899             cc->nr_migratepages = 0;
0900             cc->last_migrated_pfn = 0;
0901             nr_isolated = 0;
0902         }
0903 
0904         if (low_pfn < next_skip_pfn) {
0905             low_pfn = next_skip_pfn - 1;
0906             /*
0907              * The check near the loop beginning would have updated
0908              * next_skip_pfn too, but this is a bit simpler.
0909              */
0910             next_skip_pfn += 1UL << cc->order;
0911         }
0912     }
0913 
0914     /*
0915      * The PageBuddy() check could have potentially brought us outside
0916      * the range to be scanned.
0917      */
0918     if (unlikely(low_pfn > end_pfn))
0919         low_pfn = end_pfn;
0920 
0921     if (locked)
0922         spin_unlock_irqrestore(zone_lru_lock(zone), flags);
0923 
0924     /*
0925      * Update the pageblock-skip information and cached scanner pfn,
0926      * if the whole pageblock was scanned without isolating any page.
0927      */
0928     if (low_pfn == end_pfn)
0929         update_pageblock_skip(cc, valid_page, nr_isolated, true);
0930 
0931     trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
0932                         nr_scanned, nr_isolated);
0933 
0934     count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
0935     if (nr_isolated)
0936         count_compact_events(COMPACTISOLATED, nr_isolated);
0937 
0938     return low_pfn;
0939 }
0940 
0941 /**
0942  * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
0943  * @cc:        Compaction control structure.
0944  * @start_pfn: The first PFN to start isolating.
0945  * @end_pfn:   The one-past-last PFN.
0946  *
0947  * Returns zero if isolation fails fatally due to e.g. pending signal.
0948  * Otherwise, function returns one-past-the-last PFN of isolated page
0949  * (which may be greater than end_pfn if end fell in a middle of a THP page).
0950  */
0951 unsigned long
0952 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
0953                             unsigned long end_pfn)
0954 {
0955     unsigned long pfn, block_start_pfn, block_end_pfn;
0956 
0957     /* Scan block by block. First and last block may be incomplete */
0958     pfn = start_pfn;
0959     block_start_pfn = pageblock_start_pfn(pfn);
0960     if (block_start_pfn < cc->zone->zone_start_pfn)
0961         block_start_pfn = cc->zone->zone_start_pfn;
0962     block_end_pfn = pageblock_end_pfn(pfn);
0963 
0964     for (; pfn < end_pfn; pfn = block_end_pfn,
0965                 block_start_pfn = block_end_pfn,
0966                 block_end_pfn += pageblock_nr_pages) {
0967 
0968         block_end_pfn = min(block_end_pfn, end_pfn);
0969 
0970         if (!pageblock_pfn_to_page(block_start_pfn,
0971                     block_end_pfn, cc->zone))
0972             continue;
0973 
0974         pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
0975                             ISOLATE_UNEVICTABLE);
0976 
0977         if (!pfn)
0978             break;
0979 
0980         if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
0981             break;
0982     }
0983 
0984     return pfn;
0985 }
0986 
0987 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
0988 #ifdef CONFIG_COMPACTION
0989 
0990 /* Returns true if the page is within a block suitable for migration to */
0991 static bool suitable_migration_target(struct compact_control *cc,
0992                             struct page *page)
0993 {
0994     if (cc->ignore_block_suitable)
0995         return true;
0996 
0997     /* If the page is a large free page, then disallow migration */
0998     if (PageBuddy(page)) {
0999         /*
1000          * We are checking page_order without zone->lock taken. But
1001          * the only small danger is that we skip a potentially suitable
1002          * pageblock, so it's not worth to check order for valid range.
1003          */
1004         if (page_order_unsafe(page) >= pageblock_order)
1005             return false;
1006     }
1007 
1008     /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
1009     if (migrate_async_suitable(get_pageblock_migratetype(page)))
1010         return true;
1011 
1012     /* Otherwise skip the block */
1013     return false;
1014 }
1015 
1016 /*
1017  * Test whether the free scanner has reached the same or lower pageblock than
1018  * the migration scanner, and compaction should thus terminate.
1019  */
1020 static inline bool compact_scanners_met(struct compact_control *cc)
1021 {
1022     return (cc->free_pfn >> pageblock_order)
1023         <= (cc->migrate_pfn >> pageblock_order);
1024 }
1025 
1026 /*
1027  * Based on information in the current compact_control, find blocks
1028  * suitable for isolating free pages from and then isolate them.
1029  */
1030 static void isolate_freepages(struct compact_control *cc)
1031 {
1032     struct zone *zone = cc->zone;
1033     struct page *page;
1034     unsigned long block_start_pfn;  /* start of current pageblock */
1035     unsigned long isolate_start_pfn; /* exact pfn we start at */
1036     unsigned long block_end_pfn;    /* end of current pageblock */
1037     unsigned long low_pfn;       /* lowest pfn scanner is able to scan */
1038     struct list_head *freelist = &cc->freepages;
1039 
1040     /*
1041      * Initialise the free scanner. The starting point is where we last
1042      * successfully isolated from, zone-cached value, or the end of the
1043      * zone when isolating for the first time. For looping we also need
1044      * this pfn aligned down to the pageblock boundary, because we do
1045      * block_start_pfn -= pageblock_nr_pages in the for loop.
1046      * For ending point, take care when isolating in last pageblock of a
1047      * a zone which ends in the middle of a pageblock.
1048      * The low boundary is the end of the pageblock the migration scanner
1049      * is using.
1050      */
1051     isolate_start_pfn = cc->free_pfn;
1052     block_start_pfn = pageblock_start_pfn(cc->free_pfn);
1053     block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
1054                         zone_end_pfn(zone));
1055     low_pfn = pageblock_end_pfn(cc->migrate_pfn);
1056 
1057     /*
1058      * Isolate free pages until enough are available to migrate the
1059      * pages on cc->migratepages. We stop searching if the migrate
1060      * and free page scanners meet or enough free pages are isolated.
1061      */
1062     for (; block_start_pfn >= low_pfn;
1063                 block_end_pfn = block_start_pfn,
1064                 block_start_pfn -= pageblock_nr_pages,
1065                 isolate_start_pfn = block_start_pfn) {
1066         /*
1067          * This can iterate a massively long zone without finding any
1068          * suitable migration targets, so periodically check if we need
1069          * to schedule, or even abort async compaction.
1070          */
1071         if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
1072                         && compact_should_abort(cc))
1073             break;
1074 
1075         page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1076                                     zone);
1077         if (!page)
1078             continue;
1079 
1080         /* Check the block is suitable for migration */
1081         if (!suitable_migration_target(cc, page))
1082             continue;
1083 
1084         /* If isolation recently failed, do not retry */
1085         if (!isolation_suitable(cc, page))
1086             continue;
1087 
1088         /* Found a block suitable for isolating free pages from. */
1089         isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
1090                     freelist, false);
1091 
1092         /*
1093          * If we isolated enough freepages, or aborted due to lock
1094          * contention, terminate.
1095          */
1096         if ((cc->nr_freepages >= cc->nr_migratepages)
1097                             || cc->contended) {
1098             if (isolate_start_pfn >= block_end_pfn) {
1099                 /*
1100                  * Restart at previous pageblock if more
1101                  * freepages can be isolated next time.
1102                  */
1103                 isolate_start_pfn =
1104                     block_start_pfn - pageblock_nr_pages;
1105             }
1106             break;
1107         } else if (isolate_start_pfn < block_end_pfn) {
1108             /*
1109              * If isolation failed early, do not continue
1110              * needlessly.
1111              */
1112             break;
1113         }
1114     }
1115 
1116     /* __isolate_free_page() does not map the pages */
1117     map_pages(freelist);
1118 
1119     /*
1120      * Record where the free scanner will restart next time. Either we
1121      * broke from the loop and set isolate_start_pfn based on the last
1122      * call to isolate_freepages_block(), or we met the migration scanner
1123      * and the loop terminated due to isolate_start_pfn < low_pfn
1124      */
1125     cc->free_pfn = isolate_start_pfn;
1126 }
1127 
1128 /*
1129  * This is a migrate-callback that "allocates" freepages by taking pages
1130  * from the isolated freelists in the block we are migrating to.
1131  */
1132 static struct page *compaction_alloc(struct page *migratepage,
1133                     unsigned long data,
1134                     int **result)
1135 {
1136     struct compact_control *cc = (struct compact_control *)data;
1137     struct page *freepage;
1138 
1139     /*
1140      * Isolate free pages if necessary, and if we are not aborting due to
1141      * contention.
1142      */
1143     if (list_empty(&cc->freepages)) {
1144         if (!cc->contended)
1145             isolate_freepages(cc);
1146 
1147         if (list_empty(&cc->freepages))
1148             return NULL;
1149     }
1150 
1151     freepage = list_entry(cc->freepages.next, struct page, lru);
1152     list_del(&freepage->lru);
1153     cc->nr_freepages--;
1154 
1155     return freepage;
1156 }
1157 
1158 /*
1159  * This is a migrate-callback that "frees" freepages back to the isolated
1160  * freelist.  All pages on the freelist are from the same zone, so there is no
1161  * special handling needed for NUMA.
1162  */
1163 static void compaction_free(struct page *page, unsigned long data)
1164 {
1165     struct compact_control *cc = (struct compact_control *)data;
1166 
1167     list_add(&page->lru, &cc->freepages);
1168     cc->nr_freepages++;
1169 }
1170 
1171 /* possible outcome of isolate_migratepages */
1172 typedef enum {
1173     ISOLATE_ABORT,      /* Abort compaction now */
1174     ISOLATE_NONE,       /* No pages isolated, continue scanning */
1175     ISOLATE_SUCCESS,    /* Pages isolated, migrate */
1176 } isolate_migrate_t;
1177 
1178 /*
1179  * Allow userspace to control policy on scanning the unevictable LRU for
1180  * compactable pages.
1181  */
1182 int sysctl_compact_unevictable_allowed __read_mostly = 1;
1183 
1184 /*
1185  * Isolate all pages that can be migrated from the first suitable block,
1186  * starting at the block pointed to by the migrate scanner pfn within
1187  * compact_control.
1188  */
1189 static isolate_migrate_t isolate_migratepages(struct zone *zone,
1190                     struct compact_control *cc)
1191 {
1192     unsigned long block_start_pfn;
1193     unsigned long block_end_pfn;
1194     unsigned long low_pfn;
1195     struct page *page;
1196     const isolate_mode_t isolate_mode =
1197         (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1198         (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1199 
1200     /*
1201      * Start at where we last stopped, or beginning of the zone as
1202      * initialized by compact_zone()
1203      */
1204     low_pfn = cc->migrate_pfn;
1205     block_start_pfn = pageblock_start_pfn(low_pfn);
1206     if (block_start_pfn < zone->zone_start_pfn)
1207         block_start_pfn = zone->zone_start_pfn;
1208 
1209     /* Only scan within a pageblock boundary */
1210     block_end_pfn = pageblock_end_pfn(low_pfn);
1211 
1212     /*
1213      * Iterate over whole pageblocks until we find the first suitable.
1214      * Do not cross the free scanner.
1215      */
1216     for (; block_end_pfn <= cc->free_pfn;
1217             low_pfn = block_end_pfn,
1218             block_start_pfn = block_end_pfn,
1219             block_end_pfn += pageblock_nr_pages) {
1220 
1221         /*
1222          * This can potentially iterate a massively long zone with
1223          * many pageblocks unsuitable, so periodically check if we
1224          * need to schedule, or even abort async compaction.
1225          */
1226         if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
1227                         && compact_should_abort(cc))
1228             break;
1229 
1230         page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1231                                     zone);
1232         if (!page)
1233             continue;
1234 
1235         /* If isolation recently failed, do not retry */
1236         if (!isolation_suitable(cc, page))
1237             continue;
1238 
1239         /*
1240          * For async compaction, also only scan in MOVABLE blocks.
1241          * Async compaction is optimistic to see if the minimum amount
1242          * of work satisfies the allocation.
1243          */
1244         if (cc->mode == MIGRATE_ASYNC &&
1245             !migrate_async_suitable(get_pageblock_migratetype(page)))
1246             continue;
1247 
1248         /* Perform the isolation */
1249         low_pfn = isolate_migratepages_block(cc, low_pfn,
1250                         block_end_pfn, isolate_mode);
1251 
1252         if (!low_pfn || cc->contended)
1253             return ISOLATE_ABORT;
1254 
1255         /*
1256          * Either we isolated something and proceed with migration. Or
1257          * we failed and compact_zone should decide if we should
1258          * continue or not.
1259          */
1260         break;
1261     }
1262 
1263     /* Record where migration scanner will be restarted. */
1264     cc->migrate_pfn = low_pfn;
1265 
1266     return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1267 }
1268 
1269 /*
1270  * order == -1 is expected when compacting via
1271  * /proc/sys/vm/compact_memory
1272  */
1273 static inline bool is_via_compact_memory(int order)
1274 {
1275     return order == -1;
1276 }
1277 
1278 static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
1279                 const int migratetype)
1280 {
1281     unsigned int order;
1282     unsigned long watermark;
1283 
1284     if (cc->contended || fatal_signal_pending(current))
1285         return COMPACT_CONTENDED;
1286 
1287     /* Compaction run completes if the migrate and free scanner meet */
1288     if (compact_scanners_met(cc)) {
1289         /* Let the next compaction start anew. */
1290         reset_cached_positions(zone);
1291 
1292         /*
1293          * Mark that the PG_migrate_skip information should be cleared
1294          * by kswapd when it goes to sleep. kcompactd does not set the
1295          * flag itself as the decision to be clear should be directly
1296          * based on an allocation request.
1297          */
1298         if (cc->direct_compaction)
1299             zone->compact_blockskip_flush = true;
1300 
1301         if (cc->whole_zone)
1302             return COMPACT_COMPLETE;
1303         else
1304             return COMPACT_PARTIAL_SKIPPED;
1305     }
1306 
1307     if (is_via_compact_memory(cc->order))
1308         return COMPACT_CONTINUE;
1309 
1310     /* Compaction run is not finished if the watermark is not met */
1311     watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
1312 
1313     if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
1314                             cc->alloc_flags))
1315         return COMPACT_CONTINUE;
1316 
1317     /* Direct compactor: Is a suitable page free? */
1318     for (order = cc->order; order < MAX_ORDER; order++) {
1319         struct free_area *area = &zone->free_area[order];
1320         bool can_steal;
1321 
1322         /* Job done if page is free of the right migratetype */
1323         if (!list_empty(&area->free_list[migratetype]))
1324             return COMPACT_SUCCESS;
1325 
1326 #ifdef CONFIG_CMA
1327         /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
1328         if (migratetype == MIGRATE_MOVABLE &&
1329             !list_empty(&area->free_list[MIGRATE_CMA]))
1330             return COMPACT_SUCCESS;
1331 #endif
1332         /*
1333          * Job done if allocation would steal freepages from
1334          * other migratetype buddy lists.
1335          */
1336         if (find_suitable_fallback(area, order, migratetype,
1337                         true, &can_steal) != -1)
1338             return COMPACT_SUCCESS;
1339     }
1340 
1341     return COMPACT_NO_SUITABLE_PAGE;
1342 }
1343 
1344 static enum compact_result compact_finished(struct zone *zone,
1345             struct compact_control *cc,
1346             const int migratetype)
1347 {
1348     int ret;
1349 
1350     ret = __compact_finished(zone, cc, migratetype);
1351     trace_mm_compaction_finished(zone, cc->order, ret);
1352     if (ret == COMPACT_NO_SUITABLE_PAGE)
1353         ret = COMPACT_CONTINUE;
1354 
1355     return ret;
1356 }
1357 
1358 /*
1359  * compaction_suitable: Is this suitable to run compaction on this zone now?
1360  * Returns
1361  *   COMPACT_SKIPPED  - If there are too few free pages for compaction
1362  *   COMPACT_SUCCESS  - If the allocation would succeed without compaction
1363  *   COMPACT_CONTINUE - If compaction should run now
1364  */
1365 static enum compact_result __compaction_suitable(struct zone *zone, int order,
1366                     unsigned int alloc_flags,
1367                     int classzone_idx,
1368                     unsigned long wmark_target)
1369 {
1370     unsigned long watermark;
1371 
1372     if (is_via_compact_memory(order))
1373         return COMPACT_CONTINUE;
1374 
1375     watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1376     /*
1377      * If watermarks for high-order allocation are already met, there
1378      * should be no need for compaction at all.
1379      */
1380     if (zone_watermark_ok(zone, order, watermark, classzone_idx,
1381                                 alloc_flags))
1382         return COMPACT_SUCCESS;
1383 
1384     /*
1385      * Watermarks for order-0 must be met for compaction to be able to
1386      * isolate free pages for migration targets. This means that the
1387      * watermark and alloc_flags have to match, or be more pessimistic than
1388      * the check in __isolate_free_page(). We don't use the direct
1389      * compactor's alloc_flags, as they are not relevant for freepage
1390      * isolation. We however do use the direct compactor's classzone_idx to
1391      * skip over zones where lowmem reserves would prevent allocation even
1392      * if compaction succeeds.
1393      * For costly orders, we require low watermark instead of min for
1394      * compaction to proceed to increase its chances.
1395      * ALLOC_CMA is used, as pages in CMA pageblocks are considered
1396      * suitable migration targets
1397      */
1398     watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
1399                 low_wmark_pages(zone) : min_wmark_pages(zone);
1400     watermark += compact_gap(order);
1401     if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
1402                         ALLOC_CMA, wmark_target))
1403         return COMPACT_SKIPPED;
1404 
1405     return COMPACT_CONTINUE;
1406 }
1407 
1408 enum compact_result compaction_suitable(struct zone *zone, int order,
1409                     unsigned int alloc_flags,
1410                     int classzone_idx)
1411 {
1412     enum compact_result ret;
1413     int fragindex;
1414 
1415     ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
1416                     zone_page_state(zone, NR_FREE_PAGES));
1417     /*
1418      * fragmentation index determines if allocation failures are due to
1419      * low memory or external fragmentation
1420      *
1421      * index of -1000 would imply allocations might succeed depending on
1422      * watermarks, but we already failed the high-order watermark check
1423      * index towards 0 implies failure is due to lack of memory
1424      * index towards 1000 implies failure is due to fragmentation
1425      *
1426      * Only compact if a failure would be due to fragmentation. Also
1427      * ignore fragindex for non-costly orders where the alternative to
1428      * a successful reclaim/compaction is OOM. Fragindex and the
1429      * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
1430      * excessive compaction for costly orders, but it should not be at the
1431      * expense of system stability.
1432      */
1433     if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
1434         fragindex = fragmentation_index(zone, order);
1435         if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1436             ret = COMPACT_NOT_SUITABLE_ZONE;
1437     }
1438 
1439     trace_mm_compaction_suitable(zone, order, ret);
1440     if (ret == COMPACT_NOT_SUITABLE_ZONE)
1441         ret = COMPACT_SKIPPED;
1442 
1443     return ret;
1444 }
1445 
1446 bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
1447         int alloc_flags)
1448 {
1449     struct zone *zone;
1450     struct zoneref *z;
1451 
1452     /*
1453      * Make sure at least one zone would pass __compaction_suitable if we continue
1454      * retrying the reclaim.
1455      */
1456     for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1457                     ac->nodemask) {
1458         unsigned long available;
1459         enum compact_result compact_result;
1460 
1461         /*
1462          * Do not consider all the reclaimable memory because we do not
1463          * want to trash just for a single high order allocation which
1464          * is even not guaranteed to appear even if __compaction_suitable
1465          * is happy about the watermark check.
1466          */
1467         available = zone_reclaimable_pages(zone) / order;
1468         available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
1469         compact_result = __compaction_suitable(zone, order, alloc_flags,
1470                 ac_classzone_idx(ac), available);
1471         if (compact_result != COMPACT_SKIPPED)
1472             return true;
1473     }
1474 
1475     return false;
1476 }
1477 
1478 static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
1479 {
1480     enum compact_result ret;
1481     unsigned long start_pfn = zone->zone_start_pfn;
1482     unsigned long end_pfn = zone_end_pfn(zone);
1483     const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1484     const bool sync = cc->mode != MIGRATE_ASYNC;
1485 
1486     ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
1487                             cc->classzone_idx);
1488     /* Compaction is likely to fail */
1489     if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
1490         return ret;
1491 
1492     /* huh, compaction_suitable is returning something unexpected */
1493     VM_BUG_ON(ret != COMPACT_CONTINUE);
1494 
1495     /*
1496      * Clear pageblock skip if there were failures recently and compaction
1497      * is about to be retried after being deferred.
1498      */
1499     if (compaction_restarting(zone, cc->order))
1500         __reset_isolation_suitable(zone);
1501 
1502     /*
1503      * Setup to move all movable pages to the end of the zone. Used cached
1504      * information on where the scanners should start (unless we explicitly
1505      * want to compact the whole zone), but check that it is initialised
1506      * by ensuring the values are within zone boundaries.
1507      */
1508     if (cc->whole_zone) {
1509         cc->migrate_pfn = start_pfn;
1510         cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1511     } else {
1512         cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
1513         cc->free_pfn = zone->compact_cached_free_pfn;
1514         if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
1515             cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1516             zone->compact_cached_free_pfn = cc->free_pfn;
1517         }
1518         if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
1519             cc->migrate_pfn = start_pfn;
1520             zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1521             zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1522         }
1523 
1524         if (cc->migrate_pfn == start_pfn)
1525             cc->whole_zone = true;
1526     }
1527 
1528     cc->last_migrated_pfn = 0;
1529 
1530     trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
1531                 cc->free_pfn, end_pfn, sync);
1532 
1533     migrate_prep_local();
1534 
1535     while ((ret = compact_finished(zone, cc, migratetype)) ==
1536                         COMPACT_CONTINUE) {
1537         int err;
1538 
1539         switch (isolate_migratepages(zone, cc)) {
1540         case ISOLATE_ABORT:
1541             ret = COMPACT_CONTENDED;
1542             putback_movable_pages(&cc->migratepages);
1543             cc->nr_migratepages = 0;
1544             goto out;
1545         case ISOLATE_NONE:
1546             /*
1547              * We haven't isolated and migrated anything, but
1548              * there might still be unflushed migrations from
1549              * previous cc->order aligned block.
1550              */
1551             goto check_drain;
1552         case ISOLATE_SUCCESS:
1553             ;
1554         }
1555 
1556         err = migrate_pages(&cc->migratepages, compaction_alloc,
1557                 compaction_free, (unsigned long)cc, cc->mode,
1558                 MR_COMPACTION);
1559 
1560         trace_mm_compaction_migratepages(cc->nr_migratepages, err,
1561                             &cc->migratepages);
1562 
1563         /* All pages were either migrated or will be released */
1564         cc->nr_migratepages = 0;
1565         if (err) {
1566             putback_movable_pages(&cc->migratepages);
1567             /*
1568              * migrate_pages() may return -ENOMEM when scanners meet
1569              * and we want compact_finished() to detect it
1570              */
1571             if (err == -ENOMEM && !compact_scanners_met(cc)) {
1572                 ret = COMPACT_CONTENDED;
1573                 goto out;
1574             }
1575             /*
1576              * We failed to migrate at least one page in the current
1577              * order-aligned block, so skip the rest of it.
1578              */
1579             if (cc->direct_compaction &&
1580                         (cc->mode == MIGRATE_ASYNC)) {
1581                 cc->migrate_pfn = block_end_pfn(
1582                         cc->migrate_pfn - 1, cc->order);
1583                 /* Draining pcplists is useless in this case */
1584                 cc->last_migrated_pfn = 0;
1585 
1586             }
1587         }
1588 
1589 check_drain:
1590         /*
1591          * Has the migration scanner moved away from the previous
1592          * cc->order aligned block where we migrated from? If yes,
1593          * flush the pages that were freed, so that they can merge and
1594          * compact_finished() can detect immediately if allocation
1595          * would succeed.
1596          */
1597         if (cc->order > 0 && cc->last_migrated_pfn) {
1598             int cpu;
1599             unsigned long current_block_start =
1600                 block_start_pfn(cc->migrate_pfn, cc->order);
1601 
1602             if (cc->last_migrated_pfn < current_block_start) {
1603                 cpu = get_cpu();
1604                 lru_add_drain_cpu(cpu);
1605                 drain_local_pages(zone);
1606                 put_cpu();
1607                 /* No more flushing until we migrate again */
1608                 cc->last_migrated_pfn = 0;
1609             }
1610         }
1611 
1612     }
1613 
1614 out:
1615     /*
1616      * Release free pages and update where the free scanner should restart,
1617      * so we don't leave any returned pages behind in the next attempt.
1618      */
1619     if (cc->nr_freepages > 0) {
1620         unsigned long free_pfn = release_freepages(&cc->freepages);
1621 
1622         cc->nr_freepages = 0;
1623         VM_BUG_ON(free_pfn == 0);
1624         /* The cached pfn is always the first in a pageblock */
1625         free_pfn = pageblock_start_pfn(free_pfn);
1626         /*
1627          * Only go back, not forward. The cached pfn might have been
1628          * already reset to zone end in compact_finished()
1629          */
1630         if (free_pfn > zone->compact_cached_free_pfn)
1631             zone->compact_cached_free_pfn = free_pfn;
1632     }
1633 
1634     trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
1635                 cc->free_pfn, end_pfn, sync, ret);
1636 
1637     return ret;
1638 }
1639 
1640 static enum compact_result compact_zone_order(struct zone *zone, int order,
1641         gfp_t gfp_mask, enum compact_priority prio,
1642         unsigned int alloc_flags, int classzone_idx)
1643 {
1644     enum compact_result ret;
1645     struct compact_control cc = {
1646         .nr_freepages = 0,
1647         .nr_migratepages = 0,
1648         .order = order,
1649         .gfp_mask = gfp_mask,
1650         .zone = zone,
1651         .mode = (prio == COMPACT_PRIO_ASYNC) ?
1652                     MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
1653         .alloc_flags = alloc_flags,
1654         .classzone_idx = classzone_idx,
1655         .direct_compaction = true,
1656         .whole_zone = (prio == MIN_COMPACT_PRIORITY),
1657         .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
1658         .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
1659     };
1660     INIT_LIST_HEAD(&cc.freepages);
1661     INIT_LIST_HEAD(&cc.migratepages);
1662 
1663     ret = compact_zone(zone, &cc);
1664 
1665     VM_BUG_ON(!list_empty(&cc.freepages));
1666     VM_BUG_ON(!list_empty(&cc.migratepages));
1667 
1668     return ret;
1669 }
1670 
1671 int sysctl_extfrag_threshold = 500;
1672 
1673 /**
1674  * try_to_compact_pages - Direct compact to satisfy a high-order allocation
1675  * @gfp_mask: The GFP mask of the current allocation
1676  * @order: The order of the current allocation
1677  * @alloc_flags: The allocation flags of the current allocation
1678  * @ac: The context of current allocation
1679  * @mode: The migration mode for async, sync light, or sync migration
1680  *
1681  * This is the main entry point for direct page compaction.
1682  */
1683 enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1684         unsigned int alloc_flags, const struct alloc_context *ac,
1685         enum compact_priority prio)
1686 {
1687     int may_perform_io = gfp_mask & __GFP_IO;
1688     struct zoneref *z;
1689     struct zone *zone;
1690     enum compact_result rc = COMPACT_SKIPPED;
1691 
1692     /*
1693      * Check if the GFP flags allow compaction - GFP_NOIO is really
1694      * tricky context because the migration might require IO
1695      */
1696     if (!may_perform_io)
1697         return COMPACT_SKIPPED;
1698 
1699     trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
1700 
1701     /* Compact each zone in the list */
1702     for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1703                                 ac->nodemask) {
1704         enum compact_result status;
1705 
1706         if (prio > MIN_COMPACT_PRIORITY
1707                     && compaction_deferred(zone, order)) {
1708             rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
1709             continue;
1710         }
1711 
1712         status = compact_zone_order(zone, order, gfp_mask, prio,
1713                     alloc_flags, ac_classzone_idx(ac));
1714         rc = max(status, rc);
1715 
1716         /* The allocation should succeed, stop compacting */
1717         if (status == COMPACT_SUCCESS) {
1718             /*
1719              * We think the allocation will succeed in this zone,
1720              * but it is not certain, hence the false. The caller
1721              * will repeat this with true if allocation indeed
1722              * succeeds in this zone.
1723              */
1724             compaction_defer_reset(zone, order, false);
1725 
1726             break;
1727         }
1728 
1729         if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
1730                     status == COMPACT_PARTIAL_SKIPPED))
1731             /*
1732              * We think that allocation won't succeed in this zone
1733              * so we defer compaction there. If it ends up
1734              * succeeding after all, it will be reset.
1735              */
1736             defer_compaction(zone, order);
1737 
1738         /*
1739          * We might have stopped compacting due to need_resched() in
1740          * async compaction, or due to a fatal signal detected. In that
1741          * case do not try further zones
1742          */
1743         if ((prio == COMPACT_PRIO_ASYNC && need_resched())
1744                     || fatal_signal_pending(current))
1745             break;
1746     }
1747 
1748     return rc;
1749 }
1750 
1751 
1752 /* Compact all zones within a node */
1753 static void compact_node(int nid)
1754 {
1755     pg_data_t *pgdat = NODE_DATA(nid);
1756     int zoneid;
1757     struct zone *zone;
1758     struct compact_control cc = {
1759         .order = -1,
1760         .mode = MIGRATE_SYNC,
1761         .ignore_skip_hint = true,
1762         .whole_zone = true,
1763         .gfp_mask = GFP_KERNEL,
1764     };
1765 
1766 
1767     for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
1768 
1769         zone = &pgdat->node_zones[zoneid];
1770         if (!populated_zone(zone))
1771             continue;
1772 
1773         cc.nr_freepages = 0;
1774         cc.nr_migratepages = 0;
1775         cc.zone = zone;
1776         INIT_LIST_HEAD(&cc.freepages);
1777         INIT_LIST_HEAD(&cc.migratepages);
1778 
1779         compact_zone(zone, &cc);
1780 
1781         VM_BUG_ON(!list_empty(&cc.freepages));
1782         VM_BUG_ON(!list_empty(&cc.migratepages));
1783     }
1784 }
1785 
1786 /* Compact all nodes in the system */
1787 static void compact_nodes(void)
1788 {
1789     int nid;
1790 
1791     /* Flush pending updates to the LRU lists */
1792     lru_add_drain_all();
1793 
1794     for_each_online_node(nid)
1795         compact_node(nid);
1796 }
1797 
1798 /* The written value is actually unused, all memory is compacted */
1799 int sysctl_compact_memory;
1800 
1801 /*
1802  * This is the entry point for compacting all nodes via
1803  * /proc/sys/vm/compact_memory
1804  */
1805 int sysctl_compaction_handler(struct ctl_table *table, int write,
1806             void __user *buffer, size_t *length, loff_t *ppos)
1807 {
1808     if (write)
1809         compact_nodes();
1810 
1811     return 0;
1812 }
1813 
1814 int sysctl_extfrag_handler(struct ctl_table *table, int write,
1815             void __user *buffer, size_t *length, loff_t *ppos)
1816 {
1817     proc_dointvec_minmax(table, write, buffer, length, ppos);
1818 
1819     return 0;
1820 }
1821 
1822 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
1823 static ssize_t sysfs_compact_node(struct device *dev,
1824             struct device_attribute *attr,
1825             const char *buf, size_t count)
1826 {
1827     int nid = dev->id;
1828 
1829     if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
1830         /* Flush pending updates to the LRU lists */
1831         lru_add_drain_all();
1832 
1833         compact_node(nid);
1834     }
1835 
1836     return count;
1837 }
1838 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
1839 
1840 int compaction_register_node(struct node *node)
1841 {
1842     return device_create_file(&node->dev, &dev_attr_compact);
1843 }
1844 
1845 void compaction_unregister_node(struct node *node)
1846 {
1847     return device_remove_file(&node->dev, &dev_attr_compact);
1848 }
1849 #endif /* CONFIG_SYSFS && CONFIG_NUMA */
1850 
1851 static inline bool kcompactd_work_requested(pg_data_t *pgdat)
1852 {
1853     return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
1854 }
1855 
1856 static bool kcompactd_node_suitable(pg_data_t *pgdat)
1857 {
1858     int zoneid;
1859     struct zone *zone;
1860     enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
1861 
1862     for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
1863         zone = &pgdat->node_zones[zoneid];
1864 
1865         if (!populated_zone(zone))
1866             continue;
1867 
1868         if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
1869                     classzone_idx) == COMPACT_CONTINUE)
1870             return true;
1871     }
1872 
1873     return false;
1874 }
1875 
1876 static void kcompactd_do_work(pg_data_t *pgdat)
1877 {
1878     /*
1879      * With no special task, compact all zones so that a page of requested
1880      * order is allocatable.
1881      */
1882     int zoneid;
1883     struct zone *zone;
1884     struct compact_control cc = {
1885         .order = pgdat->kcompactd_max_order,
1886         .classzone_idx = pgdat->kcompactd_classzone_idx,
1887         .mode = MIGRATE_SYNC_LIGHT,
1888         .ignore_skip_hint = true,
1889         .gfp_mask = GFP_KERNEL,
1890 
1891     };
1892     trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
1893                             cc.classzone_idx);
1894     count_vm_event(KCOMPACTD_WAKE);
1895 
1896     for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
1897         int status;
1898 
1899         zone = &pgdat->node_zones[zoneid];
1900         if (!populated_zone(zone))
1901             continue;
1902 
1903         if (compaction_deferred(zone, cc.order))
1904             continue;
1905 
1906         if (compaction_suitable(zone, cc.order, 0, zoneid) !=
1907                             COMPACT_CONTINUE)
1908             continue;
1909 
1910         cc.nr_freepages = 0;
1911         cc.nr_migratepages = 0;
1912         cc.zone = zone;
1913         INIT_LIST_HEAD(&cc.freepages);
1914         INIT_LIST_HEAD(&cc.migratepages);
1915 
1916         if (kthread_should_stop())
1917             return;
1918         status = compact_zone(zone, &cc);
1919 
1920         if (status == COMPACT_SUCCESS) {
1921             compaction_defer_reset(zone, cc.order, false);
1922         } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
1923             /*
1924              * We use sync migration mode here, so we defer like
1925              * sync direct compaction does.
1926              */
1927             defer_compaction(zone, cc.order);
1928         }
1929 
1930         VM_BUG_ON(!list_empty(&cc.freepages));
1931         VM_BUG_ON(!list_empty(&cc.migratepages));
1932     }
1933 
1934     /*
1935      * Regardless of success, we are done until woken up next. But remember
1936      * the requested order/classzone_idx in case it was higher/tighter than
1937      * our current ones
1938      */
1939     if (pgdat->kcompactd_max_order <= cc.order)
1940         pgdat->kcompactd_max_order = 0;
1941     if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
1942         pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
1943 }
1944 
1945 void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
1946 {
1947     if (!order)
1948         return;
1949 
1950     if (pgdat->kcompactd_max_order < order)
1951         pgdat->kcompactd_max_order = order;
1952 
1953     if (pgdat->kcompactd_classzone_idx > classzone_idx)
1954         pgdat->kcompactd_classzone_idx = classzone_idx;
1955 
1956     if (!waitqueue_active(&pgdat->kcompactd_wait))
1957         return;
1958 
1959     if (!kcompactd_node_suitable(pgdat))
1960         return;
1961 
1962     trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
1963                             classzone_idx);
1964     wake_up_interruptible(&pgdat->kcompactd_wait);
1965 }
1966 
1967 /*
1968  * The background compaction daemon, started as a kernel thread
1969  * from the init process.
1970  */
1971 static int kcompactd(void *p)
1972 {
1973     pg_data_t *pgdat = (pg_data_t*)p;
1974     struct task_struct *tsk = current;
1975 
1976     const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1977 
1978     if (!cpumask_empty(cpumask))
1979         set_cpus_allowed_ptr(tsk, cpumask);
1980 
1981     set_freezable();
1982 
1983     pgdat->kcompactd_max_order = 0;
1984     pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
1985 
1986     while (!kthread_should_stop()) {
1987         trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
1988         wait_event_freezable(pgdat->kcompactd_wait,
1989                 kcompactd_work_requested(pgdat));
1990 
1991         kcompactd_do_work(pgdat);
1992     }
1993 
1994     return 0;
1995 }
1996 
1997 /*
1998  * This kcompactd start function will be called by init and node-hot-add.
1999  * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
2000  */
2001 int kcompactd_run(int nid)
2002 {
2003     pg_data_t *pgdat = NODE_DATA(nid);
2004     int ret = 0;
2005 
2006     if (pgdat->kcompactd)
2007         return 0;
2008 
2009     pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
2010     if (IS_ERR(pgdat->kcompactd)) {
2011         pr_err("Failed to start kcompactd on node %d\n", nid);
2012         ret = PTR_ERR(pgdat->kcompactd);
2013         pgdat->kcompactd = NULL;
2014     }
2015     return ret;
2016 }
2017 
2018 /*
2019  * Called by memory hotplug when all memory in a node is offlined. Caller must
2020  * hold mem_hotplug_begin/end().
2021  */
2022 void kcompactd_stop(int nid)
2023 {
2024     struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
2025 
2026     if (kcompactd) {
2027         kthread_stop(kcompactd);
2028         NODE_DATA(nid)->kcompactd = NULL;
2029     }
2030 }
2031 
2032 /*
2033  * It's optimal to keep kcompactd on the same CPUs as their memory, but
2034  * not required for correctness. So if the last cpu in a node goes
2035  * away, we get changed to run anywhere: as the first one comes back,
2036  * restore their cpu bindings.
2037  */
2038 static int kcompactd_cpu_online(unsigned int cpu)
2039 {
2040     int nid;
2041 
2042     for_each_node_state(nid, N_MEMORY) {
2043         pg_data_t *pgdat = NODE_DATA(nid);
2044         const struct cpumask *mask;
2045 
2046         mask = cpumask_of_node(pgdat->node_id);
2047 
2048         if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2049             /* One of our CPUs online: restore mask */
2050             set_cpus_allowed_ptr(pgdat->kcompactd, mask);
2051     }
2052     return 0;
2053 }
2054 
2055 static int __init kcompactd_init(void)
2056 {
2057     int nid;
2058     int ret;
2059 
2060     ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
2061                     "mm/compaction:online",
2062                     kcompactd_cpu_online, NULL);
2063     if (ret < 0) {
2064         pr_err("kcompactd: failed to register hotplug callbacks.\n");
2065         return ret;
2066     }
2067 
2068     for_each_node_state(nid, N_MEMORY)
2069         kcompactd_run(nid);
2070     return 0;
2071 }
2072 subsys_initcall(kcompactd_init)
2073 
2074 #endif /* CONFIG_COMPACTION */