the-tree/mm/khugepaged.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0003
0004 #include <linux/mm.h>
0005 #include <linux/sched.h>
0006 #include <linux/sched/mm.h>
0007 #include <linux/sched/coredump.h>
0008 #include <linux/mmu_notifier.h>
0009 #include <linux/rmap.h>
0010 #include <linux/swap.h>
0011 #include <linux/mm_inline.h>
0012 #include <linux/kthread.h>
0013 #include <linux/khugepaged.h>
0014 #include <linux/freezer.h>
0015 #include <linux/mman.h>
0016 #include <linux/hashtable.h>
0017 #include <linux/userfaultfd_k.h>
0018 #include <linux/page_idle.h>
0019 #include <linux/page_table_check.h>
0020 #include <linux/swapops.h>
0021 #include <linux/shmem_fs.h>
0022
0023 #include <asm/tlb.h>
0024 #include <asm/pgalloc.h>
0025 #include "internal.h"
0026
0027 enum scan_result {
0028     SCAN_FAIL,
0029     SCAN_SUCCEED,
0030     SCAN_PMD_NULL,
0031     SCAN_EXCEED_NONE_PTE,
0032     SCAN_EXCEED_SWAP_PTE,
0033     SCAN_EXCEED_SHARED_PTE,
0034     SCAN_PTE_NON_PRESENT,
0035     SCAN_PTE_UFFD_WP,
0036     SCAN_PAGE_RO,
0037     SCAN_LACK_REFERENCED_PAGE,
0038     SCAN_PAGE_NULL,
0039     SCAN_SCAN_ABORT,
0040     SCAN_PAGE_COUNT,
0041     SCAN_PAGE_LRU,
0042     SCAN_PAGE_LOCK,
0043     SCAN_PAGE_ANON,
0044     SCAN_PAGE_COMPOUND,
0045     SCAN_ANY_PROCESS,
0046     SCAN_VMA_NULL,
0047     SCAN_VMA_CHECK,
0048     SCAN_ADDRESS_RANGE,
0049     SCAN_DEL_PAGE_LRU,
0050     SCAN_ALLOC_HUGE_PAGE_FAIL,
0051     SCAN_CGROUP_CHARGE_FAIL,
0052     SCAN_TRUNCATED,
0053     SCAN_PAGE_HAS_PRIVATE,
0054 };
0055
0056 #define CREATE_TRACE_POINTS
0057 #include <trace/events/huge_memory.h>
0058
0059 static struct task_struct *khugepaged_thread __read_mostly;
0060 static DEFINE_MUTEX(khugepaged_mutex);
0061
0062 /* default scan 8*512 pte (or vmas) every 30 second */
0063 static unsigned int khugepaged_pages_to_scan __read_mostly;
0064 static unsigned int khugepaged_pages_collapsed;
0065 static unsigned int khugepaged_full_scans;
0066 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
0067 /* during fragmentation poll the hugepage allocator once every minute */
0068 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
0069 static unsigned long khugepaged_sleep_expire;
0070 static DEFINE_SPINLOCK(khugepaged_mm_lock);
0071 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
0072 /*
0073  * default collapse hugepages if there is at least one pte mapped like
0074  * it would have happened if the vma was large enough during page
0075  * fault.
0076  */
0077 static unsigned int khugepaged_max_ptes_none __read_mostly;
0078 static unsigned int khugepaged_max_ptes_swap __read_mostly;
0079 static unsigned int khugepaged_max_ptes_shared __read_mostly;
0080
0081 #define MM_SLOTS_HASH_BITS 10
0082 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
0083
0084 static struct kmem_cache *mm_slot_cache __read_mostly;
0085
0086 #define MAX_PTE_MAPPED_THP 8
0087
0088 /**
0089  * struct mm_slot - hash lookup from mm to mm_slot
0090  * @hash: hash collision list
0091  * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
0092  * @mm: the mm that this information is valid for
0093  * @nr_pte_mapped_thp: number of pte mapped THP
0094  * @pte_mapped_thp: address array corresponding pte mapped THP
0095  */
0096 struct mm_slot {
0097     struct hlist_node hash;
0098     struct list_head mm_node;
0099     struct mm_struct *mm;
0100
0101     /* pte-mapped THP in this mm */
0102     int nr_pte_mapped_thp;
0103     unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
0104 };
0105
0106 /**
0107  * struct khugepaged_scan - cursor for scanning
0108  * @mm_head: the head of the mm list to scan
0109  * @mm_slot: the current mm_slot we are scanning
0110  * @address: the next address inside that to be scanned
0111  *
0112  * There is only the one khugepaged_scan instance of this cursor structure.
0113  */
0114 struct khugepaged_scan {
0115     struct list_head mm_head;
0116     struct mm_slot *mm_slot;
0117     unsigned long address;
0118 };
0119
0120 static struct khugepaged_scan khugepaged_scan = {
0121     .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
0122 };
0123
0124 #ifdef CONFIG_SYSFS
0125 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
0126                      struct kobj_attribute *attr,
0127                      char *buf)
0128 {
0129     return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
0130 }
0131
0132 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
0133                       struct kobj_attribute *attr,
0134                       const char *buf, size_t count)
0135 {
0136     unsigned int msecs;
0137     int err;
0138
0139     err = kstrtouint(buf, 10, &msecs);
0140     if (err)
0141         return -EINVAL;
0142
0143     khugepaged_scan_sleep_millisecs = msecs;
0144     khugepaged_sleep_expire = 0;
0145     wake_up_interruptible(&khugepaged_wait);
0146
0147     return count;
0148 }
0149 static struct kobj_attribute scan_sleep_millisecs_attr =
0150     __ATTR_RW(scan_sleep_millisecs);
0151
0152 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
0153                       struct kobj_attribute *attr,
0154                       char *buf)
0155 {
0156     return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
0157 }
0158
0159 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
0160                        struct kobj_attribute *attr,
0161                        const char *buf, size_t count)
0162 {
0163     unsigned int msecs;
0164     int err;
0165
0166     err = kstrtouint(buf, 10, &msecs);
0167     if (err)
0168         return -EINVAL;
0169
0170     khugepaged_alloc_sleep_millisecs = msecs;
0171     khugepaged_sleep_expire = 0;
0172     wake_up_interruptible(&khugepaged_wait);
0173
0174     return count;
0175 }
0176 static struct kobj_attribute alloc_sleep_millisecs_attr =
0177     __ATTR_RW(alloc_sleep_millisecs);
0178
0179 static ssize_t pages_to_scan_show(struct kobject *kobj,
0180                   struct kobj_attribute *attr,
0181                   char *buf)
0182 {
0183     return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
0184 }
0185 static ssize_t pages_to_scan_store(struct kobject *kobj,
0186                    struct kobj_attribute *attr,
0187                    const char *buf, size_t count)
0188 {
0189     unsigned int pages;
0190     int err;
0191
0192     err = kstrtouint(buf, 10, &pages);
0193     if (err || !pages)
0194         return -EINVAL;
0195
0196     khugepaged_pages_to_scan = pages;
0197
0198     return count;
0199 }
0200 static struct kobj_attribute pages_to_scan_attr =
0201     __ATTR_RW(pages_to_scan);
0202
0203 static ssize_t pages_collapsed_show(struct kobject *kobj,
0204                     struct kobj_attribute *attr,
0205                     char *buf)
0206 {
0207     return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
0208 }
0209 static struct kobj_attribute pages_collapsed_attr =
0210     __ATTR_RO(pages_collapsed);
0211
0212 static ssize_t full_scans_show(struct kobject *kobj,
0213                    struct kobj_attribute *attr,
0214                    char *buf)
0215 {
0216     return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
0217 }
0218 static struct kobj_attribute full_scans_attr =
0219     __ATTR_RO(full_scans);
0220
0221 static ssize_t defrag_show(struct kobject *kobj,
0222                struct kobj_attribute *attr, char *buf)
0223 {
0224     return single_hugepage_flag_show(kobj, attr, buf,
0225                      TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
0226 }
0227 static ssize_t defrag_store(struct kobject *kobj,
0228                 struct kobj_attribute *attr,
0229                 const char *buf, size_t count)
0230 {
0231     return single_hugepage_flag_store(kobj, attr, buf, count,
0232                  TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
0233 }
0234 static struct kobj_attribute khugepaged_defrag_attr =
0235     __ATTR_RW(defrag);
0236
0237 /*
0238  * max_ptes_none controls if khugepaged should collapse hugepages over
0239  * any unmapped ptes in turn potentially increasing the memory
0240  * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
0241  * reduce the available free memory in the system as it
0242  * runs. Increasing max_ptes_none will instead potentially reduce the
0243  * free memory in the system during the khugepaged scan.
0244  */
0245 static ssize_t max_ptes_none_show(struct kobject *kobj,
0246                   struct kobj_attribute *attr,
0247                   char *buf)
0248 {
0249     return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
0250 }
0251 static ssize_t max_ptes_none_store(struct kobject *kobj,
0252                    struct kobj_attribute *attr,
0253                    const char *buf, size_t count)
0254 {
0255     int err;
0256     unsigned long max_ptes_none;
0257
0258     err = kstrtoul(buf, 10, &max_ptes_none);
0259     if (err || max_ptes_none > HPAGE_PMD_NR - 1)
0260         return -EINVAL;
0261
0262     khugepaged_max_ptes_none = max_ptes_none;
0263
0264     return count;
0265 }
0266 static struct kobj_attribute khugepaged_max_ptes_none_attr =
0267     __ATTR_RW(max_ptes_none);
0268
0269 static ssize_t max_ptes_swap_show(struct kobject *kobj,
0270                   struct kobj_attribute *attr,
0271                   char *buf)
0272 {
0273     return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
0274 }
0275
0276 static ssize_t max_ptes_swap_store(struct kobject *kobj,
0277                    struct kobj_attribute *attr,
0278                    const char *buf, size_t count)
0279 {
0280     int err;
0281     unsigned long max_ptes_swap;
0282
0283     err  = kstrtoul(buf, 10, &max_ptes_swap);
0284     if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
0285         return -EINVAL;
0286
0287     khugepaged_max_ptes_swap = max_ptes_swap;
0288
0289     return count;
0290 }
0291
0292 static struct kobj_attribute khugepaged_max_ptes_swap_attr =
0293     __ATTR_RW(max_ptes_swap);
0294
0295 static ssize_t max_ptes_shared_show(struct kobject *kobj,
0296                     struct kobj_attribute *attr,
0297                     char *buf)
0298 {
0299     return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
0300 }
0301
0302 static ssize_t max_ptes_shared_store(struct kobject *kobj,
0303                      struct kobj_attribute *attr,
0304                      const char *buf, size_t count)
0305 {
0306     int err;
0307     unsigned long max_ptes_shared;
0308
0309     err  = kstrtoul(buf, 10, &max_ptes_shared);
0310     if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
0311         return -EINVAL;
0312
0313     khugepaged_max_ptes_shared = max_ptes_shared;
0314
0315     return count;
0316 }
0317
0318 static struct kobj_attribute khugepaged_max_ptes_shared_attr =
0319     __ATTR_RW(max_ptes_shared);
0320
0321 static struct attribute *khugepaged_attr[] = {
0322     &khugepaged_defrag_attr.attr,
0323     &khugepaged_max_ptes_none_attr.attr,
0324     &khugepaged_max_ptes_swap_attr.attr,
0325     &khugepaged_max_ptes_shared_attr.attr,
0326     &pages_to_scan_attr.attr,
0327     &pages_collapsed_attr.attr,
0328     &full_scans_attr.attr,
0329     &scan_sleep_millisecs_attr.attr,
0330     &alloc_sleep_millisecs_attr.attr,
0331     NULL,
0332 };
0333
0334 struct attribute_group khugepaged_attr_group = {
0335     .attrs = khugepaged_attr,
0336     .name = "khugepaged",
0337 };
0338 #endif /* CONFIG_SYSFS */
0339
0340 int hugepage_madvise(struct vm_area_struct *vma,
0341              unsigned long *vm_flags, int advice)
0342 {
0343     switch (advice) {
0344     case MADV_HUGEPAGE:
0345 #ifdef CONFIG_S390
0346         /*
0347          * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
0348          * can't handle this properly after s390_enable_sie, so we simply
0349          * ignore the madvise to prevent qemu from causing a SIGSEGV.
0350          */
0351         if (mm_has_pgste(vma->vm_mm))
0352             return 0;
0353 #endif
0354         *vm_flags &= ~VM_NOHUGEPAGE;
0355         *vm_flags |= VM_HUGEPAGE;
0356         /*
0357          * If the vma become good for khugepaged to scan,
0358          * register it here without waiting a page fault that
0359          * may not happen any time soon.
0360          */
0361         khugepaged_enter_vma(vma, *vm_flags);
0362         break;
0363     case MADV_NOHUGEPAGE:
0364         *vm_flags &= ~VM_HUGEPAGE;
0365         *vm_flags |= VM_NOHUGEPAGE;
0366         /*
0367          * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
0368          * this vma even if we leave the mm registered in khugepaged if
0369          * it got registered before VM_NOHUGEPAGE was set.
0370          */
0371         break;
0372     }
0373
0374     return 0;
0375 }
0376
0377 int __init khugepaged_init(void)
0378 {
0379     mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
0380                       sizeof(struct mm_slot),
0381                       __alignof__(struct mm_slot), 0, NULL);
0382     if (!mm_slot_cache)
0383         return -ENOMEM;
0384
0385     khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
0386     khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
0387     khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
0388     khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
0389
0390     return 0;
0391 }
0392
0393 void __init khugepaged_destroy(void)
0394 {
0395     kmem_cache_destroy(mm_slot_cache);
0396 }
0397
0398 static inline struct mm_slot *alloc_mm_slot(void)
0399 {
0400     if (!mm_slot_cache) /* initialization failed */
0401         return NULL;
0402     return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
0403 }
0404
0405 static inline void free_mm_slot(struct mm_slot *mm_slot)
0406 {
0407     kmem_cache_free(mm_slot_cache, mm_slot);
0408 }
0409
0410 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
0411 {
0412     struct mm_slot *mm_slot;
0413
0414     hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
0415         if (mm == mm_slot->mm)
0416             return mm_slot;
0417
0418     return NULL;
0419 }
0420
0421 static void insert_to_mm_slots_hash(struct mm_struct *mm,
0422                     struct mm_slot *mm_slot)
0423 {
0424     mm_slot->mm = mm;
0425     hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
0426 }
0427
0428 static inline int khugepaged_test_exit(struct mm_struct *mm)
0429 {
0430     return atomic_read(&mm->mm_users) == 0;
0431 }
0432
0433 void __khugepaged_enter(struct mm_struct *mm)
0434 {
0435     struct mm_slot *mm_slot;
0436     int wakeup;
0437
0438     mm_slot = alloc_mm_slot();
0439     if (!mm_slot)
0440         return;
0441
0442     /* __khugepaged_exit() must not run from under us */
0443     VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
0444     if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
0445         free_mm_slot(mm_slot);
0446         return;
0447     }
0448
0449     spin_lock(&khugepaged_mm_lock);
0450     insert_to_mm_slots_hash(mm, mm_slot);
0451     /*
0452      * Insert just behind the scanning cursor, to let the area settle
0453      * down a little.
0454      */
0455     wakeup = list_empty(&khugepaged_scan.mm_head);
0456     list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
0457     spin_unlock(&khugepaged_mm_lock);
0458
0459     mmgrab(mm);
0460     if (wakeup)
0461         wake_up_interruptible(&khugepaged_wait);
0462 }
0463
0464 void khugepaged_enter_vma(struct vm_area_struct *vma,
0465               unsigned long vm_flags)
0466 {
0467     if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
0468         hugepage_flags_enabled()) {
0469         if (hugepage_vma_check(vma, vm_flags, false, false))
0470             __khugepaged_enter(vma->vm_mm);
0471     }
0472 }
0473
0474 void __khugepaged_exit(struct mm_struct *mm)
0475 {
0476     struct mm_slot *mm_slot;
0477     int free = 0;
0478
0479     spin_lock(&khugepaged_mm_lock);
0480     mm_slot = get_mm_slot(mm);
0481     if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
0482         hash_del(&mm_slot->hash);
0483         list_del(&mm_slot->mm_node);
0484         free = 1;
0485     }
0486     spin_unlock(&khugepaged_mm_lock);
0487
0488     if (free) {
0489         clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
0490         free_mm_slot(mm_slot);
0491         mmdrop(mm);
0492     } else if (mm_slot) {
0493         /*
0494          * This is required to serialize against
0495          * khugepaged_test_exit() (which is guaranteed to run
0496          * under mmap sem read mode). Stop here (after we
0497          * return all pagetables will be destroyed) until
0498          * khugepaged has finished working on the pagetables
0499          * under the mmap_lock.
0500          */
0501         mmap_write_lock(mm);
0502         mmap_write_unlock(mm);
0503     }
0504 }
0505
0506 static void release_pte_page(struct page *page)
0507 {
0508     mod_node_page_state(page_pgdat(page),
0509             NR_ISOLATED_ANON + page_is_file_lru(page),
0510             -compound_nr(page));
0511     unlock_page(page);
0512     putback_lru_page(page);
0513 }
0514
0515 static void release_pte_pages(pte_t *pte, pte_t *_pte,
0516         struct list_head *compound_pagelist)
0517 {
0518     struct page *page, *tmp;
0519
0520     while (--_pte >= pte) {
0521         pte_t pteval = *_pte;
0522
0523         page = pte_page(pteval);
0524         if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
0525                 !PageCompound(page))
0526             release_pte_page(page);
0527     }
0528
0529     list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
0530         list_del(&page->lru);
0531         release_pte_page(page);
0532     }
0533 }
0534
0535 static bool is_refcount_suitable(struct page *page)
0536 {
0537     int expected_refcount;
0538
0539     expected_refcount = total_mapcount(page);
0540     if (PageSwapCache(page))
0541         expected_refcount += compound_nr(page);
0542
0543     return page_count(page) == expected_refcount;
0544 }
0545
0546 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
0547                     unsigned long address,
0548                     pte_t *pte,
0549                     struct list_head *compound_pagelist)
0550 {
0551     struct page *page = NULL;
0552     pte_t *_pte;
0553     int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
0554     bool writable = false;
0555
0556     for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
0557          _pte++, address += PAGE_SIZE) {
0558         pte_t pteval = *_pte;
0559         if (pte_none(pteval) || (pte_present(pteval) &&
0560                 is_zero_pfn(pte_pfn(pteval)))) {
0561             if (!userfaultfd_armed(vma) &&
0562                 ++none_or_zero <= khugepaged_max_ptes_none) {
0563                 continue;
0564             } else {
0565                 result = SCAN_EXCEED_NONE_PTE;
0566                 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
0567                 goto out;
0568             }
0569         }
0570         if (!pte_present(pteval)) {
0571             result = SCAN_PTE_NON_PRESENT;
0572             goto out;
0573         }
0574         page = vm_normal_page(vma, address, pteval);
0575         if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
0576             result = SCAN_PAGE_NULL;
0577             goto out;
0578         }
0579
0580         VM_BUG_ON_PAGE(!PageAnon(page), page);
0581
0582         if (page_mapcount(page) > 1 &&
0583                 ++shared > khugepaged_max_ptes_shared) {
0584             result = SCAN_EXCEED_SHARED_PTE;
0585             count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
0586             goto out;
0587         }
0588
0589         if (PageCompound(page)) {
0590             struct page *p;
0591             page = compound_head(page);
0592
0593             /*
0594              * Check if we have dealt with the compound page
0595              * already
0596              */
0597             list_for_each_entry(p, compound_pagelist, lru) {
0598                 if (page == p)
0599                     goto next;
0600             }
0601         }
0602
0603         /*
0604          * We can do it before isolate_lru_page because the
0605          * page can't be freed from under us. NOTE: PG_lock
0606          * is needed to serialize against split_huge_page
0607          * when invoked from the VM.
0608          */
0609         if (!trylock_page(page)) {
0610             result = SCAN_PAGE_LOCK;
0611             goto out;
0612         }
0613
0614         /*
0615          * Check if the page has any GUP (or other external) pins.
0616          *
0617          * The page table that maps the page has been already unlinked
0618          * from the page table tree and this process cannot get
0619          * an additional pin on the page.
0620          *
0621          * New pins can come later if the page is shared across fork,
0622          * but not from this process. The other process cannot write to
0623          * the page, only trigger CoW.
0624          */
0625         if (!is_refcount_suitable(page)) {
0626             unlock_page(page);
0627             result = SCAN_PAGE_COUNT;
0628             goto out;
0629         }
0630
0631         /*
0632          * Isolate the page to avoid collapsing an hugepage
0633          * currently in use by the VM.
0634          */
0635         if (isolate_lru_page(page)) {
0636             unlock_page(page);
0637             result = SCAN_DEL_PAGE_LRU;
0638             goto out;
0639         }
0640         mod_node_page_state(page_pgdat(page),
0641                 NR_ISOLATED_ANON + page_is_file_lru(page),
0642                 compound_nr(page));
0643         VM_BUG_ON_PAGE(!PageLocked(page), page);
0644         VM_BUG_ON_PAGE(PageLRU(page), page);
0645
0646         if (PageCompound(page))
0647             list_add_tail(&page->lru, compound_pagelist);
0648 next:
0649         /* There should be enough young pte to collapse the page */
0650         if (pte_young(pteval) ||
0651             page_is_young(page) || PageReferenced(page) ||
0652             mmu_notifier_test_young(vma->vm_mm, address))
0653             referenced++;
0654
0655         if (pte_write(pteval))
0656             writable = true;
0657     }
0658
0659     if (unlikely(!writable)) {
0660         result = SCAN_PAGE_RO;
0661     } else if (unlikely(!referenced)) {
0662         result = SCAN_LACK_REFERENCED_PAGE;
0663     } else {
0664         result = SCAN_SUCCEED;
0665         trace_mm_collapse_huge_page_isolate(page, none_or_zero,
0666                             referenced, writable, result);
0667         return 1;
0668     }
0669 out:
0670     release_pte_pages(pte, _pte, compound_pagelist);
0671     trace_mm_collapse_huge_page_isolate(page, none_or_zero,
0672                         referenced, writable, result);
0673     return 0;
0674 }
0675
0676 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
0677                       struct vm_area_struct *vma,
0678                       unsigned long address,
0679                       spinlock_t *ptl,
0680                       struct list_head *compound_pagelist)
0681 {
0682     struct page *src_page, *tmp;
0683     pte_t *_pte;
0684     for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
0685                 _pte++, page++, address += PAGE_SIZE) {
0686         pte_t pteval = *_pte;
0687
0688         if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
0689             clear_user_highpage(page, address);
0690             add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
0691             if (is_zero_pfn(pte_pfn(pteval))) {
0692                 /*
0693                  * ptl mostly unnecessary.
0694                  */
0695                 spin_lock(ptl);
0696                 ptep_clear(vma->vm_mm, address, _pte);
0697                 spin_unlock(ptl);
0698             }
0699         } else {
0700             src_page = pte_page(pteval);
0701             copy_user_highpage(page, src_page, address, vma);
0702             if (!PageCompound(src_page))
0703                 release_pte_page(src_page);
0704             /*
0705              * ptl mostly unnecessary, but preempt has to
0706              * be disabled to update the per-cpu stats
0707              * inside page_remove_rmap().
0708              */
0709             spin_lock(ptl);
0710             ptep_clear(vma->vm_mm, address, _pte);
0711             page_remove_rmap(src_page, vma, false);
0712             spin_unlock(ptl);
0713             free_page_and_swap_cache(src_page);
0714         }
0715     }
0716
0717     list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
0718         list_del(&src_page->lru);
0719         mod_node_page_state(page_pgdat(src_page),
0720                     NR_ISOLATED_ANON + page_is_file_lru(src_page),
0721                     -compound_nr(src_page));
0722         unlock_page(src_page);
0723         free_swap_cache(src_page);
0724         putback_lru_page(src_page);
0725     }
0726 }
0727
0728 static void khugepaged_alloc_sleep(void)
0729 {
0730     DEFINE_WAIT(wait);
0731
0732     add_wait_queue(&khugepaged_wait, &wait);
0733     freezable_schedule_timeout_interruptible(
0734         msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
0735     remove_wait_queue(&khugepaged_wait, &wait);
0736 }
0737
0738 static int khugepaged_node_load[MAX_NUMNODES];
0739
0740 static bool khugepaged_scan_abort(int nid)
0741 {
0742     int i;
0743
0744     /*
0745      * If node_reclaim_mode is disabled, then no extra effort is made to
0746      * allocate memory locally.
0747      */
0748     if (!node_reclaim_enabled())
0749         return false;
0750
0751     /* If there is a count for this node already, it must be acceptable */
0752     if (khugepaged_node_load[nid])
0753         return false;
0754
0755     for (i = 0; i < MAX_NUMNODES; i++) {
0756         if (!khugepaged_node_load[i])
0757             continue;
0758         if (node_distance(nid, i) > node_reclaim_distance)
0759             return true;
0760     }
0761     return false;
0762 }
0763
0764 #define khugepaged_defrag()                 \
0765     (transparent_hugepage_flags &               \
0766      (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
0767
0768 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
0769 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
0770 {
0771     return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
0772 }
0773
0774 #ifdef CONFIG_NUMA
0775 static int khugepaged_find_target_node(void)
0776 {
0777     static int last_khugepaged_target_node = NUMA_NO_NODE;
0778     int nid, target_node = 0, max_value = 0;
0779
0780     /* find first node with max normal pages hit */
0781     for (nid = 0; nid < MAX_NUMNODES; nid++)
0782         if (khugepaged_node_load[nid] > max_value) {
0783             max_value = khugepaged_node_load[nid];
0784             target_node = nid;
0785         }
0786
0787     /* do some balance if several nodes have the same hit record */
0788     if (target_node <= last_khugepaged_target_node)
0789         for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
0790                 nid++)
0791             if (max_value == khugepaged_node_load[nid]) {
0792                 target_node = nid;
0793                 break;
0794             }
0795
0796     last_khugepaged_target_node = target_node;
0797     return target_node;
0798 }
0799
0800 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
0801 {
0802     if (IS_ERR(*hpage)) {
0803         if (!*wait)
0804             return false;
0805
0806         *wait = false;
0807         *hpage = NULL;
0808         khugepaged_alloc_sleep();
0809     } else if (*hpage) {
0810         put_page(*hpage);
0811         *hpage = NULL;
0812     }
0813
0814     return true;
0815 }
0816
0817 static struct page *
0818 khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
0819 {
0820     VM_BUG_ON_PAGE(*hpage, *hpage);
0821
0822     *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
0823     if (unlikely(!*hpage)) {
0824         count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
0825         *hpage = ERR_PTR(-ENOMEM);
0826         return NULL;
0827     }
0828
0829     prep_transhuge_page(*hpage);
0830     count_vm_event(THP_COLLAPSE_ALLOC);
0831     return *hpage;
0832 }
0833 #else
0834 static int khugepaged_find_target_node(void)
0835 {
0836     return 0;
0837 }
0838
0839 static inline struct page *alloc_khugepaged_hugepage(void)
0840 {
0841     struct page *page;
0842
0843     page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
0844                HPAGE_PMD_ORDER);
0845     if (page)
0846         prep_transhuge_page(page);
0847     return page;
0848 }
0849
0850 static struct page *khugepaged_alloc_hugepage(bool *wait)
0851 {
0852     struct page *hpage;
0853
0854     do {
0855         hpage = alloc_khugepaged_hugepage();
0856         if (!hpage) {
0857             count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
0858             if (!*wait)
0859                 return NULL;
0860
0861             *wait = false;
0862             khugepaged_alloc_sleep();
0863         } else
0864             count_vm_event(THP_COLLAPSE_ALLOC);
0865     } while (unlikely(!hpage) && likely(hugepage_flags_enabled()));
0866
0867     return hpage;
0868 }
0869
0870 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
0871 {
0872     /*
0873      * If the hpage allocated earlier was briefly exposed in page cache
0874      * before collapse_file() failed, it is possible that racing lookups
0875      * have not yet completed, and would then be unpleasantly surprised by
0876      * finding the hpage reused for the same mapping at a different offset.
0877      * Just release the previous allocation if there is any danger of that.
0878      */
0879     if (*hpage && page_count(*hpage) > 1) {
0880         put_page(*hpage);
0881         *hpage = NULL;
0882     }
0883
0884     if (!*hpage)
0885         *hpage = khugepaged_alloc_hugepage(wait);
0886
0887     if (unlikely(!*hpage))
0888         return false;
0889
0890     return true;
0891 }
0892
0893 static struct page *
0894 khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
0895 {
0896     VM_BUG_ON(!*hpage);
0897
0898     return  *hpage;
0899 }
0900 #endif
0901
0902 /*
0903  * If mmap_lock temporarily dropped, revalidate vma
0904  * before taking mmap_lock.
0905  * Return 0 if succeeds, otherwise return none-zero
0906  * value (scan code).
0907  */
0908
0909 static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
0910         struct vm_area_struct **vmap)
0911 {
0912     struct vm_area_struct *vma;
0913
0914     if (unlikely(khugepaged_test_exit(mm)))
0915         return SCAN_ANY_PROCESS;
0916
0917     *vmap = vma = find_vma(mm, address);
0918     if (!vma)
0919         return SCAN_VMA_NULL;
0920
0921     if (!transhuge_vma_suitable(vma, address))
0922         return SCAN_ADDRESS_RANGE;
0923     if (!hugepage_vma_check(vma, vma->vm_flags, false, false))
0924         return SCAN_VMA_CHECK;
0925     /*
0926      * Anon VMA expected, the address may be unmapped then
0927      * remapped to file after khugepaged reaquired the mmap_lock.
0928      *
0929      * hugepage_vma_check may return true for qualified file
0930      * vmas.
0931      */
0932     if (!vma->anon_vma || !vma_is_anonymous(vma))
0933         return SCAN_VMA_CHECK;
0934     return 0;
0935 }
0936
0937 /*
0938  * Bring missing pages in from swap, to complete THP collapse.
0939  * Only done if khugepaged_scan_pmd believes it is worthwhile.
0940  *
0941  * Called and returns without pte mapped or spinlocks held.
0942  * Note that if false is returned, mmap_lock will be released.
0943  */
0944
0945 static bool __collapse_huge_page_swapin(struct mm_struct *mm,
0946                     struct vm_area_struct *vma,
0947                     unsigned long haddr, pmd_t *pmd,
0948                     int referenced)
0949 {
0950     int swapped_in = 0;
0951     vm_fault_t ret = 0;
0952     unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
0953
0954     for (address = haddr; address < end; address += PAGE_SIZE) {
0955         struct vm_fault vmf = {
0956             .vma = vma,
0957             .address = address,
0958             .pgoff = linear_page_index(vma, haddr),
0959             .flags = FAULT_FLAG_ALLOW_RETRY,
0960             .pmd = pmd,
0961         };
0962
0963         vmf.pte = pte_offset_map(pmd, address);
0964         vmf.orig_pte = *vmf.pte;
0965         if (!is_swap_pte(vmf.orig_pte)) {
0966             pte_unmap(vmf.pte);
0967             continue;
0968         }
0969         ret = do_swap_page(&vmf);
0970
0971         /*
0972          * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
0973          * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
0974          * we do not retry here and swap entry will remain in pagetable
0975          * resulting in later failure.
0976          */
0977         if (ret & VM_FAULT_RETRY) {
0978             trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
0979             return false;
0980         }
0981         if (ret & VM_FAULT_ERROR) {
0982             mmap_read_unlock(mm);
0983             trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
0984             return false;
0985         }
0986         swapped_in++;
0987     }
0988
0989     /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
0990     if (swapped_in)
0991         lru_add_drain();
0992
0993     trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
0994     return true;
0995 }
0996
0997 static void collapse_huge_page(struct mm_struct *mm,
0998                    unsigned long address,
0999                    struct page **hpage,
1000                    int node, int referenced, int unmapped)
1001 {
1002     LIST_HEAD(compound_pagelist);
1003     pmd_t *pmd, _pmd;
1004     pte_t *pte;
1005     pgtable_t pgtable;
1006     struct page *new_page;
1007     spinlock_t *pmd_ptl, *pte_ptl;
1008     int isolated = 0, result = 0;
1009     struct vm_area_struct *vma;
1010     struct mmu_notifier_range range;
1011     gfp_t gfp;
1012
1013     VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1014
1015     /* Only allocate from the target node */
1016     gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
1017
1018     /*
1019      * Before allocating the hugepage, release the mmap_lock read lock.
1020      * The allocation can take potentially a long time if it involves
1021      * sync compaction, and we do not need to hold the mmap_lock during
1022      * that. We will recheck the vma after taking it again in write mode.
1023      */
1024     mmap_read_unlock(mm);
1025     new_page = khugepaged_alloc_page(hpage, gfp, node);
1026     if (!new_page) {
1027         result = SCAN_ALLOC_HUGE_PAGE_FAIL;
1028         goto out_nolock;
1029     }
1030
1031     if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
1032         result = SCAN_CGROUP_CHARGE_FAIL;
1033         goto out_nolock;
1034     }
1035     count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
1036
1037     mmap_read_lock(mm);
1038     result = hugepage_vma_revalidate(mm, address, &vma);
1039     if (result) {
1040         mmap_read_unlock(mm);
1041         goto out_nolock;
1042     }
1043
1044     pmd = mm_find_pmd(mm, address);
1045     if (!pmd) {
1046         result = SCAN_PMD_NULL;
1047         mmap_read_unlock(mm);
1048         goto out_nolock;
1049     }
1050
1051     /*
1052      * __collapse_huge_page_swapin will return with mmap_lock released
1053      * when it fails. So we jump out_nolock directly in that case.
1054      * Continuing to collapse causes inconsistency.
1055      */
1056     if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
1057                              pmd, referenced)) {
1058         goto out_nolock;
1059     }
1060
1061     mmap_read_unlock(mm);
1062     /*
1063      * Prevent all access to pagetables with the exception of
1064      * gup_fast later handled by the ptep_clear_flush and the VM
1065      * handled by the anon_vma lock + PG_lock.
1066      */
1067     mmap_write_lock(mm);
1068     result = hugepage_vma_revalidate(mm, address, &vma);
1069     if (result)
1070         goto out_up_write;
1071     /* check if the pmd is still valid */
1072     if (mm_find_pmd(mm, address) != pmd)
1073         goto out_up_write;
1074
1075     anon_vma_lock_write(vma->anon_vma);
1076
1077     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
1078                 address, address + HPAGE_PMD_SIZE);
1079     mmu_notifier_invalidate_range_start(&range);
1080
1081     pte = pte_offset_map(pmd, address);
1082     pte_ptl = pte_lockptr(mm, pmd);
1083
1084     pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
1085     /*
1086      * This removes any huge TLB entry from the CPU so we won't allow
1087      * huge and small TLB entries for the same virtual address to
1088      * avoid the risk of CPU bugs in that area.
1089      *
1090      * Parallel fast GUP is fine since fast GUP will back off when
1091      * it detects PMD is changed.
1092      */
1093     _pmd = pmdp_collapse_flush(vma, address, pmd);
1094     spin_unlock(pmd_ptl);
1095     mmu_notifier_invalidate_range_end(&range);
1096
1097     spin_lock(pte_ptl);
1098     isolated = __collapse_huge_page_isolate(vma, address, pte,
1099             &compound_pagelist);
1100     spin_unlock(pte_ptl);
1101
1102     if (unlikely(!isolated)) {
1103         pte_unmap(pte);
1104         spin_lock(pmd_ptl);
1105         BUG_ON(!pmd_none(*pmd));
1106         /*
1107          * We can only use set_pmd_at when establishing
1108          * hugepmds and never for establishing regular pmds that
1109          * points to regular pagetables. Use pmd_populate for that
1110          */
1111         pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1112         spin_unlock(pmd_ptl);
1113         anon_vma_unlock_write(vma->anon_vma);
1114         result = SCAN_FAIL;
1115         goto out_up_write;
1116     }
1117
1118     /*
1119      * All pages are isolated and locked so anon_vma rmap
1120      * can't run anymore.
1121      */
1122     anon_vma_unlock_write(vma->anon_vma);
1123
1124     __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
1125             &compound_pagelist);
1126     pte_unmap(pte);
1127     /*
1128      * spin_lock() below is not the equivalent of smp_wmb(), but
1129      * the smp_wmb() inside __SetPageUptodate() can be reused to
1130      * avoid the copy_huge_page writes to become visible after
1131      * the set_pmd_at() write.
1132      */
1133     __SetPageUptodate(new_page);
1134     pgtable = pmd_pgtable(_pmd);
1135
1136     _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
1137     _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1138
1139     spin_lock(pmd_ptl);
1140     BUG_ON(!pmd_none(*pmd));
1141     page_add_new_anon_rmap(new_page, vma, address);
1142     lru_cache_add_inactive_or_unevictable(new_page, vma);
1143     pgtable_trans_huge_deposit(mm, pmd, pgtable);
1144     set_pmd_at(mm, address, pmd, _pmd);
1145     update_mmu_cache_pmd(vma, address, pmd);
1146     spin_unlock(pmd_ptl);
1147
1148     *hpage = NULL;
1149
1150     khugepaged_pages_collapsed++;
1151     result = SCAN_SUCCEED;
1152 out_up_write:
1153     mmap_write_unlock(mm);
1154 out_nolock:
1155     if (!IS_ERR_OR_NULL(*hpage))
1156         mem_cgroup_uncharge(page_folio(*hpage));
1157     trace_mm_collapse_huge_page(mm, isolated, result);
1158     return;
1159 }
1160
1161 static int khugepaged_scan_pmd(struct mm_struct *mm,
1162                    struct vm_area_struct *vma,
1163                    unsigned long address,
1164                    struct page **hpage)
1165 {
1166     pmd_t *pmd;
1167     pte_t *pte, *_pte;
1168     int ret = 0, result = 0, referenced = 0;
1169     int none_or_zero = 0, shared = 0;
1170     struct page *page = NULL;
1171     unsigned long _address;
1172     spinlock_t *ptl;
1173     int node = NUMA_NO_NODE, unmapped = 0;
1174     bool writable = false;
1175
1176     VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1177
1178     pmd = mm_find_pmd(mm, address);
1179     if (!pmd) {
1180         result = SCAN_PMD_NULL;
1181         goto out;
1182     }
1183
1184     memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
1185     pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1186     for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
1187          _pte++, _address += PAGE_SIZE) {
1188         pte_t pteval = *_pte;
1189         if (is_swap_pte(pteval)) {
1190             if (++unmapped <= khugepaged_max_ptes_swap) {
1191                 /*
1192                  * Always be strict with uffd-wp
1193                  * enabled swap entries.  Please see
1194                  * comment below for pte_uffd_wp().
1195                  */
1196                 if (pte_swp_uffd_wp(pteval)) {
1197                     result = SCAN_PTE_UFFD_WP;
1198                     goto out_unmap;
1199                 }
1200                 continue;
1201             } else {
1202                 result = SCAN_EXCEED_SWAP_PTE;
1203                 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
1204                 goto out_unmap;
1205             }
1206         }
1207         if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
1208             if (!userfaultfd_armed(vma) &&
1209                 ++none_or_zero <= khugepaged_max_ptes_none) {
1210                 continue;
1211             } else {
1212                 result = SCAN_EXCEED_NONE_PTE;
1213                 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
1214                 goto out_unmap;
1215             }
1216         }
1217         if (pte_uffd_wp(pteval)) {
1218             /*
1219              * Don't collapse the page if any of the small
1220              * PTEs are armed with uffd write protection.
1221              * Here we can also mark the new huge pmd as
1222              * write protected if any of the small ones is
1223              * marked but that could bring unknown
1224              * userfault messages that falls outside of
1225              * the registered range.  So, just be simple.
1226              */
1227             result = SCAN_PTE_UFFD_WP;
1228             goto out_unmap;
1229         }
1230         if (pte_write(pteval))
1231             writable = true;
1232
1233         page = vm_normal_page(vma, _address, pteval);
1234         if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
1235             result = SCAN_PAGE_NULL;
1236             goto out_unmap;
1237         }
1238
1239         if (page_mapcount(page) > 1 &&
1240                 ++shared > khugepaged_max_ptes_shared) {
1241             result = SCAN_EXCEED_SHARED_PTE;
1242             count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
1243             goto out_unmap;
1244         }
1245
1246         page = compound_head(page);
1247
1248         /*
1249          * Record which node the original page is from and save this
1250          * information to khugepaged_node_load[].
1251          * Khugepaged will allocate hugepage from the node has the max
1252          * hit record.
1253          */
1254         node = page_to_nid(page);
1255         if (khugepaged_scan_abort(node)) {
1256             result = SCAN_SCAN_ABORT;
1257             goto out_unmap;
1258         }
1259         khugepaged_node_load[node]++;
1260         if (!PageLRU(page)) {
1261             result = SCAN_PAGE_LRU;
1262             goto out_unmap;
1263         }
1264         if (PageLocked(page)) {
1265             result = SCAN_PAGE_LOCK;
1266             goto out_unmap;
1267         }
1268         if (!PageAnon(page)) {
1269             result = SCAN_PAGE_ANON;
1270             goto out_unmap;
1271         }
1272
1273         /*
1274          * Check if the page has any GUP (or other external) pins.
1275          *
1276          * Here the check is racy it may see total_mapcount > refcount
1277          * in some cases.
1278          * For example, one process with one forked child process.
1279          * The parent has the PMD split due to MADV_DONTNEED, then
1280          * the child is trying unmap the whole PMD, but khugepaged
1281          * may be scanning the parent between the child has
1282          * PageDoubleMap flag cleared and dec the mapcount.  So
1283          * khugepaged may see total_mapcount > refcount.
1284          *
1285          * But such case is ephemeral we could always retry collapse
1286          * later.  However it may report false positive if the page
1287          * has excessive GUP pins (i.e. 512).  Anyway the same check
1288          * will be done again later the risk seems low.
1289          */
1290         if (!is_refcount_suitable(page)) {
1291             result = SCAN_PAGE_COUNT;
1292             goto out_unmap;
1293         }
1294         if (pte_young(pteval) ||
1295             page_is_young(page) || PageReferenced(page) ||
1296             mmu_notifier_test_young(vma->vm_mm, address))
1297             referenced++;
1298     }
1299     if (!writable) {
1300         result = SCAN_PAGE_RO;
1301     } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
1302         result = SCAN_LACK_REFERENCED_PAGE;
1303     } else {
1304         result = SCAN_SUCCEED;
1305         ret = 1;
1306     }
1307 out_unmap:
1308     pte_unmap_unlock(pte, ptl);
1309     if (ret) {
1310         node = khugepaged_find_target_node();
1311         /* collapse_huge_page will return with the mmap_lock released */
1312         collapse_huge_page(mm, address, hpage, node,
1313                 referenced, unmapped);
1314     }
1315 out:
1316     trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
1317                      none_or_zero, result, unmapped);
1318     return ret;
1319 }
1320
1321 static void collect_mm_slot(struct mm_slot *mm_slot)
1322 {
1323     struct mm_struct *mm = mm_slot->mm;
1324
1325     lockdep_assert_held(&khugepaged_mm_lock);
1326
1327     if (khugepaged_test_exit(mm)) {
1328         /* free mm_slot */
1329         hash_del(&mm_slot->hash);
1330         list_del(&mm_slot->mm_node);
1331
1332         /*
1333          * Not strictly needed because the mm exited already.
1334          *
1335          * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1336          */
1337
1338         /* khugepaged_mm_lock actually not necessary for the below */
1339         free_mm_slot(mm_slot);
1340         mmdrop(mm);
1341     }
1342 }
1343
1344 #ifdef CONFIG_SHMEM
1345 /*
1346  * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
1347  * khugepaged should try to collapse the page table.
1348  */
1349 static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
1350                       unsigned long addr)
1351 {
1352     struct mm_slot *mm_slot;
1353
1354     VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
1355
1356     spin_lock(&khugepaged_mm_lock);
1357     mm_slot = get_mm_slot(mm);
1358     if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
1359         mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
1360     spin_unlock(&khugepaged_mm_lock);
1361 }
1362
1363 static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
1364                   unsigned long addr, pmd_t *pmdp)
1365 {
1366     spinlock_t *ptl;
1367     pmd_t pmd;
1368
1369     mmap_assert_write_locked(mm);
1370     ptl = pmd_lock(vma->vm_mm, pmdp);
1371     pmd = pmdp_collapse_flush(vma, addr, pmdp);
1372     spin_unlock(ptl);
1373     mm_dec_nr_ptes(mm);
1374     page_table_check_pte_clear_range(mm, addr, pmd);
1375     pte_free(mm, pmd_pgtable(pmd));
1376 }
1377
1378 /**
1379  * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
1380  * address haddr.
1381  *
1382  * @mm: process address space where collapse happens
1383  * @addr: THP collapse address
1384  *
1385  * This function checks whether all the PTEs in the PMD are pointing to the
1386  * right THP. If so, retract the page table so the THP can refault in with
1387  * as pmd-mapped.
1388  */
1389 void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
1390 {
1391     unsigned long haddr = addr & HPAGE_PMD_MASK;
1392     struct vm_area_struct *vma = find_vma(mm, haddr);
1393     struct page *hpage;
1394     pte_t *start_pte, *pte;
1395     pmd_t *pmd;
1396     spinlock_t *ptl;
1397     int count = 0;
1398     int i;
1399
1400     if (!vma || !vma->vm_file ||
1401         !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
1402         return;
1403
1404     /*
1405      * This vm_flags may not have VM_HUGEPAGE if the page was not
1406      * collapsed by this mm. But we can still collapse if the page is
1407      * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
1408      * will not fail the vma for missing VM_HUGEPAGE
1409      */
1410     if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false))
1411         return;
1412
1413     /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
1414     if (userfaultfd_wp(vma))
1415         return;
1416
1417     hpage = find_lock_page(vma->vm_file->f_mapping,
1418                    linear_page_index(vma, haddr));
1419     if (!hpage)
1420         return;
1421
1422     if (!PageHead(hpage))
1423         goto drop_hpage;
1424
1425     pmd = mm_find_pmd(mm, haddr);
1426     if (!pmd)
1427         goto drop_hpage;
1428
1429     start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
1430
1431     /* step 1: check all mapped PTEs are to the right huge page */
1432     for (i = 0, addr = haddr, pte = start_pte;
1433          i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1434         struct page *page;
1435
1436         /* empty pte, skip */
1437         if (pte_none(*pte))
1438             continue;
1439
1440         /* page swapped out, abort */
1441         if (!pte_present(*pte))
1442             goto abort;
1443
1444         page = vm_normal_page(vma, addr, *pte);
1445         if (WARN_ON_ONCE(page && is_zone_device_page(page)))
1446             page = NULL;
1447         /*
1448          * Note that uprobe, debugger, or MAP_PRIVATE may change the
1449          * page table, but the new page will not be a subpage of hpage.
1450          */
1451         if (hpage + i != page)
1452             goto abort;
1453         count++;
1454     }
1455
1456     /* step 2: adjust rmap */
1457     for (i = 0, addr = haddr, pte = start_pte;
1458          i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1459         struct page *page;
1460
1461         if (pte_none(*pte))
1462             continue;
1463         page = vm_normal_page(vma, addr, *pte);
1464         if (WARN_ON_ONCE(page && is_zone_device_page(page)))
1465             goto abort;
1466         page_remove_rmap(page, vma, false);
1467     }
1468
1469     pte_unmap_unlock(start_pte, ptl);
1470
1471     /* step 3: set proper refcount and mm_counters. */
1472     if (count) {
1473         page_ref_sub(hpage, count);
1474         add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
1475     }
1476
1477     /* step 4: collapse pmd */
1478     collapse_and_free_pmd(mm, vma, haddr, pmd);
1479 drop_hpage:
1480     unlock_page(hpage);
1481     put_page(hpage);
1482     return;
1483
1484 abort:
1485     pte_unmap_unlock(start_pte, ptl);
1486     goto drop_hpage;
1487 }
1488
1489 static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
1490 {
1491     struct mm_struct *mm = mm_slot->mm;
1492     int i;
1493
1494     if (likely(mm_slot->nr_pte_mapped_thp == 0))
1495         return;
1496
1497     if (!mmap_write_trylock(mm))
1498         return;
1499
1500     if (unlikely(khugepaged_test_exit(mm)))
1501         goto out;
1502
1503     for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
1504         collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
1505
1506 out:
1507     mm_slot->nr_pte_mapped_thp = 0;
1508     mmap_write_unlock(mm);
1509 }
1510
1511 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1512 {
1513     struct vm_area_struct *vma;
1514     struct mm_struct *mm;
1515     unsigned long addr;
1516     pmd_t *pmd;
1517
1518     i_mmap_lock_write(mapping);
1519     vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1520         /*
1521          * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1522          * got written to. These VMAs are likely not worth investing
1523          * mmap_write_lock(mm) as PMD-mapping is likely to be split
1524          * later.
1525          *
1526          * Note that vma->anon_vma check is racy: it can be set up after
1527          * the check but before we took mmap_lock by the fault path.
1528          * But page lock would prevent establishing any new ptes of the
1529          * page, so we are safe.
1530          *
1531          * An alternative would be drop the check, but check that page
1532          * table is clear before calling pmdp_collapse_flush() under
1533          * ptl. It has higher chance to recover THP for the VMA, but
1534          * has higher cost too.
1535          */
1536         if (vma->anon_vma)
1537             continue;
1538         addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1539         if (addr & ~HPAGE_PMD_MASK)
1540             continue;
1541         if (vma->vm_end < addr + HPAGE_PMD_SIZE)
1542             continue;
1543         mm = vma->vm_mm;
1544         pmd = mm_find_pmd(mm, addr);
1545         if (!pmd)
1546             continue;
1547         /*
1548          * We need exclusive mmap_lock to retract page table.
1549          *
1550          * We use trylock due to lock inversion: we need to acquire
1551          * mmap_lock while holding page lock. Fault path does it in
1552          * reverse order. Trylock is a way to avoid deadlock.
1553          */
1554         if (mmap_write_trylock(mm)) {
1555             /*
1556              * When a vma is registered with uffd-wp, we can't
1557              * recycle the pmd pgtable because there can be pte
1558              * markers installed.  Skip it only, so the rest mm/vma
1559              * can still have the same file mapped hugely, however
1560              * it'll always mapped in small page size for uffd-wp
1561              * registered ranges.
1562              */
1563             if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma))
1564                 collapse_and_free_pmd(mm, vma, addr, pmd);
1565             mmap_write_unlock(mm);
1566         } else {
1567             /* Try again later */
1568             khugepaged_add_pte_mapped_thp(mm, addr);
1569         }
1570     }
1571     i_mmap_unlock_write(mapping);
1572 }
1573
1574 /**
1575  * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
1576  *
1577  * @mm: process address space where collapse happens
1578  * @file: file that collapse on
1579  * @start: collapse start address
1580  * @hpage: new allocated huge page for collapse
1581  * @node: appointed node the new huge page allocate from
1582  *
1583  * Basic scheme is simple, details are more complex:
1584  *  - allocate and lock a new huge page;
1585  *  - scan page cache replacing old pages with the new one
1586  *    + swap/gup in pages if necessary;
1587  *    + fill in gaps;
1588  *    + keep old pages around in case rollback is required;
1589  *  - if replacing succeeds:
1590  *    + copy data over;
1591  *    + free old pages;
1592  *    + unlock huge page;
1593  *  - if replacing failed;
1594  *    + put all pages back and unfreeze them;
1595  *    + restore gaps in the page cache;
1596  *    + unlock and free huge page;
1597  */
1598 static void collapse_file(struct mm_struct *mm,
1599         struct file *file, pgoff_t start,
1600         struct page **hpage, int node)
1601 {
1602     struct address_space *mapping = file->f_mapping;
1603     gfp_t gfp;
1604     struct page *new_page;
1605     pgoff_t index, end = start + HPAGE_PMD_NR;
1606     LIST_HEAD(pagelist);
1607     XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
1608     int nr_none = 0, result = SCAN_SUCCEED;
1609     bool is_shmem = shmem_file(file);
1610     int nr;
1611
1612     VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
1613     VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
1614
1615     /* Only allocate from the target node */
1616     gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
1617
1618     new_page = khugepaged_alloc_page(hpage, gfp, node);
1619     if (!new_page) {
1620         result = SCAN_ALLOC_HUGE_PAGE_FAIL;
1621         goto out;
1622     }
1623
1624     if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
1625         result = SCAN_CGROUP_CHARGE_FAIL;
1626         goto out;
1627     }
1628     count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
1629
1630     /*
1631      * Ensure we have slots for all the pages in the range.  This is
1632      * almost certainly a no-op because most of the pages must be present
1633      */
1634     do {
1635         xas_lock_irq(&xas);
1636         xas_create_range(&xas);
1637         if (!xas_error(&xas))
1638             break;
1639         xas_unlock_irq(&xas);
1640         if (!xas_nomem(&xas, GFP_KERNEL)) {
1641             result = SCAN_FAIL;
1642             goto out;
1643         }
1644     } while (1);
1645
1646     __SetPageLocked(new_page);
1647     if (is_shmem)
1648         __SetPageSwapBacked(new_page);
1649     new_page->index = start;
1650     new_page->mapping = mapping;
1651
1652     /*
1653      * At this point the new_page is locked and not up-to-date.
1654      * It's safe to insert it into the page cache, because nobody would
1655      * be able to map it or use it in another way until we unlock it.
1656      */
1657
1658     xas_set(&xas, start);
1659     for (index = start; index < end; index++) {
1660         struct page *page = xas_next(&xas);
1661
1662         VM_BUG_ON(index != xas.xa_index);
1663         if (is_shmem) {
1664             if (!page) {
1665                 /*
1666                  * Stop if extent has been truncated or
1667                  * hole-punched, and is now completely
1668                  * empty.
1669                  */
1670                 if (index == start) {
1671                     if (!xas_next_entry(&xas, end - 1)) {
1672                         result = SCAN_TRUNCATED;
1673                         goto xa_locked;
1674                     }
1675                     xas_set(&xas, index);
1676                 }
1677                 if (!shmem_charge(mapping->host, 1)) {
1678                     result = SCAN_FAIL;
1679                     goto xa_locked;
1680                 }
1681                 xas_store(&xas, new_page);
1682                 nr_none++;
1683                 continue;
1684             }
1685
1686             if (xa_is_value(page) || !PageUptodate(page)) {
1687                 xas_unlock_irq(&xas);
1688                 /* swap in or instantiate fallocated page */
1689                 if (shmem_getpage(mapping->host, index, &page,
1690                           SGP_NOALLOC)) {
1691                     result = SCAN_FAIL;
1692                     goto xa_unlocked;
1693                 }
1694             } else if (trylock_page(page)) {
1695                 get_page(page);
1696                 xas_unlock_irq(&xas);
1697             } else {
1698                 result = SCAN_PAGE_LOCK;
1699                 goto xa_locked;
1700             }
1701         } else {    /* !is_shmem */
1702             if (!page || xa_is_value(page)) {
1703                 xas_unlock_irq(&xas);
1704                 page_cache_sync_readahead(mapping, &file->f_ra,
1705                               file, index,
1706                               end - index);
1707                 /* drain pagevecs to help isolate_lru_page() */
1708                 lru_add_drain();
1709                 page = find_lock_page(mapping, index);
1710                 if (unlikely(page == NULL)) {
1711                     result = SCAN_FAIL;
1712                     goto xa_unlocked;
1713                 }
1714             } else if (PageDirty(page)) {
1715                 /*
1716                  * khugepaged only works on read-only fd,
1717                  * so this page is dirty because it hasn't
1718                  * been flushed since first write. There
1719                  * won't be new dirty pages.
1720                  *
1721                  * Trigger async flush here and hope the
1722                  * writeback is done when khugepaged
1723                  * revisits this page.
1724                  *
1725                  * This is a one-off situation. We are not
1726                  * forcing writeback in loop.
1727                  */
1728                 xas_unlock_irq(&xas);
1729                 filemap_flush(mapping);
1730                 result = SCAN_FAIL;
1731                 goto xa_unlocked;
1732             } else if (PageWriteback(page)) {
1733                 xas_unlock_irq(&xas);
1734                 result = SCAN_FAIL;
1735                 goto xa_unlocked;
1736             } else if (trylock_page(page)) {
1737                 get_page(page);
1738                 xas_unlock_irq(&xas);
1739             } else {
1740                 result = SCAN_PAGE_LOCK;
1741                 goto xa_locked;
1742             }
1743         }
1744
1745         /*
1746          * The page must be locked, so we can drop the i_pages lock
1747          * without racing with truncate.
1748          */
1749         VM_BUG_ON_PAGE(!PageLocked(page), page);
1750
1751         /* make sure the page is up to date */
1752         if (unlikely(!PageUptodate(page))) {
1753             result = SCAN_FAIL;
1754             goto out_unlock;
1755         }
1756
1757         /*
1758          * If file was truncated then extended, or hole-punched, before
1759          * we locked the first page, then a THP might be there already.
1760          */
1761         if (PageTransCompound(page)) {
1762             result = SCAN_PAGE_COMPOUND;
1763             goto out_unlock;
1764         }
1765
1766         if (page_mapping(page) != mapping) {
1767             result = SCAN_TRUNCATED;
1768             goto out_unlock;
1769         }
1770
1771         if (!is_shmem && (PageDirty(page) ||
1772                   PageWriteback(page))) {
1773             /*
1774              * khugepaged only works on read-only fd, so this
1775              * page is dirty because it hasn't been flushed
1776              * since first write.
1777              */
1778             result = SCAN_FAIL;
1779             goto out_unlock;
1780         }
1781
1782         if (isolate_lru_page(page)) {
1783             result = SCAN_DEL_PAGE_LRU;
1784             goto out_unlock;
1785         }
1786
1787         if (page_has_private(page) &&
1788             !try_to_release_page(page, GFP_KERNEL)) {
1789             result = SCAN_PAGE_HAS_PRIVATE;
1790             putback_lru_page(page);
1791             goto out_unlock;
1792         }
1793
1794         if (page_mapped(page))
1795             try_to_unmap(page_folio(page),
1796                     TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
1797
1798         xas_lock_irq(&xas);
1799         xas_set(&xas, index);
1800
1801         VM_BUG_ON_PAGE(page != xas_load(&xas), page);
1802
1803         /*
1804          * The page is expected to have page_count() == 3:
1805          *  - we hold a pin on it;
1806          *  - one reference from page cache;
1807          *  - one from isolate_lru_page;
1808          */
1809         if (!page_ref_freeze(page, 3)) {
1810             result = SCAN_PAGE_COUNT;
1811             xas_unlock_irq(&xas);
1812             putback_lru_page(page);
1813             goto out_unlock;
1814         }
1815
1816         /*
1817          * Add the page to the list to be able to undo the collapse if
1818          * something go wrong.
1819          */
1820         list_add_tail(&page->lru, &pagelist);
1821
1822         /* Finally, replace with the new page. */
1823         xas_store(&xas, new_page);
1824         continue;
1825 out_unlock:
1826         unlock_page(page);
1827         put_page(page);
1828         goto xa_unlocked;
1829     }
1830     nr = thp_nr_pages(new_page);
1831
1832     if (is_shmem)
1833         __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
1834     else {
1835         __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
1836         filemap_nr_thps_inc(mapping);
1837         /*
1838          * Paired with smp_mb() in do_dentry_open() to ensure
1839          * i_writecount is up to date and the update to nr_thps is
1840          * visible. Ensures the page cache will be truncated if the
1841          * file is opened writable.
1842          */
1843         smp_mb();
1844         if (inode_is_open_for_write(mapping->host)) {
1845             result = SCAN_FAIL;
1846             __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
1847             filemap_nr_thps_dec(mapping);
1848             goto xa_locked;
1849         }
1850     }
1851
1852     if (nr_none) {
1853         __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
1854         /* nr_none is always 0 for non-shmem. */
1855         __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
1856     }
1857
1858     /* Join all the small entries into a single multi-index entry */
1859     xas_set_order(&xas, start, HPAGE_PMD_ORDER);
1860     xas_store(&xas, new_page);
1861 xa_locked:
1862     xas_unlock_irq(&xas);
1863 xa_unlocked:
1864
1865     /*
1866      * If collapse is successful, flush must be done now before copying.
1867      * If collapse is unsuccessful, does flush actually need to be done?
1868      * Do it anyway, to clear the state.
1869      */
1870     try_to_unmap_flush();
1871
1872     if (result == SCAN_SUCCEED) {
1873         struct page *page, *tmp;
1874
1875         /*
1876          * Replacing old pages with new one has succeeded, now we
1877          * need to copy the content and free the old pages.
1878          */
1879         index = start;
1880         list_for_each_entry_safe(page, tmp, &pagelist, lru) {
1881             while (index < page->index) {
1882                 clear_highpage(new_page + (index % HPAGE_PMD_NR));
1883                 index++;
1884             }
1885             copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
1886                     page);
1887             list_del(&page->lru);
1888             page->mapping = NULL;
1889             page_ref_unfreeze(page, 1);
1890             ClearPageActive(page);
1891             ClearPageUnevictable(page);
1892             unlock_page(page);
1893             put_page(page);
1894             index++;
1895         }
1896         while (index < end) {
1897             clear_highpage(new_page + (index % HPAGE_PMD_NR));
1898             index++;
1899         }
1900
1901         SetPageUptodate(new_page);
1902         page_ref_add(new_page, HPAGE_PMD_NR - 1);
1903         if (is_shmem)
1904             set_page_dirty(new_page);
1905         lru_cache_add(new_page);
1906
1907         /*
1908          * Remove pte page tables, so we can re-fault the page as huge.
1909          */
1910         retract_page_tables(mapping, start);
1911         *hpage = NULL;
1912
1913         khugepaged_pages_collapsed++;
1914     } else {
1915         struct page *page;
1916
1917         /* Something went wrong: roll back page cache changes */
1918         xas_lock_irq(&xas);
1919         if (nr_none) {
1920             mapping->nrpages -= nr_none;
1921             shmem_uncharge(mapping->host, nr_none);
1922         }
1923
1924         xas_set(&xas, start);
1925         xas_for_each(&xas, page, end - 1) {
1926             page = list_first_entry_or_null(&pagelist,
1927                     struct page, lru);
1928             if (!page || xas.xa_index < page->index) {
1929                 if (!nr_none)
1930                     break;
1931                 nr_none--;
1932                 /* Put holes back where they were */
1933                 xas_store(&xas, NULL);
1934                 continue;
1935             }
1936
1937             VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
1938
1939             /* Unfreeze the page. */
1940             list_del(&page->lru);
1941             page_ref_unfreeze(page, 2);
1942             xas_store(&xas, page);
1943             xas_pause(&xas);
1944             xas_unlock_irq(&xas);
1945             unlock_page(page);
1946             putback_lru_page(page);
1947             xas_lock_irq(&xas);
1948         }
1949         VM_BUG_ON(nr_none);
1950         xas_unlock_irq(&xas);
1951
1952         new_page->mapping = NULL;
1953     }
1954
1955     unlock_page(new_page);
1956 out:
1957     VM_BUG_ON(!list_empty(&pagelist));
1958     if (!IS_ERR_OR_NULL(*hpage))
1959         mem_cgroup_uncharge(page_folio(*hpage));
1960     /* TODO: tracepoints */
1961 }
1962
1963 static void khugepaged_scan_file(struct mm_struct *mm,
1964         struct file *file, pgoff_t start, struct page **hpage)
1965 {
1966     struct page *page = NULL;
1967     struct address_space *mapping = file->f_mapping;
1968     XA_STATE(xas, &mapping->i_pages, start);
1969     int present, swap;
1970     int node = NUMA_NO_NODE;
1971     int result = SCAN_SUCCEED;
1972
1973     present = 0;
1974     swap = 0;
1975     memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
1976     rcu_read_lock();
1977     xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
1978         if (xas_retry(&xas, page))
1979             continue;
1980
1981         if (xa_is_value(page)) {
1982             if (++swap > khugepaged_max_ptes_swap) {
1983                 result = SCAN_EXCEED_SWAP_PTE;
1984                 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
1985                 break;
1986             }
1987             continue;
1988         }
1989
1990         /*
1991          * XXX: khugepaged should compact smaller compound pages
1992          * into a PMD sized page
1993          */
1994         if (PageTransCompound(page)) {
1995             result = SCAN_PAGE_COMPOUND;
1996             break;
1997         }
1998
1999         node = page_to_nid(page);
2000         if (khugepaged_scan_abort(node)) {
2001             result = SCAN_SCAN_ABORT;
2002             break;
2003         }
2004         khugepaged_node_load[node]++;
2005
2006         if (!PageLRU(page)) {
2007             result = SCAN_PAGE_LRU;
2008             break;
2009         }
2010
2011         if (page_count(page) !=
2012             1 + page_mapcount(page) + page_has_private(page)) {
2013             result = SCAN_PAGE_COUNT;
2014             break;
2015         }
2016
2017         /*
2018          * We probably should check if the page is referenced here, but
2019          * nobody would transfer pte_young() to PageReferenced() for us.
2020          * And rmap walk here is just too costly...
2021          */
2022
2023         present++;
2024
2025         if (need_resched()) {
2026             xas_pause(&xas);
2027             cond_resched_rcu();
2028         }
2029     }
2030     rcu_read_unlock();
2031
2032     if (result == SCAN_SUCCEED) {
2033         if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
2034             result = SCAN_EXCEED_NONE_PTE;
2035             count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
2036         } else {
2037             node = khugepaged_find_target_node();
2038             collapse_file(mm, file, start, hpage, node);
2039         }
2040     }
2041
2042     /* TODO: tracepoints */
2043 }
2044 #else
2045 static void khugepaged_scan_file(struct mm_struct *mm,
2046         struct file *file, pgoff_t start, struct page **hpage)
2047 {
2048     BUILD_BUG();
2049 }
2050
2051 static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
2052 {
2053 }
2054 #endif
2055
2056 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2057                         struct page **hpage)
2058     __releases(&khugepaged_mm_lock)
2059     __acquires(&khugepaged_mm_lock)
2060 {
2061     struct mm_slot *mm_slot;
2062     struct mm_struct *mm;
2063     struct vm_area_struct *vma;
2064     int progress = 0;
2065
2066     VM_BUG_ON(!pages);
2067     lockdep_assert_held(&khugepaged_mm_lock);
2068
2069     if (khugepaged_scan.mm_slot)
2070         mm_slot = khugepaged_scan.mm_slot;
2071     else {
2072         mm_slot = list_entry(khugepaged_scan.mm_head.next,
2073                      struct mm_slot, mm_node);
2074         khugepaged_scan.address = 0;
2075         khugepaged_scan.mm_slot = mm_slot;
2076     }
2077     spin_unlock(&khugepaged_mm_lock);
2078     khugepaged_collapse_pte_mapped_thps(mm_slot);
2079
2080     mm = mm_slot->mm;
2081     /*
2082      * Don't wait for semaphore (to avoid long wait times).  Just move to
2083      * the next mm on the list.
2084      */
2085     vma = NULL;
2086     if (unlikely(!mmap_read_trylock(mm)))
2087         goto breakouterloop_mmap_lock;
2088     if (likely(!khugepaged_test_exit(mm)))
2089         vma = find_vma(mm, khugepaged_scan.address);
2090
2091     progress++;
2092     for (; vma; vma = vma->vm_next) {
2093         unsigned long hstart, hend;
2094
2095         cond_resched();
2096         if (unlikely(khugepaged_test_exit(mm))) {
2097             progress++;
2098             break;
2099         }
2100         if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) {
2101 skip:
2102             progress++;
2103             continue;
2104         }
2105         hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2106         hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2107         if (khugepaged_scan.address > hend)
2108             goto skip;
2109         if (khugepaged_scan.address < hstart)
2110             khugepaged_scan.address = hstart;
2111         VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2112
2113         while (khugepaged_scan.address < hend) {
2114             int ret;
2115             cond_resched();
2116             if (unlikely(khugepaged_test_exit(mm)))
2117                 goto breakouterloop;
2118
2119             VM_BUG_ON(khugepaged_scan.address < hstart ||
2120                   khugepaged_scan.address + HPAGE_PMD_SIZE >
2121                   hend);
2122             if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
2123                 struct file *file = get_file(vma->vm_file);
2124                 pgoff_t pgoff = linear_page_index(vma,
2125                         khugepaged_scan.address);
2126
2127                 mmap_read_unlock(mm);
2128                 ret = 1;
2129                 khugepaged_scan_file(mm, file, pgoff, hpage);
2130                 fput(file);
2131             } else {
2132                 ret = khugepaged_scan_pmd(mm, vma,
2133                         khugepaged_scan.address,
2134                         hpage);
2135             }
2136             /* move to next address */
2137             khugepaged_scan.address += HPAGE_PMD_SIZE;
2138             progress += HPAGE_PMD_NR;
2139             if (ret)
2140                 /* we released mmap_lock so break loop */
2141                 goto breakouterloop_mmap_lock;
2142             if (progress >= pages)
2143                 goto breakouterloop;
2144         }
2145     }
2146 breakouterloop:
2147     mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
2148 breakouterloop_mmap_lock:
2149
2150     spin_lock(&khugepaged_mm_lock);
2151     VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2152     /*
2153      * Release the current mm_slot if this mm is about to die, or
2154      * if we scanned all vmas of this mm.
2155      */
2156     if (khugepaged_test_exit(mm) || !vma) {
2157         /*
2158          * Make sure that if mm_users is reaching zero while
2159          * khugepaged runs here, khugepaged_exit will find
2160          * mm_slot not pointing to the exiting mm.
2161          */
2162         if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2163             khugepaged_scan.mm_slot = list_entry(
2164                 mm_slot->mm_node.next,
2165                 struct mm_slot, mm_node);
2166             khugepaged_scan.address = 0;
2167         } else {
2168             khugepaged_scan.mm_slot = NULL;
2169             khugepaged_full_scans++;
2170         }
2171
2172         collect_mm_slot(mm_slot);
2173     }
2174
2175     return progress;
2176 }
2177
2178 static int khugepaged_has_work(void)
2179 {
2180     return !list_empty(&khugepaged_scan.mm_head) &&
2181         hugepage_flags_enabled();
2182 }
2183
2184 static int khugepaged_wait_event(void)
2185 {
2186     return !list_empty(&khugepaged_scan.mm_head) ||
2187         kthread_should_stop();
2188 }
2189
2190 static void khugepaged_do_scan(void)
2191 {
2192     struct page *hpage = NULL;
2193     unsigned int progress = 0, pass_through_head = 0;
2194     unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
2195     bool wait = true;
2196
2197     lru_add_drain_all();
2198
2199     while (progress < pages) {
2200         if (!khugepaged_prealloc_page(&hpage, &wait))
2201             break;
2202
2203         cond_resched();
2204
2205         if (unlikely(kthread_should_stop() || try_to_freeze()))
2206             break;
2207
2208         spin_lock(&khugepaged_mm_lock);
2209         if (!khugepaged_scan.mm_slot)
2210             pass_through_head++;
2211         if (khugepaged_has_work() &&
2212             pass_through_head < 2)
2213             progress += khugepaged_scan_mm_slot(pages - progress,
2214                                 &hpage);
2215         else
2216             progress = pages;
2217         spin_unlock(&khugepaged_mm_lock);
2218     }
2219
2220     if (!IS_ERR_OR_NULL(hpage))
2221         put_page(hpage);
2222 }
2223
2224 static bool khugepaged_should_wakeup(void)
2225 {
2226     return kthread_should_stop() ||
2227            time_after_eq(jiffies, khugepaged_sleep_expire);
2228 }
2229
2230 static void khugepaged_wait_work(void)
2231 {
2232     if (khugepaged_has_work()) {
2233         const unsigned long scan_sleep_jiffies =
2234             msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
2235
2236         if (!scan_sleep_jiffies)
2237             return;
2238
2239         khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
2240         wait_event_freezable_timeout(khugepaged_wait,
2241                          khugepaged_should_wakeup(),
2242                          scan_sleep_jiffies);
2243         return;
2244     }
2245
2246     if (hugepage_flags_enabled())
2247         wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2248 }
2249
2250 static int khugepaged(void *none)
2251 {
2252     struct mm_slot *mm_slot;
2253
2254     set_freezable();
2255     set_user_nice(current, MAX_NICE);
2256
2257     while (!kthread_should_stop()) {
2258         khugepaged_do_scan();
2259         khugepaged_wait_work();
2260     }
2261
2262     spin_lock(&khugepaged_mm_lock);
2263     mm_slot = khugepaged_scan.mm_slot;
2264     khugepaged_scan.mm_slot = NULL;
2265     if (mm_slot)
2266         collect_mm_slot(mm_slot);
2267     spin_unlock(&khugepaged_mm_lock);
2268     return 0;
2269 }
2270
2271 static void set_recommended_min_free_kbytes(void)
2272 {
2273     struct zone *zone;
2274     int nr_zones = 0;
2275     unsigned long recommended_min;
2276
2277     if (!hugepage_flags_enabled()) {
2278         calculate_min_free_kbytes();
2279         goto update_wmarks;
2280     }
2281
2282     for_each_populated_zone(zone) {
2283         /*
2284          * We don't need to worry about fragmentation of
2285          * ZONE_MOVABLE since it only has movable pages.
2286          */
2287         if (zone_idx(zone) > gfp_zone(GFP_USER))
2288             continue;
2289
2290         nr_zones++;
2291     }
2292
2293     /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
2294     recommended_min = pageblock_nr_pages * nr_zones * 2;
2295
2296     /*
2297      * Make sure that on average at least two pageblocks are almost free
2298      * of another type, one for a migratetype to fall back to and a
2299      * second to avoid subsequent fallbacks of other types There are 3
2300      * MIGRATE_TYPES we care about.
2301      */
2302     recommended_min += pageblock_nr_pages * nr_zones *
2303                MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
2304
2305     /* don't ever allow to reserve more than 5% of the lowmem */
2306     recommended_min = min(recommended_min,
2307                   (unsigned long) nr_free_buffer_pages() / 20);
2308     recommended_min <<= (PAGE_SHIFT-10);
2309
2310     if (recommended_min > min_free_kbytes) {
2311         if (user_min_free_kbytes >= 0)
2312             pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
2313                 min_free_kbytes, recommended_min);
2314
2315         min_free_kbytes = recommended_min;
2316     }
2317
2318 update_wmarks:
2319     setup_per_zone_wmarks();
2320 }
2321
2322 int start_stop_khugepaged(void)
2323 {
2324     int err = 0;
2325
2326     mutex_lock(&khugepaged_mutex);
2327     if (hugepage_flags_enabled()) {
2328         if (!khugepaged_thread)
2329             khugepaged_thread = kthread_run(khugepaged, NULL,
2330                             "khugepaged");
2331         if (IS_ERR(khugepaged_thread)) {
2332             pr_err("khugepaged: kthread_run(khugepaged) failed\n");
2333             err = PTR_ERR(khugepaged_thread);
2334             khugepaged_thread = NULL;
2335             goto fail;
2336         }
2337
2338         if (!list_empty(&khugepaged_scan.mm_head))
2339             wake_up_interruptible(&khugepaged_wait);
2340     } else if (khugepaged_thread) {
2341         kthread_stop(khugepaged_thread);
2342         khugepaged_thread = NULL;
2343     }
2344     set_recommended_min_free_kbytes();
2345 fail:
2346     mutex_unlock(&khugepaged_mutex);
2347     return err;
2348 }
2349
2350 void khugepaged_min_free_kbytes_update(void)
2351 {
2352     mutex_lock(&khugepaged_mutex);
2353     if (hugepage_flags_enabled() && khugepaged_thread)
2354         set_recommended_min_free_kbytes();
2355     mutex_unlock(&khugepaged_mutex);
2356 }