the-tree/mm/ksm.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Memory merging support.
0004  *
0005  * This code enables dynamic sharing of identical pages found in different
0006  * memory areas, even if they are not shared by fork()
0007  *
0008  * Copyright (C) 2008-2009 Red Hat, Inc.
0009  * Authors:
0010  *  Izik Eidus
0011  *  Andrea Arcangeli
0012  *  Chris Wright
0013  *  Hugh Dickins
0014  */
0015
0016 #include <linux/errno.h>
0017 #include <linux/mm.h>
0018 #include <linux/mm_inline.h>
0019 #include <linux/fs.h>
0020 #include <linux/mman.h>
0021 #include <linux/sched.h>
0022 #include <linux/sched/mm.h>
0023 #include <linux/sched/coredump.h>
0024 #include <linux/rwsem.h>
0025 #include <linux/pagemap.h>
0026 #include <linux/rmap.h>
0027 #include <linux/spinlock.h>
0028 #include <linux/xxhash.h>
0029 #include <linux/delay.h>
0030 #include <linux/kthread.h>
0031 #include <linux/wait.h>
0032 #include <linux/slab.h>
0033 #include <linux/rbtree.h>
0034 #include <linux/memory.h>
0035 #include <linux/mmu_notifier.h>
0036 #include <linux/swap.h>
0037 #include <linux/ksm.h>
0038 #include <linux/hashtable.h>
0039 #include <linux/freezer.h>
0040 #include <linux/oom.h>
0041 #include <linux/numa.h>
0042
0043 #include <asm/tlbflush.h>
0044 #include "internal.h"
0045
0046 #ifdef CONFIG_NUMA
0047 #define NUMA(x)     (x)
0048 #define DO_NUMA(x)  do { (x); } while (0)
0049 #else
0050 #define NUMA(x)     (0)
0051 #define DO_NUMA(x)  do { } while (0)
0052 #endif
0053
0054 /**
0055  * DOC: Overview
0056  *
0057  * A few notes about the KSM scanning process,
0058  * to make it easier to understand the data structures below:
0059  *
0060  * In order to reduce excessive scanning, KSM sorts the memory pages by their
0061  * contents into a data structure that holds pointers to the pages' locations.
0062  *
0063  * Since the contents of the pages may change at any moment, KSM cannot just
0064  * insert the pages into a normal sorted tree and expect it to find anything.
0065  * Therefore KSM uses two data structures - the stable and the unstable tree.
0066  *
0067  * The stable tree holds pointers to all the merged pages (ksm pages), sorted
0068  * by their contents.  Because each such page is write-protected, searching on
0069  * this tree is fully assured to be working (except when pages are unmapped),
0070  * and therefore this tree is called the stable tree.
0071  *
0072  * The stable tree node includes information required for reverse
0073  * mapping from a KSM page to virtual addresses that map this page.
0074  *
0075  * In order to avoid large latencies of the rmap walks on KSM pages,
0076  * KSM maintains two types of nodes in the stable tree:
0077  *
0078  * * the regular nodes that keep the reverse mapping structures in a
0079  *   linked list
0080  * * the "chains" that link nodes ("dups") that represent the same
0081  *   write protected memory content, but each "dup" corresponds to a
0082  *   different KSM page copy of that content
0083  *
0084  * Internally, the regular nodes, "dups" and "chains" are represented
0085  * using the same struct stable_node structure.
0086  *
0087  * In addition to the stable tree, KSM uses a second data structure called the
0088  * unstable tree: this tree holds pointers to pages which have been found to
0089  * be "unchanged for a period of time".  The unstable tree sorts these pages
0090  * by their contents, but since they are not write-protected, KSM cannot rely
0091  * upon the unstable tree to work correctly - the unstable tree is liable to
0092  * be corrupted as its contents are modified, and so it is called unstable.
0093  *
0094  * KSM solves this problem by several techniques:
0095  *
0096  * 1) The unstable tree is flushed every time KSM completes scanning all
0097  *    memory areas, and then the tree is rebuilt again from the beginning.
0098  * 2) KSM will only insert into the unstable tree, pages whose hash value
0099  *    has not changed since the previous scan of all memory areas.
0100  * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
0101  *    colors of the nodes and not on their contents, assuring that even when
0102  *    the tree gets "corrupted" it won't get out of balance, so scanning time
0103  *    remains the same (also, searching and inserting nodes in an rbtree uses
0104  *    the same algorithm, so we have no overhead when we flush and rebuild).
0105  * 4) KSM never flushes the stable tree, which means that even if it were to
0106  *    take 10 attempts to find a page in the unstable tree, once it is found,
0107  *    it is secured in the stable tree.  (When we scan a new page, we first
0108  *    compare it against the stable tree, and then against the unstable tree.)
0109  *
0110  * If the merge_across_nodes tunable is unset, then KSM maintains multiple
0111  * stable trees and multiple unstable trees: one of each for each NUMA node.
0112  */
0113
0114 /**
0115  * struct mm_slot - ksm information per mm that is being scanned
0116  * @link: link to the mm_slots hash list
0117  * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
0118  * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
0119  * @mm: the mm that this information is valid for
0120  */
0121 struct mm_slot {
0122     struct hlist_node link;
0123     struct list_head mm_list;
0124     struct rmap_item *rmap_list;
0125     struct mm_struct *mm;
0126 };
0127
0128 /**
0129  * struct ksm_scan - cursor for scanning
0130  * @mm_slot: the current mm_slot we are scanning
0131  * @address: the next address inside that to be scanned
0132  * @rmap_list: link to the next rmap to be scanned in the rmap_list
0133  * @seqnr: count of completed full scans (needed when removing unstable node)
0134  *
0135  * There is only the one ksm_scan instance of this cursor structure.
0136  */
0137 struct ksm_scan {
0138     struct mm_slot *mm_slot;
0139     unsigned long address;
0140     struct rmap_item **rmap_list;
0141     unsigned long seqnr;
0142 };
0143
0144 /**
0145  * struct stable_node - node of the stable rbtree
0146  * @node: rb node of this ksm page in the stable tree
0147  * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
0148  * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
0149  * @list: linked into migrate_nodes, pending placement in the proper node tree
0150  * @hlist: hlist head of rmap_items using this ksm page
0151  * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
0152  * @chain_prune_time: time of the last full garbage collection
0153  * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
0154  * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
0155  */
0156 struct stable_node {
0157     union {
0158         struct rb_node node;    /* when node of stable tree */
0159         struct {        /* when listed for migration */
0160             struct list_head *head;
0161             struct {
0162                 struct hlist_node hlist_dup;
0163                 struct list_head list;
0164             };
0165         };
0166     };
0167     struct hlist_head hlist;
0168     union {
0169         unsigned long kpfn;
0170         unsigned long chain_prune_time;
0171     };
0172     /*
0173      * STABLE_NODE_CHAIN can be any negative number in
0174      * rmap_hlist_len negative range, but better not -1 to be able
0175      * to reliably detect underflows.
0176      */
0177 #define STABLE_NODE_CHAIN -1024
0178     int rmap_hlist_len;
0179 #ifdef CONFIG_NUMA
0180     int nid;
0181 #endif
0182 };
0183
0184 /**
0185  * struct rmap_item - reverse mapping item for virtual addresses
0186  * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
0187  * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
0188  * @nid: NUMA node id of unstable tree in which linked (may not match page)
0189  * @mm: the memory structure this rmap_item is pointing into
0190  * @address: the virtual address this rmap_item tracks (+ flags in low bits)
0191  * @oldchecksum: previous checksum of the page at that virtual address
0192  * @node: rb node of this rmap_item in the unstable tree
0193  * @head: pointer to stable_node heading this list in the stable tree
0194  * @hlist: link into hlist of rmap_items hanging off that stable_node
0195  */
0196 struct rmap_item {
0197     struct rmap_item *rmap_list;
0198     union {
0199         struct anon_vma *anon_vma;  /* when stable */
0200 #ifdef CONFIG_NUMA
0201         int nid;        /* when node of unstable tree */
0202 #endif
0203     };
0204     struct mm_struct *mm;
0205     unsigned long address;      /* + low bits used for flags below */
0206     unsigned int oldchecksum;   /* when unstable */
0207     union {
0208         struct rb_node node;    /* when node of unstable tree */
0209         struct {        /* when listed from stable tree */
0210             struct stable_node *head;
0211             struct hlist_node hlist;
0212         };
0213     };
0214 };
0215
0216 #define SEQNR_MASK  0x0ff   /* low bits of unstable tree seqnr */
0217 #define UNSTABLE_FLAG   0x100   /* is a node of the unstable tree */
0218 #define STABLE_FLAG 0x200   /* is listed from the stable tree */
0219
0220 /* The stable and unstable tree heads */
0221 static struct rb_root one_stable_tree[1] = { RB_ROOT };
0222 static struct rb_root one_unstable_tree[1] = { RB_ROOT };
0223 static struct rb_root *root_stable_tree = one_stable_tree;
0224 static struct rb_root *root_unstable_tree = one_unstable_tree;
0225
0226 /* Recently migrated nodes of stable tree, pending proper placement */
0227 static LIST_HEAD(migrate_nodes);
0228 #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
0229
0230 #define MM_SLOTS_HASH_BITS 10
0231 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
0232
0233 static struct mm_slot ksm_mm_head = {
0234     .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
0235 };
0236 static struct ksm_scan ksm_scan = {
0237     .mm_slot = &ksm_mm_head,
0238 };
0239
0240 static struct kmem_cache *rmap_item_cache;
0241 static struct kmem_cache *stable_node_cache;
0242 static struct kmem_cache *mm_slot_cache;
0243
0244 /* The number of nodes in the stable tree */
0245 static unsigned long ksm_pages_shared;
0246
0247 /* The number of page slots additionally sharing those nodes */
0248 static unsigned long ksm_pages_sharing;
0249
0250 /* The number of nodes in the unstable tree */
0251 static unsigned long ksm_pages_unshared;
0252
0253 /* The number of rmap_items in use: to calculate pages_volatile */
0254 static unsigned long ksm_rmap_items;
0255
0256 /* The number of stable_node chains */
0257 static unsigned long ksm_stable_node_chains;
0258
0259 /* The number of stable_node dups linked to the stable_node chains */
0260 static unsigned long ksm_stable_node_dups;
0261
0262 /* Delay in pruning stale stable_node_dups in the stable_node_chains */
0263 static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
0264
0265 /* Maximum number of page slots sharing a stable node */
0266 static int ksm_max_page_sharing = 256;
0267
0268 /* Number of pages ksmd should scan in one batch */
0269 static unsigned int ksm_thread_pages_to_scan = 100;
0270
0271 /* Milliseconds ksmd should sleep between batches */
0272 static unsigned int ksm_thread_sleep_millisecs = 20;
0273
0274 /* Checksum of an empty (zeroed) page */
0275 static unsigned int zero_checksum __read_mostly;
0276
0277 /* Whether to merge empty (zeroed) pages with actual zero pages */
0278 static bool ksm_use_zero_pages __read_mostly;
0279
0280 #ifdef CONFIG_NUMA
0281 /* Zeroed when merging across nodes is not allowed */
0282 static unsigned int ksm_merge_across_nodes = 1;
0283 static int ksm_nr_node_ids = 1;
0284 #else
0285 #define ksm_merge_across_nodes  1U
0286 #define ksm_nr_node_ids     1
0287 #endif
0288
0289 #define KSM_RUN_STOP    0
0290 #define KSM_RUN_MERGE   1
0291 #define KSM_RUN_UNMERGE 2
0292 #define KSM_RUN_OFFLINE 4
0293 static unsigned long ksm_run = KSM_RUN_STOP;
0294 static void wait_while_offlining(void);
0295
0296 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
0297 static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
0298 static DEFINE_MUTEX(ksm_thread_mutex);
0299 static DEFINE_SPINLOCK(ksm_mmlist_lock);
0300
0301 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
0302         sizeof(struct __struct), __alignof__(struct __struct),\
0303         (__flags), NULL)
0304
0305 static int __init ksm_slab_init(void)
0306 {
0307     rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
0308     if (!rmap_item_cache)
0309         goto out;
0310
0311     stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
0312     if (!stable_node_cache)
0313         goto out_free1;
0314
0315     mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
0316     if (!mm_slot_cache)
0317         goto out_free2;
0318
0319     return 0;
0320
0321 out_free2:
0322     kmem_cache_destroy(stable_node_cache);
0323 out_free1:
0324     kmem_cache_destroy(rmap_item_cache);
0325 out:
0326     return -ENOMEM;
0327 }
0328
0329 static void __init ksm_slab_free(void)
0330 {
0331     kmem_cache_destroy(mm_slot_cache);
0332     kmem_cache_destroy(stable_node_cache);
0333     kmem_cache_destroy(rmap_item_cache);
0334     mm_slot_cache = NULL;
0335 }
0336
0337 static __always_inline bool is_stable_node_chain(struct stable_node *chain)
0338 {
0339     return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
0340 }
0341
0342 static __always_inline bool is_stable_node_dup(struct stable_node *dup)
0343 {
0344     return dup->head == STABLE_NODE_DUP_HEAD;
0345 }
0346
0347 static inline void stable_node_chain_add_dup(struct stable_node *dup,
0348                          struct stable_node *chain)
0349 {
0350     VM_BUG_ON(is_stable_node_dup(dup));
0351     dup->head = STABLE_NODE_DUP_HEAD;
0352     VM_BUG_ON(!is_stable_node_chain(chain));
0353     hlist_add_head(&dup->hlist_dup, &chain->hlist);
0354     ksm_stable_node_dups++;
0355 }
0356
0357 static inline void __stable_node_dup_del(struct stable_node *dup)
0358 {
0359     VM_BUG_ON(!is_stable_node_dup(dup));
0360     hlist_del(&dup->hlist_dup);
0361     ksm_stable_node_dups--;
0362 }
0363
0364 static inline void stable_node_dup_del(struct stable_node *dup)
0365 {
0366     VM_BUG_ON(is_stable_node_chain(dup));
0367     if (is_stable_node_dup(dup))
0368         __stable_node_dup_del(dup);
0369     else
0370         rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
0371 #ifdef CONFIG_DEBUG_VM
0372     dup->head = NULL;
0373 #endif
0374 }
0375
0376 static inline struct rmap_item *alloc_rmap_item(void)
0377 {
0378     struct rmap_item *rmap_item;
0379
0380     rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
0381                         __GFP_NORETRY | __GFP_NOWARN);
0382     if (rmap_item)
0383         ksm_rmap_items++;
0384     return rmap_item;
0385 }
0386
0387 static inline void free_rmap_item(struct rmap_item *rmap_item)
0388 {
0389     ksm_rmap_items--;
0390     rmap_item->mm = NULL;   /* debug safety */
0391     kmem_cache_free(rmap_item_cache, rmap_item);
0392 }
0393
0394 static inline struct stable_node *alloc_stable_node(void)
0395 {
0396     /*
0397      * The allocation can take too long with GFP_KERNEL when memory is under
0398      * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
0399      * grants access to memory reserves, helping to avoid this problem.
0400      */
0401     return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
0402 }
0403
0404 static inline void free_stable_node(struct stable_node *stable_node)
0405 {
0406     VM_BUG_ON(stable_node->rmap_hlist_len &&
0407           !is_stable_node_chain(stable_node));
0408     kmem_cache_free(stable_node_cache, stable_node);
0409 }
0410
0411 static inline struct mm_slot *alloc_mm_slot(void)
0412 {
0413     if (!mm_slot_cache) /* initialization failed */
0414         return NULL;
0415     return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
0416 }
0417
0418 static inline void free_mm_slot(struct mm_slot *mm_slot)
0419 {
0420     kmem_cache_free(mm_slot_cache, mm_slot);
0421 }
0422
0423 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
0424 {
0425     struct mm_slot *slot;
0426
0427     hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
0428         if (slot->mm == mm)
0429             return slot;
0430
0431     return NULL;
0432 }
0433
0434 static void insert_to_mm_slots_hash(struct mm_struct *mm,
0435                     struct mm_slot *mm_slot)
0436 {
0437     mm_slot->mm = mm;
0438     hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
0439 }
0440
0441 /*
0442  * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
0443  * page tables after it has passed through ksm_exit() - which, if necessary,
0444  * takes mmap_lock briefly to serialize against them.  ksm_exit() does not set
0445  * a special flag: they can just back out as soon as mm_users goes to zero.
0446  * ksm_test_exit() is used throughout to make this test for exit: in some
0447  * places for correctness, in some places just to avoid unnecessary work.
0448  */
0449 static inline bool ksm_test_exit(struct mm_struct *mm)
0450 {
0451     return atomic_read(&mm->mm_users) == 0;
0452 }
0453
0454 /*
0455  * We use break_ksm to break COW on a ksm page: it's a stripped down
0456  *
0457  *  if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
0458  *      put_page(page);
0459  *
0460  * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
0461  * in case the application has unmapped and remapped mm,addr meanwhile.
0462  * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
0463  * mmap of /dev/mem, where we would not want to touch it.
0464  *
0465  * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
0466  * of the process that owns 'vma'.  We also do not want to enforce
0467  * protection keys here anyway.
0468  */
0469 static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
0470 {
0471     struct page *page;
0472     vm_fault_t ret = 0;
0473
0474     do {
0475         cond_resched();
0476         page = follow_page(vma, addr,
0477                 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
0478         if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
0479             break;
0480         if (PageKsm(page))
0481             ret = handle_mm_fault(vma, addr,
0482                           FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
0483                           NULL);
0484         else
0485             ret = VM_FAULT_WRITE;
0486         put_page(page);
0487     } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
0488     /*
0489      * We must loop because handle_mm_fault() may back out if there's
0490      * any difficulty e.g. if pte accessed bit gets updated concurrently.
0491      *
0492      * VM_FAULT_WRITE is what we have been hoping for: it indicates that
0493      * COW has been broken, even if the vma does not permit VM_WRITE;
0494      * but note that a concurrent fault might break PageKsm for us.
0495      *
0496      * VM_FAULT_SIGBUS could occur if we race with truncation of the
0497      * backing file, which also invalidates anonymous pages: that's
0498      * okay, that truncation will have unmapped the PageKsm for us.
0499      *
0500      * VM_FAULT_OOM: at the time of writing (late July 2009), setting
0501      * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
0502      * current task has TIF_MEMDIE set, and will be OOM killed on return
0503      * to user; and ksmd, having no mm, would never be chosen for that.
0504      *
0505      * But if the mm is in a limited mem_cgroup, then the fault may fail
0506      * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
0507      * even ksmd can fail in this way - though it's usually breaking ksm
0508      * just to undo a merge it made a moment before, so unlikely to oom.
0509      *
0510      * That's a pity: we might therefore have more kernel pages allocated
0511      * than we're counting as nodes in the stable tree; but ksm_do_scan
0512      * will retry to break_cow on each pass, so should recover the page
0513      * in due course.  The important thing is to not let VM_MERGEABLE
0514      * be cleared while any such pages might remain in the area.
0515      */
0516     return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
0517 }
0518
0519 static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
0520         unsigned long addr)
0521 {
0522     struct vm_area_struct *vma;
0523     if (ksm_test_exit(mm))
0524         return NULL;
0525     vma = vma_lookup(mm, addr);
0526     if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
0527         return NULL;
0528     return vma;
0529 }
0530
0531 static void break_cow(struct rmap_item *rmap_item)
0532 {
0533     struct mm_struct *mm = rmap_item->mm;
0534     unsigned long addr = rmap_item->address;
0535     struct vm_area_struct *vma;
0536
0537     /*
0538      * It is not an accident that whenever we want to break COW
0539      * to undo, we also need to drop a reference to the anon_vma.
0540      */
0541     put_anon_vma(rmap_item->anon_vma);
0542
0543     mmap_read_lock(mm);
0544     vma = find_mergeable_vma(mm, addr);
0545     if (vma)
0546         break_ksm(vma, addr);
0547     mmap_read_unlock(mm);
0548 }
0549
0550 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
0551 {
0552     struct mm_struct *mm = rmap_item->mm;
0553     unsigned long addr = rmap_item->address;
0554     struct vm_area_struct *vma;
0555     struct page *page;
0556
0557     mmap_read_lock(mm);
0558     vma = find_mergeable_vma(mm, addr);
0559     if (!vma)
0560         goto out;
0561
0562     page = follow_page(vma, addr, FOLL_GET);
0563     if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
0564         goto out;
0565     if (PageAnon(page)) {
0566         flush_anon_page(vma, page, addr);
0567         flush_dcache_page(page);
0568     } else {
0569         put_page(page);
0570 out:
0571         page = NULL;
0572     }
0573     mmap_read_unlock(mm);
0574     return page;
0575 }
0576
0577 /*
0578  * This helper is used for getting right index into array of tree roots.
0579  * When merge_across_nodes knob is set to 1, there are only two rb-trees for
0580  * stable and unstable pages from all nodes with roots in index 0. Otherwise,
0581  * every node has its own stable and unstable tree.
0582  */
0583 static inline int get_kpfn_nid(unsigned long kpfn)
0584 {
0585     return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
0586 }
0587
0588 static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
0589                            struct rb_root *root)
0590 {
0591     struct stable_node *chain = alloc_stable_node();
0592     VM_BUG_ON(is_stable_node_chain(dup));
0593     if (likely(chain)) {
0594         INIT_HLIST_HEAD(&chain->hlist);
0595         chain->chain_prune_time = jiffies;
0596         chain->rmap_hlist_len = STABLE_NODE_CHAIN;
0597 #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
0598         chain->nid = NUMA_NO_NODE; /* debug */
0599 #endif
0600         ksm_stable_node_chains++;
0601
0602         /*
0603          * Put the stable node chain in the first dimension of
0604          * the stable tree and at the same time remove the old
0605          * stable node.
0606          */
0607         rb_replace_node(&dup->node, &chain->node, root);
0608
0609         /*
0610          * Move the old stable node to the second dimension
0611          * queued in the hlist_dup. The invariant is that all
0612          * dup stable_nodes in the chain->hlist point to pages
0613          * that are write protected and have the exact same
0614          * content.
0615          */
0616         stable_node_chain_add_dup(dup, chain);
0617     }
0618     return chain;
0619 }
0620
0621 static inline void free_stable_node_chain(struct stable_node *chain,
0622                       struct rb_root *root)
0623 {
0624     rb_erase(&chain->node, root);
0625     free_stable_node(chain);
0626     ksm_stable_node_chains--;
0627 }
0628
0629 static void remove_node_from_stable_tree(struct stable_node *stable_node)
0630 {
0631     struct rmap_item *rmap_item;
0632
0633     /* check it's not STABLE_NODE_CHAIN or negative */
0634     BUG_ON(stable_node->rmap_hlist_len < 0);
0635
0636     hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
0637         if (rmap_item->hlist.next)
0638             ksm_pages_sharing--;
0639         else
0640             ksm_pages_shared--;
0641
0642         rmap_item->mm->ksm_merging_pages--;
0643
0644         VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
0645         stable_node->rmap_hlist_len--;
0646         put_anon_vma(rmap_item->anon_vma);
0647         rmap_item->address &= PAGE_MASK;
0648         cond_resched();
0649     }
0650
0651     /*
0652      * We need the second aligned pointer of the migrate_nodes
0653      * list_head to stay clear from the rb_parent_color union
0654      * (aligned and different than any node) and also different
0655      * from &migrate_nodes. This will verify that future list.h changes
0656      * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
0657      */
0658     BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
0659     BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
0660
0661     if (stable_node->head == &migrate_nodes)
0662         list_del(&stable_node->list);
0663     else
0664         stable_node_dup_del(stable_node);
0665     free_stable_node(stable_node);
0666 }
0667
0668 enum get_ksm_page_flags {
0669     GET_KSM_PAGE_NOLOCK,
0670     GET_KSM_PAGE_LOCK,
0671     GET_KSM_PAGE_TRYLOCK
0672 };
0673
0674 /*
0675  * get_ksm_page: checks if the page indicated by the stable node
0676  * is still its ksm page, despite having held no reference to it.
0677  * In which case we can trust the content of the page, and it
0678  * returns the gotten page; but if the page has now been zapped,
0679  * remove the stale node from the stable tree and return NULL.
0680  * But beware, the stable node's page might be being migrated.
0681  *
0682  * You would expect the stable_node to hold a reference to the ksm page.
0683  * But if it increments the page's count, swapping out has to wait for
0684  * ksmd to come around again before it can free the page, which may take
0685  * seconds or even minutes: much too unresponsive.  So instead we use a
0686  * "keyhole reference": access to the ksm page from the stable node peeps
0687  * out through its keyhole to see if that page still holds the right key,
0688  * pointing back to this stable node.  This relies on freeing a PageAnon
0689  * page to reset its page->mapping to NULL, and relies on no other use of
0690  * a page to put something that might look like our key in page->mapping.
0691  * is on its way to being freed; but it is an anomaly to bear in mind.
0692  */
0693 static struct page *get_ksm_page(struct stable_node *stable_node,
0694                  enum get_ksm_page_flags flags)
0695 {
0696     struct page *page;
0697     void *expected_mapping;
0698     unsigned long kpfn;
0699
0700     expected_mapping = (void *)((unsigned long)stable_node |
0701                     PAGE_MAPPING_KSM);
0702 again:
0703     kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
0704     page = pfn_to_page(kpfn);
0705     if (READ_ONCE(page->mapping) != expected_mapping)
0706         goto stale;
0707
0708     /*
0709      * We cannot do anything with the page while its refcount is 0.
0710      * Usually 0 means free, or tail of a higher-order page: in which
0711      * case this node is no longer referenced, and should be freed;
0712      * however, it might mean that the page is under page_ref_freeze().
0713      * The __remove_mapping() case is easy, again the node is now stale;
0714      * the same is in reuse_ksm_page() case; but if page is swapcache
0715      * in folio_migrate_mapping(), it might still be our page,
0716      * in which case it's essential to keep the node.
0717      */
0718     while (!get_page_unless_zero(page)) {
0719         /*
0720          * Another check for page->mapping != expected_mapping would
0721          * work here too.  We have chosen the !PageSwapCache test to
0722          * optimize the common case, when the page is or is about to
0723          * be freed: PageSwapCache is cleared (under spin_lock_irq)
0724          * in the ref_freeze section of __remove_mapping(); but Anon
0725          * page->mapping reset to NULL later, in free_pages_prepare().
0726          */
0727         if (!PageSwapCache(page))
0728             goto stale;
0729         cpu_relax();
0730     }
0731
0732     if (READ_ONCE(page->mapping) != expected_mapping) {
0733         put_page(page);
0734         goto stale;
0735     }
0736
0737     if (flags == GET_KSM_PAGE_TRYLOCK) {
0738         if (!trylock_page(page)) {
0739             put_page(page);
0740             return ERR_PTR(-EBUSY);
0741         }
0742     } else if (flags == GET_KSM_PAGE_LOCK)
0743         lock_page(page);
0744
0745     if (flags != GET_KSM_PAGE_NOLOCK) {
0746         if (READ_ONCE(page->mapping) != expected_mapping) {
0747             unlock_page(page);
0748             put_page(page);
0749             goto stale;
0750         }
0751     }
0752     return page;
0753
0754 stale:
0755     /*
0756      * We come here from above when page->mapping or !PageSwapCache
0757      * suggests that the node is stale; but it might be under migration.
0758      * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
0759      * before checking whether node->kpfn has been changed.
0760      */
0761     smp_rmb();
0762     if (READ_ONCE(stable_node->kpfn) != kpfn)
0763         goto again;
0764     remove_node_from_stable_tree(stable_node);
0765     return NULL;
0766 }
0767
0768 /*
0769  * Removing rmap_item from stable or unstable tree.
0770  * This function will clean the information from the stable/unstable tree.
0771  */
0772 static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
0773 {
0774     if (rmap_item->address & STABLE_FLAG) {
0775         struct stable_node *stable_node;
0776         struct page *page;
0777
0778         stable_node = rmap_item->head;
0779         page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
0780         if (!page)
0781             goto out;
0782
0783         hlist_del(&rmap_item->hlist);
0784         unlock_page(page);
0785         put_page(page);
0786
0787         if (!hlist_empty(&stable_node->hlist))
0788             ksm_pages_sharing--;
0789         else
0790             ksm_pages_shared--;
0791
0792         rmap_item->mm->ksm_merging_pages--;
0793
0794         VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
0795         stable_node->rmap_hlist_len--;
0796
0797         put_anon_vma(rmap_item->anon_vma);
0798         rmap_item->head = NULL;
0799         rmap_item->address &= PAGE_MASK;
0800
0801     } else if (rmap_item->address & UNSTABLE_FLAG) {
0802         unsigned char age;
0803         /*
0804          * Usually ksmd can and must skip the rb_erase, because
0805          * root_unstable_tree was already reset to RB_ROOT.
0806          * But be careful when an mm is exiting: do the rb_erase
0807          * if this rmap_item was inserted by this scan, rather
0808          * than left over from before.
0809          */
0810         age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
0811         BUG_ON(age > 1);
0812         if (!age)
0813             rb_erase(&rmap_item->node,
0814                  root_unstable_tree + NUMA(rmap_item->nid));
0815         ksm_pages_unshared--;
0816         rmap_item->address &= PAGE_MASK;
0817     }
0818 out:
0819     cond_resched();     /* we're called from many long loops */
0820 }
0821
0822 static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
0823 {
0824     while (*rmap_list) {
0825         struct rmap_item *rmap_item = *rmap_list;
0826         *rmap_list = rmap_item->rmap_list;
0827         remove_rmap_item_from_tree(rmap_item);
0828         free_rmap_item(rmap_item);
0829     }
0830 }
0831
0832 /*
0833  * Though it's very tempting to unmerge rmap_items from stable tree rather
0834  * than check every pte of a given vma, the locking doesn't quite work for
0835  * that - an rmap_item is assigned to the stable tree after inserting ksm
0836  * page and upping mmap_lock.  Nor does it fit with the way we skip dup'ing
0837  * rmap_items from parent to child at fork time (so as not to waste time
0838  * if exit comes before the next scan reaches it).
0839  *
0840  * Similarly, although we'd like to remove rmap_items (so updating counts
0841  * and freeing memory) when unmerging an area, it's easier to leave that
0842  * to the next pass of ksmd - consider, for example, how ksmd might be
0843  * in cmp_and_merge_page on one of the rmap_items we would be removing.
0844  */
0845 static int unmerge_ksm_pages(struct vm_area_struct *vma,
0846                  unsigned long start, unsigned long end)
0847 {
0848     unsigned long addr;
0849     int err = 0;
0850
0851     for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
0852         if (ksm_test_exit(vma->vm_mm))
0853             break;
0854         if (signal_pending(current))
0855             err = -ERESTARTSYS;
0856         else
0857             err = break_ksm(vma, addr);
0858     }
0859     return err;
0860 }
0861
0862 static inline struct stable_node *folio_stable_node(struct folio *folio)
0863 {
0864     return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
0865 }
0866
0867 static inline struct stable_node *page_stable_node(struct page *page)
0868 {
0869     return folio_stable_node(page_folio(page));
0870 }
0871
0872 static inline void set_page_stable_node(struct page *page,
0873                     struct stable_node *stable_node)
0874 {
0875     VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
0876     page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
0877 }
0878
0879 #ifdef CONFIG_SYSFS
0880 /*
0881  * Only called through the sysfs control interface:
0882  */
0883 static int remove_stable_node(struct stable_node *stable_node)
0884 {
0885     struct page *page;
0886     int err;
0887
0888     page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
0889     if (!page) {
0890         /*
0891          * get_ksm_page did remove_node_from_stable_tree itself.
0892          */
0893         return 0;
0894     }
0895
0896     /*
0897      * Page could be still mapped if this races with __mmput() running in
0898      * between ksm_exit() and exit_mmap(). Just refuse to let
0899      * merge_across_nodes/max_page_sharing be switched.
0900      */
0901     err = -EBUSY;
0902     if (!page_mapped(page)) {
0903         /*
0904          * The stable node did not yet appear stale to get_ksm_page(),
0905          * since that allows for an unmapped ksm page to be recognized
0906          * right up until it is freed; but the node is safe to remove.
0907          * This page might be in a pagevec waiting to be freed,
0908          * or it might be PageSwapCache (perhaps under writeback),
0909          * or it might have been removed from swapcache a moment ago.
0910          */
0911         set_page_stable_node(page, NULL);
0912         remove_node_from_stable_tree(stable_node);
0913         err = 0;
0914     }
0915
0916     unlock_page(page);
0917     put_page(page);
0918     return err;
0919 }
0920
0921 static int remove_stable_node_chain(struct stable_node *stable_node,
0922                     struct rb_root *root)
0923 {
0924     struct stable_node *dup;
0925     struct hlist_node *hlist_safe;
0926
0927     if (!is_stable_node_chain(stable_node)) {
0928         VM_BUG_ON(is_stable_node_dup(stable_node));
0929         if (remove_stable_node(stable_node))
0930             return true;
0931         else
0932             return false;
0933     }
0934
0935     hlist_for_each_entry_safe(dup, hlist_safe,
0936                   &stable_node->hlist, hlist_dup) {
0937         VM_BUG_ON(!is_stable_node_dup(dup));
0938         if (remove_stable_node(dup))
0939             return true;
0940     }
0941     BUG_ON(!hlist_empty(&stable_node->hlist));
0942     free_stable_node_chain(stable_node, root);
0943     return false;
0944 }
0945
0946 static int remove_all_stable_nodes(void)
0947 {
0948     struct stable_node *stable_node, *next;
0949     int nid;
0950     int err = 0;
0951
0952     for (nid = 0; nid < ksm_nr_node_ids; nid++) {
0953         while (root_stable_tree[nid].rb_node) {
0954             stable_node = rb_entry(root_stable_tree[nid].rb_node,
0955                         struct stable_node, node);
0956             if (remove_stable_node_chain(stable_node,
0957                              root_stable_tree + nid)) {
0958                 err = -EBUSY;
0959                 break;  /* proceed to next nid */
0960             }
0961             cond_resched();
0962         }
0963     }
0964     list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
0965         if (remove_stable_node(stable_node))
0966             err = -EBUSY;
0967         cond_resched();
0968     }
0969     return err;
0970 }
0971
0972 static int unmerge_and_remove_all_rmap_items(void)
0973 {
0974     struct mm_slot *mm_slot;
0975     struct mm_struct *mm;
0976     struct vm_area_struct *vma;
0977     int err = 0;
0978
0979     spin_lock(&ksm_mmlist_lock);
0980     ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
0981                         struct mm_slot, mm_list);
0982     spin_unlock(&ksm_mmlist_lock);
0983
0984     for (mm_slot = ksm_scan.mm_slot;
0985             mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
0986         mm = mm_slot->mm;
0987         mmap_read_lock(mm);
0988         for (vma = mm->mmap; vma; vma = vma->vm_next) {
0989             if (ksm_test_exit(mm))
0990                 break;
0991             if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
0992                 continue;
0993             err = unmerge_ksm_pages(vma,
0994                         vma->vm_start, vma->vm_end);
0995             if (err)
0996                 goto error;
0997         }
0998
0999         remove_trailing_rmap_items(&mm_slot->rmap_list);
1000         mmap_read_unlock(mm);
1001
1002         spin_lock(&ksm_mmlist_lock);
1003         ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
1004                         struct mm_slot, mm_list);
1005         if (ksm_test_exit(mm)) {
1006             hash_del(&mm_slot->link);
1007             list_del(&mm_slot->mm_list);
1008             spin_unlock(&ksm_mmlist_lock);
1009
1010             free_mm_slot(mm_slot);
1011             clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1012             mmdrop(mm);
1013         } else
1014             spin_unlock(&ksm_mmlist_lock);
1015     }
1016
1017     /* Clean up stable nodes, but don't worry if some are still busy */
1018     remove_all_stable_nodes();
1019     ksm_scan.seqnr = 0;
1020     return 0;
1021
1022 error:
1023     mmap_read_unlock(mm);
1024     spin_lock(&ksm_mmlist_lock);
1025     ksm_scan.mm_slot = &ksm_mm_head;
1026     spin_unlock(&ksm_mmlist_lock);
1027     return err;
1028 }
1029 #endif /* CONFIG_SYSFS */
1030
1031 static u32 calc_checksum(struct page *page)
1032 {
1033     u32 checksum;
1034     void *addr = kmap_atomic(page);
1035     checksum = xxhash(addr, PAGE_SIZE, 0);
1036     kunmap_atomic(addr);
1037     return checksum;
1038 }
1039
1040 static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1041                   pte_t *orig_pte)
1042 {
1043     struct mm_struct *mm = vma->vm_mm;
1044     DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
1045     int swapped;
1046     int err = -EFAULT;
1047     struct mmu_notifier_range range;
1048     bool anon_exclusive;
1049
1050     pvmw.address = page_address_in_vma(page, vma);
1051     if (pvmw.address == -EFAULT)
1052         goto out;
1053
1054     BUG_ON(PageTransCompound(page));
1055
1056     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1057                 pvmw.address,
1058                 pvmw.address + PAGE_SIZE);
1059     mmu_notifier_invalidate_range_start(&range);
1060
1061     if (!page_vma_mapped_walk(&pvmw))
1062         goto out_mn;
1063     if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1064         goto out_unlock;
1065
1066     anon_exclusive = PageAnonExclusive(page);
1067     if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1068         (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1069         anon_exclusive || mm_tlb_flush_pending(mm)) {
1070         pte_t entry;
1071
1072         swapped = PageSwapCache(page);
1073         flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1074         /*
1075          * Ok this is tricky, when get_user_pages_fast() run it doesn't
1076          * take any lock, therefore the check that we are going to make
1077          * with the pagecount against the mapcount is racy and
1078          * O_DIRECT can happen right after the check.
1079          * So we clear the pte and flush the tlb before the check
1080          * this assure us that no O_DIRECT can happen after the check
1081          * or in the middle of the check.
1082          *
1083          * No need to notify as we are downgrading page table to read
1084          * only not changing it to point to a new page.
1085          *
1086          * See Documentation/mm/mmu_notifier.rst
1087          */
1088         entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1089         /*
1090          * Check that no O_DIRECT or similar I/O is in progress on the
1091          * page
1092          */
1093         if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1094             set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1095             goto out_unlock;
1096         }
1097
1098         if (anon_exclusive && page_try_share_anon_rmap(page)) {
1099             set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1100             goto out_unlock;
1101         }
1102
1103         if (pte_dirty(entry))
1104             set_page_dirty(page);
1105
1106         if (pte_protnone(entry))
1107             entry = pte_mkclean(pte_clear_savedwrite(entry));
1108         else
1109             entry = pte_mkclean(pte_wrprotect(entry));
1110         set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1111     }
1112     *orig_pte = *pvmw.pte;
1113     err = 0;
1114
1115 out_unlock:
1116     page_vma_mapped_walk_done(&pvmw);
1117 out_mn:
1118     mmu_notifier_invalidate_range_end(&range);
1119 out:
1120     return err;
1121 }
1122
1123 /**
1124  * replace_page - replace page in vma by new ksm page
1125  * @vma:      vma that holds the pte pointing to page
1126  * @page:     the page we are replacing by kpage
1127  * @kpage:    the ksm page we replace page by
1128  * @orig_pte: the original value of the pte
1129  *
1130  * Returns 0 on success, -EFAULT on failure.
1131  */
1132 static int replace_page(struct vm_area_struct *vma, struct page *page,
1133             struct page *kpage, pte_t orig_pte)
1134 {
1135     struct mm_struct *mm = vma->vm_mm;
1136     pmd_t *pmd;
1137     pte_t *ptep;
1138     pte_t newpte;
1139     spinlock_t *ptl;
1140     unsigned long addr;
1141     int err = -EFAULT;
1142     struct mmu_notifier_range range;
1143
1144     addr = page_address_in_vma(page, vma);
1145     if (addr == -EFAULT)
1146         goto out;
1147
1148     pmd = mm_find_pmd(mm, addr);
1149     if (!pmd)
1150         goto out;
1151
1152     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1153                 addr + PAGE_SIZE);
1154     mmu_notifier_invalidate_range_start(&range);
1155
1156     ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1157     if (!pte_same(*ptep, orig_pte)) {
1158         pte_unmap_unlock(ptep, ptl);
1159         goto out_mn;
1160     }
1161     VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
1162     VM_BUG_ON_PAGE(PageAnon(kpage) && PageAnonExclusive(kpage), kpage);
1163
1164     /*
1165      * No need to check ksm_use_zero_pages here: we can only have a
1166      * zero_page here if ksm_use_zero_pages was enabled already.
1167      */
1168     if (!is_zero_pfn(page_to_pfn(kpage))) {
1169         get_page(kpage);
1170         page_add_anon_rmap(kpage, vma, addr, RMAP_NONE);
1171         newpte = mk_pte(kpage, vma->vm_page_prot);
1172     } else {
1173         newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1174                            vma->vm_page_prot));
1175         /*
1176          * We're replacing an anonymous page with a zero page, which is
1177          * not anonymous. We need to do proper accounting otherwise we
1178          * will get wrong values in /proc, and a BUG message in dmesg
1179          * when tearing down the mm.
1180          */
1181         dec_mm_counter(mm, MM_ANONPAGES);
1182     }
1183
1184     flush_cache_page(vma, addr, pte_pfn(*ptep));
1185     /*
1186      * No need to notify as we are replacing a read only page with another
1187      * read only page with the same content.
1188      *
1189      * See Documentation/mm/mmu_notifier.rst
1190      */
1191     ptep_clear_flush(vma, addr, ptep);
1192     set_pte_at_notify(mm, addr, ptep, newpte);
1193
1194     page_remove_rmap(page, vma, false);
1195     if (!page_mapped(page))
1196         try_to_free_swap(page);
1197     put_page(page);
1198
1199     pte_unmap_unlock(ptep, ptl);
1200     err = 0;
1201 out_mn:
1202     mmu_notifier_invalidate_range_end(&range);
1203 out:
1204     return err;
1205 }
1206
1207 /*
1208  * try_to_merge_one_page - take two pages and merge them into one
1209  * @vma: the vma that holds the pte pointing to page
1210  * @page: the PageAnon page that we want to replace with kpage
1211  * @kpage: the PageKsm page that we want to map instead of page,
1212  *         or NULL the first time when we want to use page as kpage.
1213  *
1214  * This function returns 0 if the pages were merged, -EFAULT otherwise.
1215  */
1216 static int try_to_merge_one_page(struct vm_area_struct *vma,
1217                  struct page *page, struct page *kpage)
1218 {
1219     pte_t orig_pte = __pte(0);
1220     int err = -EFAULT;
1221
1222     if (page == kpage)          /* ksm page forked */
1223         return 0;
1224
1225     if (!PageAnon(page))
1226         goto out;
1227
1228     /*
1229      * We need the page lock to read a stable PageSwapCache in
1230      * write_protect_page().  We use trylock_page() instead of
1231      * lock_page() because we don't want to wait here - we
1232      * prefer to continue scanning and merging different pages,
1233      * then come back to this page when it is unlocked.
1234      */
1235     if (!trylock_page(page))
1236         goto out;
1237
1238     if (PageTransCompound(page)) {
1239         if (split_huge_page(page))
1240             goto out_unlock;
1241     }
1242
1243     /*
1244      * If this anonymous page is mapped only here, its pte may need
1245      * to be write-protected.  If it's mapped elsewhere, all of its
1246      * ptes are necessarily already write-protected.  But in either
1247      * case, we need to lock and check page_count is not raised.
1248      */
1249     if (write_protect_page(vma, page, &orig_pte) == 0) {
1250         if (!kpage) {
1251             /*
1252              * While we hold page lock, upgrade page from
1253              * PageAnon+anon_vma to PageKsm+NULL stable_node:
1254              * stable_tree_insert() will update stable_node.
1255              */
1256             set_page_stable_node(page, NULL);
1257             mark_page_accessed(page);
1258             /*
1259              * Page reclaim just frees a clean page with no dirty
1260              * ptes: make sure that the ksm page would be swapped.
1261              */
1262             if (!PageDirty(page))
1263                 SetPageDirty(page);
1264             err = 0;
1265         } else if (pages_identical(page, kpage))
1266             err = replace_page(vma, page, kpage, orig_pte);
1267     }
1268
1269 out_unlock:
1270     unlock_page(page);
1271 out:
1272     return err;
1273 }
1274
1275 /*
1276  * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
1277  * but no new kernel page is allocated: kpage must already be a ksm page.
1278  *
1279  * This function returns 0 if the pages were merged, -EFAULT otherwise.
1280  */
1281 static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1282                       struct page *page, struct page *kpage)
1283 {
1284     struct mm_struct *mm = rmap_item->mm;
1285     struct vm_area_struct *vma;
1286     int err = -EFAULT;
1287
1288     mmap_read_lock(mm);
1289     vma = find_mergeable_vma(mm, rmap_item->address);
1290     if (!vma)
1291         goto out;
1292
1293     err = try_to_merge_one_page(vma, page, kpage);
1294     if (err)
1295         goto out;
1296
1297     /* Unstable nid is in union with stable anon_vma: remove first */
1298     remove_rmap_item_from_tree(rmap_item);
1299
1300     /* Must get reference to anon_vma while still holding mmap_lock */
1301     rmap_item->anon_vma = vma->anon_vma;
1302     get_anon_vma(vma->anon_vma);
1303 out:
1304     mmap_read_unlock(mm);
1305     return err;
1306 }
1307
1308 /*
1309  * try_to_merge_two_pages - take two identical pages and prepare them
1310  * to be merged into one page.
1311  *
1312  * This function returns the kpage if we successfully merged two identical
1313  * pages into one ksm page, NULL otherwise.
1314  *
1315  * Note that this function upgrades page to ksm page: if one of the pages
1316  * is already a ksm page, try_to_merge_with_ksm_page should be used.
1317  */
1318 static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1319                        struct page *page,
1320                        struct rmap_item *tree_rmap_item,
1321                        struct page *tree_page)
1322 {
1323     int err;
1324
1325     err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1326     if (!err) {
1327         err = try_to_merge_with_ksm_page(tree_rmap_item,
1328                             tree_page, page);
1329         /*
1330          * If that fails, we have a ksm page with only one pte
1331          * pointing to it: so break it.
1332          */
1333         if (err)
1334             break_cow(rmap_item);
1335     }
1336     return err ? NULL : page;
1337 }
1338
1339 static __always_inline
1340 bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1341 {
1342     VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1343     /*
1344      * Check that at least one mapping still exists, otherwise
1345      * there's no much point to merge and share with this
1346      * stable_node, as the underlying tree_page of the other
1347      * sharer is going to be freed soon.
1348      */
1349     return stable_node->rmap_hlist_len &&
1350         stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1351 }
1352
1353 static __always_inline
1354 bool is_page_sharing_candidate(struct stable_node *stable_node)
1355 {
1356     return __is_page_sharing_candidate(stable_node, 0);
1357 }
1358
1359 static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1360                     struct stable_node **_stable_node,
1361                     struct rb_root *root,
1362                     bool prune_stale_stable_nodes)
1363 {
1364     struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1365     struct hlist_node *hlist_safe;
1366     struct page *_tree_page, *tree_page = NULL;
1367     int nr = 0;
1368     int found_rmap_hlist_len;
1369
1370     if (!prune_stale_stable_nodes ||
1371         time_before(jiffies, stable_node->chain_prune_time +
1372             msecs_to_jiffies(
1373                 ksm_stable_node_chains_prune_millisecs)))
1374         prune_stale_stable_nodes = false;
1375     else
1376         stable_node->chain_prune_time = jiffies;
1377
1378     hlist_for_each_entry_safe(dup, hlist_safe,
1379                   &stable_node->hlist, hlist_dup) {
1380         cond_resched();
1381         /*
1382          * We must walk all stable_node_dup to prune the stale
1383          * stable nodes during lookup.
1384          *
1385          * get_ksm_page can drop the nodes from the
1386          * stable_node->hlist if they point to freed pages
1387          * (that's why we do a _safe walk). The "dup"
1388          * stable_node parameter itself will be freed from
1389          * under us if it returns NULL.
1390          */
1391         _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1392         if (!_tree_page)
1393             continue;
1394         nr += 1;
1395         if (is_page_sharing_candidate(dup)) {
1396             if (!found ||
1397                 dup->rmap_hlist_len > found_rmap_hlist_len) {
1398                 if (found)
1399                     put_page(tree_page);
1400                 found = dup;
1401                 found_rmap_hlist_len = found->rmap_hlist_len;
1402                 tree_page = _tree_page;
1403
1404                 /* skip put_page for found dup */
1405                 if (!prune_stale_stable_nodes)
1406                     break;
1407                 continue;
1408             }
1409         }
1410         put_page(_tree_page);
1411     }
1412
1413     if (found) {
1414         /*
1415          * nr is counting all dups in the chain only if
1416          * prune_stale_stable_nodes is true, otherwise we may
1417          * break the loop at nr == 1 even if there are
1418          * multiple entries.
1419          */
1420         if (prune_stale_stable_nodes && nr == 1) {
1421             /*
1422              * If there's not just one entry it would
1423              * corrupt memory, better BUG_ON. In KSM
1424              * context with no lock held it's not even
1425              * fatal.
1426              */
1427             BUG_ON(stable_node->hlist.first->next);
1428
1429             /*
1430              * There's just one entry and it is below the
1431              * deduplication limit so drop the chain.
1432              */
1433             rb_replace_node(&stable_node->node, &found->node,
1434                     root);
1435             free_stable_node(stable_node);
1436             ksm_stable_node_chains--;
1437             ksm_stable_node_dups--;
1438             /*
1439              * NOTE: the caller depends on the stable_node
1440              * to be equal to stable_node_dup if the chain
1441              * was collapsed.
1442              */
1443             *_stable_node = found;
1444             /*
1445              * Just for robustness, as stable_node is
1446              * otherwise left as a stable pointer, the
1447              * compiler shall optimize it away at build
1448              * time.
1449              */
1450             stable_node = NULL;
1451         } else if (stable_node->hlist.first != &found->hlist_dup &&
1452                __is_page_sharing_candidate(found, 1)) {
1453             /*
1454              * If the found stable_node dup can accept one
1455              * more future merge (in addition to the one
1456              * that is underway) and is not at the head of
1457              * the chain, put it there so next search will
1458              * be quicker in the !prune_stale_stable_nodes
1459              * case.
1460              *
1461              * NOTE: it would be inaccurate to use nr > 1
1462              * instead of checking the hlist.first pointer
1463              * directly, because in the
1464              * prune_stale_stable_nodes case "nr" isn't
1465              * the position of the found dup in the chain,
1466              * but the total number of dups in the chain.
1467              */
1468             hlist_del(&found->hlist_dup);
1469             hlist_add_head(&found->hlist_dup,
1470                        &stable_node->hlist);
1471         }
1472     }
1473
1474     *_stable_node_dup = found;
1475     return tree_page;
1476 }
1477
1478 static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1479                            struct rb_root *root)
1480 {
1481     if (!is_stable_node_chain(stable_node))
1482         return stable_node;
1483     if (hlist_empty(&stable_node->hlist)) {
1484         free_stable_node_chain(stable_node, root);
1485         return NULL;
1486     }
1487     return hlist_entry(stable_node->hlist.first,
1488                typeof(*stable_node), hlist_dup);
1489 }
1490
1491 /*
1492  * Like for get_ksm_page, this function can free the *_stable_node and
1493  * *_stable_node_dup if the returned tree_page is NULL.
1494  *
1495  * It can also free and overwrite *_stable_node with the found
1496  * stable_node_dup if the chain is collapsed (in which case
1497  * *_stable_node will be equal to *_stable_node_dup like if the chain
1498  * never existed). It's up to the caller to verify tree_page is not
1499  * NULL before dereferencing *_stable_node or *_stable_node_dup.
1500  *
1501  * *_stable_node_dup is really a second output parameter of this
1502  * function and will be overwritten in all cases, the caller doesn't
1503  * need to initialize it.
1504  */
1505 static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1506                     struct stable_node **_stable_node,
1507                     struct rb_root *root,
1508                     bool prune_stale_stable_nodes)
1509 {
1510     struct stable_node *stable_node = *_stable_node;
1511     if (!is_stable_node_chain(stable_node)) {
1512         if (is_page_sharing_candidate(stable_node)) {
1513             *_stable_node_dup = stable_node;
1514             return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
1515         }
1516         /*
1517          * _stable_node_dup set to NULL means the stable_node
1518          * reached the ksm_max_page_sharing limit.
1519          */
1520         *_stable_node_dup = NULL;
1521         return NULL;
1522     }
1523     return stable_node_dup(_stable_node_dup, _stable_node, root,
1524                    prune_stale_stable_nodes);
1525 }
1526
1527 static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1528                         struct stable_node **s_n,
1529                         struct rb_root *root)
1530 {
1531     return __stable_node_chain(s_n_d, s_n, root, true);
1532 }
1533
1534 static __always_inline struct page *chain(struct stable_node **s_n_d,
1535                       struct stable_node *s_n,
1536                       struct rb_root *root)
1537 {
1538     struct stable_node *old_stable_node = s_n;
1539     struct page *tree_page;
1540
1541     tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1542     /* not pruning dups so s_n cannot have changed */
1543     VM_BUG_ON(s_n != old_stable_node);
1544     return tree_page;
1545 }
1546
1547 /*
1548  * stable_tree_search - search for page inside the stable tree
1549  *
1550  * This function checks if there is a page inside the stable tree
1551  * with identical content to the page that we are scanning right now.
1552  *
1553  * This function returns the stable tree node of identical content if found,
1554  * NULL otherwise.
1555  */
1556 static struct page *stable_tree_search(struct page *page)
1557 {
1558     int nid;
1559     struct rb_root *root;
1560     struct rb_node **new;
1561     struct rb_node *parent;
1562     struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1563     struct stable_node *page_node;
1564
1565     page_node = page_stable_node(page);
1566     if (page_node && page_node->head != &migrate_nodes) {
1567         /* ksm page forked */
1568         get_page(page);
1569         return page;
1570     }
1571
1572     nid = get_kpfn_nid(page_to_pfn(page));
1573     root = root_stable_tree + nid;
1574 again:
1575     new = &root->rb_node;
1576     parent = NULL;
1577
1578     while (*new) {
1579         struct page *tree_page;
1580         int ret;
1581
1582         cond_resched();
1583         stable_node = rb_entry(*new, struct stable_node, node);
1584         stable_node_any = NULL;
1585         tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1586         /*
1587          * NOTE: stable_node may have been freed by
1588          * chain_prune() if the returned stable_node_dup is
1589          * not NULL. stable_node_dup may have been inserted in
1590          * the rbtree instead as a regular stable_node (in
1591          * order to collapse the stable_node chain if a single
1592          * stable_node dup was found in it). In such case the
1593          * stable_node is overwritten by the callee to point
1594          * to the stable_node_dup that was collapsed in the
1595          * stable rbtree and stable_node will be equal to
1596          * stable_node_dup like if the chain never existed.
1597          */
1598         if (!stable_node_dup) {
1599             /*
1600              * Either all stable_node dups were full in
1601              * this stable_node chain, or this chain was
1602              * empty and should be rb_erased.
1603              */
1604             stable_node_any = stable_node_dup_any(stable_node,
1605                                   root);
1606             if (!stable_node_any) {
1607                 /* rb_erase just run */
1608                 goto again;
1609             }
1610             /*
1611              * Take any of the stable_node dups page of
1612              * this stable_node chain to let the tree walk
1613              * continue. All KSM pages belonging to the
1614              * stable_node dups in a stable_node chain
1615              * have the same content and they're
1616              * write protected at all times. Any will work
1617              * fine to continue the walk.
1618              */
1619             tree_page = get_ksm_page(stable_node_any,
1620                          GET_KSM_PAGE_NOLOCK);
1621         }
1622         VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1623         if (!tree_page) {
1624             /*
1625              * If we walked over a stale stable_node,
1626              * get_ksm_page() will call rb_erase() and it
1627              * may rebalance the tree from under us. So
1628              * restart the search from scratch. Returning
1629              * NULL would be safe too, but we'd generate
1630              * false negative insertions just because some
1631              * stable_node was stale.
1632              */
1633             goto again;
1634         }
1635
1636         ret = memcmp_pages(page, tree_page);
1637         put_page(tree_page);
1638
1639         parent = *new;
1640         if (ret < 0)
1641             new = &parent->rb_left;
1642         else if (ret > 0)
1643             new = &parent->rb_right;
1644         else {
1645             if (page_node) {
1646                 VM_BUG_ON(page_node->head != &migrate_nodes);
1647                 /*
1648                  * Test if the migrated page should be merged
1649                  * into a stable node dup. If the mapcount is
1650                  * 1 we can migrate it with another KSM page
1651                  * without adding it to the chain.
1652                  */
1653                 if (page_mapcount(page) > 1)
1654                     goto chain_append;
1655             }
1656
1657             if (!stable_node_dup) {
1658                 /*
1659                  * If the stable_node is a chain and
1660                  * we got a payload match in memcmp
1661                  * but we cannot merge the scanned
1662                  * page in any of the existing
1663                  * stable_node dups because they're
1664                  * all full, we need to wait the
1665                  * scanned page to find itself a match
1666                  * in the unstable tree to create a
1667                  * brand new KSM page to add later to
1668                  * the dups of this stable_node.
1669                  */
1670                 return NULL;
1671             }
1672
1673             /*
1674              * Lock and unlock the stable_node's page (which
1675              * might already have been migrated) so that page
1676              * migration is sure to notice its raised count.
1677              * It would be more elegant to return stable_node
1678              * than kpage, but that involves more changes.
1679              */
1680             tree_page = get_ksm_page(stable_node_dup,
1681                          GET_KSM_PAGE_TRYLOCK);
1682
1683             if (PTR_ERR(tree_page) == -EBUSY)
1684                 return ERR_PTR(-EBUSY);
1685
1686             if (unlikely(!tree_page))
1687                 /*
1688                  * The tree may have been rebalanced,
1689                  * so re-evaluate parent and new.
1690                  */
1691                 goto again;
1692             unlock_page(tree_page);
1693
1694             if (get_kpfn_nid(stable_node_dup->kpfn) !=
1695                 NUMA(stable_node_dup->nid)) {
1696                 put_page(tree_page);
1697                 goto replace;
1698             }
1699             return tree_page;
1700         }
1701     }
1702
1703     if (!page_node)
1704         return NULL;
1705
1706     list_del(&page_node->list);
1707     DO_NUMA(page_node->nid = nid);
1708     rb_link_node(&page_node->node, parent, new);
1709     rb_insert_color(&page_node->node, root);
1710 out:
1711     if (is_page_sharing_candidate(page_node)) {
1712         get_page(page);
1713         return page;
1714     } else
1715         return NULL;
1716
1717 replace:
1718     /*
1719      * If stable_node was a chain and chain_prune collapsed it,
1720      * stable_node has been updated to be the new regular
1721      * stable_node. A collapse of the chain is indistinguishable
1722      * from the case there was no chain in the stable
1723      * rbtree. Otherwise stable_node is the chain and
1724      * stable_node_dup is the dup to replace.
1725      */
1726     if (stable_node_dup == stable_node) {
1727         VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1728         VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1729         /* there is no chain */
1730         if (page_node) {
1731             VM_BUG_ON(page_node->head != &migrate_nodes);
1732             list_del(&page_node->list);
1733             DO_NUMA(page_node->nid = nid);
1734             rb_replace_node(&stable_node_dup->node,
1735                     &page_node->node,
1736                     root);
1737             if (is_page_sharing_candidate(page_node))
1738                 get_page(page);
1739             else
1740                 page = NULL;
1741         } else {
1742             rb_erase(&stable_node_dup->node, root);
1743             page = NULL;
1744         }
1745     } else {
1746         VM_BUG_ON(!is_stable_node_chain(stable_node));
1747         __stable_node_dup_del(stable_node_dup);
1748         if (page_node) {
1749             VM_BUG_ON(page_node->head != &migrate_nodes);
1750             list_del(&page_node->list);
1751             DO_NUMA(page_node->nid = nid);
1752             stable_node_chain_add_dup(page_node, stable_node);
1753             if (is_page_sharing_candidate(page_node))
1754                 get_page(page);
1755             else
1756                 page = NULL;
1757         } else {
1758             page = NULL;
1759         }
1760     }
1761     stable_node_dup->head = &migrate_nodes;
1762     list_add(&stable_node_dup->list, stable_node_dup->head);
1763     return page;
1764
1765 chain_append:
1766     /* stable_node_dup could be null if it reached the limit */
1767     if (!stable_node_dup)
1768         stable_node_dup = stable_node_any;
1769     /*
1770      * If stable_node was a chain and chain_prune collapsed it,
1771      * stable_node has been updated to be the new regular
1772      * stable_node. A collapse of the chain is indistinguishable
1773      * from the case there was no chain in the stable
1774      * rbtree. Otherwise stable_node is the chain and
1775      * stable_node_dup is the dup to replace.
1776      */
1777     if (stable_node_dup == stable_node) {
1778         VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1779         /* chain is missing so create it */
1780         stable_node = alloc_stable_node_chain(stable_node_dup,
1781                               root);
1782         if (!stable_node)
1783             return NULL;
1784     }
1785     /*
1786      * Add this stable_node dup that was
1787      * migrated to the stable_node chain
1788      * of the current nid for this page
1789      * content.
1790      */
1791     VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1792     VM_BUG_ON(page_node->head != &migrate_nodes);
1793     list_del(&page_node->list);
1794     DO_NUMA(page_node->nid = nid);
1795     stable_node_chain_add_dup(page_node, stable_node);
1796     goto out;
1797 }
1798
1799 /*
1800  * stable_tree_insert - insert stable tree node pointing to new ksm page
1801  * into the stable tree.
1802  *
1803  * This function returns the stable tree node just allocated on success,
1804  * NULL otherwise.
1805  */
1806 static struct stable_node *stable_tree_insert(struct page *kpage)
1807 {
1808     int nid;
1809     unsigned long kpfn;
1810     struct rb_root *root;
1811     struct rb_node **new;
1812     struct rb_node *parent;
1813     struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1814     bool need_chain = false;
1815
1816     kpfn = page_to_pfn(kpage);
1817     nid = get_kpfn_nid(kpfn);
1818     root = root_stable_tree + nid;
1819 again:
1820     parent = NULL;
1821     new = &root->rb_node;
1822
1823     while (*new) {
1824         struct page *tree_page;
1825         int ret;
1826
1827         cond_resched();
1828         stable_node = rb_entry(*new, struct stable_node, node);
1829         stable_node_any = NULL;
1830         tree_page = chain(&stable_node_dup, stable_node, root);
1831         if (!stable_node_dup) {
1832             /*
1833              * Either all stable_node dups were full in
1834              * this stable_node chain, or this chain was
1835              * empty and should be rb_erased.
1836              */
1837             stable_node_any = stable_node_dup_any(stable_node,
1838                                   root);
1839             if (!stable_node_any) {
1840                 /* rb_erase just run */
1841                 goto again;
1842             }
1843             /*
1844              * Take any of the stable_node dups page of
1845              * this stable_node chain to let the tree walk
1846              * continue. All KSM pages belonging to the
1847              * stable_node dups in a stable_node chain
1848              * have the same content and they're
1849              * write protected at all times. Any will work
1850              * fine to continue the walk.
1851              */
1852             tree_page = get_ksm_page(stable_node_any,
1853                          GET_KSM_PAGE_NOLOCK);
1854         }
1855         VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1856         if (!tree_page) {
1857             /*
1858              * If we walked over a stale stable_node,
1859              * get_ksm_page() will call rb_erase() and it
1860              * may rebalance the tree from under us. So
1861              * restart the search from scratch. Returning
1862              * NULL would be safe too, but we'd generate
1863              * false negative insertions just because some
1864              * stable_node was stale.
1865              */
1866             goto again;
1867         }
1868
1869         ret = memcmp_pages(kpage, tree_page);
1870         put_page(tree_page);
1871
1872         parent = *new;
1873         if (ret < 0)
1874             new = &parent->rb_left;
1875         else if (ret > 0)
1876             new = &parent->rb_right;
1877         else {
1878             need_chain = true;
1879             break;
1880         }
1881     }
1882
1883     stable_node_dup = alloc_stable_node();
1884     if (!stable_node_dup)
1885         return NULL;
1886
1887     INIT_HLIST_HEAD(&stable_node_dup->hlist);
1888     stable_node_dup->kpfn = kpfn;
1889     set_page_stable_node(kpage, stable_node_dup);
1890     stable_node_dup->rmap_hlist_len = 0;
1891     DO_NUMA(stable_node_dup->nid = nid);
1892     if (!need_chain) {
1893         rb_link_node(&stable_node_dup->node, parent, new);
1894         rb_insert_color(&stable_node_dup->node, root);
1895     } else {
1896         if (!is_stable_node_chain(stable_node)) {
1897             struct stable_node *orig = stable_node;
1898             /* chain is missing so create it */
1899             stable_node = alloc_stable_node_chain(orig, root);
1900             if (!stable_node) {
1901                 free_stable_node(stable_node_dup);
1902                 return NULL;
1903             }
1904         }
1905         stable_node_chain_add_dup(stable_node_dup, stable_node);
1906     }
1907
1908     return stable_node_dup;
1909 }
1910
1911 /*
1912  * unstable_tree_search_insert - search for identical page,
1913  * else insert rmap_item into the unstable tree.
1914  *
1915  * This function searches for a page in the unstable tree identical to the
1916  * page currently being scanned; and if no identical page is found in the
1917  * tree, we insert rmap_item as a new object into the unstable tree.
1918  *
1919  * This function returns pointer to rmap_item found to be identical
1920  * to the currently scanned page, NULL otherwise.
1921  *
1922  * This function does both searching and inserting, because they share
1923  * the same walking algorithm in an rbtree.
1924  */
1925 static
1926 struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1927                           struct page *page,
1928                           struct page **tree_pagep)
1929 {
1930     struct rb_node **new;
1931     struct rb_root *root;
1932     struct rb_node *parent = NULL;
1933     int nid;
1934
1935     nid = get_kpfn_nid(page_to_pfn(page));
1936     root = root_unstable_tree + nid;
1937     new = &root->rb_node;
1938
1939     while (*new) {
1940         struct rmap_item *tree_rmap_item;
1941         struct page *tree_page;
1942         int ret;
1943
1944         cond_resched();
1945         tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1946         tree_page = get_mergeable_page(tree_rmap_item);
1947         if (!tree_page)
1948             return NULL;
1949
1950         /*
1951          * Don't substitute a ksm page for a forked page.
1952          */
1953         if (page == tree_page) {
1954             put_page(tree_page);
1955             return NULL;
1956         }
1957
1958         ret = memcmp_pages(page, tree_page);
1959
1960         parent = *new;
1961         if (ret < 0) {
1962             put_page(tree_page);
1963             new = &parent->rb_left;
1964         } else if (ret > 0) {
1965             put_page(tree_page);
1966             new = &parent->rb_right;
1967         } else if (!ksm_merge_across_nodes &&
1968                page_to_nid(tree_page) != nid) {
1969             /*
1970              * If tree_page has been migrated to another NUMA node,
1971              * it will be flushed out and put in the right unstable
1972              * tree next time: only merge with it when across_nodes.
1973              */
1974             put_page(tree_page);
1975             return NULL;
1976         } else {
1977             *tree_pagep = tree_page;
1978             return tree_rmap_item;
1979         }
1980     }
1981
1982     rmap_item->address |= UNSTABLE_FLAG;
1983     rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1984     DO_NUMA(rmap_item->nid = nid);
1985     rb_link_node(&rmap_item->node, parent, new);
1986     rb_insert_color(&rmap_item->node, root);
1987
1988     ksm_pages_unshared++;
1989     return NULL;
1990 }
1991
1992 /*
1993  * stable_tree_append - add another rmap_item to the linked list of
1994  * rmap_items hanging off a given node of the stable tree, all sharing
1995  * the same ksm page.
1996  */
1997 static void stable_tree_append(struct rmap_item *rmap_item,
1998                    struct stable_node *stable_node,
1999                    bool max_page_sharing_bypass)
2000 {
2001     /*
2002      * rmap won't find this mapping if we don't insert the
2003      * rmap_item in the right stable_node
2004      * duplicate. page_migration could break later if rmap breaks,
2005      * so we can as well crash here. We really need to check for
2006      * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
2007      * for other negative values as an underflow if detected here
2008      * for the first time (and not when decreasing rmap_hlist_len)
2009      * would be sign of memory corruption in the stable_node.
2010      */
2011     BUG_ON(stable_node->rmap_hlist_len < 0);
2012
2013     stable_node->rmap_hlist_len++;
2014     if (!max_page_sharing_bypass)
2015         /* possibly non fatal but unexpected overflow, only warn */
2016         WARN_ON_ONCE(stable_node->rmap_hlist_len >
2017                  ksm_max_page_sharing);
2018
2019     rmap_item->head = stable_node;
2020     rmap_item->address |= STABLE_FLAG;
2021     hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2022
2023     if (rmap_item->hlist.next)
2024         ksm_pages_sharing++;
2025     else
2026         ksm_pages_shared++;
2027
2028     rmap_item->mm->ksm_merging_pages++;
2029 }
2030
2031 /*
2032  * cmp_and_merge_page - first see if page can be merged into the stable tree;
2033  * if not, compare checksum to previous and if it's the same, see if page can
2034  * be inserted into the unstable tree, or merged with a page already there and
2035  * both transferred to the stable tree.
2036  *
2037  * @page: the page that we are searching identical page to.
2038  * @rmap_item: the reverse mapping into the virtual address of this page
2039  */
2040 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2041 {
2042     struct mm_struct *mm = rmap_item->mm;
2043     struct rmap_item *tree_rmap_item;
2044     struct page *tree_page = NULL;
2045     struct stable_node *stable_node;
2046     struct page *kpage;
2047     unsigned int checksum;
2048     int err;
2049     bool max_page_sharing_bypass = false;
2050
2051     stable_node = page_stable_node(page);
2052     if (stable_node) {
2053         if (stable_node->head != &migrate_nodes &&
2054             get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2055             NUMA(stable_node->nid)) {
2056             stable_node_dup_del(stable_node);
2057             stable_node->head = &migrate_nodes;
2058             list_add(&stable_node->list, stable_node->head);
2059         }
2060         if (stable_node->head != &migrate_nodes &&
2061             rmap_item->head == stable_node)
2062             return;
2063         /*
2064          * If it's a KSM fork, allow it to go over the sharing limit
2065          * without warnings.
2066          */
2067         if (!is_page_sharing_candidate(stable_node))
2068             max_page_sharing_bypass = true;
2069     }
2070
2071     /* We first start with searching the page inside the stable tree */
2072     kpage = stable_tree_search(page);
2073     if (kpage == page && rmap_item->head == stable_node) {
2074         put_page(kpage);
2075         return;
2076     }
2077
2078     remove_rmap_item_from_tree(rmap_item);
2079
2080     if (kpage) {
2081         if (PTR_ERR(kpage) == -EBUSY)
2082             return;
2083
2084         err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2085         if (!err) {
2086             /*
2087              * The page was successfully merged:
2088              * add its rmap_item to the stable tree.
2089              */
2090             lock_page(kpage);
2091             stable_tree_append(rmap_item, page_stable_node(kpage),
2092                        max_page_sharing_bypass);
2093             unlock_page(kpage);
2094         }
2095         put_page(kpage);
2096         return;
2097     }
2098
2099     /*
2100      * If the hash value of the page has changed from the last time
2101      * we calculated it, this page is changing frequently: therefore we
2102      * don't want to insert it in the unstable tree, and we don't want
2103      * to waste our time searching for something identical to it there.
2104      */
2105     checksum = calc_checksum(page);
2106     if (rmap_item->oldchecksum != checksum) {
2107         rmap_item->oldchecksum = checksum;
2108         return;
2109     }
2110
2111     /*
2112      * Same checksum as an empty page. We attempt to merge it with the
2113      * appropriate zero page if the user enabled this via sysfs.
2114      */
2115     if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2116         struct vm_area_struct *vma;
2117
2118         mmap_read_lock(mm);
2119         vma = find_mergeable_vma(mm, rmap_item->address);
2120         if (vma) {
2121             err = try_to_merge_one_page(vma, page,
2122                     ZERO_PAGE(rmap_item->address));
2123         } else {
2124             /*
2125              * If the vma is out of date, we do not need to
2126              * continue.
2127              */
2128             err = 0;
2129         }
2130         mmap_read_unlock(mm);
2131         /*
2132          * In case of failure, the page was not really empty, so we
2133          * need to continue. Otherwise we're done.
2134          */
2135         if (!err)
2136             return;
2137     }
2138     tree_rmap_item =
2139         unstable_tree_search_insert(rmap_item, page, &tree_page);
2140     if (tree_rmap_item) {
2141         bool split;
2142
2143         kpage = try_to_merge_two_pages(rmap_item, page,
2144                         tree_rmap_item, tree_page);
2145         /*
2146          * If both pages we tried to merge belong to the same compound
2147          * page, then we actually ended up increasing the reference
2148          * count of the same compound page twice, and split_huge_page
2149          * failed.
2150          * Here we set a flag if that happened, and we use it later to
2151          * try split_huge_page again. Since we call put_page right
2152          * afterwards, the reference count will be correct and
2153          * split_huge_page should succeed.
2154          */
2155         split = PageTransCompound(page)
2156             && compound_head(page) == compound_head(tree_page);
2157         put_page(tree_page);
2158         if (kpage) {
2159             /*
2160              * The pages were successfully merged: insert new
2161              * node in the stable tree and add both rmap_items.
2162              */
2163             lock_page(kpage);
2164             stable_node = stable_tree_insert(kpage);
2165             if (stable_node) {
2166                 stable_tree_append(tree_rmap_item, stable_node,
2167                            false);
2168                 stable_tree_append(rmap_item, stable_node,
2169                            false);
2170             }
2171             unlock_page(kpage);
2172
2173             /*
2174              * If we fail to insert the page into the stable tree,
2175              * we will have 2 virtual addresses that are pointing
2176              * to a ksm page left outside the stable tree,
2177              * in which case we need to break_cow on both.
2178              */
2179             if (!stable_node) {
2180                 break_cow(tree_rmap_item);
2181                 break_cow(rmap_item);
2182             }
2183         } else if (split) {
2184             /*
2185              * We are here if we tried to merge two pages and
2186              * failed because they both belonged to the same
2187              * compound page. We will split the page now, but no
2188              * merging will take place.
2189              * We do not want to add the cost of a full lock; if
2190              * the page is locked, it is better to skip it and
2191              * perhaps try again later.
2192              */
2193             if (!trylock_page(page))
2194                 return;
2195             split_huge_page(page);
2196             unlock_page(page);
2197         }
2198     }
2199 }
2200
2201 static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2202                         struct rmap_item **rmap_list,
2203                         unsigned long addr)
2204 {
2205     struct rmap_item *rmap_item;
2206
2207     while (*rmap_list) {
2208         rmap_item = *rmap_list;
2209         if ((rmap_item->address & PAGE_MASK) == addr)
2210             return rmap_item;
2211         if (rmap_item->address > addr)
2212             break;
2213         *rmap_list = rmap_item->rmap_list;
2214         remove_rmap_item_from_tree(rmap_item);
2215         free_rmap_item(rmap_item);
2216     }
2217
2218     rmap_item = alloc_rmap_item();
2219     if (rmap_item) {
2220         /* It has already been zeroed */
2221         rmap_item->mm = mm_slot->mm;
2222         rmap_item->address = addr;
2223         rmap_item->rmap_list = *rmap_list;
2224         *rmap_list = rmap_item;
2225     }
2226     return rmap_item;
2227 }
2228
2229 static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2230 {
2231     struct mm_struct *mm;
2232     struct mm_slot *slot;
2233     struct vm_area_struct *vma;
2234     struct rmap_item *rmap_item;
2235     int nid;
2236
2237     if (list_empty(&ksm_mm_head.mm_list))
2238         return NULL;
2239
2240     slot = ksm_scan.mm_slot;
2241     if (slot == &ksm_mm_head) {
2242         /*
2243          * A number of pages can hang around indefinitely on per-cpu
2244          * pagevecs, raised page count preventing write_protect_page
2245          * from merging them.  Though it doesn't really matter much,
2246          * it is puzzling to see some stuck in pages_volatile until
2247          * other activity jostles them out, and they also prevented
2248          * LTP's KSM test from succeeding deterministically; so drain
2249          * them here (here rather than on entry to ksm_do_scan(),
2250          * so we don't IPI too often when pages_to_scan is set low).
2251          */
2252         lru_add_drain_all();
2253
2254         /*
2255          * Whereas stale stable_nodes on the stable_tree itself
2256          * get pruned in the regular course of stable_tree_search(),
2257          * those moved out to the migrate_nodes list can accumulate:
2258          * so prune them once before each full scan.
2259          */
2260         if (!ksm_merge_across_nodes) {
2261             struct stable_node *stable_node, *next;
2262             struct page *page;
2263
2264             list_for_each_entry_safe(stable_node, next,
2265                          &migrate_nodes, list) {
2266                 page = get_ksm_page(stable_node,
2267                             GET_KSM_PAGE_NOLOCK);
2268                 if (page)
2269                     put_page(page);
2270                 cond_resched();
2271             }
2272         }
2273
2274         for (nid = 0; nid < ksm_nr_node_ids; nid++)
2275             root_unstable_tree[nid] = RB_ROOT;
2276
2277         spin_lock(&ksm_mmlist_lock);
2278         slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2279         ksm_scan.mm_slot = slot;
2280         spin_unlock(&ksm_mmlist_lock);
2281         /*
2282          * Although we tested list_empty() above, a racing __ksm_exit
2283          * of the last mm on the list may have removed it since then.
2284          */
2285         if (slot == &ksm_mm_head)
2286             return NULL;
2287 next_mm:
2288         ksm_scan.address = 0;
2289         ksm_scan.rmap_list = &slot->rmap_list;
2290     }
2291
2292     mm = slot->mm;
2293     mmap_read_lock(mm);
2294     if (ksm_test_exit(mm))
2295         vma = NULL;
2296     else
2297         vma = find_vma(mm, ksm_scan.address);
2298
2299     for (; vma; vma = vma->vm_next) {
2300         if (!(vma->vm_flags & VM_MERGEABLE))
2301             continue;
2302         if (ksm_scan.address < vma->vm_start)
2303             ksm_scan.address = vma->vm_start;
2304         if (!vma->anon_vma)
2305             ksm_scan.address = vma->vm_end;
2306
2307         while (ksm_scan.address < vma->vm_end) {
2308             if (ksm_test_exit(mm))
2309                 break;
2310             *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2311             if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
2312                 ksm_scan.address += PAGE_SIZE;
2313                 cond_resched();
2314                 continue;
2315             }
2316             if (PageAnon(*page)) {
2317                 flush_anon_page(vma, *page, ksm_scan.address);
2318                 flush_dcache_page(*page);
2319                 rmap_item = get_next_rmap_item(slot,
2320                     ksm_scan.rmap_list, ksm_scan.address);
2321                 if (rmap_item) {
2322                     ksm_scan.rmap_list =
2323                             &rmap_item->rmap_list;
2324                     ksm_scan.address += PAGE_SIZE;
2325                 } else
2326                     put_page(*page);
2327                 mmap_read_unlock(mm);
2328                 return rmap_item;
2329             }
2330             put_page(*page);
2331             ksm_scan.address += PAGE_SIZE;
2332             cond_resched();
2333         }
2334     }
2335
2336     if (ksm_test_exit(mm)) {
2337         ksm_scan.address = 0;
2338         ksm_scan.rmap_list = &slot->rmap_list;
2339     }
2340     /*
2341      * Nuke all the rmap_items that are above this current rmap:
2342      * because there were no VM_MERGEABLE vmas with such addresses.
2343      */
2344     remove_trailing_rmap_items(ksm_scan.rmap_list);
2345
2346     spin_lock(&ksm_mmlist_lock);
2347     ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2348                         struct mm_slot, mm_list);
2349     if (ksm_scan.address == 0) {
2350         /*
2351          * We've completed a full scan of all vmas, holding mmap_lock
2352          * throughout, and found no VM_MERGEABLE: so do the same as
2353          * __ksm_exit does to remove this mm from all our lists now.
2354          * This applies either when cleaning up after __ksm_exit
2355          * (but beware: we can reach here even before __ksm_exit),
2356          * or when all VM_MERGEABLE areas have been unmapped (and
2357          * mmap_lock then protects against race with MADV_MERGEABLE).
2358          */
2359         hash_del(&slot->link);
2360         list_del(&slot->mm_list);
2361         spin_unlock(&ksm_mmlist_lock);
2362
2363         free_mm_slot(slot);
2364         clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2365         mmap_read_unlock(mm);
2366         mmdrop(mm);
2367     } else {
2368         mmap_read_unlock(mm);
2369         /*
2370          * mmap_read_unlock(mm) first because after
2371          * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
2372          * already have been freed under us by __ksm_exit()
2373          * because the "mm_slot" is still hashed and
2374          * ksm_scan.mm_slot doesn't point to it anymore.
2375          */
2376         spin_unlock(&ksm_mmlist_lock);
2377     }
2378
2379     /* Repeat until we've completed scanning the whole list */
2380     slot = ksm_scan.mm_slot;
2381     if (slot != &ksm_mm_head)
2382         goto next_mm;
2383
2384     ksm_scan.seqnr++;
2385     return NULL;
2386 }
2387
2388 /**
2389  * ksm_do_scan  - the ksm scanner main worker function.
2390  * @scan_npages:  number of pages we want to scan before we return.
2391  */
2392 static void ksm_do_scan(unsigned int scan_npages)
2393 {
2394     struct rmap_item *rmap_item;
2395     struct page *page;
2396
2397     while (scan_npages-- && likely(!freezing(current))) {
2398         cond_resched();
2399         rmap_item = scan_get_next_rmap_item(&page);
2400         if (!rmap_item)
2401             return;
2402         cmp_and_merge_page(page, rmap_item);
2403         put_page(page);
2404     }
2405 }
2406
2407 static int ksmd_should_run(void)
2408 {
2409     return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2410 }
2411
2412 static int ksm_scan_thread(void *nothing)
2413 {
2414     unsigned int sleep_ms;
2415
2416     set_freezable();
2417     set_user_nice(current, 5);
2418
2419     while (!kthread_should_stop()) {
2420         mutex_lock(&ksm_thread_mutex);
2421         wait_while_offlining();
2422         if (ksmd_should_run())
2423             ksm_do_scan(ksm_thread_pages_to_scan);
2424         mutex_unlock(&ksm_thread_mutex);
2425
2426         try_to_freeze();
2427
2428         if (ksmd_should_run()) {
2429             sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2430             wait_event_interruptible_timeout(ksm_iter_wait,
2431                 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2432                 msecs_to_jiffies(sleep_ms));
2433         } else {
2434             wait_event_freezable(ksm_thread_wait,
2435                 ksmd_should_run() || kthread_should_stop());
2436         }
2437     }
2438     return 0;
2439 }
2440
2441 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2442         unsigned long end, int advice, unsigned long *vm_flags)
2443 {
2444     struct mm_struct *mm = vma->vm_mm;
2445     int err;
2446
2447     switch (advice) {
2448     case MADV_MERGEABLE:
2449         /*
2450          * Be somewhat over-protective for now!
2451          */
2452         if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
2453                  VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
2454                  VM_HUGETLB | VM_MIXEDMAP))
2455             return 0;       /* just ignore the advice */
2456
2457         if (vma_is_dax(vma))
2458             return 0;
2459
2460 #ifdef VM_SAO
2461         if (*vm_flags & VM_SAO)
2462             return 0;
2463 #endif
2464 #ifdef VM_SPARC_ADI
2465         if (*vm_flags & VM_SPARC_ADI)
2466             return 0;
2467 #endif
2468
2469         if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2470             err = __ksm_enter(mm);
2471             if (err)
2472                 return err;
2473         }
2474
2475         *vm_flags |= VM_MERGEABLE;
2476         break;
2477
2478     case MADV_UNMERGEABLE:
2479         if (!(*vm_flags & VM_MERGEABLE))
2480             return 0;       /* just ignore the advice */
2481
2482         if (vma->anon_vma) {
2483             err = unmerge_ksm_pages(vma, start, end);
2484             if (err)
2485                 return err;
2486         }
2487
2488         *vm_flags &= ~VM_MERGEABLE;
2489         break;
2490     }
2491
2492     return 0;
2493 }
2494 EXPORT_SYMBOL_GPL(ksm_madvise);
2495
2496 int __ksm_enter(struct mm_struct *mm)
2497 {
2498     struct mm_slot *mm_slot;
2499     int needs_wakeup;
2500
2501     mm_slot = alloc_mm_slot();
2502     if (!mm_slot)
2503         return -ENOMEM;
2504
2505     /* Check ksm_run too?  Would need tighter locking */
2506     needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2507
2508     spin_lock(&ksm_mmlist_lock);
2509     insert_to_mm_slots_hash(mm, mm_slot);
2510     /*
2511      * When KSM_RUN_MERGE (or KSM_RUN_STOP),
2512      * insert just behind the scanning cursor, to let the area settle
2513      * down a little; when fork is followed by immediate exec, we don't
2514      * want ksmd to waste time setting up and tearing down an rmap_list.
2515      *
2516      * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
2517      * scanning cursor, otherwise KSM pages in newly forked mms will be
2518      * missed: then we might as well insert at the end of the list.
2519      */
2520     if (ksm_run & KSM_RUN_UNMERGE)
2521         list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2522     else
2523         list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2524     spin_unlock(&ksm_mmlist_lock);
2525
2526     set_bit(MMF_VM_MERGEABLE, &mm->flags);
2527     mmgrab(mm);
2528
2529     if (needs_wakeup)
2530         wake_up_interruptible(&ksm_thread_wait);
2531
2532     return 0;
2533 }
2534
2535 void __ksm_exit(struct mm_struct *mm)
2536 {
2537     struct mm_slot *mm_slot;
2538     int easy_to_free = 0;
2539
2540     /*
2541      * This process is exiting: if it's straightforward (as is the
2542      * case when ksmd was never running), free mm_slot immediately.
2543      * But if it's at the cursor or has rmap_items linked to it, use
2544      * mmap_lock to synchronize with any break_cows before pagetables
2545      * are freed, and leave the mm_slot on the list for ksmd to free.
2546      * Beware: ksm may already have noticed it exiting and freed the slot.
2547      */
2548
2549     spin_lock(&ksm_mmlist_lock);
2550     mm_slot = get_mm_slot(mm);
2551     if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2552         if (!mm_slot->rmap_list) {
2553             hash_del(&mm_slot->link);
2554             list_del(&mm_slot->mm_list);
2555             easy_to_free = 1;
2556         } else {
2557             list_move(&mm_slot->mm_list,
2558                   &ksm_scan.mm_slot->mm_list);
2559         }
2560     }
2561     spin_unlock(&ksm_mmlist_lock);
2562
2563     if (easy_to_free) {
2564         free_mm_slot(mm_slot);
2565         clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2566         mmdrop(mm);
2567     } else if (mm_slot) {
2568         mmap_write_lock(mm);
2569         mmap_write_unlock(mm);
2570     }
2571 }
2572
2573 struct page *ksm_might_need_to_copy(struct page *page,
2574             struct vm_area_struct *vma, unsigned long address)
2575 {
2576     struct folio *folio = page_folio(page);
2577     struct anon_vma *anon_vma = folio_anon_vma(folio);
2578     struct page *new_page;
2579
2580     if (PageKsm(page)) {
2581         if (page_stable_node(page) &&
2582             !(ksm_run & KSM_RUN_UNMERGE))
2583             return page;    /* no need to copy it */
2584     } else if (!anon_vma) {
2585         return page;        /* no need to copy it */
2586     } else if (page->index == linear_page_index(vma, address) &&
2587             anon_vma->root == vma->anon_vma->root) {
2588         return page;        /* still no need to copy it */
2589     }
2590     if (!PageUptodate(page))
2591         return page;        /* let do_swap_page report the error */
2592
2593     new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2594     if (new_page &&
2595         mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
2596         put_page(new_page);
2597         new_page = NULL;
2598     }
2599     if (new_page) {
2600         copy_user_highpage(new_page, page, address, vma);
2601
2602         SetPageDirty(new_page);
2603         __SetPageUptodate(new_page);
2604         __SetPageLocked(new_page);
2605 #ifdef CONFIG_SWAP
2606         count_vm_event(KSM_SWPIN_COPY);
2607 #endif
2608     }
2609
2610     return new_page;
2611 }
2612
2613 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
2614 {
2615     struct stable_node *stable_node;
2616     struct rmap_item *rmap_item;
2617     int search_new_forks = 0;
2618
2619     VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
2620
2621     /*
2622      * Rely on the page lock to protect against concurrent modifications
2623      * to that page's node of the stable tree.
2624      */
2625     VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
2626
2627     stable_node = folio_stable_node(folio);
2628     if (!stable_node)
2629         return;
2630 again:
2631     hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2632         struct anon_vma *anon_vma = rmap_item->anon_vma;
2633         struct anon_vma_chain *vmac;
2634         struct vm_area_struct *vma;
2635
2636         cond_resched();
2637         if (!anon_vma_trylock_read(anon_vma)) {
2638             if (rwc->try_lock) {
2639                 rwc->contended = true;
2640                 return;
2641             }
2642             anon_vma_lock_read(anon_vma);
2643         }
2644         anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2645                            0, ULONG_MAX) {
2646             unsigned long addr;
2647
2648             cond_resched();
2649             vma = vmac->vma;
2650
2651             /* Ignore the stable/unstable/sqnr flags */
2652             addr = rmap_item->address & PAGE_MASK;
2653
2654             if (addr < vma->vm_start || addr >= vma->vm_end)
2655                 continue;
2656             /*
2657              * Initially we examine only the vma which covers this
2658              * rmap_item; but later, if there is still work to do,
2659              * we examine covering vmas in other mms: in case they
2660              * were forked from the original since ksmd passed.
2661              */
2662             if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2663                 continue;
2664
2665             if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2666                 continue;
2667
2668             if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
2669                 anon_vma_unlock_read(anon_vma);
2670                 return;
2671             }
2672             if (rwc->done && rwc->done(folio)) {
2673                 anon_vma_unlock_read(anon_vma);
2674                 return;
2675             }
2676         }
2677         anon_vma_unlock_read(anon_vma);
2678     }
2679     if (!search_new_forks++)
2680         goto again;
2681 }
2682
2683 #ifdef CONFIG_MIGRATION
2684 void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
2685 {
2686     struct stable_node *stable_node;
2687
2688     VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
2689     VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
2690     VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
2691
2692     stable_node = folio_stable_node(folio);
2693     if (stable_node) {
2694         VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
2695         stable_node->kpfn = folio_pfn(newfolio);
2696         /*
2697          * newfolio->mapping was set in advance; now we need smp_wmb()
2698          * to make sure that the new stable_node->kpfn is visible
2699          * to get_ksm_page() before it can see that folio->mapping
2700          * has gone stale (or that folio_test_swapcache has been cleared).
2701          */
2702         smp_wmb();
2703         set_page_stable_node(&folio->page, NULL);
2704     }
2705 }
2706 #endif /* CONFIG_MIGRATION */
2707
2708 #ifdef CONFIG_MEMORY_HOTREMOVE
2709 static void wait_while_offlining(void)
2710 {
2711     while (ksm_run & KSM_RUN_OFFLINE) {
2712         mutex_unlock(&ksm_thread_mutex);
2713         wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2714                 TASK_UNINTERRUPTIBLE);
2715         mutex_lock(&ksm_thread_mutex);
2716     }
2717 }
2718
2719 static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2720                      unsigned long start_pfn,
2721                      unsigned long end_pfn)
2722 {
2723     if (stable_node->kpfn >= start_pfn &&
2724         stable_node->kpfn < end_pfn) {
2725         /*
2726          * Don't get_ksm_page, page has already gone:
2727          * which is why we keep kpfn instead of page*
2728          */
2729         remove_node_from_stable_tree(stable_node);
2730         return true;
2731     }
2732     return false;
2733 }
2734
2735 static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2736                        unsigned long start_pfn,
2737                        unsigned long end_pfn,
2738                        struct rb_root *root)
2739 {
2740     struct stable_node *dup;
2741     struct hlist_node *hlist_safe;
2742
2743     if (!is_stable_node_chain(stable_node)) {
2744         VM_BUG_ON(is_stable_node_dup(stable_node));
2745         return stable_node_dup_remove_range(stable_node, start_pfn,
2746                             end_pfn);
2747     }
2748
2749     hlist_for_each_entry_safe(dup, hlist_safe,
2750                   &stable_node->hlist, hlist_dup) {
2751         VM_BUG_ON(!is_stable_node_dup(dup));
2752         stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2753     }
2754     if (hlist_empty(&stable_node->hlist)) {
2755         free_stable_node_chain(stable_node, root);
2756         return true; /* notify caller that tree was rebalanced */
2757     } else
2758         return false;
2759 }
2760
2761 static void ksm_check_stable_tree(unsigned long start_pfn,
2762                   unsigned long end_pfn)
2763 {
2764     struct stable_node *stable_node, *next;
2765     struct rb_node *node;
2766     int nid;
2767
2768     for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2769         node = rb_first(root_stable_tree + nid);
2770         while (node) {
2771             stable_node = rb_entry(node, struct stable_node, node);
2772             if (stable_node_chain_remove_range(stable_node,
2773                                start_pfn, end_pfn,
2774                                root_stable_tree +
2775                                nid))
2776                 node = rb_first(root_stable_tree + nid);
2777             else
2778                 node = rb_next(node);
2779             cond_resched();
2780         }
2781     }
2782     list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2783         if (stable_node->kpfn >= start_pfn &&
2784             stable_node->kpfn < end_pfn)
2785             remove_node_from_stable_tree(stable_node);
2786         cond_resched();
2787     }
2788 }
2789
2790 static int ksm_memory_callback(struct notifier_block *self,
2791                    unsigned long action, void *arg)
2792 {
2793     struct memory_notify *mn = arg;
2794
2795     switch (action) {
2796     case MEM_GOING_OFFLINE:
2797         /*
2798          * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
2799          * and remove_all_stable_nodes() while memory is going offline:
2800          * it is unsafe for them to touch the stable tree at this time.
2801          * But unmerge_ksm_pages(), rmap lookups and other entry points
2802          * which do not need the ksm_thread_mutex are all safe.
2803          */
2804         mutex_lock(&ksm_thread_mutex);
2805         ksm_run |= KSM_RUN_OFFLINE;
2806         mutex_unlock(&ksm_thread_mutex);
2807         break;
2808
2809     case MEM_OFFLINE:
2810         /*
2811          * Most of the work is done by page migration; but there might
2812          * be a few stable_nodes left over, still pointing to struct
2813          * pages which have been offlined: prune those from the tree,
2814          * otherwise get_ksm_page() might later try to access a
2815          * non-existent struct page.
2816          */
2817         ksm_check_stable_tree(mn->start_pfn,
2818                       mn->start_pfn + mn->nr_pages);
2819         fallthrough;
2820     case MEM_CANCEL_OFFLINE:
2821         mutex_lock(&ksm_thread_mutex);
2822         ksm_run &= ~KSM_RUN_OFFLINE;
2823         mutex_unlock(&ksm_thread_mutex);
2824
2825         smp_mb();   /* wake_up_bit advises this */
2826         wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2827         break;
2828     }
2829     return NOTIFY_OK;
2830 }
2831 #else
2832 static void wait_while_offlining(void)
2833 {
2834 }
2835 #endif /* CONFIG_MEMORY_HOTREMOVE */
2836
2837 #ifdef CONFIG_SYSFS
2838 /*
2839  * This all compiles without CONFIG_SYSFS, but is a waste of space.
2840  */
2841
2842 #define KSM_ATTR_RO(_name) \
2843     static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2844 #define KSM_ATTR(_name) \
2845     static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
2846
2847 static ssize_t sleep_millisecs_show(struct kobject *kobj,
2848                     struct kobj_attribute *attr, char *buf)
2849 {
2850     return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
2851 }
2852
2853 static ssize_t sleep_millisecs_store(struct kobject *kobj,
2854                      struct kobj_attribute *attr,
2855                      const char *buf, size_t count)
2856 {
2857     unsigned int msecs;
2858     int err;
2859
2860     err = kstrtouint(buf, 10, &msecs);
2861     if (err)
2862         return -EINVAL;
2863
2864     ksm_thread_sleep_millisecs = msecs;
2865     wake_up_interruptible(&ksm_iter_wait);
2866
2867     return count;
2868 }
2869 KSM_ATTR(sleep_millisecs);
2870
2871 static ssize_t pages_to_scan_show(struct kobject *kobj,
2872                   struct kobj_attribute *attr, char *buf)
2873 {
2874     return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
2875 }
2876
2877 static ssize_t pages_to_scan_store(struct kobject *kobj,
2878                    struct kobj_attribute *attr,
2879                    const char *buf, size_t count)
2880 {
2881     unsigned int nr_pages;
2882     int err;
2883
2884     err = kstrtouint(buf, 10, &nr_pages);
2885     if (err)
2886         return -EINVAL;
2887
2888     ksm_thread_pages_to_scan = nr_pages;
2889
2890     return count;
2891 }
2892 KSM_ATTR(pages_to_scan);
2893
2894 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2895             char *buf)
2896 {
2897     return sysfs_emit(buf, "%lu\n", ksm_run);
2898 }
2899
2900 static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2901              const char *buf, size_t count)
2902 {
2903     unsigned int flags;
2904     int err;
2905
2906     err = kstrtouint(buf, 10, &flags);
2907     if (err)
2908         return -EINVAL;
2909     if (flags > KSM_RUN_UNMERGE)
2910         return -EINVAL;
2911
2912     /*
2913      * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
2914      * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
2915      * breaking COW to free the pages_shared (but leaves mm_slots
2916      * on the list for when ksmd may be set running again).
2917      */
2918
2919     mutex_lock(&ksm_thread_mutex);
2920     wait_while_offlining();
2921     if (ksm_run != flags) {
2922         ksm_run = flags;
2923         if (flags & KSM_RUN_UNMERGE) {
2924             set_current_oom_origin();
2925             err = unmerge_and_remove_all_rmap_items();
2926             clear_current_oom_origin();
2927             if (err) {
2928                 ksm_run = KSM_RUN_STOP;
2929                 count = err;
2930             }
2931         }
2932     }
2933     mutex_unlock(&ksm_thread_mutex);
2934
2935     if (flags & KSM_RUN_MERGE)
2936         wake_up_interruptible(&ksm_thread_wait);
2937
2938     return count;
2939 }
2940 KSM_ATTR(run);
2941
2942 #ifdef CONFIG_NUMA
2943 static ssize_t merge_across_nodes_show(struct kobject *kobj,
2944                        struct kobj_attribute *attr, char *buf)
2945 {
2946     return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
2947 }
2948
2949 static ssize_t merge_across_nodes_store(struct kobject *kobj,
2950                    struct kobj_attribute *attr,
2951                    const char *buf, size_t count)
2952 {
2953     int err;
2954     unsigned long knob;
2955
2956     err = kstrtoul(buf, 10, &knob);
2957     if (err)
2958         return err;
2959     if (knob > 1)
2960         return -EINVAL;
2961
2962     mutex_lock(&ksm_thread_mutex);
2963     wait_while_offlining();
2964     if (ksm_merge_across_nodes != knob) {
2965         if (ksm_pages_shared || remove_all_stable_nodes())
2966             err = -EBUSY;
2967         else if (root_stable_tree == one_stable_tree) {
2968             struct rb_root *buf;
2969             /*
2970              * This is the first time that we switch away from the
2971              * default of merging across nodes: must now allocate
2972              * a buffer to hold as many roots as may be needed.
2973              * Allocate stable and unstable together:
2974              * MAXSMP NODES_SHIFT 10 will use 16kB.
2975              */
2976             buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2977                       GFP_KERNEL);
2978             /* Let us assume that RB_ROOT is NULL is zero */
2979             if (!buf)
2980                 err = -ENOMEM;
2981             else {
2982                 root_stable_tree = buf;
2983                 root_unstable_tree = buf + nr_node_ids;
2984                 /* Stable tree is empty but not the unstable */
2985                 root_unstable_tree[0] = one_unstable_tree[0];
2986             }
2987         }
2988         if (!err) {
2989             ksm_merge_across_nodes = knob;
2990             ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2991         }
2992     }
2993     mutex_unlock(&ksm_thread_mutex);
2994
2995     return err ? err : count;
2996 }
2997 KSM_ATTR(merge_across_nodes);
2998 #endif
2999
3000 static ssize_t use_zero_pages_show(struct kobject *kobj,
3001                    struct kobj_attribute *attr, char *buf)
3002 {
3003     return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
3004 }
3005 static ssize_t use_zero_pages_store(struct kobject *kobj,
3006                    struct kobj_attribute *attr,
3007                    const char *buf, size_t count)
3008 {
3009     int err;
3010     bool value;
3011
3012     err = kstrtobool(buf, &value);
3013     if (err)
3014         return -EINVAL;
3015
3016     ksm_use_zero_pages = value;
3017
3018     return count;
3019 }
3020 KSM_ATTR(use_zero_pages);
3021
3022 static ssize_t max_page_sharing_show(struct kobject *kobj,
3023                      struct kobj_attribute *attr, char *buf)
3024 {
3025     return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
3026 }
3027
3028 static ssize_t max_page_sharing_store(struct kobject *kobj,
3029                       struct kobj_attribute *attr,
3030                       const char *buf, size_t count)
3031 {
3032     int err;
3033     int knob;
3034
3035     err = kstrtoint(buf, 10, &knob);
3036     if (err)
3037         return err;
3038     /*
3039      * When a KSM page is created it is shared by 2 mappings. This
3040      * being a signed comparison, it implicitly verifies it's not
3041      * negative.
3042      */
3043     if (knob < 2)
3044         return -EINVAL;
3045
3046     if (READ_ONCE(ksm_max_page_sharing) == knob)
3047         return count;
3048
3049     mutex_lock(&ksm_thread_mutex);
3050     wait_while_offlining();
3051     if (ksm_max_page_sharing != knob) {
3052         if (ksm_pages_shared || remove_all_stable_nodes())
3053             err = -EBUSY;
3054         else
3055             ksm_max_page_sharing = knob;
3056     }
3057     mutex_unlock(&ksm_thread_mutex);
3058
3059     return err ? err : count;
3060 }
3061 KSM_ATTR(max_page_sharing);
3062
3063 static ssize_t pages_shared_show(struct kobject *kobj,
3064                  struct kobj_attribute *attr, char *buf)
3065 {
3066     return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
3067 }
3068 KSM_ATTR_RO(pages_shared);
3069
3070 static ssize_t pages_sharing_show(struct kobject *kobj,
3071                   struct kobj_attribute *attr, char *buf)
3072 {
3073     return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
3074 }
3075 KSM_ATTR_RO(pages_sharing);
3076
3077 static ssize_t pages_unshared_show(struct kobject *kobj,
3078                    struct kobj_attribute *attr, char *buf)
3079 {
3080     return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
3081 }
3082 KSM_ATTR_RO(pages_unshared);
3083
3084 static ssize_t pages_volatile_show(struct kobject *kobj,
3085                    struct kobj_attribute *attr, char *buf)
3086 {
3087     long ksm_pages_volatile;
3088
3089     ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3090                 - ksm_pages_sharing - ksm_pages_unshared;
3091     /*
3092      * It was not worth any locking to calculate that statistic,
3093      * but it might therefore sometimes be negative: conceal that.
3094      */
3095     if (ksm_pages_volatile < 0)
3096         ksm_pages_volatile = 0;
3097     return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
3098 }
3099 KSM_ATTR_RO(pages_volatile);
3100
3101 static ssize_t stable_node_dups_show(struct kobject *kobj,
3102                      struct kobj_attribute *attr, char *buf)
3103 {
3104     return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
3105 }
3106 KSM_ATTR_RO(stable_node_dups);
3107
3108 static ssize_t stable_node_chains_show(struct kobject *kobj,
3109                        struct kobj_attribute *attr, char *buf)
3110 {
3111     return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
3112 }
3113 KSM_ATTR_RO(stable_node_chains);
3114
3115 static ssize_t
3116 stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3117                     struct kobj_attribute *attr,
3118                     char *buf)
3119 {
3120     return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3121 }
3122
3123 static ssize_t
3124 stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3125                      struct kobj_attribute *attr,
3126                      const char *buf, size_t count)
3127 {
3128     unsigned int msecs;
3129     int err;
3130
3131     err = kstrtouint(buf, 10, &msecs);
3132     if (err)
3133         return -EINVAL;
3134
3135     ksm_stable_node_chains_prune_millisecs = msecs;
3136
3137     return count;
3138 }
3139 KSM_ATTR(stable_node_chains_prune_millisecs);
3140
3141 static ssize_t full_scans_show(struct kobject *kobj,
3142                    struct kobj_attribute *attr, char *buf)
3143 {
3144     return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
3145 }
3146 KSM_ATTR_RO(full_scans);
3147
3148 static struct attribute *ksm_attrs[] = {
3149     &sleep_millisecs_attr.attr,
3150     &pages_to_scan_attr.attr,
3151     &run_attr.attr,
3152     &pages_shared_attr.attr,
3153     &pages_sharing_attr.attr,
3154     &pages_unshared_attr.attr,
3155     &pages_volatile_attr.attr,
3156     &full_scans_attr.attr,
3157 #ifdef CONFIG_NUMA
3158     &merge_across_nodes_attr.attr,
3159 #endif
3160     &max_page_sharing_attr.attr,
3161     &stable_node_chains_attr.attr,
3162     &stable_node_dups_attr.attr,
3163     &stable_node_chains_prune_millisecs_attr.attr,
3164     &use_zero_pages_attr.attr,
3165     NULL,
3166 };
3167
3168 static const struct attribute_group ksm_attr_group = {
3169     .attrs = ksm_attrs,
3170     .name = "ksm",
3171 };
3172 #endif /* CONFIG_SYSFS */
3173
3174 static int __init ksm_init(void)
3175 {
3176     struct task_struct *ksm_thread;
3177     int err;
3178
3179     /* The correct value depends on page size and endianness */
3180     zero_checksum = calc_checksum(ZERO_PAGE(0));
3181     /* Default to false for backwards compatibility */
3182     ksm_use_zero_pages = false;
3183
3184     err = ksm_slab_init();
3185     if (err)
3186         goto out;
3187
3188     ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3189     if (IS_ERR(ksm_thread)) {
3190         pr_err("ksm: creating kthread failed\n");
3191         err = PTR_ERR(ksm_thread);
3192         goto out_free;
3193     }
3194
3195 #ifdef CONFIG_SYSFS
3196     err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3197     if (err) {
3198         pr_err("ksm: register sysfs failed\n");
3199         kthread_stop(ksm_thread);
3200         goto out_free;
3201     }
3202 #else
3203     ksm_run = KSM_RUN_MERGE;    /* no way for user to start it */
3204
3205 #endif /* CONFIG_SYSFS */
3206
3207 #ifdef CONFIG_MEMORY_HOTREMOVE
3208     /* There is no significance to this priority 100 */
3209     hotplug_memory_notifier(ksm_memory_callback, 100);
3210 #endif
3211     return 0;
3212
3213 out_free:
3214     ksm_slab_free();
3215 out:
3216     return err;
3217 }
3218 subsys_initcall(ksm_init);