0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016 #include <linux/errno.h>
0017 #include <linux/mm.h>
0018 #include <linux/mm_inline.h>
0019 #include <linux/fs.h>
0020 #include <linux/mman.h>
0021 #include <linux/sched.h>
0022 #include <linux/sched/mm.h>
0023 #include <linux/sched/coredump.h>
0024 #include <linux/rwsem.h>
0025 #include <linux/pagemap.h>
0026 #include <linux/rmap.h>
0027 #include <linux/spinlock.h>
0028 #include <linux/xxhash.h>
0029 #include <linux/delay.h>
0030 #include <linux/kthread.h>
0031 #include <linux/wait.h>
0032 #include <linux/slab.h>
0033 #include <linux/rbtree.h>
0034 #include <linux/memory.h>
0035 #include <linux/mmu_notifier.h>
0036 #include <linux/swap.h>
0037 #include <linux/ksm.h>
0038 #include <linux/hashtable.h>
0039 #include <linux/freezer.h>
0040 #include <linux/oom.h>
0041 #include <linux/numa.h>
0042
0043 #include <asm/tlbflush.h>
0044 #include "internal.h"
0045
0046 #ifdef CONFIG_NUMA
0047 #define NUMA(x) (x)
0048 #define DO_NUMA(x) do { (x); } while (0)
0049 #else
0050 #define NUMA(x) (0)
0051 #define DO_NUMA(x) do { } while (0)
0052 #endif
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121 struct mm_slot {
0122 struct hlist_node link;
0123 struct list_head mm_list;
0124 struct rmap_item *rmap_list;
0125 struct mm_struct *mm;
0126 };
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137 struct ksm_scan {
0138 struct mm_slot *mm_slot;
0139 unsigned long address;
0140 struct rmap_item **rmap_list;
0141 unsigned long seqnr;
0142 };
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156 struct stable_node {
0157 union {
0158 struct rb_node node;
0159 struct {
0160 struct list_head *head;
0161 struct {
0162 struct hlist_node hlist_dup;
0163 struct list_head list;
0164 };
0165 };
0166 };
0167 struct hlist_head hlist;
0168 union {
0169 unsigned long kpfn;
0170 unsigned long chain_prune_time;
0171 };
0172
0173
0174
0175
0176
0177 #define STABLE_NODE_CHAIN -1024
0178 int rmap_hlist_len;
0179 #ifdef CONFIG_NUMA
0180 int nid;
0181 #endif
0182 };
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196 struct rmap_item {
0197 struct rmap_item *rmap_list;
0198 union {
0199 struct anon_vma *anon_vma;
0200 #ifdef CONFIG_NUMA
0201 int nid;
0202 #endif
0203 };
0204 struct mm_struct *mm;
0205 unsigned long address;
0206 unsigned int oldchecksum;
0207 union {
0208 struct rb_node node;
0209 struct {
0210 struct stable_node *head;
0211 struct hlist_node hlist;
0212 };
0213 };
0214 };
0215
0216 #define SEQNR_MASK 0x0ff
0217 #define UNSTABLE_FLAG 0x100
0218 #define STABLE_FLAG 0x200
0219
0220
0221 static struct rb_root one_stable_tree[1] = { RB_ROOT };
0222 static struct rb_root one_unstable_tree[1] = { RB_ROOT };
0223 static struct rb_root *root_stable_tree = one_stable_tree;
0224 static struct rb_root *root_unstable_tree = one_unstable_tree;
0225
0226
0227 static LIST_HEAD(migrate_nodes);
0228 #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
0229
0230 #define MM_SLOTS_HASH_BITS 10
0231 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
0232
0233 static struct mm_slot ksm_mm_head = {
0234 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
0235 };
0236 static struct ksm_scan ksm_scan = {
0237 .mm_slot = &ksm_mm_head,
0238 };
0239
0240 static struct kmem_cache *rmap_item_cache;
0241 static struct kmem_cache *stable_node_cache;
0242 static struct kmem_cache *mm_slot_cache;
0243
0244
0245 static unsigned long ksm_pages_shared;
0246
0247
0248 static unsigned long ksm_pages_sharing;
0249
0250
0251 static unsigned long ksm_pages_unshared;
0252
0253
0254 static unsigned long ksm_rmap_items;
0255
0256
0257 static unsigned long ksm_stable_node_chains;
0258
0259
0260 static unsigned long ksm_stable_node_dups;
0261
0262
0263 static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
0264
0265
0266 static int ksm_max_page_sharing = 256;
0267
0268
0269 static unsigned int ksm_thread_pages_to_scan = 100;
0270
0271
0272 static unsigned int ksm_thread_sleep_millisecs = 20;
0273
0274
0275 static unsigned int zero_checksum __read_mostly;
0276
0277
0278 static bool ksm_use_zero_pages __read_mostly;
0279
0280 #ifdef CONFIG_NUMA
0281
0282 static unsigned int ksm_merge_across_nodes = 1;
0283 static int ksm_nr_node_ids = 1;
0284 #else
0285 #define ksm_merge_across_nodes 1U
0286 #define ksm_nr_node_ids 1
0287 #endif
0288
0289 #define KSM_RUN_STOP 0
0290 #define KSM_RUN_MERGE 1
0291 #define KSM_RUN_UNMERGE 2
0292 #define KSM_RUN_OFFLINE 4
0293 static unsigned long ksm_run = KSM_RUN_STOP;
0294 static void wait_while_offlining(void);
0295
0296 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
0297 static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
0298 static DEFINE_MUTEX(ksm_thread_mutex);
0299 static DEFINE_SPINLOCK(ksm_mmlist_lock);
0300
0301 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
0302 sizeof(struct __struct), __alignof__(struct __struct),\
0303 (__flags), NULL)
0304
0305 static int __init ksm_slab_init(void)
0306 {
0307 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
0308 if (!rmap_item_cache)
0309 goto out;
0310
0311 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
0312 if (!stable_node_cache)
0313 goto out_free1;
0314
0315 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
0316 if (!mm_slot_cache)
0317 goto out_free2;
0318
0319 return 0;
0320
0321 out_free2:
0322 kmem_cache_destroy(stable_node_cache);
0323 out_free1:
0324 kmem_cache_destroy(rmap_item_cache);
0325 out:
0326 return -ENOMEM;
0327 }
0328
0329 static void __init ksm_slab_free(void)
0330 {
0331 kmem_cache_destroy(mm_slot_cache);
0332 kmem_cache_destroy(stable_node_cache);
0333 kmem_cache_destroy(rmap_item_cache);
0334 mm_slot_cache = NULL;
0335 }
0336
0337 static __always_inline bool is_stable_node_chain(struct stable_node *chain)
0338 {
0339 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
0340 }
0341
0342 static __always_inline bool is_stable_node_dup(struct stable_node *dup)
0343 {
0344 return dup->head == STABLE_NODE_DUP_HEAD;
0345 }
0346
0347 static inline void stable_node_chain_add_dup(struct stable_node *dup,
0348 struct stable_node *chain)
0349 {
0350 VM_BUG_ON(is_stable_node_dup(dup));
0351 dup->head = STABLE_NODE_DUP_HEAD;
0352 VM_BUG_ON(!is_stable_node_chain(chain));
0353 hlist_add_head(&dup->hlist_dup, &chain->hlist);
0354 ksm_stable_node_dups++;
0355 }
0356
0357 static inline void __stable_node_dup_del(struct stable_node *dup)
0358 {
0359 VM_BUG_ON(!is_stable_node_dup(dup));
0360 hlist_del(&dup->hlist_dup);
0361 ksm_stable_node_dups--;
0362 }
0363
0364 static inline void stable_node_dup_del(struct stable_node *dup)
0365 {
0366 VM_BUG_ON(is_stable_node_chain(dup));
0367 if (is_stable_node_dup(dup))
0368 __stable_node_dup_del(dup);
0369 else
0370 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
0371 #ifdef CONFIG_DEBUG_VM
0372 dup->head = NULL;
0373 #endif
0374 }
0375
0376 static inline struct rmap_item *alloc_rmap_item(void)
0377 {
0378 struct rmap_item *rmap_item;
0379
0380 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
0381 __GFP_NORETRY | __GFP_NOWARN);
0382 if (rmap_item)
0383 ksm_rmap_items++;
0384 return rmap_item;
0385 }
0386
0387 static inline void free_rmap_item(struct rmap_item *rmap_item)
0388 {
0389 ksm_rmap_items--;
0390 rmap_item->mm = NULL;
0391 kmem_cache_free(rmap_item_cache, rmap_item);
0392 }
0393
0394 static inline struct stable_node *alloc_stable_node(void)
0395 {
0396
0397
0398
0399
0400
0401 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
0402 }
0403
0404 static inline void free_stable_node(struct stable_node *stable_node)
0405 {
0406 VM_BUG_ON(stable_node->rmap_hlist_len &&
0407 !is_stable_node_chain(stable_node));
0408 kmem_cache_free(stable_node_cache, stable_node);
0409 }
0410
0411 static inline struct mm_slot *alloc_mm_slot(void)
0412 {
0413 if (!mm_slot_cache)
0414 return NULL;
0415 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
0416 }
0417
0418 static inline void free_mm_slot(struct mm_slot *mm_slot)
0419 {
0420 kmem_cache_free(mm_slot_cache, mm_slot);
0421 }
0422
0423 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
0424 {
0425 struct mm_slot *slot;
0426
0427 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
0428 if (slot->mm == mm)
0429 return slot;
0430
0431 return NULL;
0432 }
0433
0434 static void insert_to_mm_slots_hash(struct mm_struct *mm,
0435 struct mm_slot *mm_slot)
0436 {
0437 mm_slot->mm = mm;
0438 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
0439 }
0440
0441
0442
0443
0444
0445
0446
0447
0448
0449 static inline bool ksm_test_exit(struct mm_struct *mm)
0450 {
0451 return atomic_read(&mm->mm_users) == 0;
0452 }
0453
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464
0465
0466
0467
0468
0469 static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
0470 {
0471 struct page *page;
0472 vm_fault_t ret = 0;
0473
0474 do {
0475 cond_resched();
0476 page = follow_page(vma, addr,
0477 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
0478 if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
0479 break;
0480 if (PageKsm(page))
0481 ret = handle_mm_fault(vma, addr,
0482 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
0483 NULL);
0484 else
0485 ret = VM_FAULT_WRITE;
0486 put_page(page);
0487 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
0488
0489
0490
0491
0492
0493
0494
0495
0496
0497
0498
0499
0500
0501
0502
0503
0504
0505
0506
0507
0508
0509
0510
0511
0512
0513
0514
0515
0516 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
0517 }
0518
0519 static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
0520 unsigned long addr)
0521 {
0522 struct vm_area_struct *vma;
0523 if (ksm_test_exit(mm))
0524 return NULL;
0525 vma = vma_lookup(mm, addr);
0526 if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
0527 return NULL;
0528 return vma;
0529 }
0530
0531 static void break_cow(struct rmap_item *rmap_item)
0532 {
0533 struct mm_struct *mm = rmap_item->mm;
0534 unsigned long addr = rmap_item->address;
0535 struct vm_area_struct *vma;
0536
0537
0538
0539
0540
0541 put_anon_vma(rmap_item->anon_vma);
0542
0543 mmap_read_lock(mm);
0544 vma = find_mergeable_vma(mm, addr);
0545 if (vma)
0546 break_ksm(vma, addr);
0547 mmap_read_unlock(mm);
0548 }
0549
0550 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
0551 {
0552 struct mm_struct *mm = rmap_item->mm;
0553 unsigned long addr = rmap_item->address;
0554 struct vm_area_struct *vma;
0555 struct page *page;
0556
0557 mmap_read_lock(mm);
0558 vma = find_mergeable_vma(mm, addr);
0559 if (!vma)
0560 goto out;
0561
0562 page = follow_page(vma, addr, FOLL_GET);
0563 if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
0564 goto out;
0565 if (PageAnon(page)) {
0566 flush_anon_page(vma, page, addr);
0567 flush_dcache_page(page);
0568 } else {
0569 put_page(page);
0570 out:
0571 page = NULL;
0572 }
0573 mmap_read_unlock(mm);
0574 return page;
0575 }
0576
0577
0578
0579
0580
0581
0582
0583 static inline int get_kpfn_nid(unsigned long kpfn)
0584 {
0585 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
0586 }
0587
0588 static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
0589 struct rb_root *root)
0590 {
0591 struct stable_node *chain = alloc_stable_node();
0592 VM_BUG_ON(is_stable_node_chain(dup));
0593 if (likely(chain)) {
0594 INIT_HLIST_HEAD(&chain->hlist);
0595 chain->chain_prune_time = jiffies;
0596 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
0597 #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
0598 chain->nid = NUMA_NO_NODE;
0599 #endif
0600 ksm_stable_node_chains++;
0601
0602
0603
0604
0605
0606
0607 rb_replace_node(&dup->node, &chain->node, root);
0608
0609
0610
0611
0612
0613
0614
0615
0616 stable_node_chain_add_dup(dup, chain);
0617 }
0618 return chain;
0619 }
0620
0621 static inline void free_stable_node_chain(struct stable_node *chain,
0622 struct rb_root *root)
0623 {
0624 rb_erase(&chain->node, root);
0625 free_stable_node(chain);
0626 ksm_stable_node_chains--;
0627 }
0628
0629 static void remove_node_from_stable_tree(struct stable_node *stable_node)
0630 {
0631 struct rmap_item *rmap_item;
0632
0633
0634 BUG_ON(stable_node->rmap_hlist_len < 0);
0635
0636 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
0637 if (rmap_item->hlist.next)
0638 ksm_pages_sharing--;
0639 else
0640 ksm_pages_shared--;
0641
0642 rmap_item->mm->ksm_merging_pages--;
0643
0644 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
0645 stable_node->rmap_hlist_len--;
0646 put_anon_vma(rmap_item->anon_vma);
0647 rmap_item->address &= PAGE_MASK;
0648 cond_resched();
0649 }
0650
0651
0652
0653
0654
0655
0656
0657
0658 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
0659 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
0660
0661 if (stable_node->head == &migrate_nodes)
0662 list_del(&stable_node->list);
0663 else
0664 stable_node_dup_del(stable_node);
0665 free_stable_node(stable_node);
0666 }
0667
0668 enum get_ksm_page_flags {
0669 GET_KSM_PAGE_NOLOCK,
0670 GET_KSM_PAGE_LOCK,
0671 GET_KSM_PAGE_TRYLOCK
0672 };
0673
0674
0675
0676
0677
0678
0679
0680
0681
0682
0683
0684
0685
0686
0687
0688
0689
0690
0691
0692
0693 static struct page *get_ksm_page(struct stable_node *stable_node,
0694 enum get_ksm_page_flags flags)
0695 {
0696 struct page *page;
0697 void *expected_mapping;
0698 unsigned long kpfn;
0699
0700 expected_mapping = (void *)((unsigned long)stable_node |
0701 PAGE_MAPPING_KSM);
0702 again:
0703 kpfn = READ_ONCE(stable_node->kpfn);
0704 page = pfn_to_page(kpfn);
0705 if (READ_ONCE(page->mapping) != expected_mapping)
0706 goto stale;
0707
0708
0709
0710
0711
0712
0713
0714
0715
0716
0717
0718 while (!get_page_unless_zero(page)) {
0719
0720
0721
0722
0723
0724
0725
0726
0727 if (!PageSwapCache(page))
0728 goto stale;
0729 cpu_relax();
0730 }
0731
0732 if (READ_ONCE(page->mapping) != expected_mapping) {
0733 put_page(page);
0734 goto stale;
0735 }
0736
0737 if (flags == GET_KSM_PAGE_TRYLOCK) {
0738 if (!trylock_page(page)) {
0739 put_page(page);
0740 return ERR_PTR(-EBUSY);
0741 }
0742 } else if (flags == GET_KSM_PAGE_LOCK)
0743 lock_page(page);
0744
0745 if (flags != GET_KSM_PAGE_NOLOCK) {
0746 if (READ_ONCE(page->mapping) != expected_mapping) {
0747 unlock_page(page);
0748 put_page(page);
0749 goto stale;
0750 }
0751 }
0752 return page;
0753
0754 stale:
0755
0756
0757
0758
0759
0760
0761 smp_rmb();
0762 if (READ_ONCE(stable_node->kpfn) != kpfn)
0763 goto again;
0764 remove_node_from_stable_tree(stable_node);
0765 return NULL;
0766 }
0767
0768
0769
0770
0771
0772 static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
0773 {
0774 if (rmap_item->address & STABLE_FLAG) {
0775 struct stable_node *stable_node;
0776 struct page *page;
0777
0778 stable_node = rmap_item->head;
0779 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
0780 if (!page)
0781 goto out;
0782
0783 hlist_del(&rmap_item->hlist);
0784 unlock_page(page);
0785 put_page(page);
0786
0787 if (!hlist_empty(&stable_node->hlist))
0788 ksm_pages_sharing--;
0789 else
0790 ksm_pages_shared--;
0791
0792 rmap_item->mm->ksm_merging_pages--;
0793
0794 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
0795 stable_node->rmap_hlist_len--;
0796
0797 put_anon_vma(rmap_item->anon_vma);
0798 rmap_item->head = NULL;
0799 rmap_item->address &= PAGE_MASK;
0800
0801 } else if (rmap_item->address & UNSTABLE_FLAG) {
0802 unsigned char age;
0803
0804
0805
0806
0807
0808
0809
0810 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
0811 BUG_ON(age > 1);
0812 if (!age)
0813 rb_erase(&rmap_item->node,
0814 root_unstable_tree + NUMA(rmap_item->nid));
0815 ksm_pages_unshared--;
0816 rmap_item->address &= PAGE_MASK;
0817 }
0818 out:
0819 cond_resched();
0820 }
0821
0822 static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
0823 {
0824 while (*rmap_list) {
0825 struct rmap_item *rmap_item = *rmap_list;
0826 *rmap_list = rmap_item->rmap_list;
0827 remove_rmap_item_from_tree(rmap_item);
0828 free_rmap_item(rmap_item);
0829 }
0830 }
0831
0832
0833
0834
0835
0836
0837
0838
0839
0840
0841
0842
0843
0844
0845 static int unmerge_ksm_pages(struct vm_area_struct *vma,
0846 unsigned long start, unsigned long end)
0847 {
0848 unsigned long addr;
0849 int err = 0;
0850
0851 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
0852 if (ksm_test_exit(vma->vm_mm))
0853 break;
0854 if (signal_pending(current))
0855 err = -ERESTARTSYS;
0856 else
0857 err = break_ksm(vma, addr);
0858 }
0859 return err;
0860 }
0861
0862 static inline struct stable_node *folio_stable_node(struct folio *folio)
0863 {
0864 return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
0865 }
0866
0867 static inline struct stable_node *page_stable_node(struct page *page)
0868 {
0869 return folio_stable_node(page_folio(page));
0870 }
0871
0872 static inline void set_page_stable_node(struct page *page,
0873 struct stable_node *stable_node)
0874 {
0875 VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
0876 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
0877 }
0878
0879 #ifdef CONFIG_SYSFS
0880
0881
0882
0883 static int remove_stable_node(struct stable_node *stable_node)
0884 {
0885 struct page *page;
0886 int err;
0887
0888 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
0889 if (!page) {
0890
0891
0892
0893 return 0;
0894 }
0895
0896
0897
0898
0899
0900
0901 err = -EBUSY;
0902 if (!page_mapped(page)) {
0903
0904
0905
0906
0907
0908
0909
0910
0911 set_page_stable_node(page, NULL);
0912 remove_node_from_stable_tree(stable_node);
0913 err = 0;
0914 }
0915
0916 unlock_page(page);
0917 put_page(page);
0918 return err;
0919 }
0920
0921 static int remove_stable_node_chain(struct stable_node *stable_node,
0922 struct rb_root *root)
0923 {
0924 struct stable_node *dup;
0925 struct hlist_node *hlist_safe;
0926
0927 if (!is_stable_node_chain(stable_node)) {
0928 VM_BUG_ON(is_stable_node_dup(stable_node));
0929 if (remove_stable_node(stable_node))
0930 return true;
0931 else
0932 return false;
0933 }
0934
0935 hlist_for_each_entry_safe(dup, hlist_safe,
0936 &stable_node->hlist, hlist_dup) {
0937 VM_BUG_ON(!is_stable_node_dup(dup));
0938 if (remove_stable_node(dup))
0939 return true;
0940 }
0941 BUG_ON(!hlist_empty(&stable_node->hlist));
0942 free_stable_node_chain(stable_node, root);
0943 return false;
0944 }
0945
0946 static int remove_all_stable_nodes(void)
0947 {
0948 struct stable_node *stable_node, *next;
0949 int nid;
0950 int err = 0;
0951
0952 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
0953 while (root_stable_tree[nid].rb_node) {
0954 stable_node = rb_entry(root_stable_tree[nid].rb_node,
0955 struct stable_node, node);
0956 if (remove_stable_node_chain(stable_node,
0957 root_stable_tree + nid)) {
0958 err = -EBUSY;
0959 break;
0960 }
0961 cond_resched();
0962 }
0963 }
0964 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
0965 if (remove_stable_node(stable_node))
0966 err = -EBUSY;
0967 cond_resched();
0968 }
0969 return err;
0970 }
0971
0972 static int unmerge_and_remove_all_rmap_items(void)
0973 {
0974 struct mm_slot *mm_slot;
0975 struct mm_struct *mm;
0976 struct vm_area_struct *vma;
0977 int err = 0;
0978
0979 spin_lock(&ksm_mmlist_lock);
0980 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
0981 struct mm_slot, mm_list);
0982 spin_unlock(&ksm_mmlist_lock);
0983
0984 for (mm_slot = ksm_scan.mm_slot;
0985 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
0986 mm = mm_slot->mm;
0987 mmap_read_lock(mm);
0988 for (vma = mm->mmap; vma; vma = vma->vm_next) {
0989 if (ksm_test_exit(mm))
0990 break;
0991 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
0992 continue;
0993 err = unmerge_ksm_pages(vma,
0994 vma->vm_start, vma->vm_end);
0995 if (err)
0996 goto error;
0997 }
0998
0999 remove_trailing_rmap_items(&mm_slot->rmap_list);
1000 mmap_read_unlock(mm);
1001
1002 spin_lock(&ksm_mmlist_lock);
1003 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
1004 struct mm_slot, mm_list);
1005 if (ksm_test_exit(mm)) {
1006 hash_del(&mm_slot->link);
1007 list_del(&mm_slot->mm_list);
1008 spin_unlock(&ksm_mmlist_lock);
1009
1010 free_mm_slot(mm_slot);
1011 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1012 mmdrop(mm);
1013 } else
1014 spin_unlock(&ksm_mmlist_lock);
1015 }
1016
1017
1018 remove_all_stable_nodes();
1019 ksm_scan.seqnr = 0;
1020 return 0;
1021
1022 error:
1023 mmap_read_unlock(mm);
1024 spin_lock(&ksm_mmlist_lock);
1025 ksm_scan.mm_slot = &ksm_mm_head;
1026 spin_unlock(&ksm_mmlist_lock);
1027 return err;
1028 }
1029 #endif
1030
1031 static u32 calc_checksum(struct page *page)
1032 {
1033 u32 checksum;
1034 void *addr = kmap_atomic(page);
1035 checksum = xxhash(addr, PAGE_SIZE, 0);
1036 kunmap_atomic(addr);
1037 return checksum;
1038 }
1039
1040 static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1041 pte_t *orig_pte)
1042 {
1043 struct mm_struct *mm = vma->vm_mm;
1044 DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
1045 int swapped;
1046 int err = -EFAULT;
1047 struct mmu_notifier_range range;
1048 bool anon_exclusive;
1049
1050 pvmw.address = page_address_in_vma(page, vma);
1051 if (pvmw.address == -EFAULT)
1052 goto out;
1053
1054 BUG_ON(PageTransCompound(page));
1055
1056 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1057 pvmw.address,
1058 pvmw.address + PAGE_SIZE);
1059 mmu_notifier_invalidate_range_start(&range);
1060
1061 if (!page_vma_mapped_walk(&pvmw))
1062 goto out_mn;
1063 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1064 goto out_unlock;
1065
1066 anon_exclusive = PageAnonExclusive(page);
1067 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1068 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1069 anon_exclusive || mm_tlb_flush_pending(mm)) {
1070 pte_t entry;
1071
1072 swapped = PageSwapCache(page);
1073 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1089
1090
1091
1092
1093 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1094 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1095 goto out_unlock;
1096 }
1097
1098 if (anon_exclusive && page_try_share_anon_rmap(page)) {
1099 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1100 goto out_unlock;
1101 }
1102
1103 if (pte_dirty(entry))
1104 set_page_dirty(page);
1105
1106 if (pte_protnone(entry))
1107 entry = pte_mkclean(pte_clear_savedwrite(entry));
1108 else
1109 entry = pte_mkclean(pte_wrprotect(entry));
1110 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1111 }
1112 *orig_pte = *pvmw.pte;
1113 err = 0;
1114
1115 out_unlock:
1116 page_vma_mapped_walk_done(&pvmw);
1117 out_mn:
1118 mmu_notifier_invalidate_range_end(&range);
1119 out:
1120 return err;
1121 }
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132 static int replace_page(struct vm_area_struct *vma, struct page *page,
1133 struct page *kpage, pte_t orig_pte)
1134 {
1135 struct mm_struct *mm = vma->vm_mm;
1136 pmd_t *pmd;
1137 pte_t *ptep;
1138 pte_t newpte;
1139 spinlock_t *ptl;
1140 unsigned long addr;
1141 int err = -EFAULT;
1142 struct mmu_notifier_range range;
1143
1144 addr = page_address_in_vma(page, vma);
1145 if (addr == -EFAULT)
1146 goto out;
1147
1148 pmd = mm_find_pmd(mm, addr);
1149 if (!pmd)
1150 goto out;
1151
1152 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1153 addr + PAGE_SIZE);
1154 mmu_notifier_invalidate_range_start(&range);
1155
1156 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1157 if (!pte_same(*ptep, orig_pte)) {
1158 pte_unmap_unlock(ptep, ptl);
1159 goto out_mn;
1160 }
1161 VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
1162 VM_BUG_ON_PAGE(PageAnon(kpage) && PageAnonExclusive(kpage), kpage);
1163
1164
1165
1166
1167
1168 if (!is_zero_pfn(page_to_pfn(kpage))) {
1169 get_page(kpage);
1170 page_add_anon_rmap(kpage, vma, addr, RMAP_NONE);
1171 newpte = mk_pte(kpage, vma->vm_page_prot);
1172 } else {
1173 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1174 vma->vm_page_prot));
1175
1176
1177
1178
1179
1180
1181 dec_mm_counter(mm, MM_ANONPAGES);
1182 }
1183
1184 flush_cache_page(vma, addr, pte_pfn(*ptep));
1185
1186
1187
1188
1189
1190
1191 ptep_clear_flush(vma, addr, ptep);
1192 set_pte_at_notify(mm, addr, ptep, newpte);
1193
1194 page_remove_rmap(page, vma, false);
1195 if (!page_mapped(page))
1196 try_to_free_swap(page);
1197 put_page(page);
1198
1199 pte_unmap_unlock(ptep, ptl);
1200 err = 0;
1201 out_mn:
1202 mmu_notifier_invalidate_range_end(&range);
1203 out:
1204 return err;
1205 }
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216 static int try_to_merge_one_page(struct vm_area_struct *vma,
1217 struct page *page, struct page *kpage)
1218 {
1219 pte_t orig_pte = __pte(0);
1220 int err = -EFAULT;
1221
1222 if (page == kpage)
1223 return 0;
1224
1225 if (!PageAnon(page))
1226 goto out;
1227
1228
1229
1230
1231
1232
1233
1234
1235 if (!trylock_page(page))
1236 goto out;
1237
1238 if (PageTransCompound(page)) {
1239 if (split_huge_page(page))
1240 goto out_unlock;
1241 }
1242
1243
1244
1245
1246
1247
1248
1249 if (write_protect_page(vma, page, &orig_pte) == 0) {
1250 if (!kpage) {
1251
1252
1253
1254
1255
1256 set_page_stable_node(page, NULL);
1257 mark_page_accessed(page);
1258
1259
1260
1261
1262 if (!PageDirty(page))
1263 SetPageDirty(page);
1264 err = 0;
1265 } else if (pages_identical(page, kpage))
1266 err = replace_page(vma, page, kpage, orig_pte);
1267 }
1268
1269 out_unlock:
1270 unlock_page(page);
1271 out:
1272 return err;
1273 }
1274
1275
1276
1277
1278
1279
1280
1281 static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1282 struct page *page, struct page *kpage)
1283 {
1284 struct mm_struct *mm = rmap_item->mm;
1285 struct vm_area_struct *vma;
1286 int err = -EFAULT;
1287
1288 mmap_read_lock(mm);
1289 vma = find_mergeable_vma(mm, rmap_item->address);
1290 if (!vma)
1291 goto out;
1292
1293 err = try_to_merge_one_page(vma, page, kpage);
1294 if (err)
1295 goto out;
1296
1297
1298 remove_rmap_item_from_tree(rmap_item);
1299
1300
1301 rmap_item->anon_vma = vma->anon_vma;
1302 get_anon_vma(vma->anon_vma);
1303 out:
1304 mmap_read_unlock(mm);
1305 return err;
1306 }
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318 static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1319 struct page *page,
1320 struct rmap_item *tree_rmap_item,
1321 struct page *tree_page)
1322 {
1323 int err;
1324
1325 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1326 if (!err) {
1327 err = try_to_merge_with_ksm_page(tree_rmap_item,
1328 tree_page, page);
1329
1330
1331
1332
1333 if (err)
1334 break_cow(rmap_item);
1335 }
1336 return err ? NULL : page;
1337 }
1338
1339 static __always_inline
1340 bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1341 {
1342 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1343
1344
1345
1346
1347
1348
1349 return stable_node->rmap_hlist_len &&
1350 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1351 }
1352
1353 static __always_inline
1354 bool is_page_sharing_candidate(struct stable_node *stable_node)
1355 {
1356 return __is_page_sharing_candidate(stable_node, 0);
1357 }
1358
1359 static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1360 struct stable_node **_stable_node,
1361 struct rb_root *root,
1362 bool prune_stale_stable_nodes)
1363 {
1364 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1365 struct hlist_node *hlist_safe;
1366 struct page *_tree_page, *tree_page = NULL;
1367 int nr = 0;
1368 int found_rmap_hlist_len;
1369
1370 if (!prune_stale_stable_nodes ||
1371 time_before(jiffies, stable_node->chain_prune_time +
1372 msecs_to_jiffies(
1373 ksm_stable_node_chains_prune_millisecs)))
1374 prune_stale_stable_nodes = false;
1375 else
1376 stable_node->chain_prune_time = jiffies;
1377
1378 hlist_for_each_entry_safe(dup, hlist_safe,
1379 &stable_node->hlist, hlist_dup) {
1380 cond_resched();
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1392 if (!_tree_page)
1393 continue;
1394 nr += 1;
1395 if (is_page_sharing_candidate(dup)) {
1396 if (!found ||
1397 dup->rmap_hlist_len > found_rmap_hlist_len) {
1398 if (found)
1399 put_page(tree_page);
1400 found = dup;
1401 found_rmap_hlist_len = found->rmap_hlist_len;
1402 tree_page = _tree_page;
1403
1404
1405 if (!prune_stale_stable_nodes)
1406 break;
1407 continue;
1408 }
1409 }
1410 put_page(_tree_page);
1411 }
1412
1413 if (found) {
1414
1415
1416
1417
1418
1419
1420 if (prune_stale_stable_nodes && nr == 1) {
1421
1422
1423
1424
1425
1426
1427 BUG_ON(stable_node->hlist.first->next);
1428
1429
1430
1431
1432
1433 rb_replace_node(&stable_node->node, &found->node,
1434 root);
1435 free_stable_node(stable_node);
1436 ksm_stable_node_chains--;
1437 ksm_stable_node_dups--;
1438
1439
1440
1441
1442
1443 *_stable_node = found;
1444
1445
1446
1447
1448
1449
1450 stable_node = NULL;
1451 } else if (stable_node->hlist.first != &found->hlist_dup &&
1452 __is_page_sharing_candidate(found, 1)) {
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468 hlist_del(&found->hlist_dup);
1469 hlist_add_head(&found->hlist_dup,
1470 &stable_node->hlist);
1471 }
1472 }
1473
1474 *_stable_node_dup = found;
1475 return tree_page;
1476 }
1477
1478 static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1479 struct rb_root *root)
1480 {
1481 if (!is_stable_node_chain(stable_node))
1482 return stable_node;
1483 if (hlist_empty(&stable_node->hlist)) {
1484 free_stable_node_chain(stable_node, root);
1485 return NULL;
1486 }
1487 return hlist_entry(stable_node->hlist.first,
1488 typeof(*stable_node), hlist_dup);
1489 }
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505 static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1506 struct stable_node **_stable_node,
1507 struct rb_root *root,
1508 bool prune_stale_stable_nodes)
1509 {
1510 struct stable_node *stable_node = *_stable_node;
1511 if (!is_stable_node_chain(stable_node)) {
1512 if (is_page_sharing_candidate(stable_node)) {
1513 *_stable_node_dup = stable_node;
1514 return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
1515 }
1516
1517
1518
1519
1520 *_stable_node_dup = NULL;
1521 return NULL;
1522 }
1523 return stable_node_dup(_stable_node_dup, _stable_node, root,
1524 prune_stale_stable_nodes);
1525 }
1526
1527 static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1528 struct stable_node **s_n,
1529 struct rb_root *root)
1530 {
1531 return __stable_node_chain(s_n_d, s_n, root, true);
1532 }
1533
1534 static __always_inline struct page *chain(struct stable_node **s_n_d,
1535 struct stable_node *s_n,
1536 struct rb_root *root)
1537 {
1538 struct stable_node *old_stable_node = s_n;
1539 struct page *tree_page;
1540
1541 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1542
1543 VM_BUG_ON(s_n != old_stable_node);
1544 return tree_page;
1545 }
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556 static struct page *stable_tree_search(struct page *page)
1557 {
1558 int nid;
1559 struct rb_root *root;
1560 struct rb_node **new;
1561 struct rb_node *parent;
1562 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1563 struct stable_node *page_node;
1564
1565 page_node = page_stable_node(page);
1566 if (page_node && page_node->head != &migrate_nodes) {
1567
1568 get_page(page);
1569 return page;
1570 }
1571
1572 nid = get_kpfn_nid(page_to_pfn(page));
1573 root = root_stable_tree + nid;
1574 again:
1575 new = &root->rb_node;
1576 parent = NULL;
1577
1578 while (*new) {
1579 struct page *tree_page;
1580 int ret;
1581
1582 cond_resched();
1583 stable_node = rb_entry(*new, struct stable_node, node);
1584 stable_node_any = NULL;
1585 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598 if (!stable_node_dup) {
1599
1600
1601
1602
1603
1604 stable_node_any = stable_node_dup_any(stable_node,
1605 root);
1606 if (!stable_node_any) {
1607
1608 goto again;
1609 }
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619 tree_page = get_ksm_page(stable_node_any,
1620 GET_KSM_PAGE_NOLOCK);
1621 }
1622 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1623 if (!tree_page) {
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633 goto again;
1634 }
1635
1636 ret = memcmp_pages(page, tree_page);
1637 put_page(tree_page);
1638
1639 parent = *new;
1640 if (ret < 0)
1641 new = &parent->rb_left;
1642 else if (ret > 0)
1643 new = &parent->rb_right;
1644 else {
1645 if (page_node) {
1646 VM_BUG_ON(page_node->head != &migrate_nodes);
1647
1648
1649
1650
1651
1652
1653 if (page_mapcount(page) > 1)
1654 goto chain_append;
1655 }
1656
1657 if (!stable_node_dup) {
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670 return NULL;
1671 }
1672
1673
1674
1675
1676
1677
1678
1679
1680 tree_page = get_ksm_page(stable_node_dup,
1681 GET_KSM_PAGE_TRYLOCK);
1682
1683 if (PTR_ERR(tree_page) == -EBUSY)
1684 return ERR_PTR(-EBUSY);
1685
1686 if (unlikely(!tree_page))
1687
1688
1689
1690
1691 goto again;
1692 unlock_page(tree_page);
1693
1694 if (get_kpfn_nid(stable_node_dup->kpfn) !=
1695 NUMA(stable_node_dup->nid)) {
1696 put_page(tree_page);
1697 goto replace;
1698 }
1699 return tree_page;
1700 }
1701 }
1702
1703 if (!page_node)
1704 return NULL;
1705
1706 list_del(&page_node->list);
1707 DO_NUMA(page_node->nid = nid);
1708 rb_link_node(&page_node->node, parent, new);
1709 rb_insert_color(&page_node->node, root);
1710 out:
1711 if (is_page_sharing_candidate(page_node)) {
1712 get_page(page);
1713 return page;
1714 } else
1715 return NULL;
1716
1717 replace:
1718
1719
1720
1721
1722
1723
1724
1725
1726 if (stable_node_dup == stable_node) {
1727 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1728 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1729
1730 if (page_node) {
1731 VM_BUG_ON(page_node->head != &migrate_nodes);
1732 list_del(&page_node->list);
1733 DO_NUMA(page_node->nid = nid);
1734 rb_replace_node(&stable_node_dup->node,
1735 &page_node->node,
1736 root);
1737 if (is_page_sharing_candidate(page_node))
1738 get_page(page);
1739 else
1740 page = NULL;
1741 } else {
1742 rb_erase(&stable_node_dup->node, root);
1743 page = NULL;
1744 }
1745 } else {
1746 VM_BUG_ON(!is_stable_node_chain(stable_node));
1747 __stable_node_dup_del(stable_node_dup);
1748 if (page_node) {
1749 VM_BUG_ON(page_node->head != &migrate_nodes);
1750 list_del(&page_node->list);
1751 DO_NUMA(page_node->nid = nid);
1752 stable_node_chain_add_dup(page_node, stable_node);
1753 if (is_page_sharing_candidate(page_node))
1754 get_page(page);
1755 else
1756 page = NULL;
1757 } else {
1758 page = NULL;
1759 }
1760 }
1761 stable_node_dup->head = &migrate_nodes;
1762 list_add(&stable_node_dup->list, stable_node_dup->head);
1763 return page;
1764
1765 chain_append:
1766
1767 if (!stable_node_dup)
1768 stable_node_dup = stable_node_any;
1769
1770
1771
1772
1773
1774
1775
1776
1777 if (stable_node_dup == stable_node) {
1778 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1779
1780 stable_node = alloc_stable_node_chain(stable_node_dup,
1781 root);
1782 if (!stable_node)
1783 return NULL;
1784 }
1785
1786
1787
1788
1789
1790
1791 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1792 VM_BUG_ON(page_node->head != &migrate_nodes);
1793 list_del(&page_node->list);
1794 DO_NUMA(page_node->nid = nid);
1795 stable_node_chain_add_dup(page_node, stable_node);
1796 goto out;
1797 }
1798
1799
1800
1801
1802
1803
1804
1805
1806 static struct stable_node *stable_tree_insert(struct page *kpage)
1807 {
1808 int nid;
1809 unsigned long kpfn;
1810 struct rb_root *root;
1811 struct rb_node **new;
1812 struct rb_node *parent;
1813 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1814 bool need_chain = false;
1815
1816 kpfn = page_to_pfn(kpage);
1817 nid = get_kpfn_nid(kpfn);
1818 root = root_stable_tree + nid;
1819 again:
1820 parent = NULL;
1821 new = &root->rb_node;
1822
1823 while (*new) {
1824 struct page *tree_page;
1825 int ret;
1826
1827 cond_resched();
1828 stable_node = rb_entry(*new, struct stable_node, node);
1829 stable_node_any = NULL;
1830 tree_page = chain(&stable_node_dup, stable_node, root);
1831 if (!stable_node_dup) {
1832
1833
1834
1835
1836
1837 stable_node_any = stable_node_dup_any(stable_node,
1838 root);
1839 if (!stable_node_any) {
1840
1841 goto again;
1842 }
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852 tree_page = get_ksm_page(stable_node_any,
1853 GET_KSM_PAGE_NOLOCK);
1854 }
1855 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1856 if (!tree_page) {
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866 goto again;
1867 }
1868
1869 ret = memcmp_pages(kpage, tree_page);
1870 put_page(tree_page);
1871
1872 parent = *new;
1873 if (ret < 0)
1874 new = &parent->rb_left;
1875 else if (ret > 0)
1876 new = &parent->rb_right;
1877 else {
1878 need_chain = true;
1879 break;
1880 }
1881 }
1882
1883 stable_node_dup = alloc_stable_node();
1884 if (!stable_node_dup)
1885 return NULL;
1886
1887 INIT_HLIST_HEAD(&stable_node_dup->hlist);
1888 stable_node_dup->kpfn = kpfn;
1889 set_page_stable_node(kpage, stable_node_dup);
1890 stable_node_dup->rmap_hlist_len = 0;
1891 DO_NUMA(stable_node_dup->nid = nid);
1892 if (!need_chain) {
1893 rb_link_node(&stable_node_dup->node, parent, new);
1894 rb_insert_color(&stable_node_dup->node, root);
1895 } else {
1896 if (!is_stable_node_chain(stable_node)) {
1897 struct stable_node *orig = stable_node;
1898
1899 stable_node = alloc_stable_node_chain(orig, root);
1900 if (!stable_node) {
1901 free_stable_node(stable_node_dup);
1902 return NULL;
1903 }
1904 }
1905 stable_node_chain_add_dup(stable_node_dup, stable_node);
1906 }
1907
1908 return stable_node_dup;
1909 }
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925 static
1926 struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1927 struct page *page,
1928 struct page **tree_pagep)
1929 {
1930 struct rb_node **new;
1931 struct rb_root *root;
1932 struct rb_node *parent = NULL;
1933 int nid;
1934
1935 nid = get_kpfn_nid(page_to_pfn(page));
1936 root = root_unstable_tree + nid;
1937 new = &root->rb_node;
1938
1939 while (*new) {
1940 struct rmap_item *tree_rmap_item;
1941 struct page *tree_page;
1942 int ret;
1943
1944 cond_resched();
1945 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1946 tree_page = get_mergeable_page(tree_rmap_item);
1947 if (!tree_page)
1948 return NULL;
1949
1950
1951
1952
1953 if (page == tree_page) {
1954 put_page(tree_page);
1955 return NULL;
1956 }
1957
1958 ret = memcmp_pages(page, tree_page);
1959
1960 parent = *new;
1961 if (ret < 0) {
1962 put_page(tree_page);
1963 new = &parent->rb_left;
1964 } else if (ret > 0) {
1965 put_page(tree_page);
1966 new = &parent->rb_right;
1967 } else if (!ksm_merge_across_nodes &&
1968 page_to_nid(tree_page) != nid) {
1969
1970
1971
1972
1973
1974 put_page(tree_page);
1975 return NULL;
1976 } else {
1977 *tree_pagep = tree_page;
1978 return tree_rmap_item;
1979 }
1980 }
1981
1982 rmap_item->address |= UNSTABLE_FLAG;
1983 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1984 DO_NUMA(rmap_item->nid = nid);
1985 rb_link_node(&rmap_item->node, parent, new);
1986 rb_insert_color(&rmap_item->node, root);
1987
1988 ksm_pages_unshared++;
1989 return NULL;
1990 }
1991
1992
1993
1994
1995
1996
1997 static void stable_tree_append(struct rmap_item *rmap_item,
1998 struct stable_node *stable_node,
1999 bool max_page_sharing_bypass)
2000 {
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011 BUG_ON(stable_node->rmap_hlist_len < 0);
2012
2013 stable_node->rmap_hlist_len++;
2014 if (!max_page_sharing_bypass)
2015
2016 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2017 ksm_max_page_sharing);
2018
2019 rmap_item->head = stable_node;
2020 rmap_item->address |= STABLE_FLAG;
2021 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2022
2023 if (rmap_item->hlist.next)
2024 ksm_pages_sharing++;
2025 else
2026 ksm_pages_shared++;
2027
2028 rmap_item->mm->ksm_merging_pages++;
2029 }
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2041 {
2042 struct mm_struct *mm = rmap_item->mm;
2043 struct rmap_item *tree_rmap_item;
2044 struct page *tree_page = NULL;
2045 struct stable_node *stable_node;
2046 struct page *kpage;
2047 unsigned int checksum;
2048 int err;
2049 bool max_page_sharing_bypass = false;
2050
2051 stable_node = page_stable_node(page);
2052 if (stable_node) {
2053 if (stable_node->head != &migrate_nodes &&
2054 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2055 NUMA(stable_node->nid)) {
2056 stable_node_dup_del(stable_node);
2057 stable_node->head = &migrate_nodes;
2058 list_add(&stable_node->list, stable_node->head);
2059 }
2060 if (stable_node->head != &migrate_nodes &&
2061 rmap_item->head == stable_node)
2062 return;
2063
2064
2065
2066
2067 if (!is_page_sharing_candidate(stable_node))
2068 max_page_sharing_bypass = true;
2069 }
2070
2071
2072 kpage = stable_tree_search(page);
2073 if (kpage == page && rmap_item->head == stable_node) {
2074 put_page(kpage);
2075 return;
2076 }
2077
2078 remove_rmap_item_from_tree(rmap_item);
2079
2080 if (kpage) {
2081 if (PTR_ERR(kpage) == -EBUSY)
2082 return;
2083
2084 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2085 if (!err) {
2086
2087
2088
2089
2090 lock_page(kpage);
2091 stable_tree_append(rmap_item, page_stable_node(kpage),
2092 max_page_sharing_bypass);
2093 unlock_page(kpage);
2094 }
2095 put_page(kpage);
2096 return;
2097 }
2098
2099
2100
2101
2102
2103
2104
2105 checksum = calc_checksum(page);
2106 if (rmap_item->oldchecksum != checksum) {
2107 rmap_item->oldchecksum = checksum;
2108 return;
2109 }
2110
2111
2112
2113
2114
2115 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2116 struct vm_area_struct *vma;
2117
2118 mmap_read_lock(mm);
2119 vma = find_mergeable_vma(mm, rmap_item->address);
2120 if (vma) {
2121 err = try_to_merge_one_page(vma, page,
2122 ZERO_PAGE(rmap_item->address));
2123 } else {
2124
2125
2126
2127
2128 err = 0;
2129 }
2130 mmap_read_unlock(mm);
2131
2132
2133
2134
2135 if (!err)
2136 return;
2137 }
2138 tree_rmap_item =
2139 unstable_tree_search_insert(rmap_item, page, &tree_page);
2140 if (tree_rmap_item) {
2141 bool split;
2142
2143 kpage = try_to_merge_two_pages(rmap_item, page,
2144 tree_rmap_item, tree_page);
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155 split = PageTransCompound(page)
2156 && compound_head(page) == compound_head(tree_page);
2157 put_page(tree_page);
2158 if (kpage) {
2159
2160
2161
2162
2163 lock_page(kpage);
2164 stable_node = stable_tree_insert(kpage);
2165 if (stable_node) {
2166 stable_tree_append(tree_rmap_item, stable_node,
2167 false);
2168 stable_tree_append(rmap_item, stable_node,
2169 false);
2170 }
2171 unlock_page(kpage);
2172
2173
2174
2175
2176
2177
2178
2179 if (!stable_node) {
2180 break_cow(tree_rmap_item);
2181 break_cow(rmap_item);
2182 }
2183 } else if (split) {
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193 if (!trylock_page(page))
2194 return;
2195 split_huge_page(page);
2196 unlock_page(page);
2197 }
2198 }
2199 }
2200
2201 static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2202 struct rmap_item **rmap_list,
2203 unsigned long addr)
2204 {
2205 struct rmap_item *rmap_item;
2206
2207 while (*rmap_list) {
2208 rmap_item = *rmap_list;
2209 if ((rmap_item->address & PAGE_MASK) == addr)
2210 return rmap_item;
2211 if (rmap_item->address > addr)
2212 break;
2213 *rmap_list = rmap_item->rmap_list;
2214 remove_rmap_item_from_tree(rmap_item);
2215 free_rmap_item(rmap_item);
2216 }
2217
2218 rmap_item = alloc_rmap_item();
2219 if (rmap_item) {
2220
2221 rmap_item->mm = mm_slot->mm;
2222 rmap_item->address = addr;
2223 rmap_item->rmap_list = *rmap_list;
2224 *rmap_list = rmap_item;
2225 }
2226 return rmap_item;
2227 }
2228
2229 static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2230 {
2231 struct mm_struct *mm;
2232 struct mm_slot *slot;
2233 struct vm_area_struct *vma;
2234 struct rmap_item *rmap_item;
2235 int nid;
2236
2237 if (list_empty(&ksm_mm_head.mm_list))
2238 return NULL;
2239
2240 slot = ksm_scan.mm_slot;
2241 if (slot == &ksm_mm_head) {
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252 lru_add_drain_all();
2253
2254
2255
2256
2257
2258
2259
2260 if (!ksm_merge_across_nodes) {
2261 struct stable_node *stable_node, *next;
2262 struct page *page;
2263
2264 list_for_each_entry_safe(stable_node, next,
2265 &migrate_nodes, list) {
2266 page = get_ksm_page(stable_node,
2267 GET_KSM_PAGE_NOLOCK);
2268 if (page)
2269 put_page(page);
2270 cond_resched();
2271 }
2272 }
2273
2274 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2275 root_unstable_tree[nid] = RB_ROOT;
2276
2277 spin_lock(&ksm_mmlist_lock);
2278 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2279 ksm_scan.mm_slot = slot;
2280 spin_unlock(&ksm_mmlist_lock);
2281
2282
2283
2284
2285 if (slot == &ksm_mm_head)
2286 return NULL;
2287 next_mm:
2288 ksm_scan.address = 0;
2289 ksm_scan.rmap_list = &slot->rmap_list;
2290 }
2291
2292 mm = slot->mm;
2293 mmap_read_lock(mm);
2294 if (ksm_test_exit(mm))
2295 vma = NULL;
2296 else
2297 vma = find_vma(mm, ksm_scan.address);
2298
2299 for (; vma; vma = vma->vm_next) {
2300 if (!(vma->vm_flags & VM_MERGEABLE))
2301 continue;
2302 if (ksm_scan.address < vma->vm_start)
2303 ksm_scan.address = vma->vm_start;
2304 if (!vma->anon_vma)
2305 ksm_scan.address = vma->vm_end;
2306
2307 while (ksm_scan.address < vma->vm_end) {
2308 if (ksm_test_exit(mm))
2309 break;
2310 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2311 if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
2312 ksm_scan.address += PAGE_SIZE;
2313 cond_resched();
2314 continue;
2315 }
2316 if (PageAnon(*page)) {
2317 flush_anon_page(vma, *page, ksm_scan.address);
2318 flush_dcache_page(*page);
2319 rmap_item = get_next_rmap_item(slot,
2320 ksm_scan.rmap_list, ksm_scan.address);
2321 if (rmap_item) {
2322 ksm_scan.rmap_list =
2323 &rmap_item->rmap_list;
2324 ksm_scan.address += PAGE_SIZE;
2325 } else
2326 put_page(*page);
2327 mmap_read_unlock(mm);
2328 return rmap_item;
2329 }
2330 put_page(*page);
2331 ksm_scan.address += PAGE_SIZE;
2332 cond_resched();
2333 }
2334 }
2335
2336 if (ksm_test_exit(mm)) {
2337 ksm_scan.address = 0;
2338 ksm_scan.rmap_list = &slot->rmap_list;
2339 }
2340
2341
2342
2343
2344 remove_trailing_rmap_items(ksm_scan.rmap_list);
2345
2346 spin_lock(&ksm_mmlist_lock);
2347 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2348 struct mm_slot, mm_list);
2349 if (ksm_scan.address == 0) {
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359 hash_del(&slot->link);
2360 list_del(&slot->mm_list);
2361 spin_unlock(&ksm_mmlist_lock);
2362
2363 free_mm_slot(slot);
2364 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2365 mmap_read_unlock(mm);
2366 mmdrop(mm);
2367 } else {
2368 mmap_read_unlock(mm);
2369
2370
2371
2372
2373
2374
2375
2376 spin_unlock(&ksm_mmlist_lock);
2377 }
2378
2379
2380 slot = ksm_scan.mm_slot;
2381 if (slot != &ksm_mm_head)
2382 goto next_mm;
2383
2384 ksm_scan.seqnr++;
2385 return NULL;
2386 }
2387
2388
2389
2390
2391
2392 static void ksm_do_scan(unsigned int scan_npages)
2393 {
2394 struct rmap_item *rmap_item;
2395 struct page *page;
2396
2397 while (scan_npages-- && likely(!freezing(current))) {
2398 cond_resched();
2399 rmap_item = scan_get_next_rmap_item(&page);
2400 if (!rmap_item)
2401 return;
2402 cmp_and_merge_page(page, rmap_item);
2403 put_page(page);
2404 }
2405 }
2406
2407 static int ksmd_should_run(void)
2408 {
2409 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2410 }
2411
2412 static int ksm_scan_thread(void *nothing)
2413 {
2414 unsigned int sleep_ms;
2415
2416 set_freezable();
2417 set_user_nice(current, 5);
2418
2419 while (!kthread_should_stop()) {
2420 mutex_lock(&ksm_thread_mutex);
2421 wait_while_offlining();
2422 if (ksmd_should_run())
2423 ksm_do_scan(ksm_thread_pages_to_scan);
2424 mutex_unlock(&ksm_thread_mutex);
2425
2426 try_to_freeze();
2427
2428 if (ksmd_should_run()) {
2429 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2430 wait_event_interruptible_timeout(ksm_iter_wait,
2431 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2432 msecs_to_jiffies(sleep_ms));
2433 } else {
2434 wait_event_freezable(ksm_thread_wait,
2435 ksmd_should_run() || kthread_should_stop());
2436 }
2437 }
2438 return 0;
2439 }
2440
2441 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2442 unsigned long end, int advice, unsigned long *vm_flags)
2443 {
2444 struct mm_struct *mm = vma->vm_mm;
2445 int err;
2446
2447 switch (advice) {
2448 case MADV_MERGEABLE:
2449
2450
2451
2452 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
2453 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
2454 VM_HUGETLB | VM_MIXEDMAP))
2455 return 0;
2456
2457 if (vma_is_dax(vma))
2458 return 0;
2459
2460 #ifdef VM_SAO
2461 if (*vm_flags & VM_SAO)
2462 return 0;
2463 #endif
2464 #ifdef VM_SPARC_ADI
2465 if (*vm_flags & VM_SPARC_ADI)
2466 return 0;
2467 #endif
2468
2469 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2470 err = __ksm_enter(mm);
2471 if (err)
2472 return err;
2473 }
2474
2475 *vm_flags |= VM_MERGEABLE;
2476 break;
2477
2478 case MADV_UNMERGEABLE:
2479 if (!(*vm_flags & VM_MERGEABLE))
2480 return 0;
2481
2482 if (vma->anon_vma) {
2483 err = unmerge_ksm_pages(vma, start, end);
2484 if (err)
2485 return err;
2486 }
2487
2488 *vm_flags &= ~VM_MERGEABLE;
2489 break;
2490 }
2491
2492 return 0;
2493 }
2494 EXPORT_SYMBOL_GPL(ksm_madvise);
2495
2496 int __ksm_enter(struct mm_struct *mm)
2497 {
2498 struct mm_slot *mm_slot;
2499 int needs_wakeup;
2500
2501 mm_slot = alloc_mm_slot();
2502 if (!mm_slot)
2503 return -ENOMEM;
2504
2505
2506 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2507
2508 spin_lock(&ksm_mmlist_lock);
2509 insert_to_mm_slots_hash(mm, mm_slot);
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520 if (ksm_run & KSM_RUN_UNMERGE)
2521 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2522 else
2523 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2524 spin_unlock(&ksm_mmlist_lock);
2525
2526 set_bit(MMF_VM_MERGEABLE, &mm->flags);
2527 mmgrab(mm);
2528
2529 if (needs_wakeup)
2530 wake_up_interruptible(&ksm_thread_wait);
2531
2532 return 0;
2533 }
2534
2535 void __ksm_exit(struct mm_struct *mm)
2536 {
2537 struct mm_slot *mm_slot;
2538 int easy_to_free = 0;
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549 spin_lock(&ksm_mmlist_lock);
2550 mm_slot = get_mm_slot(mm);
2551 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2552 if (!mm_slot->rmap_list) {
2553 hash_del(&mm_slot->link);
2554 list_del(&mm_slot->mm_list);
2555 easy_to_free = 1;
2556 } else {
2557 list_move(&mm_slot->mm_list,
2558 &ksm_scan.mm_slot->mm_list);
2559 }
2560 }
2561 spin_unlock(&ksm_mmlist_lock);
2562
2563 if (easy_to_free) {
2564 free_mm_slot(mm_slot);
2565 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2566 mmdrop(mm);
2567 } else if (mm_slot) {
2568 mmap_write_lock(mm);
2569 mmap_write_unlock(mm);
2570 }
2571 }
2572
2573 struct page *ksm_might_need_to_copy(struct page *page,
2574 struct vm_area_struct *vma, unsigned long address)
2575 {
2576 struct folio *folio = page_folio(page);
2577 struct anon_vma *anon_vma = folio_anon_vma(folio);
2578 struct page *new_page;
2579
2580 if (PageKsm(page)) {
2581 if (page_stable_node(page) &&
2582 !(ksm_run & KSM_RUN_UNMERGE))
2583 return page;
2584 } else if (!anon_vma) {
2585 return page;
2586 } else if (page->index == linear_page_index(vma, address) &&
2587 anon_vma->root == vma->anon_vma->root) {
2588 return page;
2589 }
2590 if (!PageUptodate(page))
2591 return page;
2592
2593 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2594 if (new_page &&
2595 mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
2596 put_page(new_page);
2597 new_page = NULL;
2598 }
2599 if (new_page) {
2600 copy_user_highpage(new_page, page, address, vma);
2601
2602 SetPageDirty(new_page);
2603 __SetPageUptodate(new_page);
2604 __SetPageLocked(new_page);
2605 #ifdef CONFIG_SWAP
2606 count_vm_event(KSM_SWPIN_COPY);
2607 #endif
2608 }
2609
2610 return new_page;
2611 }
2612
2613 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
2614 {
2615 struct stable_node *stable_node;
2616 struct rmap_item *rmap_item;
2617 int search_new_forks = 0;
2618
2619 VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
2620
2621
2622
2623
2624
2625 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
2626
2627 stable_node = folio_stable_node(folio);
2628 if (!stable_node)
2629 return;
2630 again:
2631 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2632 struct anon_vma *anon_vma = rmap_item->anon_vma;
2633 struct anon_vma_chain *vmac;
2634 struct vm_area_struct *vma;
2635
2636 cond_resched();
2637 if (!anon_vma_trylock_read(anon_vma)) {
2638 if (rwc->try_lock) {
2639 rwc->contended = true;
2640 return;
2641 }
2642 anon_vma_lock_read(anon_vma);
2643 }
2644 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2645 0, ULONG_MAX) {
2646 unsigned long addr;
2647
2648 cond_resched();
2649 vma = vmac->vma;
2650
2651
2652 addr = rmap_item->address & PAGE_MASK;
2653
2654 if (addr < vma->vm_start || addr >= vma->vm_end)
2655 continue;
2656
2657
2658
2659
2660
2661
2662 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2663 continue;
2664
2665 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2666 continue;
2667
2668 if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
2669 anon_vma_unlock_read(anon_vma);
2670 return;
2671 }
2672 if (rwc->done && rwc->done(folio)) {
2673 anon_vma_unlock_read(anon_vma);
2674 return;
2675 }
2676 }
2677 anon_vma_unlock_read(anon_vma);
2678 }
2679 if (!search_new_forks++)
2680 goto again;
2681 }
2682
2683 #ifdef CONFIG_MIGRATION
2684 void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
2685 {
2686 struct stable_node *stable_node;
2687
2688 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
2689 VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
2690 VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
2691
2692 stable_node = folio_stable_node(folio);
2693 if (stable_node) {
2694 VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
2695 stable_node->kpfn = folio_pfn(newfolio);
2696
2697
2698
2699
2700
2701
2702 smp_wmb();
2703 set_page_stable_node(&folio->page, NULL);
2704 }
2705 }
2706 #endif
2707
2708 #ifdef CONFIG_MEMORY_HOTREMOVE
2709 static void wait_while_offlining(void)
2710 {
2711 while (ksm_run & KSM_RUN_OFFLINE) {
2712 mutex_unlock(&ksm_thread_mutex);
2713 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2714 TASK_UNINTERRUPTIBLE);
2715 mutex_lock(&ksm_thread_mutex);
2716 }
2717 }
2718
2719 static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2720 unsigned long start_pfn,
2721 unsigned long end_pfn)
2722 {
2723 if (stable_node->kpfn >= start_pfn &&
2724 stable_node->kpfn < end_pfn) {
2725
2726
2727
2728
2729 remove_node_from_stable_tree(stable_node);
2730 return true;
2731 }
2732 return false;
2733 }
2734
2735 static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2736 unsigned long start_pfn,
2737 unsigned long end_pfn,
2738 struct rb_root *root)
2739 {
2740 struct stable_node *dup;
2741 struct hlist_node *hlist_safe;
2742
2743 if (!is_stable_node_chain(stable_node)) {
2744 VM_BUG_ON(is_stable_node_dup(stable_node));
2745 return stable_node_dup_remove_range(stable_node, start_pfn,
2746 end_pfn);
2747 }
2748
2749 hlist_for_each_entry_safe(dup, hlist_safe,
2750 &stable_node->hlist, hlist_dup) {
2751 VM_BUG_ON(!is_stable_node_dup(dup));
2752 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2753 }
2754 if (hlist_empty(&stable_node->hlist)) {
2755 free_stable_node_chain(stable_node, root);
2756 return true;
2757 } else
2758 return false;
2759 }
2760
2761 static void ksm_check_stable_tree(unsigned long start_pfn,
2762 unsigned long end_pfn)
2763 {
2764 struct stable_node *stable_node, *next;
2765 struct rb_node *node;
2766 int nid;
2767
2768 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2769 node = rb_first(root_stable_tree + nid);
2770 while (node) {
2771 stable_node = rb_entry(node, struct stable_node, node);
2772 if (stable_node_chain_remove_range(stable_node,
2773 start_pfn, end_pfn,
2774 root_stable_tree +
2775 nid))
2776 node = rb_first(root_stable_tree + nid);
2777 else
2778 node = rb_next(node);
2779 cond_resched();
2780 }
2781 }
2782 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2783 if (stable_node->kpfn >= start_pfn &&
2784 stable_node->kpfn < end_pfn)
2785 remove_node_from_stable_tree(stable_node);
2786 cond_resched();
2787 }
2788 }
2789
2790 static int ksm_memory_callback(struct notifier_block *self,
2791 unsigned long action, void *arg)
2792 {
2793 struct memory_notify *mn = arg;
2794
2795 switch (action) {
2796 case MEM_GOING_OFFLINE:
2797
2798
2799
2800
2801
2802
2803
2804 mutex_lock(&ksm_thread_mutex);
2805 ksm_run |= KSM_RUN_OFFLINE;
2806 mutex_unlock(&ksm_thread_mutex);
2807 break;
2808
2809 case MEM_OFFLINE:
2810
2811
2812
2813
2814
2815
2816
2817 ksm_check_stable_tree(mn->start_pfn,
2818 mn->start_pfn + mn->nr_pages);
2819 fallthrough;
2820 case MEM_CANCEL_OFFLINE:
2821 mutex_lock(&ksm_thread_mutex);
2822 ksm_run &= ~KSM_RUN_OFFLINE;
2823 mutex_unlock(&ksm_thread_mutex);
2824
2825 smp_mb();
2826 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2827 break;
2828 }
2829 return NOTIFY_OK;
2830 }
2831 #else
2832 static void wait_while_offlining(void)
2833 {
2834 }
2835 #endif
2836
2837 #ifdef CONFIG_SYSFS
2838
2839
2840
2841
2842 #define KSM_ATTR_RO(_name) \
2843 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2844 #define KSM_ATTR(_name) \
2845 static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
2846
2847 static ssize_t sleep_millisecs_show(struct kobject *kobj,
2848 struct kobj_attribute *attr, char *buf)
2849 {
2850 return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
2851 }
2852
2853 static ssize_t sleep_millisecs_store(struct kobject *kobj,
2854 struct kobj_attribute *attr,
2855 const char *buf, size_t count)
2856 {
2857 unsigned int msecs;
2858 int err;
2859
2860 err = kstrtouint(buf, 10, &msecs);
2861 if (err)
2862 return -EINVAL;
2863
2864 ksm_thread_sleep_millisecs = msecs;
2865 wake_up_interruptible(&ksm_iter_wait);
2866
2867 return count;
2868 }
2869 KSM_ATTR(sleep_millisecs);
2870
2871 static ssize_t pages_to_scan_show(struct kobject *kobj,
2872 struct kobj_attribute *attr, char *buf)
2873 {
2874 return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
2875 }
2876
2877 static ssize_t pages_to_scan_store(struct kobject *kobj,
2878 struct kobj_attribute *attr,
2879 const char *buf, size_t count)
2880 {
2881 unsigned int nr_pages;
2882 int err;
2883
2884 err = kstrtouint(buf, 10, &nr_pages);
2885 if (err)
2886 return -EINVAL;
2887
2888 ksm_thread_pages_to_scan = nr_pages;
2889
2890 return count;
2891 }
2892 KSM_ATTR(pages_to_scan);
2893
2894 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2895 char *buf)
2896 {
2897 return sysfs_emit(buf, "%lu\n", ksm_run);
2898 }
2899
2900 static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2901 const char *buf, size_t count)
2902 {
2903 unsigned int flags;
2904 int err;
2905
2906 err = kstrtouint(buf, 10, &flags);
2907 if (err)
2908 return -EINVAL;
2909 if (flags > KSM_RUN_UNMERGE)
2910 return -EINVAL;
2911
2912
2913
2914
2915
2916
2917
2918
2919 mutex_lock(&ksm_thread_mutex);
2920 wait_while_offlining();
2921 if (ksm_run != flags) {
2922 ksm_run = flags;
2923 if (flags & KSM_RUN_UNMERGE) {
2924 set_current_oom_origin();
2925 err = unmerge_and_remove_all_rmap_items();
2926 clear_current_oom_origin();
2927 if (err) {
2928 ksm_run = KSM_RUN_STOP;
2929 count = err;
2930 }
2931 }
2932 }
2933 mutex_unlock(&ksm_thread_mutex);
2934
2935 if (flags & KSM_RUN_MERGE)
2936 wake_up_interruptible(&ksm_thread_wait);
2937
2938 return count;
2939 }
2940 KSM_ATTR(run);
2941
2942 #ifdef CONFIG_NUMA
2943 static ssize_t merge_across_nodes_show(struct kobject *kobj,
2944 struct kobj_attribute *attr, char *buf)
2945 {
2946 return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
2947 }
2948
2949 static ssize_t merge_across_nodes_store(struct kobject *kobj,
2950 struct kobj_attribute *attr,
2951 const char *buf, size_t count)
2952 {
2953 int err;
2954 unsigned long knob;
2955
2956 err = kstrtoul(buf, 10, &knob);
2957 if (err)
2958 return err;
2959 if (knob > 1)
2960 return -EINVAL;
2961
2962 mutex_lock(&ksm_thread_mutex);
2963 wait_while_offlining();
2964 if (ksm_merge_across_nodes != knob) {
2965 if (ksm_pages_shared || remove_all_stable_nodes())
2966 err = -EBUSY;
2967 else if (root_stable_tree == one_stable_tree) {
2968 struct rb_root *buf;
2969
2970
2971
2972
2973
2974
2975
2976 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2977 GFP_KERNEL);
2978
2979 if (!buf)
2980 err = -ENOMEM;
2981 else {
2982 root_stable_tree = buf;
2983 root_unstable_tree = buf + nr_node_ids;
2984
2985 root_unstable_tree[0] = one_unstable_tree[0];
2986 }
2987 }
2988 if (!err) {
2989 ksm_merge_across_nodes = knob;
2990 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2991 }
2992 }
2993 mutex_unlock(&ksm_thread_mutex);
2994
2995 return err ? err : count;
2996 }
2997 KSM_ATTR(merge_across_nodes);
2998 #endif
2999
3000 static ssize_t use_zero_pages_show(struct kobject *kobj,
3001 struct kobj_attribute *attr, char *buf)
3002 {
3003 return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
3004 }
3005 static ssize_t use_zero_pages_store(struct kobject *kobj,
3006 struct kobj_attribute *attr,
3007 const char *buf, size_t count)
3008 {
3009 int err;
3010 bool value;
3011
3012 err = kstrtobool(buf, &value);
3013 if (err)
3014 return -EINVAL;
3015
3016 ksm_use_zero_pages = value;
3017
3018 return count;
3019 }
3020 KSM_ATTR(use_zero_pages);
3021
3022 static ssize_t max_page_sharing_show(struct kobject *kobj,
3023 struct kobj_attribute *attr, char *buf)
3024 {
3025 return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
3026 }
3027
3028 static ssize_t max_page_sharing_store(struct kobject *kobj,
3029 struct kobj_attribute *attr,
3030 const char *buf, size_t count)
3031 {
3032 int err;
3033 int knob;
3034
3035 err = kstrtoint(buf, 10, &knob);
3036 if (err)
3037 return err;
3038
3039
3040
3041
3042
3043 if (knob < 2)
3044 return -EINVAL;
3045
3046 if (READ_ONCE(ksm_max_page_sharing) == knob)
3047 return count;
3048
3049 mutex_lock(&ksm_thread_mutex);
3050 wait_while_offlining();
3051 if (ksm_max_page_sharing != knob) {
3052 if (ksm_pages_shared || remove_all_stable_nodes())
3053 err = -EBUSY;
3054 else
3055 ksm_max_page_sharing = knob;
3056 }
3057 mutex_unlock(&ksm_thread_mutex);
3058
3059 return err ? err : count;
3060 }
3061 KSM_ATTR(max_page_sharing);
3062
3063 static ssize_t pages_shared_show(struct kobject *kobj,
3064 struct kobj_attribute *attr, char *buf)
3065 {
3066 return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
3067 }
3068 KSM_ATTR_RO(pages_shared);
3069
3070 static ssize_t pages_sharing_show(struct kobject *kobj,
3071 struct kobj_attribute *attr, char *buf)
3072 {
3073 return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
3074 }
3075 KSM_ATTR_RO(pages_sharing);
3076
3077 static ssize_t pages_unshared_show(struct kobject *kobj,
3078 struct kobj_attribute *attr, char *buf)
3079 {
3080 return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
3081 }
3082 KSM_ATTR_RO(pages_unshared);
3083
3084 static ssize_t pages_volatile_show(struct kobject *kobj,
3085 struct kobj_attribute *attr, char *buf)
3086 {
3087 long ksm_pages_volatile;
3088
3089 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3090 - ksm_pages_sharing - ksm_pages_unshared;
3091
3092
3093
3094
3095 if (ksm_pages_volatile < 0)
3096 ksm_pages_volatile = 0;
3097 return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
3098 }
3099 KSM_ATTR_RO(pages_volatile);
3100
3101 static ssize_t stable_node_dups_show(struct kobject *kobj,
3102 struct kobj_attribute *attr, char *buf)
3103 {
3104 return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
3105 }
3106 KSM_ATTR_RO(stable_node_dups);
3107
3108 static ssize_t stable_node_chains_show(struct kobject *kobj,
3109 struct kobj_attribute *attr, char *buf)
3110 {
3111 return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
3112 }
3113 KSM_ATTR_RO(stable_node_chains);
3114
3115 static ssize_t
3116 stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3117 struct kobj_attribute *attr,
3118 char *buf)
3119 {
3120 return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3121 }
3122
3123 static ssize_t
3124 stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3125 struct kobj_attribute *attr,
3126 const char *buf, size_t count)
3127 {
3128 unsigned int msecs;
3129 int err;
3130
3131 err = kstrtouint(buf, 10, &msecs);
3132 if (err)
3133 return -EINVAL;
3134
3135 ksm_stable_node_chains_prune_millisecs = msecs;
3136
3137 return count;
3138 }
3139 KSM_ATTR(stable_node_chains_prune_millisecs);
3140
3141 static ssize_t full_scans_show(struct kobject *kobj,
3142 struct kobj_attribute *attr, char *buf)
3143 {
3144 return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
3145 }
3146 KSM_ATTR_RO(full_scans);
3147
3148 static struct attribute *ksm_attrs[] = {
3149 &sleep_millisecs_attr.attr,
3150 &pages_to_scan_attr.attr,
3151 &run_attr.attr,
3152 &pages_shared_attr.attr,
3153 &pages_sharing_attr.attr,
3154 &pages_unshared_attr.attr,
3155 &pages_volatile_attr.attr,
3156 &full_scans_attr.attr,
3157 #ifdef CONFIG_NUMA
3158 &merge_across_nodes_attr.attr,
3159 #endif
3160 &max_page_sharing_attr.attr,
3161 &stable_node_chains_attr.attr,
3162 &stable_node_dups_attr.attr,
3163 &stable_node_chains_prune_millisecs_attr.attr,
3164 &use_zero_pages_attr.attr,
3165 NULL,
3166 };
3167
3168 static const struct attribute_group ksm_attr_group = {
3169 .attrs = ksm_attrs,
3170 .name = "ksm",
3171 };
3172 #endif
3173
3174 static int __init ksm_init(void)
3175 {
3176 struct task_struct *ksm_thread;
3177 int err;
3178
3179
3180 zero_checksum = calc_checksum(ZERO_PAGE(0));
3181
3182 ksm_use_zero_pages = false;
3183
3184 err = ksm_slab_init();
3185 if (err)
3186 goto out;
3187
3188 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3189 if (IS_ERR(ksm_thread)) {
3190 pr_err("ksm: creating kthread failed\n");
3191 err = PTR_ERR(ksm_thread);
3192 goto out_free;
3193 }
3194
3195 #ifdef CONFIG_SYSFS
3196 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3197 if (err) {
3198 pr_err("ksm: register sysfs failed\n");
3199 kthread_stop(ksm_thread);
3200 goto out_free;
3201 }
3202 #else
3203 ksm_run = KSM_RUN_MERGE;
3204
3205 #endif
3206
3207 #ifdef CONFIG_MEMORY_HOTREMOVE
3208
3209 hotplug_memory_notifier(ksm_memory_callback, 100);
3210 #endif
3211 return 0;
3212
3213 out_free:
3214 ksm_slab_free();
3215 out:
3216 return err;
3217 }
3218 subsys_initcall(ksm_init);