0001
0002
0003
0004
0005
0006
0007
0008 #include <linux/stddef.h>
0009 #include <linux/mm.h>
0010 #include <linux/sched/signal.h>
0011 #include <linux/swap.h>
0012 #include <linux/interrupt.h>
0013 #include <linux/pagemap.h>
0014 #include <linux/compiler.h>
0015 #include <linux/export.h>
0016 #include <linux/pagevec.h>
0017 #include <linux/writeback.h>
0018 #include <linux/slab.h>
0019 #include <linux/sysctl.h>
0020 #include <linux/cpu.h>
0021 #include <linux/memory.h>
0022 #include <linux/memremap.h>
0023 #include <linux/memory_hotplug.h>
0024 #include <linux/vmalloc.h>
0025 #include <linux/ioport.h>
0026 #include <linux/delay.h>
0027 #include <linux/migrate.h>
0028 #include <linux/page-isolation.h>
0029 #include <linux/pfn.h>
0030 #include <linux/suspend.h>
0031 #include <linux/mm_inline.h>
0032 #include <linux/firmware-map.h>
0033 #include <linux/stop_machine.h>
0034 #include <linux/hugetlb.h>
0035 #include <linux/memblock.h>
0036 #include <linux/compaction.h>
0037 #include <linux/rmap.h>
0038 #include <linux/module.h>
0039
0040 #include <asm/tlbflush.h>
0041
0042 #include "internal.h"
0043 #include "shuffle.h"
0044
0045 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
0046
0047
0048
0049 static bool memmap_on_memory __ro_after_init;
0050 module_param(memmap_on_memory, bool, 0444);
0051 MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
0052
0053 static inline bool mhp_memmap_on_memory(void)
0054 {
0055 return memmap_on_memory;
0056 }
0057 #else
0058 static inline bool mhp_memmap_on_memory(void)
0059 {
0060 return false;
0061 }
0062 #endif
0063
0064 enum {
0065 ONLINE_POLICY_CONTIG_ZONES = 0,
0066 ONLINE_POLICY_AUTO_MOVABLE,
0067 };
0068
0069 static const char * const online_policy_to_str[] = {
0070 [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
0071 [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
0072 };
0073
0074 static int set_online_policy(const char *val, const struct kernel_param *kp)
0075 {
0076 int ret = sysfs_match_string(online_policy_to_str, val);
0077
0078 if (ret < 0)
0079 return ret;
0080 *((int *)kp->arg) = ret;
0081 return 0;
0082 }
0083
0084 static int get_online_policy(char *buffer, const struct kernel_param *kp)
0085 {
0086 return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]);
0087 }
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097 static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES;
0098 static const struct kernel_param_ops online_policy_ops = {
0099 .set = set_online_policy,
0100 .get = get_online_policy,
0101 };
0102 module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
0103 MODULE_PARM_DESC(online_policy,
0104 "Set the online policy (\"contig-zones\", \"auto-movable\") "
0105 "Default: \"contig-zones\"");
0106
0107
0108
0109
0110
0111
0112
0113
0114 static unsigned int auto_movable_ratio __read_mostly = 301;
0115 module_param(auto_movable_ratio, uint, 0644);
0116 MODULE_PARM_DESC(auto_movable_ratio,
0117 "Set the maximum ratio of MOVABLE:KERNEL memory in the system "
0118 "in percent for \"auto-movable\" online policy. Default: 301");
0119
0120
0121
0122
0123 #ifdef CONFIG_NUMA
0124 static bool auto_movable_numa_aware __read_mostly = true;
0125 module_param(auto_movable_numa_aware, bool, 0644);
0126 MODULE_PARM_DESC(auto_movable_numa_aware,
0127 "Consider numa node stats in addition to global stats in "
0128 "\"auto-movable\" online policy. Default: true");
0129 #endif
0130
0131
0132
0133
0134
0135
0136
0137
0138 static online_page_callback_t online_page_callback = generic_online_page;
0139 static DEFINE_MUTEX(online_page_callback_lock);
0140
0141 DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
0142
0143 void get_online_mems(void)
0144 {
0145 percpu_down_read(&mem_hotplug_lock);
0146 }
0147
0148 void put_online_mems(void)
0149 {
0150 percpu_up_read(&mem_hotplug_lock);
0151 }
0152
0153 bool movable_node_enabled = false;
0154
0155 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
0156 int mhp_default_online_type = MMOP_OFFLINE;
0157 #else
0158 int mhp_default_online_type = MMOP_ONLINE;
0159 #endif
0160
0161 static int __init setup_memhp_default_state(char *str)
0162 {
0163 const int online_type = mhp_online_type_from_str(str);
0164
0165 if (online_type >= 0)
0166 mhp_default_online_type = online_type;
0167
0168 return 1;
0169 }
0170 __setup("memhp_default_state=", setup_memhp_default_state);
0171
0172 void mem_hotplug_begin(void)
0173 {
0174 cpus_read_lock();
0175 percpu_down_write(&mem_hotplug_lock);
0176 }
0177
0178 void mem_hotplug_done(void)
0179 {
0180 percpu_up_write(&mem_hotplug_lock);
0181 cpus_read_unlock();
0182 }
0183
0184 u64 max_mem_size = U64_MAX;
0185
0186
0187 static struct resource *register_memory_resource(u64 start, u64 size,
0188 const char *resource_name)
0189 {
0190 struct resource *res;
0191 unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
0192
0193 if (strcmp(resource_name, "System RAM"))
0194 flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
0195
0196 if (!mhp_range_allowed(start, size, true))
0197 return ERR_PTR(-E2BIG);
0198
0199
0200
0201
0202
0203
0204
0205 if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
0206 return ERR_PTR(-E2BIG);
0207
0208
0209
0210
0211
0212
0213 res = __request_region(&iomem_resource, start, size,
0214 resource_name, flags);
0215
0216 if (!res) {
0217 pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
0218 start, start + size);
0219 return ERR_PTR(-EEXIST);
0220 }
0221 return res;
0222 }
0223
0224 static void release_memory_resource(struct resource *res)
0225 {
0226 if (!res)
0227 return;
0228 release_resource(res);
0229 kfree(res);
0230 }
0231
0232 static int check_pfn_span(unsigned long pfn, unsigned long nr_pages)
0233 {
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243 unsigned long min_align;
0244
0245 if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
0246 min_align = PAGES_PER_SUBSECTION;
0247 else
0248 min_align = PAGES_PER_SECTION;
0249 if (!IS_ALIGNED(pfn | nr_pages, min_align))
0250 return -EINVAL;
0251 return 0;
0252 }
0253
0254
0255
0256
0257
0258
0259 struct page *pfn_to_online_page(unsigned long pfn)
0260 {
0261 unsigned long nr = pfn_to_section_nr(pfn);
0262 struct dev_pagemap *pgmap;
0263 struct mem_section *ms;
0264
0265 if (nr >= NR_MEM_SECTIONS)
0266 return NULL;
0267
0268 ms = __nr_to_section(nr);
0269 if (!online_section(ms))
0270 return NULL;
0271
0272
0273
0274
0275
0276 if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn))
0277 return NULL;
0278
0279 if (!pfn_section_valid(ms, pfn))
0280 return NULL;
0281
0282 if (!online_device_section(ms))
0283 return pfn_to_page(pfn);
0284
0285
0286
0287
0288
0289
0290
0291 pgmap = get_dev_pagemap(pfn, NULL);
0292 put_dev_pagemap(pgmap);
0293
0294
0295 if (pgmap)
0296 return NULL;
0297
0298 return pfn_to_page(pfn);
0299 }
0300 EXPORT_SYMBOL_GPL(pfn_to_online_page);
0301
0302 int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
0303 struct mhp_params *params)
0304 {
0305 const unsigned long end_pfn = pfn + nr_pages;
0306 unsigned long cur_nr_pages;
0307 int err;
0308 struct vmem_altmap *altmap = params->altmap;
0309
0310 if (WARN_ON_ONCE(!pgprot_val(params->pgprot)))
0311 return -EINVAL;
0312
0313 VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
0314
0315 if (altmap) {
0316
0317
0318
0319 if (altmap->base_pfn != pfn
0320 || vmem_altmap_offset(altmap) > nr_pages) {
0321 pr_warn_once("memory add fail, invalid altmap\n");
0322 return -EINVAL;
0323 }
0324 altmap->alloc = 0;
0325 }
0326
0327 if (check_pfn_span(pfn, nr_pages)) {
0328 WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
0329 return -EINVAL;
0330 }
0331
0332 for (; pfn < end_pfn; pfn += cur_nr_pages) {
0333
0334 cur_nr_pages = min(end_pfn - pfn,
0335 SECTION_ALIGN_UP(pfn + 1) - pfn);
0336 err = sparse_add_section(nid, pfn, cur_nr_pages, altmap,
0337 params->pgmap);
0338 if (err)
0339 break;
0340 cond_resched();
0341 }
0342 vmemmap_populate_print_last();
0343 return err;
0344 }
0345
0346
0347 static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
0348 unsigned long start_pfn,
0349 unsigned long end_pfn)
0350 {
0351 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
0352 if (unlikely(!pfn_to_online_page(start_pfn)))
0353 continue;
0354
0355 if (unlikely(pfn_to_nid(start_pfn) != nid))
0356 continue;
0357
0358 if (zone != page_zone(pfn_to_page(start_pfn)))
0359 continue;
0360
0361 return start_pfn;
0362 }
0363
0364 return 0;
0365 }
0366
0367
0368 static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
0369 unsigned long start_pfn,
0370 unsigned long end_pfn)
0371 {
0372 unsigned long pfn;
0373
0374
0375 pfn = end_pfn - 1;
0376 for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
0377 if (unlikely(!pfn_to_online_page(pfn)))
0378 continue;
0379
0380 if (unlikely(pfn_to_nid(pfn) != nid))
0381 continue;
0382
0383 if (zone != page_zone(pfn_to_page(pfn)))
0384 continue;
0385
0386 return pfn;
0387 }
0388
0389 return 0;
0390 }
0391
0392 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
0393 unsigned long end_pfn)
0394 {
0395 unsigned long pfn;
0396 int nid = zone_to_nid(zone);
0397
0398 if (zone->zone_start_pfn == start_pfn) {
0399
0400
0401
0402
0403
0404
0405 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
0406 zone_end_pfn(zone));
0407 if (pfn) {
0408 zone->spanned_pages = zone_end_pfn(zone) - pfn;
0409 zone->zone_start_pfn = pfn;
0410 } else {
0411 zone->zone_start_pfn = 0;
0412 zone->spanned_pages = 0;
0413 }
0414 } else if (zone_end_pfn(zone) == end_pfn) {
0415
0416
0417
0418
0419
0420
0421 pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
0422 start_pfn);
0423 if (pfn)
0424 zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
0425 else {
0426 zone->zone_start_pfn = 0;
0427 zone->spanned_pages = 0;
0428 }
0429 }
0430 }
0431
0432 static void update_pgdat_span(struct pglist_data *pgdat)
0433 {
0434 unsigned long node_start_pfn = 0, node_end_pfn = 0;
0435 struct zone *zone;
0436
0437 for (zone = pgdat->node_zones;
0438 zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
0439 unsigned long end_pfn = zone_end_pfn(zone);
0440
0441
0442 if (!zone->spanned_pages)
0443 continue;
0444 if (!node_end_pfn) {
0445 node_start_pfn = zone->zone_start_pfn;
0446 node_end_pfn = end_pfn;
0447 continue;
0448 }
0449
0450 if (end_pfn > node_end_pfn)
0451 node_end_pfn = end_pfn;
0452 if (zone->zone_start_pfn < node_start_pfn)
0453 node_start_pfn = zone->zone_start_pfn;
0454 }
0455
0456 pgdat->node_start_pfn = node_start_pfn;
0457 pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
0458 }
0459
0460 void __ref remove_pfn_range_from_zone(struct zone *zone,
0461 unsigned long start_pfn,
0462 unsigned long nr_pages)
0463 {
0464 const unsigned long end_pfn = start_pfn + nr_pages;
0465 struct pglist_data *pgdat = zone->zone_pgdat;
0466 unsigned long pfn, cur_nr_pages;
0467
0468
0469 for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
0470 cond_resched();
0471
0472
0473 cur_nr_pages =
0474 min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
0475 page_init_poison(pfn_to_page(pfn),
0476 sizeof(struct page) * cur_nr_pages);
0477 }
0478
0479
0480
0481
0482
0483
0484 if (zone_is_zone_device(zone))
0485 return;
0486
0487 clear_zone_contiguous(zone);
0488
0489 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
0490 update_pgdat_span(pgdat);
0491
0492 set_zone_contiguous(zone);
0493 }
0494
0495 static void __remove_section(unsigned long pfn, unsigned long nr_pages,
0496 unsigned long map_offset,
0497 struct vmem_altmap *altmap)
0498 {
0499 struct mem_section *ms = __pfn_to_section(pfn);
0500
0501 if (WARN_ON_ONCE(!valid_section(ms)))
0502 return;
0503
0504 sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
0505 }
0506
0507
0508
0509
0510
0511
0512
0513
0514
0515
0516
0517
0518 void __remove_pages(unsigned long pfn, unsigned long nr_pages,
0519 struct vmem_altmap *altmap)
0520 {
0521 const unsigned long end_pfn = pfn + nr_pages;
0522 unsigned long cur_nr_pages;
0523 unsigned long map_offset = 0;
0524
0525 map_offset = vmem_altmap_offset(altmap);
0526
0527 if (check_pfn_span(pfn, nr_pages)) {
0528 WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
0529 return;
0530 }
0531
0532 for (; pfn < end_pfn; pfn += cur_nr_pages) {
0533 cond_resched();
0534
0535 cur_nr_pages = min(end_pfn - pfn,
0536 SECTION_ALIGN_UP(pfn + 1) - pfn);
0537 __remove_section(pfn, cur_nr_pages, map_offset, altmap);
0538 map_offset = 0;
0539 }
0540 }
0541
0542 int set_online_page_callback(online_page_callback_t callback)
0543 {
0544 int rc = -EINVAL;
0545
0546 get_online_mems();
0547 mutex_lock(&online_page_callback_lock);
0548
0549 if (online_page_callback == generic_online_page) {
0550 online_page_callback = callback;
0551 rc = 0;
0552 }
0553
0554 mutex_unlock(&online_page_callback_lock);
0555 put_online_mems();
0556
0557 return rc;
0558 }
0559 EXPORT_SYMBOL_GPL(set_online_page_callback);
0560
0561 int restore_online_page_callback(online_page_callback_t callback)
0562 {
0563 int rc = -EINVAL;
0564
0565 get_online_mems();
0566 mutex_lock(&online_page_callback_lock);
0567
0568 if (online_page_callback == callback) {
0569 online_page_callback = generic_online_page;
0570 rc = 0;
0571 }
0572
0573 mutex_unlock(&online_page_callback_lock);
0574 put_online_mems();
0575
0576 return rc;
0577 }
0578 EXPORT_SYMBOL_GPL(restore_online_page_callback);
0579
0580 void generic_online_page(struct page *page, unsigned int order)
0581 {
0582
0583
0584
0585
0586
0587 debug_pagealloc_map_pages(page, 1 << order);
0588 __free_pages_core(page, order);
0589 totalram_pages_add(1UL << order);
0590 }
0591 EXPORT_SYMBOL_GPL(generic_online_page);
0592
0593 static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
0594 {
0595 const unsigned long end_pfn = start_pfn + nr_pages;
0596 unsigned long pfn;
0597
0598
0599
0600
0601
0602
0603
0604
0605
0606
0607 for (pfn = start_pfn; pfn < end_pfn;) {
0608 int order = min(MAX_ORDER - 1UL, __ffs(pfn));
0609
0610 (*online_page_callback)(pfn_to_page(pfn), order);
0611 pfn += (1UL << order);
0612 }
0613
0614
0615 online_mem_sections(start_pfn, end_pfn);
0616 }
0617
0618
0619 static void node_states_check_changes_online(unsigned long nr_pages,
0620 struct zone *zone, struct memory_notify *arg)
0621 {
0622 int nid = zone_to_nid(zone);
0623
0624 arg->status_change_nid = NUMA_NO_NODE;
0625 arg->status_change_nid_normal = NUMA_NO_NODE;
0626
0627 if (!node_state(nid, N_MEMORY))
0628 arg->status_change_nid = nid;
0629 if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
0630 arg->status_change_nid_normal = nid;
0631 }
0632
0633 static void node_states_set_node(int node, struct memory_notify *arg)
0634 {
0635 if (arg->status_change_nid_normal >= 0)
0636 node_set_state(node, N_NORMAL_MEMORY);
0637
0638 if (arg->status_change_nid >= 0)
0639 node_set_state(node, N_MEMORY);
0640 }
0641
0642 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
0643 unsigned long nr_pages)
0644 {
0645 unsigned long old_end_pfn = zone_end_pfn(zone);
0646
0647 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
0648 zone->zone_start_pfn = start_pfn;
0649
0650 zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
0651 }
0652
0653 static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
0654 unsigned long nr_pages)
0655 {
0656 unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
0657
0658 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
0659 pgdat->node_start_pfn = start_pfn;
0660
0661 pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
0662
0663 }
0664
0665 #ifdef CONFIG_ZONE_DEVICE
0666 static void section_taint_zone_device(unsigned long pfn)
0667 {
0668 struct mem_section *ms = __pfn_to_section(pfn);
0669
0670 ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
0671 }
0672 #else
0673 static inline void section_taint_zone_device(unsigned long pfn)
0674 {
0675 }
0676 #endif
0677
0678
0679
0680
0681
0682
0683
0684
0685
0686
0687 void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
0688 unsigned long nr_pages,
0689 struct vmem_altmap *altmap, int migratetype)
0690 {
0691 struct pglist_data *pgdat = zone->zone_pgdat;
0692 int nid = pgdat->node_id;
0693
0694 clear_zone_contiguous(zone);
0695
0696 if (zone_is_empty(zone))
0697 init_currently_empty_zone(zone, start_pfn, nr_pages);
0698 resize_zone_range(zone, start_pfn, nr_pages);
0699 resize_pgdat_range(pgdat, start_pfn, nr_pages);
0700
0701
0702
0703
0704
0705
0706
0707 if (zone_is_zone_device(zone)) {
0708 if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
0709 section_taint_zone_device(start_pfn);
0710 if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
0711 section_taint_zone_device(start_pfn + nr_pages);
0712 }
0713
0714
0715
0716
0717
0718
0719
0720 memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
0721 MEMINIT_HOTPLUG, altmap, migratetype);
0722
0723 set_zone_contiguous(zone);
0724 }
0725
0726 struct auto_movable_stats {
0727 unsigned long kernel_early_pages;
0728 unsigned long movable_pages;
0729 };
0730
0731 static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
0732 struct zone *zone)
0733 {
0734 if (zone_idx(zone) == ZONE_MOVABLE) {
0735 stats->movable_pages += zone->present_pages;
0736 } else {
0737 stats->kernel_early_pages += zone->present_early_pages;
0738 #ifdef CONFIG_CMA
0739
0740
0741
0742
0743 stats->movable_pages += zone->cma_pages;
0744 stats->kernel_early_pages -= zone->cma_pages;
0745 #endif
0746 }
0747 }
0748 struct auto_movable_group_stats {
0749 unsigned long movable_pages;
0750 unsigned long req_kernel_early_pages;
0751 };
0752
0753 static int auto_movable_stats_account_group(struct memory_group *group,
0754 void *arg)
0755 {
0756 const int ratio = READ_ONCE(auto_movable_ratio);
0757 struct auto_movable_group_stats *stats = arg;
0758 long pages;
0759
0760
0761
0762
0763
0764 if (!ratio)
0765 return 0;
0766
0767
0768
0769
0770
0771 pages = group->present_movable_pages * 100 / ratio;
0772 pages -= group->present_kernel_pages;
0773
0774 if (pages > 0)
0775 stats->req_kernel_early_pages += pages;
0776 stats->movable_pages += group->present_movable_pages;
0777 return 0;
0778 }
0779
0780 static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
0781 unsigned long nr_pages)
0782 {
0783 unsigned long kernel_early_pages, movable_pages;
0784 struct auto_movable_group_stats group_stats = {};
0785 struct auto_movable_stats stats = {};
0786 pg_data_t *pgdat = NODE_DATA(nid);
0787 struct zone *zone;
0788 int i;
0789
0790
0791 if (nid == NUMA_NO_NODE) {
0792
0793 for_each_populated_zone(zone)
0794 auto_movable_stats_account_zone(&stats, zone);
0795 } else {
0796 for (i = 0; i < MAX_NR_ZONES; i++) {
0797 zone = pgdat->node_zones + i;
0798 if (populated_zone(zone))
0799 auto_movable_stats_account_zone(&stats, zone);
0800 }
0801 }
0802
0803 kernel_early_pages = stats.kernel_early_pages;
0804 movable_pages = stats.movable_pages;
0805
0806
0807
0808
0809
0810
0811 walk_dynamic_memory_groups(nid, auto_movable_stats_account_group,
0812 group, &group_stats);
0813 if (kernel_early_pages <= group_stats.req_kernel_early_pages)
0814 return false;
0815 kernel_early_pages -= group_stats.req_kernel_early_pages;
0816 movable_pages -= group_stats.movable_pages;
0817
0818 if (group && group->is_dynamic)
0819 kernel_early_pages += group->present_kernel_pages;
0820
0821
0822
0823
0824
0825 movable_pages += nr_pages;
0826 return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
0827 }
0828
0829
0830
0831
0832
0833
0834 static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
0835 unsigned long nr_pages)
0836 {
0837 struct pglist_data *pgdat = NODE_DATA(nid);
0838 int zid;
0839
0840 for (zid = 0; zid < ZONE_NORMAL; zid++) {
0841 struct zone *zone = &pgdat->node_zones[zid];
0842
0843 if (zone_intersects(zone, start_pfn, nr_pages))
0844 return zone;
0845 }
0846
0847 return &pgdat->node_zones[ZONE_NORMAL];
0848 }
0849
0850
0851
0852
0853
0854
0855
0856
0857
0858
0859
0860
0861
0862
0863
0864
0865
0866
0867
0868
0869
0870
0871
0872
0873
0874
0875
0876
0877
0878
0879
0880
0881
0882
0883
0884
0885
0886
0887
0888
0889
0890
0891
0892
0893
0894
0895
0896
0897
0898
0899
0900 static struct zone *auto_movable_zone_for_pfn(int nid,
0901 struct memory_group *group,
0902 unsigned long pfn,
0903 unsigned long nr_pages)
0904 {
0905 unsigned long online_pages = 0, max_pages, end_pfn;
0906 struct page *page;
0907
0908 if (!auto_movable_ratio)
0909 goto kernel_zone;
0910
0911 if (group && !group->is_dynamic) {
0912 max_pages = group->s.max_pages;
0913 online_pages = group->present_movable_pages;
0914
0915
0916 if (group->present_kernel_pages)
0917 goto kernel_zone;
0918 } else if (!group || group->d.unit_pages == nr_pages) {
0919 max_pages = nr_pages;
0920 } else {
0921 max_pages = group->d.unit_pages;
0922
0923
0924
0925
0926
0927
0928 pfn = ALIGN_DOWN(pfn, group->d.unit_pages);
0929 end_pfn = pfn + group->d.unit_pages;
0930 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
0931 page = pfn_to_online_page(pfn);
0932 if (!page)
0933 continue;
0934
0935 if (!is_zone_movable_page(page))
0936 goto kernel_zone;
0937 online_pages += PAGES_PER_SECTION;
0938 }
0939 }
0940
0941
0942
0943
0944
0945
0946 nr_pages = max_pages - online_pages;
0947 if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages))
0948 goto kernel_zone;
0949
0950 #ifdef CONFIG_NUMA
0951 if (auto_movable_numa_aware &&
0952 !auto_movable_can_online_movable(nid, group, nr_pages))
0953 goto kernel_zone;
0954 #endif
0955
0956 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
0957 kernel_zone:
0958 return default_kernel_zone_for_pfn(nid, pfn, nr_pages);
0959 }
0960
0961 static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
0962 unsigned long nr_pages)
0963 {
0964 struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
0965 nr_pages);
0966 struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
0967 bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
0968 bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
0969
0970
0971
0972
0973
0974 if (in_kernel ^ in_movable)
0975 return (in_kernel) ? kernel_zone : movable_zone;
0976
0977
0978
0979
0980
0981
0982 return movable_node_enabled ? movable_zone : kernel_zone;
0983 }
0984
0985 struct zone *zone_for_pfn_range(int online_type, int nid,
0986 struct memory_group *group, unsigned long start_pfn,
0987 unsigned long nr_pages)
0988 {
0989 if (online_type == MMOP_ONLINE_KERNEL)
0990 return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
0991
0992 if (online_type == MMOP_ONLINE_MOVABLE)
0993 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
0994
0995 if (online_policy == ONLINE_POLICY_AUTO_MOVABLE)
0996 return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages);
0997
0998 return default_zone_for_pfn(nid, start_pfn, nr_pages);
0999 }
1000
1001
1002
1003
1004
1005 void adjust_present_page_count(struct page *page, struct memory_group *group,
1006 long nr_pages)
1007 {
1008 struct zone *zone = page_zone(page);
1009 const bool movable = zone_idx(zone) == ZONE_MOVABLE;
1010
1011
1012
1013
1014
1015 if (early_section(__pfn_to_section(page_to_pfn(page))))
1016 zone->present_early_pages += nr_pages;
1017 zone->present_pages += nr_pages;
1018 zone->zone_pgdat->node_present_pages += nr_pages;
1019
1020 if (group && movable)
1021 group->present_movable_pages += nr_pages;
1022 else if (group && !movable)
1023 group->present_kernel_pages += nr_pages;
1024 }
1025
1026 int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
1027 struct zone *zone)
1028 {
1029 unsigned long end_pfn = pfn + nr_pages;
1030 int ret, i;
1031
1032 ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
1033 if (ret)
1034 return ret;
1035
1036 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
1037
1038 for (i = 0; i < nr_pages; i++)
1039 SetPageVmemmapSelfHosted(pfn_to_page(pfn + i));
1040
1041
1042
1043
1044
1045
1046 if (nr_pages >= PAGES_PER_SECTION)
1047 online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
1048
1049 return ret;
1050 }
1051
1052 void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
1053 {
1054 unsigned long end_pfn = pfn + nr_pages;
1055
1056
1057
1058
1059
1060
1061 if (nr_pages >= PAGES_PER_SECTION)
1062 offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
1063
1064
1065
1066
1067
1068 remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
1069 kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
1070 }
1071
1072 int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
1073 struct zone *zone, struct memory_group *group)
1074 {
1075 unsigned long flags;
1076 int need_zonelists_rebuild = 0;
1077 const int nid = zone_to_nid(zone);
1078 int ret;
1079 struct memory_notify arg;
1080
1081
1082
1083
1084
1085
1086
1087
1088 if (WARN_ON_ONCE(!nr_pages ||
1089 !IS_ALIGNED(pfn, pageblock_nr_pages) ||
1090 !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
1091 return -EINVAL;
1092
1093 mem_hotplug_begin();
1094
1095
1096 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
1097
1098 arg.start_pfn = pfn;
1099 arg.nr_pages = nr_pages;
1100 node_states_check_changes_online(nr_pages, zone, &arg);
1101
1102 ret = memory_notify(MEM_GOING_ONLINE, &arg);
1103 ret = notifier_to_errno(ret);
1104 if (ret)
1105 goto failed_addition;
1106
1107
1108
1109
1110
1111 spin_lock_irqsave(&zone->lock, flags);
1112 zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
1113 spin_unlock_irqrestore(&zone->lock, flags);
1114
1115
1116
1117
1118
1119
1120 if (!populated_zone(zone)) {
1121 need_zonelists_rebuild = 1;
1122 setup_zone_pageset(zone);
1123 }
1124
1125 online_pages_range(pfn, nr_pages);
1126 adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
1127
1128 node_states_set_node(nid, &arg);
1129 if (need_zonelists_rebuild)
1130 build_all_zonelists(NULL);
1131
1132
1133 undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
1134
1135
1136
1137
1138
1139
1140
1141 shuffle_zone(zone);
1142
1143
1144 init_per_zone_wmark_min();
1145
1146 kswapd_run(nid);
1147 kcompactd_run(nid);
1148
1149 writeback_set_ratelimit();
1150
1151 memory_notify(MEM_ONLINE, &arg);
1152 mem_hotplug_done();
1153 return 0;
1154
1155 failed_addition:
1156 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
1157 (unsigned long long) pfn << PAGE_SHIFT,
1158 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
1159 memory_notify(MEM_CANCEL_ONLINE, &arg);
1160 remove_pfn_range_from_zone(zone, pfn, nr_pages);
1161 mem_hotplug_done();
1162 return ret;
1163 }
1164
1165 static void reset_node_present_pages(pg_data_t *pgdat)
1166 {
1167 struct zone *z;
1168
1169 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
1170 z->present_pages = 0;
1171
1172 pgdat->node_present_pages = 0;
1173 }
1174
1175
1176 static pg_data_t __ref *hotadd_init_pgdat(int nid)
1177 {
1178 struct pglist_data *pgdat;
1179
1180
1181
1182
1183
1184
1185
1186 pgdat = NODE_DATA(nid);
1187
1188
1189 free_area_init_core_hotplug(pgdat);
1190
1191
1192
1193
1194
1195 build_all_zonelists(pgdat);
1196
1197
1198
1199
1200
1201
1202
1203 reset_node_managed_pages(pgdat);
1204 reset_node_present_pages(pgdat);
1205
1206 return pgdat;
1207 }
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220 static int __try_online_node(int nid, bool set_node_online)
1221 {
1222 pg_data_t *pgdat;
1223 int ret = 1;
1224
1225 if (node_online(nid))
1226 return 0;
1227
1228 pgdat = hotadd_init_pgdat(nid);
1229 if (!pgdat) {
1230 pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1231 ret = -ENOMEM;
1232 goto out;
1233 }
1234
1235 if (set_node_online) {
1236 node_set_online(nid);
1237 ret = register_one_node(nid);
1238 BUG_ON(ret);
1239 }
1240 out:
1241 return ret;
1242 }
1243
1244
1245
1246
1247 int try_online_node(int nid)
1248 {
1249 int ret;
1250
1251 mem_hotplug_begin();
1252 ret = __try_online_node(nid, true);
1253 mem_hotplug_done();
1254 return ret;
1255 }
1256
1257 static int check_hotplug_memory_range(u64 start, u64 size)
1258 {
1259
1260 if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
1261 !IS_ALIGNED(size, memory_block_size_bytes())) {
1262 pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
1263 memory_block_size_bytes(), start, size);
1264 return -EINVAL;
1265 }
1266
1267 return 0;
1268 }
1269
1270 static int online_memory_block(struct memory_block *mem, void *arg)
1271 {
1272 mem->online_type = mhp_default_online_type;
1273 return device_online(&mem->dev);
1274 }
1275
1276 bool mhp_supports_memmap_on_memory(unsigned long size)
1277 {
1278 unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
1279 unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
1280 unsigned long remaining_size = size - vmemmap_size;
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308 return mhp_memmap_on_memory() &&
1309 size == memory_block_size_bytes() &&
1310 IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
1311 IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
1312 }
1313
1314
1315
1316
1317
1318
1319
1320 int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
1321 {
1322 struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
1323 enum memblock_flags memblock_flags = MEMBLOCK_NONE;
1324 struct vmem_altmap mhp_altmap = {};
1325 struct memory_group *group = NULL;
1326 u64 start, size;
1327 bool new_node = false;
1328 int ret;
1329
1330 start = res->start;
1331 size = resource_size(res);
1332
1333 ret = check_hotplug_memory_range(start, size);
1334 if (ret)
1335 return ret;
1336
1337 if (mhp_flags & MHP_NID_IS_MGID) {
1338 group = memory_group_find_by_id(nid);
1339 if (!group)
1340 return -EINVAL;
1341 nid = group->nid;
1342 }
1343
1344 if (!node_possible(nid)) {
1345 WARN(1, "node %d was absent from the node_possible_map\n", nid);
1346 return -EINVAL;
1347 }
1348
1349 mem_hotplug_begin();
1350
1351 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
1352 if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
1353 memblock_flags = MEMBLOCK_DRIVER_MANAGED;
1354 ret = memblock_add_node(start, size, nid, memblock_flags);
1355 if (ret)
1356 goto error_mem_hotplug_end;
1357 }
1358
1359 ret = __try_online_node(nid, false);
1360 if (ret < 0)
1361 goto error;
1362 new_node = ret;
1363
1364
1365
1366
1367 if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
1368 if (!mhp_supports_memmap_on_memory(size)) {
1369 ret = -EINVAL;
1370 goto error;
1371 }
1372 mhp_altmap.free = PHYS_PFN(size);
1373 mhp_altmap.base_pfn = PHYS_PFN(start);
1374 params.altmap = &mhp_altmap;
1375 }
1376
1377
1378 ret = arch_add_memory(nid, start, size, ¶ms);
1379 if (ret < 0)
1380 goto error;
1381
1382
1383 ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
1384 group);
1385 if (ret) {
1386 arch_remove_memory(start, size, NULL);
1387 goto error;
1388 }
1389
1390 if (new_node) {
1391
1392
1393
1394
1395
1396 node_set_online(nid);
1397 ret = __register_one_node(nid);
1398 BUG_ON(ret);
1399 }
1400
1401 register_memory_blocks_under_node(nid, PFN_DOWN(start),
1402 PFN_UP(start + size - 1),
1403 MEMINIT_HOTPLUG);
1404
1405
1406 if (!strcmp(res->name, "System RAM"))
1407 firmware_map_add_hotplug(start, start + size, "System RAM");
1408
1409
1410 mem_hotplug_done();
1411
1412
1413
1414
1415
1416 if (mhp_flags & MHP_MERGE_RESOURCE)
1417 merge_system_ram_resource(res);
1418
1419
1420 if (mhp_default_online_type != MMOP_OFFLINE)
1421 walk_memory_blocks(start, size, NULL, online_memory_block);
1422
1423 return ret;
1424 error:
1425 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
1426 memblock_remove(start, size);
1427 error_mem_hotplug_end:
1428 mem_hotplug_done();
1429 return ret;
1430 }
1431
1432
1433 int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
1434 {
1435 struct resource *res;
1436 int ret;
1437
1438 res = register_memory_resource(start, size, "System RAM");
1439 if (IS_ERR(res))
1440 return PTR_ERR(res);
1441
1442 ret = add_memory_resource(nid, res, mhp_flags);
1443 if (ret < 0)
1444 release_memory_resource(res);
1445 return ret;
1446 }
1447
1448 int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
1449 {
1450 int rc;
1451
1452 lock_device_hotplug();
1453 rc = __add_memory(nid, start, size, mhp_flags);
1454 unlock_device_hotplug();
1455
1456 return rc;
1457 }
1458 EXPORT_SYMBOL_GPL(add_memory);
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481 int add_memory_driver_managed(int nid, u64 start, u64 size,
1482 const char *resource_name, mhp_t mhp_flags)
1483 {
1484 struct resource *res;
1485 int rc;
1486
1487 if (!resource_name ||
1488 strstr(resource_name, "System RAM (") != resource_name ||
1489 resource_name[strlen(resource_name) - 1] != ')')
1490 return -EINVAL;
1491
1492 lock_device_hotplug();
1493
1494 res = register_memory_resource(start, size, resource_name);
1495 if (IS_ERR(res)) {
1496 rc = PTR_ERR(res);
1497 goto out_unlock;
1498 }
1499
1500 rc = add_memory_resource(nid, res, mhp_flags);
1501 if (rc < 0)
1502 release_memory_resource(res);
1503
1504 out_unlock:
1505 unlock_device_hotplug();
1506 return rc;
1507 }
1508 EXPORT_SYMBOL_GPL(add_memory_driver_managed);
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523 struct range __weak arch_get_mappable_range(void)
1524 {
1525 struct range mhp_range = {
1526 .start = 0UL,
1527 .end = -1ULL,
1528 };
1529 return mhp_range;
1530 }
1531
1532 struct range mhp_get_pluggable_range(bool need_mapping)
1533 {
1534 const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
1535 struct range mhp_range;
1536
1537 if (need_mapping) {
1538 mhp_range = arch_get_mappable_range();
1539 if (mhp_range.start > max_phys) {
1540 mhp_range.start = 0;
1541 mhp_range.end = 0;
1542 }
1543 mhp_range.end = min_t(u64, mhp_range.end, max_phys);
1544 } else {
1545 mhp_range.start = 0;
1546 mhp_range.end = max_phys;
1547 }
1548 return mhp_range;
1549 }
1550 EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
1551
1552 bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
1553 {
1554 struct range mhp_range = mhp_get_pluggable_range(need_mapping);
1555 u64 end = start + size;
1556
1557 if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
1558 return true;
1559
1560 pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
1561 start, end, mhp_range.start, mhp_range.end);
1562 return false;
1563 }
1564
1565 #ifdef CONFIG_MEMORY_HOTREMOVE
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577 static int scan_movable_pages(unsigned long start, unsigned long end,
1578 unsigned long *movable_pfn)
1579 {
1580 unsigned long pfn;
1581
1582 for (pfn = start; pfn < end; pfn++) {
1583 struct page *page, *head;
1584 unsigned long skip;
1585
1586 if (!pfn_valid(pfn))
1587 continue;
1588 page = pfn_to_page(pfn);
1589 if (PageLRU(page))
1590 goto found;
1591 if (__PageMovable(page))
1592 goto found;
1593
1594
1595
1596
1597
1598
1599
1600 if (PageOffline(page) && page_count(page))
1601 return -EBUSY;
1602
1603 if (!PageHuge(page))
1604 continue;
1605 head = compound_head(page);
1606
1607
1608
1609
1610
1611
1612
1613 if (HPageMigratable(head))
1614 goto found;
1615 skip = compound_nr(head) - (page - head);
1616 pfn += skip - 1;
1617 }
1618 return -ENOENT;
1619 found:
1620 *movable_pfn = pfn;
1621 return 0;
1622 }
1623
1624 static int
1625 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1626 {
1627 unsigned long pfn;
1628 struct page *page, *head;
1629 int ret = 0;
1630 LIST_HEAD(source);
1631 static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
1632 DEFAULT_RATELIMIT_BURST);
1633
1634 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1635 struct folio *folio;
1636
1637 if (!pfn_valid(pfn))
1638 continue;
1639 page = pfn_to_page(pfn);
1640 folio = page_folio(page);
1641 head = &folio->page;
1642
1643 if (PageHuge(page)) {
1644 pfn = page_to_pfn(head) + compound_nr(head) - 1;
1645 isolate_hugetlb(head, &source);
1646 continue;
1647 } else if (PageTransHuge(page))
1648 pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
1649
1650
1651
1652
1653
1654
1655
1656
1657 if (PageHWPoison(page)) {
1658 if (WARN_ON(folio_test_lru(folio)))
1659 folio_isolate_lru(folio);
1660 if (folio_mapped(folio))
1661 try_to_unmap(folio, TTU_IGNORE_MLOCK);
1662 continue;
1663 }
1664
1665 if (!get_page_unless_zero(page))
1666 continue;
1667
1668
1669
1670
1671 if (PageLRU(page))
1672 ret = isolate_lru_page(page);
1673 else
1674 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1675 if (!ret) {
1676 list_add_tail(&page->lru, &source);
1677 if (!__PageMovable(page))
1678 inc_node_page_state(page, NR_ISOLATED_ANON +
1679 page_is_file_lru(page));
1680
1681 } else {
1682 if (__ratelimit(&migrate_rs)) {
1683 pr_warn("failed to isolate pfn %lx\n", pfn);
1684 dump_page(page, "isolation failed");
1685 }
1686 }
1687 put_page(page);
1688 }
1689 if (!list_empty(&source)) {
1690 nodemask_t nmask = node_states[N_MEMORY];
1691 struct migration_target_control mtc = {
1692 .nmask = &nmask,
1693 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
1694 };
1695
1696
1697
1698
1699
1700 mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
1701
1702
1703
1704
1705
1706
1707 node_clear(mtc.nid, nmask);
1708 if (nodes_empty(nmask))
1709 node_set(mtc.nid, nmask);
1710 ret = migrate_pages(&source, alloc_migration_target, NULL,
1711 (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
1712 if (ret) {
1713 list_for_each_entry(page, &source, lru) {
1714 if (__ratelimit(&migrate_rs)) {
1715 pr_warn("migrating pfn %lx failed ret:%d\n",
1716 page_to_pfn(page), ret);
1717 dump_page(page, "migration failure");
1718 }
1719 }
1720 putback_movable_pages(&source);
1721 }
1722 }
1723
1724 return ret;
1725 }
1726
1727 static int __init cmdline_parse_movable_node(char *p)
1728 {
1729 movable_node_enabled = true;
1730 return 0;
1731 }
1732 early_param("movable_node", cmdline_parse_movable_node);
1733
1734
1735 static void node_states_check_changes_offline(unsigned long nr_pages,
1736 struct zone *zone, struct memory_notify *arg)
1737 {
1738 struct pglist_data *pgdat = zone->zone_pgdat;
1739 unsigned long present_pages = 0;
1740 enum zone_type zt;
1741
1742 arg->status_change_nid = NUMA_NO_NODE;
1743 arg->status_change_nid_normal = NUMA_NO_NODE;
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753 for (zt = 0; zt <= ZONE_NORMAL; zt++)
1754 present_pages += pgdat->node_zones[zt].present_pages;
1755 if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
1756 arg->status_change_nid_normal = zone_to_nid(zone);
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767 present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
1768
1769 if (nr_pages >= present_pages)
1770 arg->status_change_nid = zone_to_nid(zone);
1771 }
1772
1773 static void node_states_clear_node(int node, struct memory_notify *arg)
1774 {
1775 if (arg->status_change_nid_normal >= 0)
1776 node_clear_state(node, N_NORMAL_MEMORY);
1777
1778 if (arg->status_change_nid >= 0)
1779 node_clear_state(node, N_MEMORY);
1780 }
1781
1782 static int count_system_ram_pages_cb(unsigned long start_pfn,
1783 unsigned long nr_pages, void *data)
1784 {
1785 unsigned long *nr_system_ram_pages = data;
1786
1787 *nr_system_ram_pages += nr_pages;
1788 return 0;
1789 }
1790
1791 int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
1792 struct zone *zone, struct memory_group *group)
1793 {
1794 const unsigned long end_pfn = start_pfn + nr_pages;
1795 unsigned long pfn, system_ram_pages = 0;
1796 const int node = zone_to_nid(zone);
1797 unsigned long flags;
1798 struct memory_notify arg;
1799 char *reason;
1800 int ret;
1801
1802
1803
1804
1805
1806
1807
1808
1809 if (WARN_ON_ONCE(!nr_pages ||
1810 !IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
1811 !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
1812 return -EINVAL;
1813
1814 mem_hotplug_begin();
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824 walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
1825 count_system_ram_pages_cb);
1826 if (system_ram_pages != nr_pages) {
1827 ret = -EINVAL;
1828 reason = "memory holes";
1829 goto failed_removal;
1830 }
1831
1832
1833
1834
1835
1836
1837 if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone ||
1838 page_zone(pfn_to_page(end_pfn - 1)) != zone)) {
1839 ret = -EINVAL;
1840 reason = "multizone range";
1841 goto failed_removal;
1842 }
1843
1844
1845
1846
1847
1848 zone_pcp_disable(zone);
1849 lru_cache_disable();
1850
1851
1852 ret = start_isolate_page_range(start_pfn, end_pfn,
1853 MIGRATE_MOVABLE,
1854 MEMORY_OFFLINE | REPORT_FAILURE,
1855 GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
1856 if (ret) {
1857 reason = "failure to isolate range";
1858 goto failed_removal_pcplists_disabled;
1859 }
1860
1861 arg.start_pfn = start_pfn;
1862 arg.nr_pages = nr_pages;
1863 node_states_check_changes_offline(nr_pages, zone, &arg);
1864
1865 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
1866 ret = notifier_to_errno(ret);
1867 if (ret) {
1868 reason = "notifier failure";
1869 goto failed_removal_isolated;
1870 }
1871
1872 do {
1873 pfn = start_pfn;
1874 do {
1875 if (signal_pending(current)) {
1876 ret = -EINTR;
1877 reason = "signal backoff";
1878 goto failed_removal_isolated;
1879 }
1880
1881 cond_resched();
1882
1883 ret = scan_movable_pages(pfn, end_pfn, &pfn);
1884 if (!ret) {
1885
1886
1887
1888
1889 do_migrate_range(pfn, end_pfn);
1890 }
1891 } while (!ret);
1892
1893 if (ret != -ENOENT) {
1894 reason = "unmovable page";
1895 goto failed_removal_isolated;
1896 }
1897
1898
1899
1900
1901
1902
1903 ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1904 if (ret) {
1905 reason = "failure to dissolve huge pages";
1906 goto failed_removal_isolated;
1907 }
1908
1909 ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
1910
1911 } while (ret);
1912
1913
1914 __offline_isolated_pages(start_pfn, end_pfn);
1915 pr_debug("Offlined Pages %ld\n", nr_pages);
1916
1917
1918
1919
1920
1921
1922 spin_lock_irqsave(&zone->lock, flags);
1923 zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
1924 spin_unlock_irqrestore(&zone->lock, flags);
1925
1926 lru_cache_enable();
1927 zone_pcp_enable(zone);
1928
1929
1930 adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
1931 adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages);
1932
1933
1934 init_per_zone_wmark_min();
1935
1936 if (!populated_zone(zone)) {
1937 zone_pcp_reset(zone);
1938 build_all_zonelists(NULL);
1939 }
1940
1941 node_states_clear_node(node, &arg);
1942 if (arg.status_change_nid >= 0) {
1943 kswapd_stop(node);
1944 kcompactd_stop(node);
1945 }
1946
1947 writeback_set_ratelimit();
1948
1949 memory_notify(MEM_OFFLINE, &arg);
1950 remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
1951 mem_hotplug_done();
1952 return 0;
1953
1954 failed_removal_isolated:
1955
1956 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1957 memory_notify(MEM_CANCEL_OFFLINE, &arg);
1958 failed_removal_pcplists_disabled:
1959 lru_cache_enable();
1960 zone_pcp_enable(zone);
1961 failed_removal:
1962 pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
1963 (unsigned long long) start_pfn << PAGE_SHIFT,
1964 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
1965 reason);
1966 mem_hotplug_done();
1967 return ret;
1968 }
1969
1970 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1971 {
1972 int ret = !is_memblock_offlined(mem);
1973 int *nid = arg;
1974
1975 *nid = mem->nid;
1976 if (unlikely(ret)) {
1977 phys_addr_t beginpa, endpa;
1978
1979 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1980 endpa = beginpa + memory_block_size_bytes() - 1;
1981 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
1982 &beginpa, &endpa);
1983
1984 return -EBUSY;
1985 }
1986 return 0;
1987 }
1988
1989 static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
1990 {
1991
1992
1993
1994 return mem->nr_vmemmap_pages;
1995 }
1996
1997 static int check_cpu_on_node(int nid)
1998 {
1999 int cpu;
2000
2001 for_each_present_cpu(cpu) {
2002 if (cpu_to_node(cpu) == nid)
2003
2004
2005
2006
2007 return -EBUSY;
2008 }
2009
2010 return 0;
2011 }
2012
2013 static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
2014 {
2015 int nid = *(int *)arg;
2016
2017
2018
2019
2020
2021
2022 return mem->nid == nid ? -EEXIST : 0;
2023 }
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034 void try_offline_node(int nid)
2035 {
2036 int rc;
2037
2038
2039
2040
2041
2042
2043 if (node_spanned_pages(nid))
2044 return;
2045
2046
2047
2048
2049
2050
2051 rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
2052 if (rc)
2053 return;
2054
2055 if (check_cpu_on_node(nid))
2056 return;
2057
2058
2059
2060
2061
2062 node_set_offline(nid);
2063 unregister_one_node(nid);
2064 }
2065 EXPORT_SYMBOL(try_offline_node);
2066
2067 static int __ref try_remove_memory(u64 start, u64 size)
2068 {
2069 struct vmem_altmap mhp_altmap = {};
2070 struct vmem_altmap *altmap = NULL;
2071 unsigned long nr_vmemmap_pages;
2072 int rc = 0, nid = NUMA_NO_NODE;
2073
2074 BUG_ON(check_hotplug_memory_range(start, size));
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085 rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
2086 if (rc)
2087 return rc;
2088
2089
2090
2091
2092
2093 if (mhp_memmap_on_memory()) {
2094 nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
2095 get_nr_vmemmap_pages_cb);
2096 if (nr_vmemmap_pages) {
2097 if (size != memory_block_size_bytes()) {
2098 pr_warn("Refuse to remove %#llx - %#llx,"
2099 "wrong granularity\n",
2100 start, start + size);
2101 return -EINVAL;
2102 }
2103
2104
2105
2106
2107
2108
2109 mhp_altmap.alloc = nr_vmemmap_pages;
2110 altmap = &mhp_altmap;
2111 }
2112 }
2113
2114
2115 firmware_map_remove(start, start + size, "System RAM");
2116
2117
2118
2119
2120
2121 remove_memory_block_devices(start, size);
2122
2123 mem_hotplug_begin();
2124
2125 arch_remove_memory(start, size, altmap);
2126
2127 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
2128 memblock_phys_free(start, size);
2129 memblock_remove(start, size);
2130 }
2131
2132 release_mem_region_adjustable(start, size);
2133
2134 if (nid != NUMA_NO_NODE)
2135 try_offline_node(nid);
2136
2137 mem_hotplug_done();
2138 return 0;
2139 }
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150 void __remove_memory(u64 start, u64 size)
2151 {
2152
2153
2154
2155
2156
2157 if (try_remove_memory(start, size))
2158 BUG();
2159 }
2160
2161
2162
2163
2164
2165 int remove_memory(u64 start, u64 size)
2166 {
2167 int rc;
2168
2169 lock_device_hotplug();
2170 rc = try_remove_memory(start, size);
2171 unlock_device_hotplug();
2172
2173 return rc;
2174 }
2175 EXPORT_SYMBOL_GPL(remove_memory);
2176
2177 static int try_offline_memory_block(struct memory_block *mem, void *arg)
2178 {
2179 uint8_t online_type = MMOP_ONLINE_KERNEL;
2180 uint8_t **online_types = arg;
2181 struct page *page;
2182 int rc;
2183
2184
2185
2186
2187
2188
2189 page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
2190 if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
2191 online_type = MMOP_ONLINE_MOVABLE;
2192
2193 rc = device_offline(&mem->dev);
2194
2195
2196
2197
2198 if (!rc)
2199 **online_types = online_type;
2200
2201 (*online_types)++;
2202
2203 return rc < 0 ? rc : 0;
2204 }
2205
2206 static int try_reonline_memory_block(struct memory_block *mem, void *arg)
2207 {
2208 uint8_t **online_types = arg;
2209 int rc;
2210
2211 if (**online_types != MMOP_OFFLINE) {
2212 mem->online_type = **online_types;
2213 rc = device_online(&mem->dev);
2214 if (rc < 0)
2215 pr_warn("%s: Failed to re-online memory: %d",
2216 __func__, rc);
2217 }
2218
2219
2220 (*online_types)++;
2221 return 0;
2222 }
2223
2224
2225
2226
2227
2228
2229
2230 int offline_and_remove_memory(u64 start, u64 size)
2231 {
2232 const unsigned long mb_count = size / memory_block_size_bytes();
2233 uint8_t *online_types, *tmp;
2234 int rc;
2235
2236 if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
2237 !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
2238 return -EINVAL;
2239
2240
2241
2242
2243
2244
2245 online_types = kmalloc_array(mb_count, sizeof(*online_types),
2246 GFP_KERNEL);
2247 if (!online_types)
2248 return -ENOMEM;
2249
2250
2251
2252
2253
2254 memset(online_types, MMOP_OFFLINE, mb_count);
2255
2256 lock_device_hotplug();
2257
2258 tmp = online_types;
2259 rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
2260
2261
2262
2263
2264
2265 if (!rc) {
2266 rc = try_remove_memory(start, size);
2267 if (rc)
2268 pr_err("%s: Failed to remove memory: %d", __func__, rc);
2269 }
2270
2271
2272
2273
2274
2275 if (rc) {
2276 tmp = online_types;
2277 walk_memory_blocks(start, size, &tmp,
2278 try_reonline_memory_block);
2279 }
2280 unlock_device_hotplug();
2281
2282 kfree(online_types);
2283 return rc;
2284 }
2285 EXPORT_SYMBOL_GPL(offline_and_remove_memory);
2286 #endif