0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0014
0015 #include <linux/mm.h>
0016 #include <linux/sched/mm.h>
0017 #include <linux/module.h>
0018 #include <linux/gfp.h>
0019 #include <linux/kernel_stat.h>
0020 #include <linux/swap.h>
0021 #include <linux/pagemap.h>
0022 #include <linux/init.h>
0023 #include <linux/highmem.h>
0024 #include <linux/vmpressure.h>
0025 #include <linux/vmstat.h>
0026 #include <linux/file.h>
0027 #include <linux/writeback.h>
0028 #include <linux/blkdev.h>
0029 #include <linux/buffer_head.h> /* for buffer_heads_over_limit */
0030 #include <linux/mm_inline.h>
0031 #include <linux/backing-dev.h>
0032 #include <linux/rmap.h>
0033 #include <linux/topology.h>
0034 #include <linux/cpu.h>
0035 #include <linux/cpuset.h>
0036 #include <linux/compaction.h>
0037 #include <linux/notifier.h>
0038 #include <linux/rwsem.h>
0039 #include <linux/delay.h>
0040 #include <linux/kthread.h>
0041 #include <linux/freezer.h>
0042 #include <linux/memcontrol.h>
0043 #include <linux/migrate.h>
0044 #include <linux/delayacct.h>
0045 #include <linux/sysctl.h>
0046 #include <linux/oom.h>
0047 #include <linux/pagevec.h>
0048 #include <linux/prefetch.h>
0049 #include <linux/printk.h>
0050 #include <linux/dax.h>
0051 #include <linux/psi.h>
0052
0053 #include <asm/tlbflush.h>
0054 #include <asm/div64.h>
0055
0056 #include <linux/swapops.h>
0057 #include <linux/balloon_compaction.h>
0058 #include <linux/sched/sysctl.h>
0059
0060 #include "internal.h"
0061 #include "swap.h"
0062
0063 #define CREATE_TRACE_POINTS
0064 #include <trace/events/vmscan.h>
0065
0066 struct scan_control {
0067
0068 unsigned long nr_to_reclaim;
0069
0070
0071
0072
0073
0074 nodemask_t *nodemask;
0075
0076
0077
0078
0079
0080 struct mem_cgroup *target_mem_cgroup;
0081
0082
0083
0084
0085 unsigned long anon_cost;
0086 unsigned long file_cost;
0087
0088
0089 #define DEACTIVATE_ANON 1
0090 #define DEACTIVATE_FILE 2
0091 unsigned int may_deactivate:2;
0092 unsigned int force_deactivate:1;
0093 unsigned int skipped_deactivate:1;
0094
0095
0096 unsigned int may_writepage:1;
0097
0098
0099 unsigned int may_unmap:1;
0100
0101
0102 unsigned int may_swap:1;
0103
0104
0105 unsigned int proactive:1;
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115 unsigned int memcg_low_reclaim:1;
0116 unsigned int memcg_low_skipped:1;
0117
0118 unsigned int hibernation_mode:1;
0119
0120
0121 unsigned int compaction_ready:1;
0122
0123
0124 unsigned int cache_trim_mode:1;
0125
0126
0127 unsigned int file_is_tiny:1;
0128
0129
0130 unsigned int no_demotion:1;
0131
0132
0133 s8 order;
0134
0135
0136 s8 priority;
0137
0138
0139 s8 reclaim_idx;
0140
0141
0142 gfp_t gfp_mask;
0143
0144
0145 unsigned long nr_scanned;
0146
0147
0148 unsigned long nr_reclaimed;
0149
0150 struct {
0151 unsigned int dirty;
0152 unsigned int unqueued_dirty;
0153 unsigned int congested;
0154 unsigned int writeback;
0155 unsigned int immediate;
0156 unsigned int file_taken;
0157 unsigned int taken;
0158 } nr;
0159
0160
0161 struct reclaim_state reclaim_state;
0162 };
0163
0164 #ifdef ARCH_HAS_PREFETCHW
0165 #define prefetchw_prev_lru_folio(_folio, _base, _field) \
0166 do { \
0167 if ((_folio)->lru.prev != _base) { \
0168 struct folio *prev; \
0169 \
0170 prev = lru_to_folio(&(_folio->lru)); \
0171 prefetchw(&prev->_field); \
0172 } \
0173 } while (0)
0174 #else
0175 #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
0176 #endif
0177
0178
0179
0180
0181 int vm_swappiness = 60;
0182
0183 static void set_task_reclaim_state(struct task_struct *task,
0184 struct reclaim_state *rs)
0185 {
0186
0187 WARN_ON_ONCE(rs && task->reclaim_state);
0188
0189
0190 WARN_ON_ONCE(!rs && !task->reclaim_state);
0191
0192 task->reclaim_state = rs;
0193 }
0194
0195 LIST_HEAD(shrinker_list);
0196 DECLARE_RWSEM(shrinker_rwsem);
0197
0198 #ifdef CONFIG_MEMCG
0199 static int shrinker_nr_max;
0200
0201
0202 static inline int shrinker_map_size(int nr_items)
0203 {
0204 return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
0205 }
0206
0207 static inline int shrinker_defer_size(int nr_items)
0208 {
0209 return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
0210 }
0211
0212 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
0213 int nid)
0214 {
0215 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
0216 lockdep_is_held(&shrinker_rwsem));
0217 }
0218
0219 static int expand_one_shrinker_info(struct mem_cgroup *memcg,
0220 int map_size, int defer_size,
0221 int old_map_size, int old_defer_size)
0222 {
0223 struct shrinker_info *new, *old;
0224 struct mem_cgroup_per_node *pn;
0225 int nid;
0226 int size = map_size + defer_size;
0227
0228 for_each_node(nid) {
0229 pn = memcg->nodeinfo[nid];
0230 old = shrinker_info_protected(memcg, nid);
0231
0232 if (!old)
0233 return 0;
0234
0235 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
0236 if (!new)
0237 return -ENOMEM;
0238
0239 new->nr_deferred = (atomic_long_t *)(new + 1);
0240 new->map = (void *)new->nr_deferred + defer_size;
0241
0242
0243 memset(new->map, (int)0xff, old_map_size);
0244 memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
0245
0246 memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
0247 memset((void *)new->nr_deferred + old_defer_size, 0,
0248 defer_size - old_defer_size);
0249
0250 rcu_assign_pointer(pn->shrinker_info, new);
0251 kvfree_rcu(old, rcu);
0252 }
0253
0254 return 0;
0255 }
0256
0257 void free_shrinker_info(struct mem_cgroup *memcg)
0258 {
0259 struct mem_cgroup_per_node *pn;
0260 struct shrinker_info *info;
0261 int nid;
0262
0263 for_each_node(nid) {
0264 pn = memcg->nodeinfo[nid];
0265 info = rcu_dereference_protected(pn->shrinker_info, true);
0266 kvfree(info);
0267 rcu_assign_pointer(pn->shrinker_info, NULL);
0268 }
0269 }
0270
0271 int alloc_shrinker_info(struct mem_cgroup *memcg)
0272 {
0273 struct shrinker_info *info;
0274 int nid, size, ret = 0;
0275 int map_size, defer_size = 0;
0276
0277 down_write(&shrinker_rwsem);
0278 map_size = shrinker_map_size(shrinker_nr_max);
0279 defer_size = shrinker_defer_size(shrinker_nr_max);
0280 size = map_size + defer_size;
0281 for_each_node(nid) {
0282 info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
0283 if (!info) {
0284 free_shrinker_info(memcg);
0285 ret = -ENOMEM;
0286 break;
0287 }
0288 info->nr_deferred = (atomic_long_t *)(info + 1);
0289 info->map = (void *)info->nr_deferred + defer_size;
0290 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
0291 }
0292 up_write(&shrinker_rwsem);
0293
0294 return ret;
0295 }
0296
0297 static inline bool need_expand(int nr_max)
0298 {
0299 return round_up(nr_max, BITS_PER_LONG) >
0300 round_up(shrinker_nr_max, BITS_PER_LONG);
0301 }
0302
0303 static int expand_shrinker_info(int new_id)
0304 {
0305 int ret = 0;
0306 int new_nr_max = new_id + 1;
0307 int map_size, defer_size = 0;
0308 int old_map_size, old_defer_size = 0;
0309 struct mem_cgroup *memcg;
0310
0311 if (!need_expand(new_nr_max))
0312 goto out;
0313
0314 if (!root_mem_cgroup)
0315 goto out;
0316
0317 lockdep_assert_held(&shrinker_rwsem);
0318
0319 map_size = shrinker_map_size(new_nr_max);
0320 defer_size = shrinker_defer_size(new_nr_max);
0321 old_map_size = shrinker_map_size(shrinker_nr_max);
0322 old_defer_size = shrinker_defer_size(shrinker_nr_max);
0323
0324 memcg = mem_cgroup_iter(NULL, NULL, NULL);
0325 do {
0326 ret = expand_one_shrinker_info(memcg, map_size, defer_size,
0327 old_map_size, old_defer_size);
0328 if (ret) {
0329 mem_cgroup_iter_break(NULL, memcg);
0330 goto out;
0331 }
0332 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
0333 out:
0334 if (!ret)
0335 shrinker_nr_max = new_nr_max;
0336
0337 return ret;
0338 }
0339
0340 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
0341 {
0342 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
0343 struct shrinker_info *info;
0344
0345 rcu_read_lock();
0346 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
0347
0348 smp_mb__before_atomic();
0349 set_bit(shrinker_id, info->map);
0350 rcu_read_unlock();
0351 }
0352 }
0353
0354 static DEFINE_IDR(shrinker_idr);
0355
0356 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
0357 {
0358 int id, ret = -ENOMEM;
0359
0360 if (mem_cgroup_disabled())
0361 return -ENOSYS;
0362
0363 down_write(&shrinker_rwsem);
0364
0365 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
0366 if (id < 0)
0367 goto unlock;
0368
0369 if (id >= shrinker_nr_max) {
0370 if (expand_shrinker_info(id)) {
0371 idr_remove(&shrinker_idr, id);
0372 goto unlock;
0373 }
0374 }
0375 shrinker->id = id;
0376 ret = 0;
0377 unlock:
0378 up_write(&shrinker_rwsem);
0379 return ret;
0380 }
0381
0382 static void unregister_memcg_shrinker(struct shrinker *shrinker)
0383 {
0384 int id = shrinker->id;
0385
0386 BUG_ON(id < 0);
0387
0388 lockdep_assert_held(&shrinker_rwsem);
0389
0390 idr_remove(&shrinker_idr, id);
0391 }
0392
0393 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
0394 struct mem_cgroup *memcg)
0395 {
0396 struct shrinker_info *info;
0397
0398 info = shrinker_info_protected(memcg, nid);
0399 return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
0400 }
0401
0402 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
0403 struct mem_cgroup *memcg)
0404 {
0405 struct shrinker_info *info;
0406
0407 info = shrinker_info_protected(memcg, nid);
0408 return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
0409 }
0410
0411 void reparent_shrinker_deferred(struct mem_cgroup *memcg)
0412 {
0413 int i, nid;
0414 long nr;
0415 struct mem_cgroup *parent;
0416 struct shrinker_info *child_info, *parent_info;
0417
0418 parent = parent_mem_cgroup(memcg);
0419 if (!parent)
0420 parent = root_mem_cgroup;
0421
0422
0423 down_read(&shrinker_rwsem);
0424 for_each_node(nid) {
0425 child_info = shrinker_info_protected(memcg, nid);
0426 parent_info = shrinker_info_protected(parent, nid);
0427 for (i = 0; i < shrinker_nr_max; i++) {
0428 nr = atomic_long_read(&child_info->nr_deferred[i]);
0429 atomic_long_add(nr, &parent_info->nr_deferred[i]);
0430 }
0431 }
0432 up_read(&shrinker_rwsem);
0433 }
0434
0435 static bool cgroup_reclaim(struct scan_control *sc)
0436 {
0437 return sc->target_mem_cgroup;
0438 }
0439
0440
0441
0442
0443
0444
0445
0446
0447
0448
0449
0450
0451
0452
0453 static bool writeback_throttling_sane(struct scan_control *sc)
0454 {
0455 if (!cgroup_reclaim(sc))
0456 return true;
0457 #ifdef CONFIG_CGROUP_WRITEBACK
0458 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
0459 return true;
0460 #endif
0461 return false;
0462 }
0463 #else
0464 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
0465 {
0466 return -ENOSYS;
0467 }
0468
0469 static void unregister_memcg_shrinker(struct shrinker *shrinker)
0470 {
0471 }
0472
0473 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
0474 struct mem_cgroup *memcg)
0475 {
0476 return 0;
0477 }
0478
0479 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
0480 struct mem_cgroup *memcg)
0481 {
0482 return 0;
0483 }
0484
0485 static bool cgroup_reclaim(struct scan_control *sc)
0486 {
0487 return false;
0488 }
0489
0490 static bool writeback_throttling_sane(struct scan_control *sc)
0491 {
0492 return true;
0493 }
0494 #endif
0495
0496 static long xchg_nr_deferred(struct shrinker *shrinker,
0497 struct shrink_control *sc)
0498 {
0499 int nid = sc->nid;
0500
0501 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
0502 nid = 0;
0503
0504 if (sc->memcg &&
0505 (shrinker->flags & SHRINKER_MEMCG_AWARE))
0506 return xchg_nr_deferred_memcg(nid, shrinker,
0507 sc->memcg);
0508
0509 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
0510 }
0511
0512
0513 static long add_nr_deferred(long nr, struct shrinker *shrinker,
0514 struct shrink_control *sc)
0515 {
0516 int nid = sc->nid;
0517
0518 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
0519 nid = 0;
0520
0521 if (sc->memcg &&
0522 (shrinker->flags & SHRINKER_MEMCG_AWARE))
0523 return add_nr_deferred_memcg(nr, nid, shrinker,
0524 sc->memcg);
0525
0526 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
0527 }
0528
0529 static bool can_demote(int nid, struct scan_control *sc)
0530 {
0531 if (!numa_demotion_enabled)
0532 return false;
0533 if (sc && sc->no_demotion)
0534 return false;
0535 if (next_demotion_node(nid) == NUMA_NO_NODE)
0536 return false;
0537
0538 return true;
0539 }
0540
0541 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
0542 int nid,
0543 struct scan_control *sc)
0544 {
0545 if (memcg == NULL) {
0546
0547
0548
0549
0550 if (get_nr_swap_pages() > 0)
0551 return true;
0552 } else {
0553
0554 if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
0555 return true;
0556 }
0557
0558
0559
0560
0561
0562
0563 return can_demote(nid, sc);
0564 }
0565
0566
0567
0568
0569
0570
0571 unsigned long zone_reclaimable_pages(struct zone *zone)
0572 {
0573 unsigned long nr;
0574
0575 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
0576 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
0577 if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
0578 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
0579 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
0580
0581 return nr;
0582 }
0583
0584
0585
0586
0587
0588
0589
0590 static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
0591 int zone_idx)
0592 {
0593 unsigned long size = 0;
0594 int zid;
0595
0596 for (zid = 0; zid <= zone_idx; zid++) {
0597 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
0598
0599 if (!managed_zone(zone))
0600 continue;
0601
0602 if (!mem_cgroup_disabled())
0603 size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
0604 else
0605 size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
0606 }
0607 return size;
0608 }
0609
0610
0611
0612
0613 static int __prealloc_shrinker(struct shrinker *shrinker)
0614 {
0615 unsigned int size;
0616 int err;
0617
0618 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
0619 err = prealloc_memcg_shrinker(shrinker);
0620 if (err != -ENOSYS)
0621 return err;
0622
0623 shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
0624 }
0625
0626 size = sizeof(*shrinker->nr_deferred);
0627 if (shrinker->flags & SHRINKER_NUMA_AWARE)
0628 size *= nr_node_ids;
0629
0630 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
0631 if (!shrinker->nr_deferred)
0632 return -ENOMEM;
0633
0634 return 0;
0635 }
0636
0637 #ifdef CONFIG_SHRINKER_DEBUG
0638 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
0639 {
0640 va_list ap;
0641 int err;
0642
0643 va_start(ap, fmt);
0644 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
0645 va_end(ap);
0646 if (!shrinker->name)
0647 return -ENOMEM;
0648
0649 err = __prealloc_shrinker(shrinker);
0650 if (err) {
0651 kfree_const(shrinker->name);
0652 shrinker->name = NULL;
0653 }
0654
0655 return err;
0656 }
0657 #else
0658 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
0659 {
0660 return __prealloc_shrinker(shrinker);
0661 }
0662 #endif
0663
0664 void free_prealloced_shrinker(struct shrinker *shrinker)
0665 {
0666 #ifdef CONFIG_SHRINKER_DEBUG
0667 kfree_const(shrinker->name);
0668 shrinker->name = NULL;
0669 #endif
0670 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
0671 down_write(&shrinker_rwsem);
0672 unregister_memcg_shrinker(shrinker);
0673 up_write(&shrinker_rwsem);
0674 return;
0675 }
0676
0677 kfree(shrinker->nr_deferred);
0678 shrinker->nr_deferred = NULL;
0679 }
0680
0681 void register_shrinker_prepared(struct shrinker *shrinker)
0682 {
0683 down_write(&shrinker_rwsem);
0684 list_add_tail(&shrinker->list, &shrinker_list);
0685 shrinker->flags |= SHRINKER_REGISTERED;
0686 shrinker_debugfs_add(shrinker);
0687 up_write(&shrinker_rwsem);
0688 }
0689
0690 static int __register_shrinker(struct shrinker *shrinker)
0691 {
0692 int err = __prealloc_shrinker(shrinker);
0693
0694 if (err)
0695 return err;
0696 register_shrinker_prepared(shrinker);
0697 return 0;
0698 }
0699
0700 #ifdef CONFIG_SHRINKER_DEBUG
0701 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
0702 {
0703 va_list ap;
0704 int err;
0705
0706 va_start(ap, fmt);
0707 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
0708 va_end(ap);
0709 if (!shrinker->name)
0710 return -ENOMEM;
0711
0712 err = __register_shrinker(shrinker);
0713 if (err) {
0714 kfree_const(shrinker->name);
0715 shrinker->name = NULL;
0716 }
0717 return err;
0718 }
0719 #else
0720 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
0721 {
0722 return __register_shrinker(shrinker);
0723 }
0724 #endif
0725 EXPORT_SYMBOL(register_shrinker);
0726
0727
0728
0729
0730 void unregister_shrinker(struct shrinker *shrinker)
0731 {
0732 if (!(shrinker->flags & SHRINKER_REGISTERED))
0733 return;
0734
0735 down_write(&shrinker_rwsem);
0736 list_del(&shrinker->list);
0737 shrinker->flags &= ~SHRINKER_REGISTERED;
0738 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
0739 unregister_memcg_shrinker(shrinker);
0740 shrinker_debugfs_remove(shrinker);
0741 up_write(&shrinker_rwsem);
0742
0743 kfree(shrinker->nr_deferred);
0744 shrinker->nr_deferred = NULL;
0745 }
0746 EXPORT_SYMBOL(unregister_shrinker);
0747
0748
0749
0750
0751
0752
0753
0754
0755
0756 void synchronize_shrinkers(void)
0757 {
0758 down_write(&shrinker_rwsem);
0759 up_write(&shrinker_rwsem);
0760 }
0761 EXPORT_SYMBOL(synchronize_shrinkers);
0762
0763 #define SHRINK_BATCH 128
0764
0765 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
0766 struct shrinker *shrinker, int priority)
0767 {
0768 unsigned long freed = 0;
0769 unsigned long long delta;
0770 long total_scan;
0771 long freeable;
0772 long nr;
0773 long new_nr;
0774 long batch_size = shrinker->batch ? shrinker->batch
0775 : SHRINK_BATCH;
0776 long scanned = 0, next_deferred;
0777
0778 freeable = shrinker->count_objects(shrinker, shrinkctl);
0779 if (freeable == 0 || freeable == SHRINK_EMPTY)
0780 return freeable;
0781
0782
0783
0784
0785
0786
0787 nr = xchg_nr_deferred(shrinker, shrinkctl);
0788
0789 if (shrinker->seeks) {
0790 delta = freeable >> priority;
0791 delta *= 4;
0792 do_div(delta, shrinker->seeks);
0793 } else {
0794
0795
0796
0797
0798
0799 delta = freeable / 2;
0800 }
0801
0802 total_scan = nr >> priority;
0803 total_scan += delta;
0804 total_scan = min(total_scan, (2 * freeable));
0805
0806 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
0807 freeable, delta, total_scan, priority);
0808
0809
0810
0811
0812
0813
0814
0815
0816
0817
0818
0819
0820
0821
0822
0823
0824 while (total_scan >= batch_size ||
0825 total_scan >= freeable) {
0826 unsigned long ret;
0827 unsigned long nr_to_scan = min(batch_size, total_scan);
0828
0829 shrinkctl->nr_to_scan = nr_to_scan;
0830 shrinkctl->nr_scanned = nr_to_scan;
0831 ret = shrinker->scan_objects(shrinker, shrinkctl);
0832 if (ret == SHRINK_STOP)
0833 break;
0834 freed += ret;
0835
0836 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
0837 total_scan -= shrinkctl->nr_scanned;
0838 scanned += shrinkctl->nr_scanned;
0839
0840 cond_resched();
0841 }
0842
0843
0844
0845
0846
0847
0848
0849 next_deferred = max_t(long, (nr + delta - scanned), 0);
0850 next_deferred = min(next_deferred, (2 * freeable));
0851
0852
0853
0854
0855
0856 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
0857
0858 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
0859 return freed;
0860 }
0861
0862 #ifdef CONFIG_MEMCG
0863 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
0864 struct mem_cgroup *memcg, int priority)
0865 {
0866 struct shrinker_info *info;
0867 unsigned long ret, freed = 0;
0868 int i;
0869
0870 if (!mem_cgroup_online(memcg))
0871 return 0;
0872
0873 if (!down_read_trylock(&shrinker_rwsem))
0874 return 0;
0875
0876 info = shrinker_info_protected(memcg, nid);
0877 if (unlikely(!info))
0878 goto unlock;
0879
0880 for_each_set_bit(i, info->map, shrinker_nr_max) {
0881 struct shrink_control sc = {
0882 .gfp_mask = gfp_mask,
0883 .nid = nid,
0884 .memcg = memcg,
0885 };
0886 struct shrinker *shrinker;
0887
0888 shrinker = idr_find(&shrinker_idr, i);
0889 if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
0890 if (!shrinker)
0891 clear_bit(i, info->map);
0892 continue;
0893 }
0894
0895
0896 if (!memcg_kmem_enabled() &&
0897 !(shrinker->flags & SHRINKER_NONSLAB))
0898 continue;
0899
0900 ret = do_shrink_slab(&sc, shrinker, priority);
0901 if (ret == SHRINK_EMPTY) {
0902 clear_bit(i, info->map);
0903
0904
0905
0906
0907
0908
0909
0910
0911
0912
0913
0914
0915
0916
0917
0918 smp_mb__after_atomic();
0919 ret = do_shrink_slab(&sc, shrinker, priority);
0920 if (ret == SHRINK_EMPTY)
0921 ret = 0;
0922 else
0923 set_shrinker_bit(memcg, nid, i);
0924 }
0925 freed += ret;
0926
0927 if (rwsem_is_contended(&shrinker_rwsem)) {
0928 freed = freed ? : 1;
0929 break;
0930 }
0931 }
0932 unlock:
0933 up_read(&shrinker_rwsem);
0934 return freed;
0935 }
0936 #else
0937 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
0938 struct mem_cgroup *memcg, int priority)
0939 {
0940 return 0;
0941 }
0942 #endif
0943
0944
0945
0946
0947
0948
0949
0950
0951
0952
0953
0954
0955
0956
0957
0958
0959
0960
0961
0962
0963
0964 static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
0965 struct mem_cgroup *memcg,
0966 int priority)
0967 {
0968 unsigned long ret, freed = 0;
0969 struct shrinker *shrinker;
0970
0971
0972
0973
0974
0975
0976
0977
0978 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
0979 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
0980
0981 if (!down_read_trylock(&shrinker_rwsem))
0982 goto out;
0983
0984 list_for_each_entry(shrinker, &shrinker_list, list) {
0985 struct shrink_control sc = {
0986 .gfp_mask = gfp_mask,
0987 .nid = nid,
0988 .memcg = memcg,
0989 };
0990
0991 ret = do_shrink_slab(&sc, shrinker, priority);
0992 if (ret == SHRINK_EMPTY)
0993 ret = 0;
0994 freed += ret;
0995
0996
0997
0998
0999
1000 if (rwsem_is_contended(&shrinker_rwsem)) {
1001 freed = freed ? : 1;
1002 break;
1003 }
1004 }
1005
1006 up_read(&shrinker_rwsem);
1007 out:
1008 cond_resched();
1009 return freed;
1010 }
1011
1012 static void drop_slab_node(int nid)
1013 {
1014 unsigned long freed;
1015 int shift = 0;
1016
1017 do {
1018 struct mem_cgroup *memcg = NULL;
1019
1020 if (fatal_signal_pending(current))
1021 return;
1022
1023 freed = 0;
1024 memcg = mem_cgroup_iter(NULL, NULL, NULL);
1025 do {
1026 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
1027 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
1028 } while ((freed >> shift++) > 1);
1029 }
1030
1031 void drop_slab(void)
1032 {
1033 int nid;
1034
1035 for_each_online_node(nid)
1036 drop_slab_node(nid);
1037 }
1038
1039 static inline int is_page_cache_freeable(struct folio *folio)
1040 {
1041
1042
1043
1044
1045
1046 return folio_ref_count(folio) - folio_test_private(folio) ==
1047 1 + folio_nr_pages(folio);
1048 }
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062 static void handle_write_error(struct address_space *mapping,
1063 struct folio *folio, int error)
1064 {
1065 folio_lock(folio);
1066 if (folio_mapping(folio) == mapping)
1067 mapping_set_error(mapping, error);
1068 folio_unlock(folio);
1069 }
1070
1071 static bool skip_throttle_noprogress(pg_data_t *pgdat)
1072 {
1073 int reclaimable = 0, write_pending = 0;
1074 int i;
1075
1076
1077
1078
1079
1080 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
1081 return true;
1082
1083
1084
1085
1086
1087
1088 for (i = 0; i < MAX_NR_ZONES; i++) {
1089 struct zone *zone = pgdat->node_zones + i;
1090
1091 if (!managed_zone(zone))
1092 continue;
1093
1094 reclaimable += zone_reclaimable_pages(zone);
1095 write_pending += zone_page_state_snapshot(zone,
1096 NR_ZONE_WRITE_PENDING);
1097 }
1098 if (2 * write_pending <= reclaimable)
1099 return true;
1100
1101 return false;
1102 }
1103
1104 void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
1105 {
1106 wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
1107 long timeout, ret;
1108 DEFINE_WAIT(wait);
1109
1110
1111
1112
1113
1114
1115 if (!current_is_kswapd() &&
1116 current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
1117 cond_resched();
1118 return;
1119 }
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131 switch(reason) {
1132 case VMSCAN_THROTTLE_WRITEBACK:
1133 timeout = HZ/10;
1134
1135 if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
1136 WRITE_ONCE(pgdat->nr_reclaim_start,
1137 node_page_state(pgdat, NR_THROTTLED_WRITTEN));
1138 }
1139
1140 break;
1141 case VMSCAN_THROTTLE_CONGESTED:
1142 fallthrough;
1143 case VMSCAN_THROTTLE_NOPROGRESS:
1144 if (skip_throttle_noprogress(pgdat)) {
1145 cond_resched();
1146 return;
1147 }
1148
1149 timeout = 1;
1150
1151 break;
1152 case VMSCAN_THROTTLE_ISOLATED:
1153 timeout = HZ/50;
1154 break;
1155 default:
1156 WARN_ON_ONCE(1);
1157 timeout = HZ;
1158 break;
1159 }
1160
1161 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1162 ret = schedule_timeout(timeout);
1163 finish_wait(wqh, &wait);
1164
1165 if (reason == VMSCAN_THROTTLE_WRITEBACK)
1166 atomic_dec(&pgdat->nr_writeback_throttled);
1167
1168 trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
1169 jiffies_to_usecs(timeout - ret),
1170 reason);
1171 }
1172
1173
1174
1175
1176
1177
1178 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
1179 int nr_throttled)
1180 {
1181 unsigned long nr_written;
1182
1183 node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
1184
1185
1186
1187
1188
1189
1190
1191
1192 nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
1193 READ_ONCE(pgdat->nr_reclaim_start);
1194
1195 if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
1196 wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
1197 }
1198
1199
1200 typedef enum {
1201
1202 PAGE_KEEP,
1203
1204 PAGE_ACTIVATE,
1205
1206 PAGE_SUCCESS,
1207
1208 PAGE_CLEAN,
1209 } pageout_t;
1210
1211
1212
1213
1214
1215 static pageout_t pageout(struct folio *folio, struct address_space *mapping,
1216 struct swap_iocb **plug)
1217 {
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234 if (!is_page_cache_freeable(folio))
1235 return PAGE_KEEP;
1236 if (!mapping) {
1237
1238
1239
1240
1241 if (folio_test_private(folio)) {
1242 if (try_to_free_buffers(folio)) {
1243 folio_clear_dirty(folio);
1244 pr_info("%s: orphaned folio\n", __func__);
1245 return PAGE_CLEAN;
1246 }
1247 }
1248 return PAGE_KEEP;
1249 }
1250 if (mapping->a_ops->writepage == NULL)
1251 return PAGE_ACTIVATE;
1252
1253 if (folio_clear_dirty_for_io(folio)) {
1254 int res;
1255 struct writeback_control wbc = {
1256 .sync_mode = WB_SYNC_NONE,
1257 .nr_to_write = SWAP_CLUSTER_MAX,
1258 .range_start = 0,
1259 .range_end = LLONG_MAX,
1260 .for_reclaim = 1,
1261 .swap_plug = plug,
1262 };
1263
1264 folio_set_reclaim(folio);
1265 res = mapping->a_ops->writepage(&folio->page, &wbc);
1266 if (res < 0)
1267 handle_write_error(mapping, folio, res);
1268 if (res == AOP_WRITEPAGE_ACTIVATE) {
1269 folio_clear_reclaim(folio);
1270 return PAGE_ACTIVATE;
1271 }
1272
1273 if (!folio_test_writeback(folio)) {
1274
1275 folio_clear_reclaim(folio);
1276 }
1277 trace_mm_vmscan_write_folio(folio);
1278 node_stat_add_folio(folio, NR_VMSCAN_WRITE);
1279 return PAGE_SUCCESS;
1280 }
1281
1282 return PAGE_CLEAN;
1283 }
1284
1285
1286
1287
1288
1289 static int __remove_mapping(struct address_space *mapping, struct folio *folio,
1290 bool reclaimed, struct mem_cgroup *target_memcg)
1291 {
1292 int refcount;
1293 void *shadow = NULL;
1294
1295 BUG_ON(!folio_test_locked(folio));
1296 BUG_ON(mapping != folio_mapping(folio));
1297
1298 if (!folio_test_swapcache(folio))
1299 spin_lock(&mapping->host->i_lock);
1300 xa_lock_irq(&mapping->i_pages);
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326 refcount = 1 + folio_nr_pages(folio);
1327 if (!folio_ref_freeze(folio, refcount))
1328 goto cannot_free;
1329
1330 if (unlikely(folio_test_dirty(folio))) {
1331 folio_ref_unfreeze(folio, refcount);
1332 goto cannot_free;
1333 }
1334
1335 if (folio_test_swapcache(folio)) {
1336 swp_entry_t swap = folio_swap_entry(folio);
1337 mem_cgroup_swapout(folio, swap);
1338 if (reclaimed && !mapping_exiting(mapping))
1339 shadow = workingset_eviction(folio, target_memcg);
1340 __delete_from_swap_cache(folio, swap, shadow);
1341 xa_unlock_irq(&mapping->i_pages);
1342 put_swap_page(&folio->page, swap);
1343 } else {
1344 void (*free_folio)(struct folio *);
1345
1346 free_folio = mapping->a_ops->free_folio;
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363 if (reclaimed && folio_is_file_lru(folio) &&
1364 !mapping_exiting(mapping) && !dax_mapping(mapping))
1365 shadow = workingset_eviction(folio, target_memcg);
1366 __filemap_remove_folio(folio, shadow);
1367 xa_unlock_irq(&mapping->i_pages);
1368 if (mapping_shrinkable(mapping))
1369 inode_add_lru(mapping->host);
1370 spin_unlock(&mapping->host->i_lock);
1371
1372 if (free_folio)
1373 free_folio(folio);
1374 }
1375
1376 return 1;
1377
1378 cannot_free:
1379 xa_unlock_irq(&mapping->i_pages);
1380 if (!folio_test_swapcache(folio))
1381 spin_unlock(&mapping->host->i_lock);
1382 return 0;
1383 }
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397 long remove_mapping(struct address_space *mapping, struct folio *folio)
1398 {
1399 if (__remove_mapping(mapping, folio, false, NULL)) {
1400
1401
1402
1403
1404
1405 folio_ref_unfreeze(folio, 1);
1406 return folio_nr_pages(folio);
1407 }
1408 return 0;
1409 }
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420 void folio_putback_lru(struct folio *folio)
1421 {
1422 folio_add_lru(folio);
1423 folio_put(folio);
1424 }
1425
1426 enum page_references {
1427 PAGEREF_RECLAIM,
1428 PAGEREF_RECLAIM_CLEAN,
1429 PAGEREF_KEEP,
1430 PAGEREF_ACTIVATE,
1431 };
1432
1433 static enum page_references folio_check_references(struct folio *folio,
1434 struct scan_control *sc)
1435 {
1436 int referenced_ptes, referenced_folio;
1437 unsigned long vm_flags;
1438
1439 referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
1440 &vm_flags);
1441 referenced_folio = folio_test_clear_referenced(folio);
1442
1443
1444
1445
1446
1447 if (vm_flags & VM_LOCKED)
1448 return PAGEREF_ACTIVATE;
1449
1450
1451 if (referenced_ptes == -1)
1452 return PAGEREF_KEEP;
1453
1454 if (referenced_ptes) {
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469 folio_set_referenced(folio);
1470
1471 if (referenced_folio || referenced_ptes > 1)
1472 return PAGEREF_ACTIVATE;
1473
1474
1475
1476
1477 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
1478 return PAGEREF_ACTIVATE;
1479
1480 return PAGEREF_KEEP;
1481 }
1482
1483
1484 if (referenced_folio && folio_is_file_lru(folio))
1485 return PAGEREF_RECLAIM_CLEAN;
1486
1487 return PAGEREF_RECLAIM;
1488 }
1489
1490
1491 static void folio_check_dirty_writeback(struct folio *folio,
1492 bool *dirty, bool *writeback)
1493 {
1494 struct address_space *mapping;
1495
1496
1497
1498
1499
1500
1501
1502
1503 if (!folio_is_file_lru(folio) ||
1504 (folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
1505 *dirty = false;
1506 *writeback = false;
1507 return;
1508 }
1509
1510
1511 *dirty = folio_test_dirty(folio);
1512 *writeback = folio_test_writeback(folio);
1513
1514
1515 if (!folio_test_private(folio))
1516 return;
1517
1518 mapping = folio_mapping(folio);
1519 if (mapping && mapping->a_ops->is_dirty_writeback)
1520 mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
1521 }
1522
1523 static struct page *alloc_demote_page(struct page *page, unsigned long node)
1524 {
1525 struct migration_target_control mtc = {
1526
1527
1528
1529
1530
1531 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
1532 __GFP_THISNODE | __GFP_NOWARN |
1533 __GFP_NOMEMALLOC | GFP_NOWAIT,
1534 .nid = node
1535 };
1536
1537 return alloc_migration_target(page, (unsigned long)&mtc);
1538 }
1539
1540
1541
1542
1543
1544
1545 static unsigned int demote_page_list(struct list_head *demote_pages,
1546 struct pglist_data *pgdat)
1547 {
1548 int target_nid = next_demotion_node(pgdat->node_id);
1549 unsigned int nr_succeeded;
1550
1551 if (list_empty(demote_pages))
1552 return 0;
1553
1554 if (target_nid == NUMA_NO_NODE)
1555 return 0;
1556
1557
1558 migrate_pages(demote_pages, alloc_demote_page, NULL,
1559 target_nid, MIGRATE_ASYNC, MR_DEMOTION,
1560 &nr_succeeded);
1561
1562 if (current_is_kswapd())
1563 __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
1564 else
1565 __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
1566
1567 return nr_succeeded;
1568 }
1569
1570 static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
1571 {
1572 if (gfp_mask & __GFP_FS)
1573 return true;
1574 if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
1575 return false;
1576
1577
1578
1579
1580
1581
1582
1583 return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
1584 }
1585
1586
1587
1588
1589 static unsigned int shrink_page_list(struct list_head *page_list,
1590 struct pglist_data *pgdat,
1591 struct scan_control *sc,
1592 struct reclaim_stat *stat,
1593 bool ignore_references)
1594 {
1595 LIST_HEAD(ret_pages);
1596 LIST_HEAD(free_pages);
1597 LIST_HEAD(demote_pages);
1598 unsigned int nr_reclaimed = 0;
1599 unsigned int pgactivate = 0;
1600 bool do_demote_pass;
1601 struct swap_iocb *plug = NULL;
1602
1603 memset(stat, 0, sizeof(*stat));
1604 cond_resched();
1605 do_demote_pass = can_demote(pgdat->node_id, sc);
1606
1607 retry:
1608 while (!list_empty(page_list)) {
1609 struct address_space *mapping;
1610 struct folio *folio;
1611 enum page_references references = PAGEREF_RECLAIM;
1612 bool dirty, writeback;
1613 unsigned int nr_pages;
1614
1615 cond_resched();
1616
1617 folio = lru_to_folio(page_list);
1618 list_del(&folio->lru);
1619
1620 if (!folio_trylock(folio))
1621 goto keep;
1622
1623 VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1624
1625 nr_pages = folio_nr_pages(folio);
1626
1627
1628 sc->nr_scanned += nr_pages;
1629
1630 if (unlikely(!folio_evictable(folio)))
1631 goto activate_locked;
1632
1633 if (!sc->may_unmap && folio_mapped(folio))
1634 goto keep_locked;
1635
1636
1637
1638
1639
1640
1641 folio_check_dirty_writeback(folio, &dirty, &writeback);
1642 if (dirty || writeback)
1643 stat->nr_dirty += nr_pages;
1644
1645 if (dirty && !writeback)
1646 stat->nr_unqueued_dirty += nr_pages;
1647
1648
1649
1650
1651
1652
1653
1654 if (writeback && folio_test_reclaim(folio))
1655 stat->nr_congested += nr_pages;
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701 if (folio_test_writeback(folio)) {
1702
1703 if (current_is_kswapd() &&
1704 folio_test_reclaim(folio) &&
1705 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1706 stat->nr_immediate += nr_pages;
1707 goto activate_locked;
1708
1709
1710 } else if (writeback_throttling_sane(sc) ||
1711 !folio_test_reclaim(folio) ||
1712 !may_enter_fs(folio, sc->gfp_mask)) {
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727 folio_set_reclaim(folio);
1728 stat->nr_writeback += nr_pages;
1729 goto activate_locked;
1730
1731
1732 } else {
1733 folio_unlock(folio);
1734 folio_wait_writeback(folio);
1735
1736 list_add_tail(&folio->lru, page_list);
1737 continue;
1738 }
1739 }
1740
1741 if (!ignore_references)
1742 references = folio_check_references(folio, sc);
1743
1744 switch (references) {
1745 case PAGEREF_ACTIVATE:
1746 goto activate_locked;
1747 case PAGEREF_KEEP:
1748 stat->nr_ref_keep += nr_pages;
1749 goto keep_locked;
1750 case PAGEREF_RECLAIM:
1751 case PAGEREF_RECLAIM_CLEAN:
1752 ;
1753 }
1754
1755
1756
1757
1758
1759 if (do_demote_pass &&
1760 (thp_migration_supported() || !folio_test_large(folio))) {
1761 list_add(&folio->lru, &demote_pages);
1762 folio_unlock(folio);
1763 continue;
1764 }
1765
1766
1767
1768
1769
1770
1771 if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
1772 if (!folio_test_swapcache(folio)) {
1773 if (!(sc->gfp_mask & __GFP_IO))
1774 goto keep_locked;
1775 if (folio_maybe_dma_pinned(folio))
1776 goto keep_locked;
1777 if (folio_test_large(folio)) {
1778
1779 if (!can_split_folio(folio, NULL))
1780 goto activate_locked;
1781
1782
1783
1784
1785
1786 if (!folio_entire_mapcount(folio) &&
1787 split_folio_to_list(folio,
1788 page_list))
1789 goto activate_locked;
1790 }
1791 if (!add_to_swap(folio)) {
1792 if (!folio_test_large(folio))
1793 goto activate_locked_split;
1794
1795 if (split_folio_to_list(folio,
1796 page_list))
1797 goto activate_locked;
1798 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1799 count_vm_event(THP_SWPOUT_FALLBACK);
1800 #endif
1801 if (!add_to_swap(folio))
1802 goto activate_locked_split;
1803 }
1804 }
1805 } else if (folio_test_swapbacked(folio) &&
1806 folio_test_large(folio)) {
1807
1808 if (split_folio_to_list(folio, page_list))
1809 goto keep_locked;
1810 }
1811
1812
1813
1814
1815
1816
1817 if ((nr_pages > 1) && !folio_test_large(folio)) {
1818 sc->nr_scanned -= (nr_pages - 1);
1819 nr_pages = 1;
1820 }
1821
1822
1823
1824
1825
1826 if (folio_mapped(folio)) {
1827 enum ttu_flags flags = TTU_BATCH_FLUSH;
1828 bool was_swapbacked = folio_test_swapbacked(folio);
1829
1830 if (folio_test_pmd_mappable(folio))
1831 flags |= TTU_SPLIT_HUGE_PMD;
1832
1833 try_to_unmap(folio, flags);
1834 if (folio_mapped(folio)) {
1835 stat->nr_unmap_fail += nr_pages;
1836 if (!was_swapbacked &&
1837 folio_test_swapbacked(folio))
1838 stat->nr_lazyfree_fail += nr_pages;
1839 goto activate_locked;
1840 }
1841 }
1842
1843 mapping = folio_mapping(folio);
1844 if (folio_test_dirty(folio)) {
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856 if (folio_is_file_lru(folio) &&
1857 (!current_is_kswapd() ||
1858 !folio_test_reclaim(folio) ||
1859 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1860
1861
1862
1863
1864
1865
1866 node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
1867 nr_pages);
1868 folio_set_reclaim(folio);
1869
1870 goto activate_locked;
1871 }
1872
1873 if (references == PAGEREF_RECLAIM_CLEAN)
1874 goto keep_locked;
1875 if (!may_enter_fs(folio, sc->gfp_mask))
1876 goto keep_locked;
1877 if (!sc->may_writepage)
1878 goto keep_locked;
1879
1880
1881
1882
1883
1884
1885 try_to_unmap_flush_dirty();
1886 switch (pageout(folio, mapping, &plug)) {
1887 case PAGE_KEEP:
1888 goto keep_locked;
1889 case PAGE_ACTIVATE:
1890 goto activate_locked;
1891 case PAGE_SUCCESS:
1892 stat->nr_pageout += nr_pages;
1893
1894 if (folio_test_writeback(folio))
1895 goto keep;
1896 if (folio_test_dirty(folio))
1897 goto keep;
1898
1899
1900
1901
1902
1903 if (!folio_trylock(folio))
1904 goto keep;
1905 if (folio_test_dirty(folio) ||
1906 folio_test_writeback(folio))
1907 goto keep_locked;
1908 mapping = folio_mapping(folio);
1909 fallthrough;
1910 case PAGE_CLEAN:
1911 ;
1912 }
1913 }
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938 if (folio_has_private(folio)) {
1939 if (!filemap_release_folio(folio, sc->gfp_mask))
1940 goto activate_locked;
1941 if (!mapping && folio_ref_count(folio) == 1) {
1942 folio_unlock(folio);
1943 if (folio_put_testzero(folio))
1944 goto free_it;
1945 else {
1946
1947
1948
1949
1950
1951
1952
1953 nr_reclaimed += nr_pages;
1954 continue;
1955 }
1956 }
1957 }
1958
1959 if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
1960
1961 if (!folio_ref_freeze(folio, 1))
1962 goto keep_locked;
1963
1964
1965
1966
1967
1968
1969
1970
1971 count_vm_events(PGLAZYFREED, nr_pages);
1972 count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
1973 } else if (!mapping || !__remove_mapping(mapping, folio, true,
1974 sc->target_mem_cgroup))
1975 goto keep_locked;
1976
1977 folio_unlock(folio);
1978 free_it:
1979
1980
1981
1982
1983 nr_reclaimed += nr_pages;
1984
1985
1986
1987
1988
1989 if (unlikely(folio_test_large(folio)))
1990 destroy_large_folio(folio);
1991 else
1992 list_add(&folio->lru, &free_pages);
1993 continue;
1994
1995 activate_locked_split:
1996
1997
1998
1999
2000 if (nr_pages > 1) {
2001 sc->nr_scanned -= (nr_pages - 1);
2002 nr_pages = 1;
2003 }
2004 activate_locked:
2005
2006 if (folio_test_swapcache(folio) &&
2007 (mem_cgroup_swap_full(&folio->page) ||
2008 folio_test_mlocked(folio)))
2009 try_to_free_swap(&folio->page);
2010 VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
2011 if (!folio_test_mlocked(folio)) {
2012 int type = folio_is_file_lru(folio);
2013 folio_set_active(folio);
2014 stat->nr_activate[type] += nr_pages;
2015 count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
2016 }
2017 keep_locked:
2018 folio_unlock(folio);
2019 keep:
2020 list_add(&folio->lru, &ret_pages);
2021 VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
2022 folio_test_unevictable(folio), folio);
2023 }
2024
2025
2026
2027 nr_reclaimed += demote_page_list(&demote_pages, pgdat);
2028
2029 if (!list_empty(&demote_pages)) {
2030
2031 list_splice_init(&demote_pages, page_list);
2032 do_demote_pass = false;
2033 goto retry;
2034 }
2035
2036 pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
2037
2038 mem_cgroup_uncharge_list(&free_pages);
2039 try_to_unmap_flush();
2040 free_unref_page_list(&free_pages);
2041
2042 list_splice(&ret_pages, page_list);
2043 count_vm_events(PGACTIVATE, pgactivate);
2044
2045 if (plug)
2046 swap_write_unplug(plug);
2047 return nr_reclaimed;
2048 }
2049
2050 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
2051 struct list_head *folio_list)
2052 {
2053 struct scan_control sc = {
2054 .gfp_mask = GFP_KERNEL,
2055 .may_unmap = 1,
2056 };
2057 struct reclaim_stat stat;
2058 unsigned int nr_reclaimed;
2059 struct folio *folio, *next;
2060 LIST_HEAD(clean_folios);
2061 unsigned int noreclaim_flag;
2062
2063 list_for_each_entry_safe(folio, next, folio_list, lru) {
2064 if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
2065 !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
2066 !folio_test_unevictable(folio)) {
2067 folio_clear_active(folio);
2068 list_move(&folio->lru, &clean_folios);
2069 }
2070 }
2071
2072
2073
2074
2075
2076
2077
2078 noreclaim_flag = memalloc_noreclaim_save();
2079 nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc,
2080 &stat, true);
2081 memalloc_noreclaim_restore(noreclaim_flag);
2082
2083 list_splice(&clean_folios, folio_list);
2084 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
2085 -(long)nr_reclaimed);
2086
2087
2088
2089
2090
2091
2092 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
2093 stat.nr_lazyfree_fail);
2094 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
2095 -(long)stat.nr_lazyfree_fail);
2096 return nr_reclaimed;
2097 }
2098
2099
2100
2101
2102
2103 static __always_inline void update_lru_sizes(struct lruvec *lruvec,
2104 enum lru_list lru, unsigned long *nr_zone_taken)
2105 {
2106 int zid;
2107
2108 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2109 if (!nr_zone_taken[zid])
2110 continue;
2111
2112 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
2113 }
2114
2115 }
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
2139 struct lruvec *lruvec, struct list_head *dst,
2140 unsigned long *nr_scanned, struct scan_control *sc,
2141 enum lru_list lru)
2142 {
2143 struct list_head *src = &lruvec->lists[lru];
2144 unsigned long nr_taken = 0;
2145 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
2146 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
2147 unsigned long skipped = 0;
2148 unsigned long scan, total_scan, nr_pages;
2149 LIST_HEAD(folios_skipped);
2150
2151 total_scan = 0;
2152 scan = 0;
2153 while (scan < nr_to_scan && !list_empty(src)) {
2154 struct list_head *move_to = src;
2155 struct folio *folio;
2156
2157 folio = lru_to_folio(src);
2158 prefetchw_prev_lru_folio(folio, src, flags);
2159
2160 nr_pages = folio_nr_pages(folio);
2161 total_scan += nr_pages;
2162
2163 if (folio_zonenum(folio) > sc->reclaim_idx) {
2164 nr_skipped[folio_zonenum(folio)] += nr_pages;
2165 move_to = &folios_skipped;
2166 goto move;
2167 }
2168
2169
2170
2171
2172
2173
2174
2175
2176 scan += nr_pages;
2177
2178 if (!folio_test_lru(folio))
2179 goto move;
2180 if (!sc->may_unmap && folio_mapped(folio))
2181 goto move;
2182
2183
2184
2185
2186
2187
2188 if (unlikely(!folio_try_get(folio)))
2189 goto move;
2190
2191 if (!folio_test_clear_lru(folio)) {
2192
2193 folio_put(folio);
2194 goto move;
2195 }
2196
2197 nr_taken += nr_pages;
2198 nr_zone_taken[folio_zonenum(folio)] += nr_pages;
2199 move_to = dst;
2200 move:
2201 list_move(&folio->lru, move_to);
2202 }
2203
2204
2205
2206
2207
2208
2209
2210
2211 if (!list_empty(&folios_skipped)) {
2212 int zid;
2213
2214 list_splice(&folios_skipped, src);
2215 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2216 if (!nr_skipped[zid])
2217 continue;
2218
2219 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
2220 skipped += nr_skipped[zid];
2221 }
2222 }
2223 *nr_scanned = total_scan;
2224 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
2225 total_scan, skipped, nr_taken,
2226 sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
2227 update_lru_sizes(lruvec, lru, nr_zone_taken);
2228 return nr_taken;
2229 }
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254 int folio_isolate_lru(struct folio *folio)
2255 {
2256 int ret = -EBUSY;
2257
2258 VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
2259
2260 if (folio_test_clear_lru(folio)) {
2261 struct lruvec *lruvec;
2262
2263 folio_get(folio);
2264 lruvec = folio_lruvec_lock_irq(folio);
2265 lruvec_del_folio(lruvec, folio);
2266 unlock_page_lruvec_irq(lruvec);
2267 ret = 0;
2268 }
2269
2270 return ret;
2271 }
2272
2273
2274
2275
2276
2277
2278
2279
2280 static int too_many_isolated(struct pglist_data *pgdat, int file,
2281 struct scan_control *sc)
2282 {
2283 unsigned long inactive, isolated;
2284 bool too_many;
2285
2286 if (current_is_kswapd())
2287 return 0;
2288
2289 if (!writeback_throttling_sane(sc))
2290 return 0;
2291
2292 if (file) {
2293 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
2294 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
2295 } else {
2296 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
2297 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
2298 }
2299
2300
2301
2302
2303
2304
2305 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
2306 inactive >>= 3;
2307
2308 too_many = isolated > inactive;
2309
2310
2311 if (!too_many)
2312 wake_throttle_isolated(pgdat);
2313
2314 return too_many;
2315 }
2316
2317
2318
2319
2320
2321
2322
2323 static unsigned int move_pages_to_lru(struct lruvec *lruvec,
2324 struct list_head *list)
2325 {
2326 int nr_pages, nr_moved = 0;
2327 LIST_HEAD(folios_to_free);
2328
2329 while (!list_empty(list)) {
2330 struct folio *folio = lru_to_folio(list);
2331
2332 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
2333 list_del(&folio->lru);
2334 if (unlikely(!folio_evictable(folio))) {
2335 spin_unlock_irq(&lruvec->lru_lock);
2336 folio_putback_lru(folio);
2337 spin_lock_irq(&lruvec->lru_lock);
2338 continue;
2339 }
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352 folio_set_lru(folio);
2353
2354 if (unlikely(folio_put_testzero(folio))) {
2355 __folio_clear_lru_flags(folio);
2356
2357 if (unlikely(folio_test_large(folio))) {
2358 spin_unlock_irq(&lruvec->lru_lock);
2359 destroy_large_folio(folio);
2360 spin_lock_irq(&lruvec->lru_lock);
2361 } else
2362 list_add(&folio->lru, &folios_to_free);
2363
2364 continue;
2365 }
2366
2367
2368
2369
2370
2371 VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
2372 lruvec_add_folio(lruvec, folio);
2373 nr_pages = folio_nr_pages(folio);
2374 nr_moved += nr_pages;
2375 if (folio_test_active(folio))
2376 workingset_age_nonresident(lruvec, nr_pages);
2377 }
2378
2379
2380
2381
2382 list_splice(&folios_to_free, list);
2383
2384 return nr_moved;
2385 }
2386
2387
2388
2389
2390
2391
2392 static int current_may_throttle(void)
2393 {
2394 return !(current->flags & PF_LOCAL_THROTTLE);
2395 }
2396
2397
2398
2399
2400
2401 static unsigned long
2402 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
2403 struct scan_control *sc, enum lru_list lru)
2404 {
2405 LIST_HEAD(page_list);
2406 unsigned long nr_scanned;
2407 unsigned int nr_reclaimed = 0;
2408 unsigned long nr_taken;
2409 struct reclaim_stat stat;
2410 bool file = is_file_lru(lru);
2411 enum vm_event_item item;
2412 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2413 bool stalled = false;
2414
2415 while (unlikely(too_many_isolated(pgdat, file, sc))) {
2416 if (stalled)
2417 return 0;
2418
2419
2420 stalled = true;
2421 reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
2422
2423
2424 if (fatal_signal_pending(current))
2425 return SWAP_CLUSTER_MAX;
2426 }
2427
2428 lru_add_drain();
2429
2430 spin_lock_irq(&lruvec->lru_lock);
2431
2432 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
2433 &nr_scanned, sc, lru);
2434
2435 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2436 item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
2437 if (!cgroup_reclaim(sc))
2438 __count_vm_events(item, nr_scanned);
2439 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
2440 __count_vm_events(PGSCAN_ANON + file, nr_scanned);
2441
2442 spin_unlock_irq(&lruvec->lru_lock);
2443
2444 if (nr_taken == 0)
2445 return 0;
2446
2447 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
2448
2449 spin_lock_irq(&lruvec->lru_lock);
2450 move_pages_to_lru(lruvec, &page_list);
2451
2452 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2453 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
2454 if (!cgroup_reclaim(sc))
2455 __count_vm_events(item, nr_reclaimed);
2456 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
2457 __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
2458 spin_unlock_irq(&lruvec->lru_lock);
2459
2460 lru_note_cost(lruvec, file, stat.nr_pageout);
2461 mem_cgroup_uncharge_list(&page_list);
2462 free_unref_page_list(&page_list);
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475 if (stat.nr_unqueued_dirty == nr_taken)
2476 wakeup_flusher_threads(WB_REASON_VMSCAN);
2477
2478 sc->nr.dirty += stat.nr_dirty;
2479 sc->nr.congested += stat.nr_congested;
2480 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2481 sc->nr.writeback += stat.nr_writeback;
2482 sc->nr.immediate += stat.nr_immediate;
2483 sc->nr.taken += nr_taken;
2484 if (file)
2485 sc->nr.file_taken += nr_taken;
2486
2487 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2488 nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2489 return nr_reclaimed;
2490 }
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509 static void shrink_active_list(unsigned long nr_to_scan,
2510 struct lruvec *lruvec,
2511 struct scan_control *sc,
2512 enum lru_list lru)
2513 {
2514 unsigned long nr_taken;
2515 unsigned long nr_scanned;
2516 unsigned long vm_flags;
2517 LIST_HEAD(l_hold);
2518 LIST_HEAD(l_active);
2519 LIST_HEAD(l_inactive);
2520 unsigned nr_deactivate, nr_activate;
2521 unsigned nr_rotated = 0;
2522 int file = is_file_lru(lru);
2523 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2524
2525 lru_add_drain();
2526
2527 spin_lock_irq(&lruvec->lru_lock);
2528
2529 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2530 &nr_scanned, sc, lru);
2531
2532 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2533
2534 if (!cgroup_reclaim(sc))
2535 __count_vm_events(PGREFILL, nr_scanned);
2536 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2537
2538 spin_unlock_irq(&lruvec->lru_lock);
2539
2540 while (!list_empty(&l_hold)) {
2541 struct folio *folio;
2542
2543 cond_resched();
2544 folio = lru_to_folio(&l_hold);
2545 list_del(&folio->lru);
2546
2547 if (unlikely(!folio_evictable(folio))) {
2548 folio_putback_lru(folio);
2549 continue;
2550 }
2551
2552 if (unlikely(buffer_heads_over_limit)) {
2553 if (folio_test_private(folio) && folio_trylock(folio)) {
2554 if (folio_test_private(folio))
2555 filemap_release_folio(folio, 0);
2556 folio_unlock(folio);
2557 }
2558 }
2559
2560
2561 if (folio_referenced(folio, 0, sc->target_mem_cgroup,
2562 &vm_flags) != 0) {
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
2573 nr_rotated += folio_nr_pages(folio);
2574 list_add(&folio->lru, &l_active);
2575 continue;
2576 }
2577 }
2578
2579 folio_clear_active(folio);
2580 folio_set_workingset(folio);
2581 list_add(&folio->lru, &l_inactive);
2582 }
2583
2584
2585
2586
2587 spin_lock_irq(&lruvec->lru_lock);
2588
2589 nr_activate = move_pages_to_lru(lruvec, &l_active);
2590 nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
2591
2592 list_splice(&l_inactive, &l_active);
2593
2594 __count_vm_events(PGDEACTIVATE, nr_deactivate);
2595 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2596
2597 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2598 spin_unlock_irq(&lruvec->lru_lock);
2599
2600 mem_cgroup_uncharge_list(&l_active);
2601 free_unref_page_list(&l_active);
2602 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2603 nr_deactivate, nr_rotated, sc->priority, file);
2604 }
2605
2606 static unsigned int reclaim_page_list(struct list_head *page_list,
2607 struct pglist_data *pgdat)
2608 {
2609 struct reclaim_stat dummy_stat;
2610 unsigned int nr_reclaimed;
2611 struct folio *folio;
2612 struct scan_control sc = {
2613 .gfp_mask = GFP_KERNEL,
2614 .may_writepage = 1,
2615 .may_unmap = 1,
2616 .may_swap = 1,
2617 .no_demotion = 1,
2618 };
2619
2620 nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false);
2621 while (!list_empty(page_list)) {
2622 folio = lru_to_folio(page_list);
2623 list_del(&folio->lru);
2624 folio_putback_lru(folio);
2625 }
2626
2627 return nr_reclaimed;
2628 }
2629
2630 unsigned long reclaim_pages(struct list_head *folio_list)
2631 {
2632 int nid;
2633 unsigned int nr_reclaimed = 0;
2634 LIST_HEAD(node_folio_list);
2635 unsigned int noreclaim_flag;
2636
2637 if (list_empty(folio_list))
2638 return nr_reclaimed;
2639
2640 noreclaim_flag = memalloc_noreclaim_save();
2641
2642 nid = folio_nid(lru_to_folio(folio_list));
2643 do {
2644 struct folio *folio = lru_to_folio(folio_list);
2645
2646 if (nid == folio_nid(folio)) {
2647 folio_clear_active(folio);
2648 list_move(&folio->lru, &node_folio_list);
2649 continue;
2650 }
2651
2652 nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
2653 nid = folio_nid(lru_to_folio(folio_list));
2654 } while (!list_empty(folio_list));
2655
2656 nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
2657
2658 memalloc_noreclaim_restore(noreclaim_flag);
2659
2660 return nr_reclaimed;
2661 }
2662
2663 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2664 struct lruvec *lruvec, struct scan_control *sc)
2665 {
2666 if (is_active_lru(lru)) {
2667 if (sc->may_deactivate & (1 << is_file_lru(lru)))
2668 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2669 else
2670 sc->skipped_deactivate = 1;
2671 return 0;
2672 }
2673
2674 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2675 }
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705 static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
2706 {
2707 enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2708 unsigned long inactive, active;
2709 unsigned long inactive_ratio;
2710 unsigned long gb;
2711
2712 inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
2713 active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
2714
2715 gb = (inactive + active) >> (30 - PAGE_SHIFT);
2716 if (gb)
2717 inactive_ratio = int_sqrt(10 * gb);
2718 else
2719 inactive_ratio = 1;
2720
2721 return inactive * inactive_ratio < active;
2722 }
2723
2724 enum scan_balance {
2725 SCAN_EQUAL,
2726 SCAN_FRACT,
2727 SCAN_ANON,
2728 SCAN_FILE,
2729 };
2730
2731
2732
2733
2734
2735
2736
2737
2738 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
2739 unsigned long *nr)
2740 {
2741 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2742 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2743 unsigned long anon_cost, file_cost, total_cost;
2744 int swappiness = mem_cgroup_swappiness(memcg);
2745 u64 fraction[ANON_AND_FILE];
2746 u64 denominator = 0;
2747 enum scan_balance scan_balance;
2748 unsigned long ap, fp;
2749 enum lru_list lru;
2750
2751
2752 if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
2753 scan_balance = SCAN_FILE;
2754 goto out;
2755 }
2756
2757
2758
2759
2760
2761
2762
2763
2764 if (cgroup_reclaim(sc) && !swappiness) {
2765 scan_balance = SCAN_FILE;
2766 goto out;
2767 }
2768
2769
2770
2771
2772
2773
2774 if (!sc->priority && swappiness) {
2775 scan_balance = SCAN_EQUAL;
2776 goto out;
2777 }
2778
2779
2780
2781
2782 if (sc->file_is_tiny) {
2783 scan_balance = SCAN_ANON;
2784 goto out;
2785 }
2786
2787
2788
2789
2790
2791 if (sc->cache_trim_mode) {
2792 scan_balance = SCAN_FILE;
2793 goto out;
2794 }
2795
2796 scan_balance = SCAN_FRACT;
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812 total_cost = sc->anon_cost + sc->file_cost;
2813 anon_cost = total_cost + sc->anon_cost;
2814 file_cost = total_cost + sc->file_cost;
2815 total_cost = anon_cost + file_cost;
2816
2817 ap = swappiness * (total_cost + 1);
2818 ap /= anon_cost + 1;
2819
2820 fp = (200 - swappiness) * (total_cost + 1);
2821 fp /= file_cost + 1;
2822
2823 fraction[0] = ap;
2824 fraction[1] = fp;
2825 denominator = ap + fp;
2826 out:
2827 for_each_evictable_lru(lru) {
2828 int file = is_file_lru(lru);
2829 unsigned long lruvec_size;
2830 unsigned long low, min;
2831 unsigned long scan;
2832
2833 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2834 mem_cgroup_protection(sc->target_mem_cgroup, memcg,
2835 &min, &low);
2836
2837 if (min || low) {
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867 unsigned long cgroup_size = mem_cgroup_size(memcg);
2868 unsigned long protection;
2869
2870
2871 if (!sc->memcg_low_reclaim && low > min) {
2872 protection = low;
2873 sc->memcg_low_skipped = 1;
2874 } else {
2875 protection = min;
2876 }
2877
2878
2879 cgroup_size = max(cgroup_size, protection);
2880
2881 scan = lruvec_size - lruvec_size * protection /
2882 (cgroup_size + 1);
2883
2884
2885
2886
2887
2888
2889 scan = max(scan, SWAP_CLUSTER_MAX);
2890 } else {
2891 scan = lruvec_size;
2892 }
2893
2894 scan >>= sc->priority;
2895
2896
2897
2898
2899
2900 if (!scan && !mem_cgroup_online(memcg))
2901 scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2902
2903 switch (scan_balance) {
2904 case SCAN_EQUAL:
2905
2906 break;
2907 case SCAN_FRACT:
2908
2909
2910
2911
2912
2913
2914
2915 scan = mem_cgroup_online(memcg) ?
2916 div64_u64(scan * fraction[file], denominator) :
2917 DIV64_U64_ROUND_UP(scan * fraction[file],
2918 denominator);
2919 break;
2920 case SCAN_FILE:
2921 case SCAN_ANON:
2922
2923 if ((scan_balance == SCAN_FILE) != file)
2924 scan = 0;
2925 break;
2926 default:
2927
2928 BUG();
2929 }
2930
2931 nr[lru] = scan;
2932 }
2933 }
2934
2935
2936
2937
2938
2939 static bool can_age_anon_pages(struct pglist_data *pgdat,
2940 struct scan_control *sc)
2941 {
2942
2943 if (total_swap_pages > 0)
2944 return true;
2945
2946
2947 return can_demote(pgdat->node_id, sc);
2948 }
2949
2950 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2951 {
2952 unsigned long nr[NR_LRU_LISTS];
2953 unsigned long targets[NR_LRU_LISTS];
2954 unsigned long nr_to_scan;
2955 enum lru_list lru;
2956 unsigned long nr_reclaimed = 0;
2957 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2958 struct blk_plug plug;
2959 bool scan_adjusted;
2960
2961 get_scan_count(lruvec, sc, nr);
2962
2963
2964 memcpy(targets, nr, sizeof(nr));
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977 scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
2978 sc->priority == DEF_PRIORITY);
2979
2980 blk_start_plug(&plug);
2981 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2982 nr[LRU_INACTIVE_FILE]) {
2983 unsigned long nr_anon, nr_file, percentage;
2984 unsigned long nr_scanned;
2985
2986 for_each_evictable_lru(lru) {
2987 if (nr[lru]) {
2988 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2989 nr[lru] -= nr_to_scan;
2990
2991 nr_reclaimed += shrink_list(lru, nr_to_scan,
2992 lruvec, sc);
2993 }
2994 }
2995
2996 cond_resched();
2997
2998 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2999 continue;
3000
3001
3002
3003
3004
3005
3006
3007
3008 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
3009 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
3010
3011
3012
3013
3014
3015
3016
3017 if (!nr_file || !nr_anon)
3018 break;
3019
3020 if (nr_file > nr_anon) {
3021 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
3022 targets[LRU_ACTIVE_ANON] + 1;
3023 lru = LRU_BASE;
3024 percentage = nr_anon * 100 / scan_target;
3025 } else {
3026 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
3027 targets[LRU_ACTIVE_FILE] + 1;
3028 lru = LRU_FILE;
3029 percentage = nr_file * 100 / scan_target;
3030 }
3031
3032
3033 nr[lru] = 0;
3034 nr[lru + LRU_ACTIVE] = 0;
3035
3036
3037
3038
3039
3040 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
3041 nr_scanned = targets[lru] - nr[lru];
3042 nr[lru] = targets[lru] * (100 - percentage) / 100;
3043 nr[lru] -= min(nr[lru], nr_scanned);
3044
3045 lru += LRU_ACTIVE;
3046 nr_scanned = targets[lru] - nr[lru];
3047 nr[lru] = targets[lru] * (100 - percentage) / 100;
3048 nr[lru] -= min(nr[lru], nr_scanned);
3049
3050 scan_adjusted = true;
3051 }
3052 blk_finish_plug(&plug);
3053 sc->nr_reclaimed += nr_reclaimed;
3054
3055
3056
3057
3058
3059 if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
3060 inactive_is_low(lruvec, LRU_INACTIVE_ANON))
3061 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3062 sc, LRU_ACTIVE_ANON);
3063 }
3064
3065
3066 static bool in_reclaim_compaction(struct scan_control *sc)
3067 {
3068 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
3069 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
3070 sc->priority < DEF_PRIORITY - 2))
3071 return true;
3072
3073 return false;
3074 }
3075
3076
3077
3078
3079
3080
3081
3082
3083 static inline bool should_continue_reclaim(struct pglist_data *pgdat,
3084 unsigned long nr_reclaimed,
3085 struct scan_control *sc)
3086 {
3087 unsigned long pages_for_compaction;
3088 unsigned long inactive_lru_pages;
3089 int z;
3090
3091
3092 if (!in_reclaim_compaction(sc))
3093 return false;
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105 if (!nr_reclaimed)
3106 return false;
3107
3108
3109 for (z = 0; z <= sc->reclaim_idx; z++) {
3110 struct zone *zone = &pgdat->node_zones[z];
3111 if (!managed_zone(zone))
3112 continue;
3113
3114 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
3115 case COMPACT_SUCCESS:
3116 case COMPACT_CONTINUE:
3117 return false;
3118 default:
3119
3120 ;
3121 }
3122 }
3123
3124
3125
3126
3127
3128 pages_for_compaction = compact_gap(sc->order);
3129 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
3130 if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
3131 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
3132
3133 return inactive_lru_pages > pages_for_compaction;
3134 }
3135
3136 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
3137 {
3138 struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
3139 struct mem_cgroup *memcg;
3140
3141 memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
3142 do {
3143 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3144 unsigned long reclaimed;
3145 unsigned long scanned;
3146
3147
3148
3149
3150
3151
3152
3153 cond_resched();
3154
3155 mem_cgroup_calculate_protection(target_memcg, memcg);
3156
3157 if (mem_cgroup_below_min(memcg)) {
3158
3159
3160
3161
3162 continue;
3163 } else if (mem_cgroup_below_low(memcg)) {
3164
3165
3166
3167
3168
3169
3170 if (!sc->memcg_low_reclaim) {
3171 sc->memcg_low_skipped = 1;
3172 continue;
3173 }
3174 memcg_memory_event(memcg, MEMCG_LOW);
3175 }
3176
3177 reclaimed = sc->nr_reclaimed;
3178 scanned = sc->nr_scanned;
3179
3180 shrink_lruvec(lruvec, sc);
3181
3182 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
3183 sc->priority);
3184
3185
3186 if (!sc->proactive)
3187 vmpressure(sc->gfp_mask, memcg, false,
3188 sc->nr_scanned - scanned,
3189 sc->nr_reclaimed - reclaimed);
3190
3191 } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
3192 }
3193
3194 static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
3195 {
3196 struct reclaim_state *reclaim_state = current->reclaim_state;
3197 unsigned long nr_reclaimed, nr_scanned;
3198 struct lruvec *target_lruvec;
3199 bool reclaimable = false;
3200 unsigned long file;
3201
3202 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
3203
3204 again:
3205
3206
3207
3208
3209 mem_cgroup_flush_stats();
3210
3211 memset(&sc->nr, 0, sizeof(sc->nr));
3212
3213 nr_reclaimed = sc->nr_reclaimed;
3214 nr_scanned = sc->nr_scanned;
3215
3216
3217
3218
3219 spin_lock_irq(&target_lruvec->lru_lock);
3220 sc->anon_cost = target_lruvec->anon_cost;
3221 sc->file_cost = target_lruvec->file_cost;
3222 spin_unlock_irq(&target_lruvec->lru_lock);
3223
3224
3225
3226
3227
3228 if (!sc->force_deactivate) {
3229 unsigned long refaults;
3230
3231 refaults = lruvec_page_state(target_lruvec,
3232 WORKINGSET_ACTIVATE_ANON);
3233 if (refaults != target_lruvec->refaults[0] ||
3234 inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
3235 sc->may_deactivate |= DEACTIVATE_ANON;
3236 else
3237 sc->may_deactivate &= ~DEACTIVATE_ANON;
3238
3239
3240
3241
3242
3243
3244 refaults = lruvec_page_state(target_lruvec,
3245 WORKINGSET_ACTIVATE_FILE);
3246 if (refaults != target_lruvec->refaults[1] ||
3247 inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
3248 sc->may_deactivate |= DEACTIVATE_FILE;
3249 else
3250 sc->may_deactivate &= ~DEACTIVATE_FILE;
3251 } else
3252 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
3253
3254
3255
3256
3257
3258
3259 file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
3260 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
3261 sc->cache_trim_mode = 1;
3262 else
3263 sc->cache_trim_mode = 0;
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274 if (!cgroup_reclaim(sc)) {
3275 unsigned long total_high_wmark = 0;
3276 unsigned long free, anon;
3277 int z;
3278
3279 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
3280 file = node_page_state(pgdat, NR_ACTIVE_FILE) +
3281 node_page_state(pgdat, NR_INACTIVE_FILE);
3282
3283 for (z = 0; z < MAX_NR_ZONES; z++) {
3284 struct zone *zone = &pgdat->node_zones[z];
3285 if (!managed_zone(zone))
3286 continue;
3287
3288 total_high_wmark += high_wmark_pages(zone);
3289 }
3290
3291
3292
3293
3294
3295
3296 anon = node_page_state(pgdat, NR_INACTIVE_ANON);
3297
3298 sc->file_is_tiny =
3299 file + free <= total_high_wmark &&
3300 !(sc->may_deactivate & DEACTIVATE_ANON) &&
3301 anon >> sc->priority;
3302 }
3303
3304 shrink_node_memcgs(pgdat, sc);
3305
3306 if (reclaim_state) {
3307 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
3308 reclaim_state->reclaimed_slab = 0;
3309 }
3310
3311
3312 if (!sc->proactive)
3313 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
3314 sc->nr_scanned - nr_scanned,
3315 sc->nr_reclaimed - nr_reclaimed);
3316
3317 if (sc->nr_reclaimed - nr_reclaimed)
3318 reclaimable = true;
3319
3320 if (current_is_kswapd()) {
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
3339 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
3340
3341
3342 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
3343 set_bit(PGDAT_DIRTY, &pgdat->flags);
3344
3345
3346
3347
3348
3349
3350
3351
3352 if (sc->nr.immediate)
3353 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
3354 }
3355
3356
3357
3358
3359
3360
3361
3362
3363 if ((current_is_kswapd() ||
3364 (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
3365 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
3366 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
3367
3368
3369
3370
3371
3372
3373
3374 if (!current_is_kswapd() && current_may_throttle() &&
3375 !sc->hibernation_mode &&
3376 test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
3377 reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
3378
3379 if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
3380 sc))
3381 goto again;
3382
3383
3384
3385
3386
3387
3388
3389 if (reclaimable)
3390 pgdat->kswapd_failures = 0;
3391 }
3392
3393
3394
3395
3396
3397
3398 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
3399 {
3400 unsigned long watermark;
3401 enum compact_result suitable;
3402
3403 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
3404 if (suitable == COMPACT_SUCCESS)
3405
3406 return true;
3407 if (suitable == COMPACT_SKIPPED)
3408
3409 return false;
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
3421
3422 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
3423 }
3424
3425 static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
3426 {
3427
3428
3429
3430
3431 if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
3432 wait_queue_head_t *wqh;
3433
3434 wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
3435 if (waitqueue_active(wqh))
3436 wake_up(wqh);
3437
3438 return;
3439 }
3440
3441
3442
3443
3444
3445
3446
3447 if (current_is_kswapd() || cgroup_reclaim(sc))
3448 return;
3449
3450
3451 if (sc->priority == 1 && !sc->nr_reclaimed)
3452 reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
3453 }
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463 static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
3464 {
3465 struct zoneref *z;
3466 struct zone *zone;
3467 unsigned long nr_soft_reclaimed;
3468 unsigned long nr_soft_scanned;
3469 gfp_t orig_mask;
3470 pg_data_t *last_pgdat = NULL;
3471 pg_data_t *first_pgdat = NULL;
3472
3473
3474
3475
3476
3477
3478 orig_mask = sc->gfp_mask;
3479 if (buffer_heads_over_limit) {
3480 sc->gfp_mask |= __GFP_HIGHMEM;
3481 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
3482 }
3483
3484 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3485 sc->reclaim_idx, sc->nodemask) {
3486
3487
3488
3489
3490 if (!cgroup_reclaim(sc)) {
3491 if (!cpuset_zone_allowed(zone,
3492 GFP_KERNEL | __GFP_HARDWALL))
3493 continue;
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504 if (IS_ENABLED(CONFIG_COMPACTION) &&
3505 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
3506 compaction_ready(zone, sc)) {
3507 sc->compaction_ready = true;
3508 continue;
3509 }
3510
3511
3512
3513
3514
3515
3516
3517 if (zone->zone_pgdat == last_pgdat)
3518 continue;
3519
3520
3521
3522
3523
3524
3525
3526 nr_soft_scanned = 0;
3527 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
3528 sc->order, sc->gfp_mask,
3529 &nr_soft_scanned);
3530 sc->nr_reclaimed += nr_soft_reclaimed;
3531 sc->nr_scanned += nr_soft_scanned;
3532
3533 }
3534
3535 if (!first_pgdat)
3536 first_pgdat = zone->zone_pgdat;
3537
3538
3539 if (zone->zone_pgdat == last_pgdat)
3540 continue;
3541 last_pgdat = zone->zone_pgdat;
3542 shrink_node(zone->zone_pgdat, sc);
3543 }
3544
3545 if (first_pgdat)
3546 consider_reclaim_throttle(first_pgdat, sc);
3547
3548
3549
3550
3551
3552 sc->gfp_mask = orig_mask;
3553 }
3554
3555 static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
3556 {
3557 struct lruvec *target_lruvec;
3558 unsigned long refaults;
3559
3560 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
3561 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
3562 target_lruvec->refaults[0] = refaults;
3563 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
3564 target_lruvec->refaults[1] = refaults;
3565 }
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3584 struct scan_control *sc)
3585 {
3586 int initial_priority = sc->priority;
3587 pg_data_t *last_pgdat;
3588 struct zoneref *z;
3589 struct zone *zone;
3590 retry:
3591 delayacct_freepages_start();
3592
3593 if (!cgroup_reclaim(sc))
3594 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
3595
3596 do {
3597 if (!sc->proactive)
3598 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
3599 sc->priority);
3600 sc->nr_scanned = 0;
3601 shrink_zones(zonelist, sc);
3602
3603 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3604 break;
3605
3606 if (sc->compaction_ready)
3607 break;
3608
3609
3610
3611
3612
3613 if (sc->priority < DEF_PRIORITY - 2)
3614 sc->may_writepage = 1;
3615 } while (--sc->priority >= 0);
3616
3617 last_pgdat = NULL;
3618 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3619 sc->nodemask) {
3620 if (zone->zone_pgdat == last_pgdat)
3621 continue;
3622 last_pgdat = zone->zone_pgdat;
3623
3624 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3625
3626 if (cgroup_reclaim(sc)) {
3627 struct lruvec *lruvec;
3628
3629 lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
3630 zone->zone_pgdat);
3631 clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
3632 }
3633 }
3634
3635 delayacct_freepages_end();
3636
3637 if (sc->nr_reclaimed)
3638 return sc->nr_reclaimed;
3639
3640
3641 if (sc->compaction_ready)
3642 return 1;
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653 if (sc->skipped_deactivate) {
3654 sc->priority = initial_priority;
3655 sc->force_deactivate = 1;
3656 sc->skipped_deactivate = 0;
3657 goto retry;
3658 }
3659
3660
3661 if (sc->memcg_low_skipped) {
3662 sc->priority = initial_priority;
3663 sc->force_deactivate = 0;
3664 sc->memcg_low_reclaim = 1;
3665 sc->memcg_low_skipped = 0;
3666 goto retry;
3667 }
3668
3669 return 0;
3670 }
3671
3672 static bool allow_direct_reclaim(pg_data_t *pgdat)
3673 {
3674 struct zone *zone;
3675 unsigned long pfmemalloc_reserve = 0;
3676 unsigned long free_pages = 0;
3677 int i;
3678 bool wmark_ok;
3679
3680 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3681 return true;
3682
3683 for (i = 0; i <= ZONE_NORMAL; i++) {
3684 zone = &pgdat->node_zones[i];
3685 if (!managed_zone(zone))
3686 continue;
3687
3688 if (!zone_reclaimable_pages(zone))
3689 continue;
3690
3691 pfmemalloc_reserve += min_wmark_pages(zone);
3692 free_pages += zone_page_state(zone, NR_FREE_PAGES);
3693 }
3694
3695
3696 if (!pfmemalloc_reserve)
3697 return true;
3698
3699 wmark_ok = free_pages > pfmemalloc_reserve / 2;
3700
3701
3702 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3703 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
3704 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
3705
3706 wake_up_interruptible(&pgdat->kswapd_wait);
3707 }
3708
3709 return wmark_ok;
3710 }
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3722 nodemask_t *nodemask)
3723 {
3724 struct zoneref *z;
3725 struct zone *zone;
3726 pg_data_t *pgdat = NULL;
3727
3728
3729
3730
3731
3732
3733
3734
3735 if (current->flags & PF_KTHREAD)
3736 goto out;
3737
3738
3739
3740
3741
3742 if (fatal_signal_pending(current))
3743 goto out;
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3760 gfp_zone(gfp_mask), nodemask) {
3761 if (zone_idx(zone) > ZONE_NORMAL)
3762 continue;
3763
3764
3765 pgdat = zone->zone_pgdat;
3766 if (allow_direct_reclaim(pgdat))
3767 goto out;
3768 break;
3769 }
3770
3771
3772 if (!pgdat)
3773 goto out;
3774
3775
3776 count_vm_event(PGSCAN_DIRECT_THROTTLE);
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786 if (!(gfp_mask & __GFP_FS))
3787 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3788 allow_direct_reclaim(pgdat), HZ);
3789 else
3790
3791 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3792 allow_direct_reclaim(pgdat));
3793
3794 if (fatal_signal_pending(current))
3795 return true;
3796
3797 out:
3798 return false;
3799 }
3800
3801 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3802 gfp_t gfp_mask, nodemask_t *nodemask)
3803 {
3804 unsigned long nr_reclaimed;
3805 struct scan_control sc = {
3806 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3807 .gfp_mask = current_gfp_context(gfp_mask),
3808 .reclaim_idx = gfp_zone(gfp_mask),
3809 .order = order,
3810 .nodemask = nodemask,
3811 .priority = DEF_PRIORITY,
3812 .may_writepage = !laptop_mode,
3813 .may_unmap = 1,
3814 .may_swap = 1,
3815 };
3816
3817
3818
3819
3820
3821 BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3822 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3823 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3824
3825
3826
3827
3828
3829
3830 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3831 return 1;
3832
3833 set_task_reclaim_state(current, &sc.reclaim_state);
3834 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
3835
3836 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3837
3838 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3839 set_task_reclaim_state(current, NULL);
3840
3841 return nr_reclaimed;
3842 }
3843
3844 #ifdef CONFIG_MEMCG
3845
3846
3847 unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3848 gfp_t gfp_mask, bool noswap,
3849 pg_data_t *pgdat,
3850 unsigned long *nr_scanned)
3851 {
3852 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3853 struct scan_control sc = {
3854 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3855 .target_mem_cgroup = memcg,
3856 .may_writepage = !laptop_mode,
3857 .may_unmap = 1,
3858 .reclaim_idx = MAX_NR_ZONES - 1,
3859 .may_swap = !noswap,
3860 };
3861
3862 WARN_ON_ONCE(!current->reclaim_state);
3863
3864 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3865 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3866
3867 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3868 sc.gfp_mask);
3869
3870
3871
3872
3873
3874
3875
3876
3877 shrink_lruvec(lruvec, &sc);
3878
3879 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3880
3881 *nr_scanned = sc.nr_scanned;
3882
3883 return sc.nr_reclaimed;
3884 }
3885
3886 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3887 unsigned long nr_pages,
3888 gfp_t gfp_mask,
3889 unsigned int reclaim_options)
3890 {
3891 unsigned long nr_reclaimed;
3892 unsigned int noreclaim_flag;
3893 struct scan_control sc = {
3894 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3895 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3896 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3897 .reclaim_idx = MAX_NR_ZONES - 1,
3898 .target_mem_cgroup = memcg,
3899 .priority = DEF_PRIORITY,
3900 .may_writepage = !laptop_mode,
3901 .may_unmap = 1,
3902 .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
3903 .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
3904 };
3905
3906
3907
3908
3909
3910 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3911
3912 set_task_reclaim_state(current, &sc.reclaim_state);
3913 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
3914 noreclaim_flag = memalloc_noreclaim_save();
3915
3916 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3917
3918 memalloc_noreclaim_restore(noreclaim_flag);
3919 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3920 set_task_reclaim_state(current, NULL);
3921
3922 return nr_reclaimed;
3923 }
3924 #endif
3925
3926 static void age_active_anon(struct pglist_data *pgdat,
3927 struct scan_control *sc)
3928 {
3929 struct mem_cgroup *memcg;
3930 struct lruvec *lruvec;
3931
3932 if (!can_age_anon_pages(pgdat, sc))
3933 return;
3934
3935 lruvec = mem_cgroup_lruvec(NULL, pgdat);
3936 if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
3937 return;
3938
3939 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3940 do {
3941 lruvec = mem_cgroup_lruvec(memcg, pgdat);
3942 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3943 sc, LRU_ACTIVE_ANON);
3944 memcg = mem_cgroup_iter(NULL, memcg, NULL);
3945 } while (memcg);
3946 }
3947
3948 static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
3949 {
3950 int i;
3951 struct zone *zone;
3952
3953
3954
3955
3956
3957
3958
3959
3960 for (i = highest_zoneidx; i >= 0; i--) {
3961 zone = pgdat->node_zones + i;
3962 if (!managed_zone(zone))
3963 continue;
3964
3965 if (zone->watermark_boost)
3966 return true;
3967 }
3968
3969 return false;
3970 }
3971
3972
3973
3974
3975
3976 static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
3977 {
3978 int i;
3979 unsigned long mark = -1;
3980 struct zone *zone;
3981
3982
3983
3984
3985
3986 for (i = 0; i <= highest_zoneidx; i++) {
3987 zone = pgdat->node_zones + i;
3988
3989 if (!managed_zone(zone))
3990 continue;
3991
3992 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
3993 mark = wmark_pages(zone, WMARK_PROMO);
3994 else
3995 mark = high_wmark_pages(zone);
3996 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
3997 return true;
3998 }
3999
4000
4001
4002
4003
4004
4005 if (mark == -1)
4006 return true;
4007
4008 return false;
4009 }
4010
4011
4012 static void clear_pgdat_congested(pg_data_t *pgdat)
4013 {
4014 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
4015
4016 clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
4017 clear_bit(PGDAT_DIRTY, &pgdat->flags);
4018 clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
4019 }
4020
4021
4022
4023
4024
4025
4026
4027 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
4028 int highest_zoneidx)
4029 {
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043 if (waitqueue_active(&pgdat->pfmemalloc_wait))
4044 wake_up_all(&pgdat->pfmemalloc_wait);
4045
4046
4047 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
4048 return true;
4049
4050 if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
4051 clear_pgdat_congested(pgdat);
4052 return true;
4053 }
4054
4055 return false;
4056 }
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066 static bool kswapd_shrink_node(pg_data_t *pgdat,
4067 struct scan_control *sc)
4068 {
4069 struct zone *zone;
4070 int z;
4071
4072
4073 sc->nr_to_reclaim = 0;
4074 for (z = 0; z <= sc->reclaim_idx; z++) {
4075 zone = pgdat->node_zones + z;
4076 if (!managed_zone(zone))
4077 continue;
4078
4079 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
4080 }
4081
4082
4083
4084
4085
4086 shrink_node(pgdat, sc);
4087
4088
4089
4090
4091
4092
4093
4094
4095 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
4096 sc->order = 0;
4097
4098 return sc->nr_scanned >= sc->nr_to_reclaim;
4099 }
4100
4101
4102 static inline void
4103 update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
4104 {
4105 int i;
4106 struct zone *zone;
4107
4108 for (i = 0; i <= highest_zoneidx; i++) {
4109 zone = pgdat->node_zones + i;
4110
4111 if (!managed_zone(zone))
4112 continue;
4113
4114 if (active)
4115 set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
4116 else
4117 clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
4118 }
4119 }
4120
4121 static inline void
4122 set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
4123 {
4124 update_reclaim_active(pgdat, highest_zoneidx, true);
4125 }
4126
4127 static inline void
4128 clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
4129 {
4130 update_reclaim_active(pgdat, highest_zoneidx, false);
4131 }
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146 static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
4147 {
4148 int i;
4149 unsigned long nr_soft_reclaimed;
4150 unsigned long nr_soft_scanned;
4151 unsigned long pflags;
4152 unsigned long nr_boost_reclaim;
4153 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
4154 bool boosted;
4155 struct zone *zone;
4156 struct scan_control sc = {
4157 .gfp_mask = GFP_KERNEL,
4158 .order = order,
4159 .may_unmap = 1,
4160 };
4161
4162 set_task_reclaim_state(current, &sc.reclaim_state);
4163 psi_memstall_enter(&pflags);
4164 __fs_reclaim_acquire(_THIS_IP_);
4165
4166 count_vm_event(PAGEOUTRUN);
4167
4168
4169
4170
4171
4172
4173 nr_boost_reclaim = 0;
4174 for (i = 0; i <= highest_zoneidx; i++) {
4175 zone = pgdat->node_zones + i;
4176 if (!managed_zone(zone))
4177 continue;
4178
4179 nr_boost_reclaim += zone->watermark_boost;
4180 zone_boosts[i] = zone->watermark_boost;
4181 }
4182 boosted = nr_boost_reclaim;
4183
4184 restart:
4185 set_reclaim_active(pgdat, highest_zoneidx);
4186 sc.priority = DEF_PRIORITY;
4187 do {
4188 unsigned long nr_reclaimed = sc.nr_reclaimed;
4189 bool raise_priority = true;
4190 bool balanced;
4191 bool ret;
4192
4193 sc.reclaim_idx = highest_zoneidx;
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205 if (buffer_heads_over_limit) {
4206 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
4207 zone = pgdat->node_zones + i;
4208 if (!managed_zone(zone))
4209 continue;
4210
4211 sc.reclaim_idx = i;
4212 break;
4213 }
4214 }
4215
4216
4217
4218
4219
4220
4221
4222
4223 balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
4224 if (!balanced && nr_boost_reclaim) {
4225 nr_boost_reclaim = 0;
4226 goto restart;
4227 }
4228
4229
4230
4231
4232
4233
4234 if (!nr_boost_reclaim && balanced)
4235 goto out;
4236
4237
4238 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
4239 raise_priority = false;
4240
4241
4242
4243
4244
4245
4246
4247 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
4248 sc.may_swap = !nr_boost_reclaim;
4249
4250
4251
4252
4253
4254
4255
4256 age_active_anon(pgdat, &sc);
4257
4258
4259
4260
4261
4262 if (sc.priority < DEF_PRIORITY - 2)
4263 sc.may_writepage = 1;
4264
4265
4266 sc.nr_scanned = 0;
4267 nr_soft_scanned = 0;
4268 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
4269 sc.gfp_mask, &nr_soft_scanned);
4270 sc.nr_reclaimed += nr_soft_reclaimed;
4271
4272
4273
4274
4275
4276
4277 if (kswapd_shrink_node(pgdat, &sc))
4278 raise_priority = false;
4279
4280
4281
4282
4283
4284
4285 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
4286 allow_direct_reclaim(pgdat))
4287 wake_up_all(&pgdat->pfmemalloc_wait);
4288
4289
4290 __fs_reclaim_release(_THIS_IP_);
4291 ret = try_to_freeze();
4292 __fs_reclaim_acquire(_THIS_IP_);
4293 if (ret || kthread_should_stop())
4294 break;
4295
4296
4297
4298
4299
4300 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
4301 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
4302
4303
4304
4305
4306
4307
4308 if (nr_boost_reclaim && !nr_reclaimed)
4309 break;
4310
4311 if (raise_priority || !nr_reclaimed)
4312 sc.priority--;
4313 } while (sc.priority >= 1);
4314
4315 if (!sc.nr_reclaimed)
4316 pgdat->kswapd_failures++;
4317
4318 out:
4319 clear_reclaim_active(pgdat, highest_zoneidx);
4320
4321
4322 if (boosted) {
4323 unsigned long flags;
4324
4325 for (i = 0; i <= highest_zoneidx; i++) {
4326 if (!zone_boosts[i])
4327 continue;
4328
4329
4330 zone = pgdat->node_zones + i;
4331 spin_lock_irqsave(&zone->lock, flags);
4332 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
4333 spin_unlock_irqrestore(&zone->lock, flags);
4334 }
4335
4336
4337
4338
4339
4340 wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
4341 }
4342
4343 snapshot_refaults(NULL, pgdat);
4344 __fs_reclaim_release(_THIS_IP_);
4345 psi_memstall_leave(&pflags);
4346 set_task_reclaim_state(current, NULL);
4347
4348
4349
4350
4351
4352
4353
4354 return sc.order;
4355 }
4356
4357
4358
4359
4360
4361
4362
4363
4364 static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
4365 enum zone_type prev_highest_zoneidx)
4366 {
4367 enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
4368
4369 return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
4370 }
4371
4372 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
4373 unsigned int highest_zoneidx)
4374 {
4375 long remaining = 0;
4376 DEFINE_WAIT(wait);
4377
4378 if (freezing(current) || kthread_should_stop())
4379 return;
4380
4381 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
4382
4383
4384
4385
4386
4387
4388
4389
4390 if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
4391
4392
4393
4394
4395
4396
4397 reset_isolation_suitable(pgdat);
4398
4399
4400
4401
4402
4403 wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
4404
4405 remaining = schedule_timeout(HZ/10);
4406
4407
4408
4409
4410
4411
4412 if (remaining) {
4413 WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
4414 kswapd_highest_zoneidx(pgdat,
4415 highest_zoneidx));
4416
4417 if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
4418 WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
4419 }
4420
4421 finish_wait(&pgdat->kswapd_wait, &wait);
4422 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
4423 }
4424
4425
4426
4427
4428
4429 if (!remaining &&
4430 prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
4431 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
4442
4443 if (!kthread_should_stop())
4444 schedule();
4445
4446 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
4447 } else {
4448 if (remaining)
4449 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
4450 else
4451 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
4452 }
4453 finish_wait(&pgdat->kswapd_wait, &wait);
4454 }
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469 static int kswapd(void *p)
4470 {
4471 unsigned int alloc_order, reclaim_order;
4472 unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
4473 pg_data_t *pgdat = (pg_data_t *)p;
4474 struct task_struct *tsk = current;
4475 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
4476
4477 if (!cpumask_empty(cpumask))
4478 set_cpus_allowed_ptr(tsk, cpumask);
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492 tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
4493 set_freezable();
4494
4495 WRITE_ONCE(pgdat->kswapd_order, 0);
4496 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
4497 atomic_set(&pgdat->nr_writeback_throttled, 0);
4498 for ( ; ; ) {
4499 bool ret;
4500
4501 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
4502 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
4503 highest_zoneidx);
4504
4505 kswapd_try_sleep:
4506 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
4507 highest_zoneidx);
4508
4509
4510 alloc_order = READ_ONCE(pgdat->kswapd_order);
4511 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
4512 highest_zoneidx);
4513 WRITE_ONCE(pgdat->kswapd_order, 0);
4514 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
4515
4516 ret = try_to_freeze();
4517 if (kthread_should_stop())
4518 break;
4519
4520
4521
4522
4523
4524 if (ret)
4525 continue;
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
4536 alloc_order);
4537 reclaim_order = balance_pgdat(pgdat, alloc_order,
4538 highest_zoneidx);
4539 if (reclaim_order < alloc_order)
4540 goto kswapd_try_sleep;
4541 }
4542
4543 tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
4544
4545 return 0;
4546 }
4547
4548
4549
4550
4551
4552
4553
4554
4555 void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
4556 enum zone_type highest_zoneidx)
4557 {
4558 pg_data_t *pgdat;
4559 enum zone_type curr_idx;
4560
4561 if (!managed_zone(zone))
4562 return;
4563
4564 if (!cpuset_zone_allowed(zone, gfp_flags))
4565 return;
4566
4567 pgdat = zone->zone_pgdat;
4568 curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
4569
4570 if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
4571 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
4572
4573 if (READ_ONCE(pgdat->kswapd_order) < order)
4574 WRITE_ONCE(pgdat->kswapd_order, order);
4575
4576 if (!waitqueue_active(&pgdat->kswapd_wait))
4577 return;
4578
4579
4580 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
4581 (pgdat_balanced(pgdat, order, highest_zoneidx) &&
4582 !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
4583
4584
4585
4586
4587
4588
4589
4590 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
4591 wakeup_kcompactd(pgdat, order, highest_zoneidx);
4592 return;
4593 }
4594
4595 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
4596 gfp_flags);
4597 wake_up_interruptible(&pgdat->kswapd_wait);
4598 }
4599
4600 #ifdef CONFIG_HIBERNATION
4601
4602
4603
4604
4605
4606
4607
4608
4609 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
4610 {
4611 struct scan_control sc = {
4612 .nr_to_reclaim = nr_to_reclaim,
4613 .gfp_mask = GFP_HIGHUSER_MOVABLE,
4614 .reclaim_idx = MAX_NR_ZONES - 1,
4615 .priority = DEF_PRIORITY,
4616 .may_writepage = 1,
4617 .may_unmap = 1,
4618 .may_swap = 1,
4619 .hibernation_mode = 1,
4620 };
4621 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4622 unsigned long nr_reclaimed;
4623 unsigned int noreclaim_flag;
4624
4625 fs_reclaim_acquire(sc.gfp_mask);
4626 noreclaim_flag = memalloc_noreclaim_save();
4627 set_task_reclaim_state(current, &sc.reclaim_state);
4628
4629 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4630
4631 set_task_reclaim_state(current, NULL);
4632 memalloc_noreclaim_restore(noreclaim_flag);
4633 fs_reclaim_release(sc.gfp_mask);
4634
4635 return nr_reclaimed;
4636 }
4637 #endif
4638
4639
4640
4641
4642 void kswapd_run(int nid)
4643 {
4644 pg_data_t *pgdat = NODE_DATA(nid);
4645
4646 if (pgdat->kswapd)
4647 return;
4648
4649 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
4650 if (IS_ERR(pgdat->kswapd)) {
4651
4652 BUG_ON(system_state < SYSTEM_RUNNING);
4653 pr_err("Failed to start kswapd on node %d\n", nid);
4654 pgdat->kswapd = NULL;
4655 }
4656 }
4657
4658
4659
4660
4661
4662 void kswapd_stop(int nid)
4663 {
4664 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
4665
4666 if (kswapd) {
4667 kthread_stop(kswapd);
4668 NODE_DATA(nid)->kswapd = NULL;
4669 }
4670 }
4671
4672 static int __init kswapd_init(void)
4673 {
4674 int nid;
4675
4676 swap_setup();
4677 for_each_node_state(nid, N_MEMORY)
4678 kswapd_run(nid);
4679 return 0;
4680 }
4681
4682 module_init(kswapd_init)
4683
4684 #ifdef CONFIG_NUMA
4685
4686
4687
4688
4689
4690
4691 int node_reclaim_mode __read_mostly;
4692
4693
4694
4695
4696
4697
4698 #define NODE_RECLAIM_PRIORITY 4
4699
4700
4701
4702
4703
4704 int sysctl_min_unmapped_ratio = 1;
4705
4706
4707
4708
4709
4710 int sysctl_min_slab_ratio = 5;
4711
4712 static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4713 {
4714 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4715 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4716 node_page_state(pgdat, NR_ACTIVE_FILE);
4717
4718
4719
4720
4721
4722
4723 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4724 }
4725
4726
4727 static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4728 {
4729 unsigned long nr_pagecache_reclaimable;
4730 unsigned long delta = 0;
4731
4732
4733
4734
4735
4736
4737
4738 if (node_reclaim_mode & RECLAIM_UNMAP)
4739 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4740 else
4741 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4742
4743
4744 if (!(node_reclaim_mode & RECLAIM_WRITE))
4745 delta += node_page_state(pgdat, NR_FILE_DIRTY);
4746
4747
4748 if (unlikely(delta > nr_pagecache_reclaimable))
4749 delta = nr_pagecache_reclaimable;
4750
4751 return nr_pagecache_reclaimable - delta;
4752 }
4753
4754
4755
4756
4757 static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4758 {
4759
4760 const unsigned long nr_pages = 1 << order;
4761 struct task_struct *p = current;
4762 unsigned int noreclaim_flag;
4763 struct scan_control sc = {
4764 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4765 .gfp_mask = current_gfp_context(gfp_mask),
4766 .order = order,
4767 .priority = NODE_RECLAIM_PRIORITY,
4768 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4769 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4770 .may_swap = 1,
4771 .reclaim_idx = gfp_zone(gfp_mask),
4772 };
4773 unsigned long pflags;
4774
4775 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
4776 sc.gfp_mask);
4777
4778 cond_resched();
4779 psi_memstall_enter(&pflags);
4780 fs_reclaim_acquire(sc.gfp_mask);
4781
4782
4783
4784 noreclaim_flag = memalloc_noreclaim_save();
4785 set_task_reclaim_state(p, &sc.reclaim_state);
4786
4787 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
4788 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
4789
4790
4791
4792
4793 do {
4794 shrink_node(pgdat, &sc);
4795 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4796 }
4797
4798 set_task_reclaim_state(p, NULL);
4799 memalloc_noreclaim_restore(noreclaim_flag);
4800 fs_reclaim_release(sc.gfp_mask);
4801 psi_memstall_leave(&pflags);
4802
4803 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
4804
4805 return sc.nr_reclaimed >= nr_pages;
4806 }
4807
4808 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4809 {
4810 int ret;
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4823 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
4824 pgdat->min_slab_pages)
4825 return NODE_RECLAIM_FULL;
4826
4827
4828
4829
4830 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4831 return NODE_RECLAIM_NOSCAN;
4832
4833
4834
4835
4836
4837
4838
4839 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4840 return NODE_RECLAIM_NOSCAN;
4841
4842 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4843 return NODE_RECLAIM_NOSCAN;
4844
4845 ret = __node_reclaim(pgdat, gfp_mask, order);
4846 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4847
4848 if (!ret)
4849 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4850
4851 return ret;
4852 }
4853 #endif
4854
4855 void check_move_unevictable_pages(struct pagevec *pvec)
4856 {
4857 struct folio_batch fbatch;
4858 unsigned i;
4859
4860 folio_batch_init(&fbatch);
4861 for (i = 0; i < pvec->nr; i++) {
4862 struct page *page = pvec->pages[i];
4863
4864 if (PageTransTail(page))
4865 continue;
4866 folio_batch_add(&fbatch, page_folio(page));
4867 }
4868 check_move_unevictable_folios(&fbatch);
4869 }
4870 EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881 void check_move_unevictable_folios(struct folio_batch *fbatch)
4882 {
4883 struct lruvec *lruvec = NULL;
4884 int pgscanned = 0;
4885 int pgrescued = 0;
4886 int i;
4887
4888 for (i = 0; i < fbatch->nr; i++) {
4889 struct folio *folio = fbatch->folios[i];
4890 int nr_pages = folio_nr_pages(folio);
4891
4892 pgscanned += nr_pages;
4893
4894
4895 if (!folio_test_clear_lru(folio))
4896 continue;
4897
4898 lruvec = folio_lruvec_relock_irq(folio, lruvec);
4899 if (folio_evictable(folio) && folio_test_unevictable(folio)) {
4900 lruvec_del_folio(lruvec, folio);
4901 folio_clear_unevictable(folio);
4902 lruvec_add_folio(lruvec, folio);
4903 pgrescued += nr_pages;
4904 }
4905 folio_set_lru(folio);
4906 }
4907
4908 if (lruvec) {
4909 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4910 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4911 unlock_page_lruvec_irq(lruvec);
4912 } else if (pgscanned) {
4913 count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4914 }
4915 }
4916 EXPORT_SYMBOL_GPL(check_move_unevictable_folios);