0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #include <linux/page_counter.h>
0029 #include <linux/memcontrol.h>
0030 #include <linux/cgroup.h>
0031 #include <linux/pagewalk.h>
0032 #include <linux/sched/mm.h>
0033 #include <linux/shmem_fs.h>
0034 #include <linux/hugetlb.h>
0035 #include <linux/pagemap.h>
0036 #include <linux/vm_event_item.h>
0037 #include <linux/smp.h>
0038 #include <linux/page-flags.h>
0039 #include <linux/backing-dev.h>
0040 #include <linux/bit_spinlock.h>
0041 #include <linux/rcupdate.h>
0042 #include <linux/limits.h>
0043 #include <linux/export.h>
0044 #include <linux/mutex.h>
0045 #include <linux/rbtree.h>
0046 #include <linux/slab.h>
0047 #include <linux/swap.h>
0048 #include <linux/swapops.h>
0049 #include <linux/spinlock.h>
0050 #include <linux/eventfd.h>
0051 #include <linux/poll.h>
0052 #include <linux/sort.h>
0053 #include <linux/fs.h>
0054 #include <linux/seq_file.h>
0055 #include <linux/vmpressure.h>
0056 #include <linux/memremap.h>
0057 #include <linux/mm_inline.h>
0058 #include <linux/swap_cgroup.h>
0059 #include <linux/cpu.h>
0060 #include <linux/oom.h>
0061 #include <linux/lockdep.h>
0062 #include <linux/file.h>
0063 #include <linux/resume_user_mode.h>
0064 #include <linux/psi.h>
0065 #include <linux/seq_buf.h>
0066 #include "internal.h"
0067 #include <net/sock.h>
0068 #include <net/ip.h>
0069 #include "slab.h"
0070 #include "swap.h"
0071
0072 #include <linux/uaccess.h>
0073
0074 #include <trace/events/vmscan.h>
0075
0076 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
0077 EXPORT_SYMBOL(memory_cgrp_subsys);
0078
0079 struct mem_cgroup *root_mem_cgroup __read_mostly;
0080
0081
0082 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
0083 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
0084
0085
0086 static bool cgroup_memory_nosocket __ro_after_init;
0087
0088
0089 static bool cgroup_memory_nokmem __ro_after_init;
0090
0091
0092 #ifdef CONFIG_MEMCG_SWAP
0093 static bool cgroup_memory_noswap __ro_after_init;
0094 #else
0095 #define cgroup_memory_noswap 1
0096 #endif
0097
0098 #ifdef CONFIG_CGROUP_WRITEBACK
0099 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
0100 #endif
0101
0102
0103 static bool do_memsw_account(void)
0104 {
0105 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
0106 }
0107
0108 #define THRESHOLDS_EVENTS_TARGET 128
0109 #define SOFTLIMIT_EVENTS_TARGET 1024
0110
0111
0112
0113
0114
0115
0116 struct mem_cgroup_tree_per_node {
0117 struct rb_root rb_root;
0118 struct rb_node *rb_rightmost;
0119 spinlock_t lock;
0120 };
0121
0122 struct mem_cgroup_tree {
0123 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
0124 };
0125
0126 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
0127
0128
0129 struct mem_cgroup_eventfd_list {
0130 struct list_head list;
0131 struct eventfd_ctx *eventfd;
0132 };
0133
0134
0135
0136
0137 struct mem_cgroup_event {
0138
0139
0140
0141 struct mem_cgroup *memcg;
0142
0143
0144
0145 struct eventfd_ctx *eventfd;
0146
0147
0148
0149 struct list_head list;
0150
0151
0152
0153
0154
0155 int (*register_event)(struct mem_cgroup *memcg,
0156 struct eventfd_ctx *eventfd, const char *args);
0157
0158
0159
0160
0161
0162 void (*unregister_event)(struct mem_cgroup *memcg,
0163 struct eventfd_ctx *eventfd);
0164
0165
0166
0167
0168 poll_table pt;
0169 wait_queue_head_t *wqh;
0170 wait_queue_entry_t wait;
0171 struct work_struct remove;
0172 };
0173
0174 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
0175 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
0176
0177
0178
0179
0180
0181 #define MOVE_ANON 0x1U
0182 #define MOVE_FILE 0x2U
0183 #define MOVE_MASK (MOVE_ANON | MOVE_FILE)
0184
0185
0186 static struct move_charge_struct {
0187 spinlock_t lock;
0188 struct mm_struct *mm;
0189 struct mem_cgroup *from;
0190 struct mem_cgroup *to;
0191 unsigned long flags;
0192 unsigned long precharge;
0193 unsigned long moved_charge;
0194 unsigned long moved_swap;
0195 struct task_struct *moving_task;
0196 wait_queue_head_t waitq;
0197 } mc = {
0198 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
0199 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
0200 };
0201
0202
0203
0204
0205
0206 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
0207 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
0208
0209
0210 enum res_type {
0211 _MEM,
0212 _MEMSWAP,
0213 _KMEM,
0214 _TCP,
0215 };
0216
0217 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
0218 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
0219 #define MEMFILE_ATTR(val) ((val) & 0xffff)
0220
0221
0222
0223
0224
0225
0226 #define for_each_mem_cgroup_tree(iter, root) \
0227 for (iter = mem_cgroup_iter(root, NULL, NULL); \
0228 iter != NULL; \
0229 iter = mem_cgroup_iter(root, iter, NULL))
0230
0231 #define for_each_mem_cgroup(iter) \
0232 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
0233 iter != NULL; \
0234 iter = mem_cgroup_iter(NULL, iter, NULL))
0235
0236 static inline bool task_is_dying(void)
0237 {
0238 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
0239 (current->flags & PF_EXITING);
0240 }
0241
0242
0243 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
0244 {
0245 if (!memcg)
0246 memcg = root_mem_cgroup;
0247 return &memcg->vmpressure;
0248 }
0249
0250 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
0251 {
0252 return container_of(vmpr, struct mem_cgroup, vmpressure);
0253 }
0254
0255 #ifdef CONFIG_MEMCG_KMEM
0256 static DEFINE_SPINLOCK(objcg_lock);
0257
0258 bool mem_cgroup_kmem_disabled(void)
0259 {
0260 return cgroup_memory_nokmem;
0261 }
0262
0263 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
0264 unsigned int nr_pages);
0265
0266 static void obj_cgroup_release(struct percpu_ref *ref)
0267 {
0268 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
0269 unsigned int nr_bytes;
0270 unsigned int nr_pages;
0271 unsigned long flags;
0272
0273
0274
0275
0276
0277
0278
0279
0280
0281
0282
0283
0284
0285
0286
0287
0288
0289
0290
0291
0292
0293 nr_bytes = atomic_read(&objcg->nr_charged_bytes);
0294 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
0295 nr_pages = nr_bytes >> PAGE_SHIFT;
0296
0297 if (nr_pages)
0298 obj_cgroup_uncharge_pages(objcg, nr_pages);
0299
0300 spin_lock_irqsave(&objcg_lock, flags);
0301 list_del(&objcg->list);
0302 spin_unlock_irqrestore(&objcg_lock, flags);
0303
0304 percpu_ref_exit(ref);
0305 kfree_rcu(objcg, rcu);
0306 }
0307
0308 static struct obj_cgroup *obj_cgroup_alloc(void)
0309 {
0310 struct obj_cgroup *objcg;
0311 int ret;
0312
0313 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
0314 if (!objcg)
0315 return NULL;
0316
0317 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
0318 GFP_KERNEL);
0319 if (ret) {
0320 kfree(objcg);
0321 return NULL;
0322 }
0323 INIT_LIST_HEAD(&objcg->list);
0324 return objcg;
0325 }
0326
0327 static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
0328 struct mem_cgroup *parent)
0329 {
0330 struct obj_cgroup *objcg, *iter;
0331
0332 objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
0333
0334 spin_lock_irq(&objcg_lock);
0335
0336
0337 list_add(&objcg->list, &memcg->objcg_list);
0338
0339 list_for_each_entry(iter, &memcg->objcg_list, list)
0340 WRITE_ONCE(iter->memcg, parent);
0341
0342 list_splice(&memcg->objcg_list, &parent->objcg_list);
0343
0344 spin_unlock_irq(&objcg_lock);
0345
0346 percpu_ref_kill(&objcg->refcnt);
0347 }
0348
0349
0350
0351
0352
0353
0354
0355 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
0356 EXPORT_SYMBOL(memcg_kmem_enabled_key);
0357 #endif
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
0371 {
0372 struct mem_cgroup *memcg;
0373
0374 memcg = page_memcg(page);
0375
0376 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
0377 memcg = root_mem_cgroup;
0378
0379 return &memcg->css;
0380 }
0381
0382
0383
0384
0385
0386
0387
0388
0389
0390
0391
0392
0393
0394
0395 ino_t page_cgroup_ino(struct page *page)
0396 {
0397 struct mem_cgroup *memcg;
0398 unsigned long ino = 0;
0399
0400 rcu_read_lock();
0401 memcg = page_memcg_check(page);
0402
0403 while (memcg && !(memcg->css.flags & CSS_ONLINE))
0404 memcg = parent_mem_cgroup(memcg);
0405 if (memcg)
0406 ino = cgroup_ino(memcg->css.cgroup);
0407 rcu_read_unlock();
0408 return ino;
0409 }
0410
0411 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
0412 struct mem_cgroup_tree_per_node *mctz,
0413 unsigned long new_usage_in_excess)
0414 {
0415 struct rb_node **p = &mctz->rb_root.rb_node;
0416 struct rb_node *parent = NULL;
0417 struct mem_cgroup_per_node *mz_node;
0418 bool rightmost = true;
0419
0420 if (mz->on_tree)
0421 return;
0422
0423 mz->usage_in_excess = new_usage_in_excess;
0424 if (!mz->usage_in_excess)
0425 return;
0426 while (*p) {
0427 parent = *p;
0428 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
0429 tree_node);
0430 if (mz->usage_in_excess < mz_node->usage_in_excess) {
0431 p = &(*p)->rb_left;
0432 rightmost = false;
0433 } else {
0434 p = &(*p)->rb_right;
0435 }
0436 }
0437
0438 if (rightmost)
0439 mctz->rb_rightmost = &mz->tree_node;
0440
0441 rb_link_node(&mz->tree_node, parent, p);
0442 rb_insert_color(&mz->tree_node, &mctz->rb_root);
0443 mz->on_tree = true;
0444 }
0445
0446 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
0447 struct mem_cgroup_tree_per_node *mctz)
0448 {
0449 if (!mz->on_tree)
0450 return;
0451
0452 if (&mz->tree_node == mctz->rb_rightmost)
0453 mctz->rb_rightmost = rb_prev(&mz->tree_node);
0454
0455 rb_erase(&mz->tree_node, &mctz->rb_root);
0456 mz->on_tree = false;
0457 }
0458
0459 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
0460 struct mem_cgroup_tree_per_node *mctz)
0461 {
0462 unsigned long flags;
0463
0464 spin_lock_irqsave(&mctz->lock, flags);
0465 __mem_cgroup_remove_exceeded(mz, mctz);
0466 spin_unlock_irqrestore(&mctz->lock, flags);
0467 }
0468
0469 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
0470 {
0471 unsigned long nr_pages = page_counter_read(&memcg->memory);
0472 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
0473 unsigned long excess = 0;
0474
0475 if (nr_pages > soft_limit)
0476 excess = nr_pages - soft_limit;
0477
0478 return excess;
0479 }
0480
0481 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
0482 {
0483 unsigned long excess;
0484 struct mem_cgroup_per_node *mz;
0485 struct mem_cgroup_tree_per_node *mctz;
0486
0487 mctz = soft_limit_tree.rb_tree_per_node[nid];
0488 if (!mctz)
0489 return;
0490
0491
0492
0493
0494 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
0495 mz = memcg->nodeinfo[nid];
0496 excess = soft_limit_excess(memcg);
0497
0498
0499
0500
0501 if (excess || mz->on_tree) {
0502 unsigned long flags;
0503
0504 spin_lock_irqsave(&mctz->lock, flags);
0505
0506 if (mz->on_tree)
0507 __mem_cgroup_remove_exceeded(mz, mctz);
0508
0509
0510
0511
0512 __mem_cgroup_insert_exceeded(mz, mctz, excess);
0513 spin_unlock_irqrestore(&mctz->lock, flags);
0514 }
0515 }
0516 }
0517
0518 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
0519 {
0520 struct mem_cgroup_tree_per_node *mctz;
0521 struct mem_cgroup_per_node *mz;
0522 int nid;
0523
0524 for_each_node(nid) {
0525 mz = memcg->nodeinfo[nid];
0526 mctz = soft_limit_tree.rb_tree_per_node[nid];
0527 if (mctz)
0528 mem_cgroup_remove_exceeded(mz, mctz);
0529 }
0530 }
0531
0532 static struct mem_cgroup_per_node *
0533 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
0534 {
0535 struct mem_cgroup_per_node *mz;
0536
0537 retry:
0538 mz = NULL;
0539 if (!mctz->rb_rightmost)
0540 goto done;
0541
0542 mz = rb_entry(mctz->rb_rightmost,
0543 struct mem_cgroup_per_node, tree_node);
0544
0545
0546
0547
0548
0549 __mem_cgroup_remove_exceeded(mz, mctz);
0550 if (!soft_limit_excess(mz->memcg) ||
0551 !css_tryget(&mz->memcg->css))
0552 goto retry;
0553 done:
0554 return mz;
0555 }
0556
0557 static struct mem_cgroup_per_node *
0558 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
0559 {
0560 struct mem_cgroup_per_node *mz;
0561
0562 spin_lock_irq(&mctz->lock);
0563 mz = __mem_cgroup_largest_soft_limit_node(mctz);
0564 spin_unlock_irq(&mctz->lock);
0565 return mz;
0566 }
0567
0568
0569
0570
0571
0572
0573
0574
0575
0576
0577
0578
0579
0580
0581
0582
0583 static void flush_memcg_stats_dwork(struct work_struct *w);
0584 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
0585 static DEFINE_SPINLOCK(stats_flush_lock);
0586 static DEFINE_PER_CPU(unsigned int, stats_updates);
0587 static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
0588 static u64 flush_next_time;
0589
0590 #define FLUSH_TIME (2UL*HZ)
0591
0592
0593
0594
0595
0596
0597
0598 static void memcg_stats_lock(void)
0599 {
0600 #ifdef CONFIG_PREEMPT_RT
0601 preempt_disable();
0602 #else
0603 VM_BUG_ON(!irqs_disabled());
0604 #endif
0605 }
0606
0607 static void __memcg_stats_lock(void)
0608 {
0609 #ifdef CONFIG_PREEMPT_RT
0610 preempt_disable();
0611 #endif
0612 }
0613
0614 static void memcg_stats_unlock(void)
0615 {
0616 #ifdef CONFIG_PREEMPT_RT
0617 preempt_enable();
0618 #endif
0619 }
0620
0621 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
0622 {
0623 unsigned int x;
0624
0625 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
0626
0627 x = __this_cpu_add_return(stats_updates, abs(val));
0628 if (x > MEMCG_CHARGE_BATCH) {
0629
0630
0631
0632
0633
0634
0635 if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
0636 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
0637 __this_cpu_write(stats_updates, 0);
0638 }
0639 }
0640
0641 static void __mem_cgroup_flush_stats(void)
0642 {
0643 unsigned long flag;
0644
0645 if (!spin_trylock_irqsave(&stats_flush_lock, flag))
0646 return;
0647
0648 flush_next_time = jiffies_64 + 2*FLUSH_TIME;
0649 cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
0650 atomic_set(&stats_flush_threshold, 0);
0651 spin_unlock_irqrestore(&stats_flush_lock, flag);
0652 }
0653
0654 void mem_cgroup_flush_stats(void)
0655 {
0656 if (atomic_read(&stats_flush_threshold) > num_online_cpus())
0657 __mem_cgroup_flush_stats();
0658 }
0659
0660 void mem_cgroup_flush_stats_delayed(void)
0661 {
0662 if (time_after64(jiffies_64, flush_next_time))
0663 mem_cgroup_flush_stats();
0664 }
0665
0666 static void flush_memcg_stats_dwork(struct work_struct *w)
0667 {
0668 __mem_cgroup_flush_stats();
0669 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
0670 }
0671
0672
0673
0674
0675
0676
0677
0678 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
0679 {
0680 if (mem_cgroup_disabled())
0681 return;
0682
0683 __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
0684 memcg_rstat_updated(memcg, val);
0685 }
0686
0687
0688 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
0689 {
0690 long x = 0;
0691 int cpu;
0692
0693 for_each_possible_cpu(cpu)
0694 x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
0695 #ifdef CONFIG_SMP
0696 if (x < 0)
0697 x = 0;
0698 #endif
0699 return x;
0700 }
0701
0702 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
0703 int val)
0704 {
0705 struct mem_cgroup_per_node *pn;
0706 struct mem_cgroup *memcg;
0707
0708 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
0709 memcg = pn->memcg;
0710
0711
0712
0713
0714
0715
0716
0717 __memcg_stats_lock();
0718 if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
0719 switch (idx) {
0720 case NR_ANON_MAPPED:
0721 case NR_FILE_MAPPED:
0722 case NR_ANON_THPS:
0723 case NR_SHMEM_PMDMAPPED:
0724 case NR_FILE_PMDMAPPED:
0725 WARN_ON_ONCE(!in_task());
0726 break;
0727 default:
0728 WARN_ON_ONCE(!irqs_disabled());
0729 }
0730 }
0731
0732
0733 __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
0734
0735
0736 __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
0737
0738 memcg_rstat_updated(memcg, val);
0739 memcg_stats_unlock();
0740 }
0741
0742
0743
0744
0745
0746
0747
0748
0749
0750
0751
0752 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
0753 int val)
0754 {
0755
0756 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
0757
0758
0759 if (!mem_cgroup_disabled())
0760 __mod_memcg_lruvec_state(lruvec, idx, val);
0761 }
0762
0763 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
0764 int val)
0765 {
0766 struct page *head = compound_head(page);
0767 struct mem_cgroup *memcg;
0768 pg_data_t *pgdat = page_pgdat(page);
0769 struct lruvec *lruvec;
0770
0771 rcu_read_lock();
0772 memcg = page_memcg(head);
0773
0774 if (!memcg) {
0775 rcu_read_unlock();
0776 __mod_node_page_state(pgdat, idx, val);
0777 return;
0778 }
0779
0780 lruvec = mem_cgroup_lruvec(memcg, pgdat);
0781 __mod_lruvec_state(lruvec, idx, val);
0782 rcu_read_unlock();
0783 }
0784 EXPORT_SYMBOL(__mod_lruvec_page_state);
0785
0786 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
0787 {
0788 pg_data_t *pgdat = page_pgdat(virt_to_page(p));
0789 struct mem_cgroup *memcg;
0790 struct lruvec *lruvec;
0791
0792 rcu_read_lock();
0793 memcg = mem_cgroup_from_slab_obj(p);
0794
0795
0796
0797
0798
0799
0800
0801 if (!memcg) {
0802 __mod_node_page_state(pgdat, idx, val);
0803 } else {
0804 lruvec = mem_cgroup_lruvec(memcg, pgdat);
0805 __mod_lruvec_state(lruvec, idx, val);
0806 }
0807 rcu_read_unlock();
0808 }
0809
0810
0811
0812
0813
0814
0815
0816 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
0817 unsigned long count)
0818 {
0819 if (mem_cgroup_disabled())
0820 return;
0821
0822 memcg_stats_lock();
0823 __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
0824 memcg_rstat_updated(memcg, count);
0825 memcg_stats_unlock();
0826 }
0827
0828 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
0829 {
0830 return READ_ONCE(memcg->vmstats.events[event]);
0831 }
0832
0833 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
0834 {
0835 long x = 0;
0836 int cpu;
0837
0838 for_each_possible_cpu(cpu)
0839 x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
0840 return x;
0841 }
0842
0843 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
0844 int nr_pages)
0845 {
0846
0847 if (nr_pages > 0)
0848 __count_memcg_events(memcg, PGPGIN, 1);
0849 else {
0850 __count_memcg_events(memcg, PGPGOUT, 1);
0851 nr_pages = -nr_pages;
0852 }
0853
0854 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
0855 }
0856
0857 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
0858 enum mem_cgroup_events_target target)
0859 {
0860 unsigned long val, next;
0861
0862 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
0863 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
0864
0865 if ((long)(next - val) < 0) {
0866 switch (target) {
0867 case MEM_CGROUP_TARGET_THRESH:
0868 next = val + THRESHOLDS_EVENTS_TARGET;
0869 break;
0870 case MEM_CGROUP_TARGET_SOFTLIMIT:
0871 next = val + SOFTLIMIT_EVENTS_TARGET;
0872 break;
0873 default:
0874 break;
0875 }
0876 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
0877 return true;
0878 }
0879 return false;
0880 }
0881
0882
0883
0884
0885
0886 static void memcg_check_events(struct mem_cgroup *memcg, int nid)
0887 {
0888 if (IS_ENABLED(CONFIG_PREEMPT_RT))
0889 return;
0890
0891
0892 if (unlikely(mem_cgroup_event_ratelimit(memcg,
0893 MEM_CGROUP_TARGET_THRESH))) {
0894 bool do_softlimit;
0895
0896 do_softlimit = mem_cgroup_event_ratelimit(memcg,
0897 MEM_CGROUP_TARGET_SOFTLIMIT);
0898 mem_cgroup_threshold(memcg);
0899 if (unlikely(do_softlimit))
0900 mem_cgroup_update_tree(memcg, nid);
0901 }
0902 }
0903
0904 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
0905 {
0906
0907
0908
0909
0910
0911 if (unlikely(!p))
0912 return NULL;
0913
0914 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
0915 }
0916 EXPORT_SYMBOL(mem_cgroup_from_task);
0917
0918 static __always_inline struct mem_cgroup *active_memcg(void)
0919 {
0920 if (!in_task())
0921 return this_cpu_read(int_active_memcg);
0922 else
0923 return current->active_memcg;
0924 }
0925
0926
0927
0928
0929
0930
0931
0932
0933
0934
0935
0936
0937 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
0938 {
0939 struct mem_cgroup *memcg;
0940
0941 if (mem_cgroup_disabled())
0942 return NULL;
0943
0944
0945
0946
0947
0948
0949
0950
0951
0952
0953 if (unlikely(!mm)) {
0954 memcg = active_memcg();
0955 if (unlikely(memcg)) {
0956
0957 css_get(&memcg->css);
0958 return memcg;
0959 }
0960 mm = current->mm;
0961 if (unlikely(!mm))
0962 return root_mem_cgroup;
0963 }
0964
0965 rcu_read_lock();
0966 do {
0967 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
0968 if (unlikely(!memcg))
0969 memcg = root_mem_cgroup;
0970 } while (!css_tryget(&memcg->css));
0971 rcu_read_unlock();
0972 return memcg;
0973 }
0974 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
0975
0976 static __always_inline bool memcg_kmem_bypass(void)
0977 {
0978
0979 if (unlikely(active_memcg()))
0980 return false;
0981
0982
0983 if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
0984 return true;
0985
0986 return false;
0987 }
0988
0989
0990
0991
0992
0993
0994
0995
0996
0997
0998
0999
1000
1001
1002
1003
1004
1005
1006 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1007 struct mem_cgroup *prev,
1008 struct mem_cgroup_reclaim_cookie *reclaim)
1009 {
1010 struct mem_cgroup_reclaim_iter *iter;
1011 struct cgroup_subsys_state *css = NULL;
1012 struct mem_cgroup *memcg = NULL;
1013 struct mem_cgroup *pos = NULL;
1014
1015 if (mem_cgroup_disabled())
1016 return NULL;
1017
1018 if (!root)
1019 root = root_mem_cgroup;
1020
1021 rcu_read_lock();
1022
1023 if (reclaim) {
1024 struct mem_cgroup_per_node *mz;
1025
1026 mz = root->nodeinfo[reclaim->pgdat->node_id];
1027 iter = &mz->iter;
1028
1029
1030
1031
1032
1033 if (!prev)
1034 reclaim->generation = iter->generation;
1035 else if (reclaim->generation != iter->generation)
1036 goto out_unlock;
1037
1038 while (1) {
1039 pos = READ_ONCE(iter->position);
1040 if (!pos || css_tryget(&pos->css))
1041 break;
1042
1043
1044
1045
1046
1047
1048
1049
1050 (void)cmpxchg(&iter->position, pos, NULL);
1051 }
1052 } else if (prev) {
1053 pos = prev;
1054 }
1055
1056 if (pos)
1057 css = &pos->css;
1058
1059 for (;;) {
1060 css = css_next_descendant_pre(css, &root->css);
1061 if (!css) {
1062
1063
1064
1065
1066
1067
1068 if (!prev)
1069 continue;
1070 break;
1071 }
1072
1073
1074
1075
1076
1077
1078 if (css == &root->css || css_tryget(css)) {
1079 memcg = mem_cgroup_from_css(css);
1080 break;
1081 }
1082 }
1083
1084 if (reclaim) {
1085
1086
1087
1088
1089
1090 (void)cmpxchg(&iter->position, pos, memcg);
1091
1092 if (pos)
1093 css_put(&pos->css);
1094
1095 if (!memcg)
1096 iter->generation++;
1097 }
1098
1099 out_unlock:
1100 rcu_read_unlock();
1101 if (prev && prev != root)
1102 css_put(&prev->css);
1103
1104 return memcg;
1105 }
1106
1107
1108
1109
1110
1111
1112 void mem_cgroup_iter_break(struct mem_cgroup *root,
1113 struct mem_cgroup *prev)
1114 {
1115 if (!root)
1116 root = root_mem_cgroup;
1117 if (prev && prev != root)
1118 css_put(&prev->css);
1119 }
1120
1121 static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1122 struct mem_cgroup *dead_memcg)
1123 {
1124 struct mem_cgroup_reclaim_iter *iter;
1125 struct mem_cgroup_per_node *mz;
1126 int nid;
1127
1128 for_each_node(nid) {
1129 mz = from->nodeinfo[nid];
1130 iter = &mz->iter;
1131 cmpxchg(&iter->position, dead_memcg, NULL);
1132 }
1133 }
1134
1135 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1136 {
1137 struct mem_cgroup *memcg = dead_memcg;
1138 struct mem_cgroup *last;
1139
1140 do {
1141 __invalidate_reclaim_iterators(memcg, dead_memcg);
1142 last = memcg;
1143 } while ((memcg = parent_mem_cgroup(memcg)));
1144
1145
1146
1147
1148
1149
1150
1151 if (last != root_mem_cgroup)
1152 __invalidate_reclaim_iterators(root_mem_cgroup,
1153 dead_memcg);
1154 }
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1170 int (*fn)(struct task_struct *, void *), void *arg)
1171 {
1172 struct mem_cgroup *iter;
1173 int ret = 0;
1174
1175 BUG_ON(memcg == root_mem_cgroup);
1176
1177 for_each_mem_cgroup_tree(iter, memcg) {
1178 struct css_task_iter it;
1179 struct task_struct *task;
1180
1181 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1182 while (!ret && (task = css_task_iter_next(&it)))
1183 ret = fn(task, arg);
1184 css_task_iter_end(&it);
1185 if (ret) {
1186 mem_cgroup_iter_break(memcg, iter);
1187 break;
1188 }
1189 }
1190 return ret;
1191 }
1192
1193 #ifdef CONFIG_DEBUG_VM
1194 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
1195 {
1196 struct mem_cgroup *memcg;
1197
1198 if (mem_cgroup_disabled())
1199 return;
1200
1201 memcg = folio_memcg(folio);
1202
1203 if (!memcg)
1204 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
1205 else
1206 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
1207 }
1208 #endif
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222 struct lruvec *folio_lruvec_lock(struct folio *folio)
1223 {
1224 struct lruvec *lruvec = folio_lruvec(folio);
1225
1226 spin_lock(&lruvec->lru_lock);
1227 lruvec_memcg_debug(lruvec, folio);
1228
1229 return lruvec;
1230 }
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245 struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
1246 {
1247 struct lruvec *lruvec = folio_lruvec(folio);
1248
1249 spin_lock_irq(&lruvec->lru_lock);
1250 lruvec_memcg_debug(lruvec, folio);
1251
1252 return lruvec;
1253 }
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
1270 unsigned long *flags)
1271 {
1272 struct lruvec *lruvec = folio_lruvec(folio);
1273
1274 spin_lock_irqsave(&lruvec->lru_lock, *flags);
1275 lruvec_memcg_debug(lruvec, folio);
1276
1277 return lruvec;
1278 }
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1291 int zid, int nr_pages)
1292 {
1293 struct mem_cgroup_per_node *mz;
1294 unsigned long *lru_size;
1295 long size;
1296
1297 if (mem_cgroup_disabled())
1298 return;
1299
1300 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1301 lru_size = &mz->lru_zone_size[zid][lru];
1302
1303 if (nr_pages < 0)
1304 *lru_size += nr_pages;
1305
1306 size = *lru_size;
1307 if (WARN_ONCE(size < 0,
1308 "%s(%p, %d, %d): lru_size %ld\n",
1309 __func__, lruvec, lru, nr_pages, size)) {
1310 VM_BUG_ON(1);
1311 *lru_size = 0;
1312 }
1313
1314 if (nr_pages > 0)
1315 *lru_size += nr_pages;
1316 }
1317
1318
1319
1320
1321
1322
1323
1324
1325 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1326 {
1327 unsigned long margin = 0;
1328 unsigned long count;
1329 unsigned long limit;
1330
1331 count = page_counter_read(&memcg->memory);
1332 limit = READ_ONCE(memcg->memory.max);
1333 if (count < limit)
1334 margin = limit - count;
1335
1336 if (do_memsw_account()) {
1337 count = page_counter_read(&memcg->memsw);
1338 limit = READ_ONCE(memcg->memsw.max);
1339 if (count < limit)
1340 margin = min(margin, limit - count);
1341 else
1342 margin = 0;
1343 }
1344
1345 return margin;
1346 }
1347
1348
1349
1350
1351
1352
1353
1354
1355 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1356 {
1357 struct mem_cgroup *from;
1358 struct mem_cgroup *to;
1359 bool ret = false;
1360
1361
1362
1363
1364 spin_lock(&mc.lock);
1365 from = mc.from;
1366 to = mc.to;
1367 if (!from)
1368 goto unlock;
1369
1370 ret = mem_cgroup_is_descendant(from, memcg) ||
1371 mem_cgroup_is_descendant(to, memcg);
1372 unlock:
1373 spin_unlock(&mc.lock);
1374 return ret;
1375 }
1376
1377 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1378 {
1379 if (mc.moving_task && current != mc.moving_task) {
1380 if (mem_cgroup_under_move(memcg)) {
1381 DEFINE_WAIT(wait);
1382 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1383
1384 if (mc.moving_task)
1385 schedule();
1386 finish_wait(&mc.waitq, &wait);
1387 return true;
1388 }
1389 }
1390 return false;
1391 }
1392
1393 struct memory_stat {
1394 const char *name;
1395 unsigned int idx;
1396 };
1397
1398 static const struct memory_stat memory_stats[] = {
1399 { "anon", NR_ANON_MAPPED },
1400 { "file", NR_FILE_PAGES },
1401 { "kernel", MEMCG_KMEM },
1402 { "kernel_stack", NR_KERNEL_STACK_KB },
1403 { "pagetables", NR_PAGETABLE },
1404 { "percpu", MEMCG_PERCPU_B },
1405 { "sock", MEMCG_SOCK },
1406 { "vmalloc", MEMCG_VMALLOC },
1407 { "shmem", NR_SHMEM },
1408 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
1409 { "zswap", MEMCG_ZSWAP_B },
1410 { "zswapped", MEMCG_ZSWAPPED },
1411 #endif
1412 { "file_mapped", NR_FILE_MAPPED },
1413 { "file_dirty", NR_FILE_DIRTY },
1414 { "file_writeback", NR_WRITEBACK },
1415 #ifdef CONFIG_SWAP
1416 { "swapcached", NR_SWAPCACHE },
1417 #endif
1418 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1419 { "anon_thp", NR_ANON_THPS },
1420 { "file_thp", NR_FILE_THPS },
1421 { "shmem_thp", NR_SHMEM_THPS },
1422 #endif
1423 { "inactive_anon", NR_INACTIVE_ANON },
1424 { "active_anon", NR_ACTIVE_ANON },
1425 { "inactive_file", NR_INACTIVE_FILE },
1426 { "active_file", NR_ACTIVE_FILE },
1427 { "unevictable", NR_UNEVICTABLE },
1428 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B },
1429 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B },
1430
1431
1432 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON },
1433 { "workingset_refault_file", WORKINGSET_REFAULT_FILE },
1434 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON },
1435 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE },
1436 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
1437 { "workingset_restore_file", WORKINGSET_RESTORE_FILE },
1438 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
1439 };
1440
1441
1442 static int memcg_page_state_unit(int item)
1443 {
1444 switch (item) {
1445 case MEMCG_PERCPU_B:
1446 case MEMCG_ZSWAP_B:
1447 case NR_SLAB_RECLAIMABLE_B:
1448 case NR_SLAB_UNRECLAIMABLE_B:
1449 case WORKINGSET_REFAULT_ANON:
1450 case WORKINGSET_REFAULT_FILE:
1451 case WORKINGSET_ACTIVATE_ANON:
1452 case WORKINGSET_ACTIVATE_FILE:
1453 case WORKINGSET_RESTORE_ANON:
1454 case WORKINGSET_RESTORE_FILE:
1455 case WORKINGSET_NODERECLAIM:
1456 return 1;
1457 case NR_KERNEL_STACK_KB:
1458 return SZ_1K;
1459 default:
1460 return PAGE_SIZE;
1461 }
1462 }
1463
1464 static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
1465 int item)
1466 {
1467 return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
1468 }
1469
1470
1471 static const unsigned int memcg_vm_event_stat[] = {
1472 PGSCAN_KSWAPD,
1473 PGSCAN_DIRECT,
1474 PGSTEAL_KSWAPD,
1475 PGSTEAL_DIRECT,
1476 PGFAULT,
1477 PGMAJFAULT,
1478 PGREFILL,
1479 PGACTIVATE,
1480 PGDEACTIVATE,
1481 PGLAZYFREE,
1482 PGLAZYFREED,
1483 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
1484 ZSWPIN,
1485 ZSWPOUT,
1486 #endif
1487 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1488 THP_FAULT_ALLOC,
1489 THP_COLLAPSE_ALLOC,
1490 #endif
1491 };
1492
1493 static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
1494 {
1495 struct seq_buf s;
1496 int i;
1497
1498 seq_buf_init(&s, buf, bufsize);
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510 mem_cgroup_flush_stats();
1511
1512 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1513 u64 size;
1514
1515 size = memcg_page_state_output(memcg, memory_stats[i].idx);
1516 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1517
1518 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1519 size += memcg_page_state_output(memcg,
1520 NR_SLAB_RECLAIMABLE_B);
1521 seq_buf_printf(&s, "slab %llu\n", size);
1522 }
1523 }
1524
1525
1526 seq_buf_printf(&s, "pgscan %lu\n",
1527 memcg_events(memcg, PGSCAN_KSWAPD) +
1528 memcg_events(memcg, PGSCAN_DIRECT));
1529 seq_buf_printf(&s, "pgsteal %lu\n",
1530 memcg_events(memcg, PGSTEAL_KSWAPD) +
1531 memcg_events(memcg, PGSTEAL_DIRECT));
1532
1533 for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++)
1534 seq_buf_printf(&s, "%s %lu\n",
1535 vm_event_name(memcg_vm_event_stat[i]),
1536 memcg_events(memcg, memcg_vm_event_stat[i]));
1537
1538
1539 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1540 }
1541
1542 #define K(x) ((x) << (PAGE_SHIFT-10))
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1553 {
1554 rcu_read_lock();
1555
1556 if (memcg) {
1557 pr_cont(",oom_memcg=");
1558 pr_cont_cgroup_path(memcg->css.cgroup);
1559 } else
1560 pr_cont(",global_oom");
1561 if (p) {
1562 pr_cont(",task_memcg=");
1563 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1564 }
1565 rcu_read_unlock();
1566 }
1567
1568
1569
1570
1571
1572
1573 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1574 {
1575
1576 static char buf[PAGE_SIZE];
1577
1578 lockdep_assert_held(&oom_lock);
1579
1580 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1581 K((u64)page_counter_read(&memcg->memory)),
1582 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1583 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1584 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1585 K((u64)page_counter_read(&memcg->swap)),
1586 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1587 else {
1588 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1589 K((u64)page_counter_read(&memcg->memsw)),
1590 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1591 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1592 K((u64)page_counter_read(&memcg->kmem)),
1593 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1594 }
1595
1596 pr_info("Memory cgroup stats for ");
1597 pr_cont_cgroup_path(memcg->css.cgroup);
1598 pr_cont(":");
1599 memory_stat_format(memcg, buf, sizeof(buf));
1600 pr_info("%s", buf);
1601 }
1602
1603
1604
1605
1606 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1607 {
1608 unsigned long max = READ_ONCE(memcg->memory.max);
1609
1610 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1611 if (mem_cgroup_swappiness(memcg))
1612 max += min(READ_ONCE(memcg->swap.max),
1613 (unsigned long)total_swap_pages);
1614 } else {
1615 if (mem_cgroup_swappiness(memcg)) {
1616
1617 unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1618
1619 max += min(swap, (unsigned long)total_swap_pages);
1620 }
1621 }
1622 return max;
1623 }
1624
1625 unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1626 {
1627 return page_counter_read(&memcg->memory);
1628 }
1629
1630 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1631 int order)
1632 {
1633 struct oom_control oc = {
1634 .zonelist = NULL,
1635 .nodemask = NULL,
1636 .memcg = memcg,
1637 .gfp_mask = gfp_mask,
1638 .order = order,
1639 };
1640 bool ret = true;
1641
1642 if (mutex_lock_killable(&oom_lock))
1643 return true;
1644
1645 if (mem_cgroup_margin(memcg) >= (1 << order))
1646 goto unlock;
1647
1648
1649
1650
1651
1652 ret = task_is_dying() || out_of_memory(&oc);
1653
1654 unlock:
1655 mutex_unlock(&oom_lock);
1656 return ret;
1657 }
1658
1659 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1660 pg_data_t *pgdat,
1661 gfp_t gfp_mask,
1662 unsigned long *total_scanned)
1663 {
1664 struct mem_cgroup *victim = NULL;
1665 int total = 0;
1666 int loop = 0;
1667 unsigned long excess;
1668 unsigned long nr_scanned;
1669 struct mem_cgroup_reclaim_cookie reclaim = {
1670 .pgdat = pgdat,
1671 };
1672
1673 excess = soft_limit_excess(root_memcg);
1674
1675 while (1) {
1676 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1677 if (!victim) {
1678 loop++;
1679 if (loop >= 2) {
1680
1681
1682
1683
1684
1685 if (!total)
1686 break;
1687
1688
1689
1690
1691
1692
1693 if (total >= (excess >> 2) ||
1694 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1695 break;
1696 }
1697 continue;
1698 }
1699 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1700 pgdat, &nr_scanned);
1701 *total_scanned += nr_scanned;
1702 if (!soft_limit_excess(root_memcg))
1703 break;
1704 }
1705 mem_cgroup_iter_break(root_memcg, victim);
1706 return total;
1707 }
1708
1709 #ifdef CONFIG_LOCKDEP
1710 static struct lockdep_map memcg_oom_lock_dep_map = {
1711 .name = "memcg_oom_lock",
1712 };
1713 #endif
1714
1715 static DEFINE_SPINLOCK(memcg_oom_lock);
1716
1717
1718
1719
1720
1721 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1722 {
1723 struct mem_cgroup *iter, *failed = NULL;
1724
1725 spin_lock(&memcg_oom_lock);
1726
1727 for_each_mem_cgroup_tree(iter, memcg) {
1728 if (iter->oom_lock) {
1729
1730
1731
1732
1733 failed = iter;
1734 mem_cgroup_iter_break(memcg, iter);
1735 break;
1736 } else
1737 iter->oom_lock = true;
1738 }
1739
1740 if (failed) {
1741
1742
1743
1744
1745 for_each_mem_cgroup_tree(iter, memcg) {
1746 if (iter == failed) {
1747 mem_cgroup_iter_break(memcg, iter);
1748 break;
1749 }
1750 iter->oom_lock = false;
1751 }
1752 } else
1753 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1754
1755 spin_unlock(&memcg_oom_lock);
1756
1757 return !failed;
1758 }
1759
1760 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1761 {
1762 struct mem_cgroup *iter;
1763
1764 spin_lock(&memcg_oom_lock);
1765 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1766 for_each_mem_cgroup_tree(iter, memcg)
1767 iter->oom_lock = false;
1768 spin_unlock(&memcg_oom_lock);
1769 }
1770
1771 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1772 {
1773 struct mem_cgroup *iter;
1774
1775 spin_lock(&memcg_oom_lock);
1776 for_each_mem_cgroup_tree(iter, memcg)
1777 iter->under_oom++;
1778 spin_unlock(&memcg_oom_lock);
1779 }
1780
1781 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1782 {
1783 struct mem_cgroup *iter;
1784
1785
1786
1787
1788
1789 spin_lock(&memcg_oom_lock);
1790 for_each_mem_cgroup_tree(iter, memcg)
1791 if (iter->under_oom > 0)
1792 iter->under_oom--;
1793 spin_unlock(&memcg_oom_lock);
1794 }
1795
1796 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1797
1798 struct oom_wait_info {
1799 struct mem_cgroup *memcg;
1800 wait_queue_entry_t wait;
1801 };
1802
1803 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1804 unsigned mode, int sync, void *arg)
1805 {
1806 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1807 struct mem_cgroup *oom_wait_memcg;
1808 struct oom_wait_info *oom_wait_info;
1809
1810 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1811 oom_wait_memcg = oom_wait_info->memcg;
1812
1813 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1814 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1815 return 0;
1816 return autoremove_wake_function(wait, mode, sync, arg);
1817 }
1818
1819 static void memcg_oom_recover(struct mem_cgroup *memcg)
1820 {
1821
1822
1823
1824
1825
1826
1827
1828
1829 if (memcg && memcg->under_oom)
1830 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1831 }
1832
1833
1834
1835
1836
1837 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1838 {
1839 bool locked, ret;
1840
1841 if (order > PAGE_ALLOC_COSTLY_ORDER)
1842 return false;
1843
1844 memcg_memory_event(memcg, MEMCG_OOM);
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864 if (memcg->oom_kill_disable) {
1865 if (current->in_user_fault) {
1866 css_get(&memcg->css);
1867 current->memcg_in_oom = memcg;
1868 current->memcg_oom_gfp_mask = mask;
1869 current->memcg_oom_order = order;
1870 }
1871 return false;
1872 }
1873
1874 mem_cgroup_mark_under_oom(memcg);
1875
1876 locked = mem_cgroup_oom_trylock(memcg);
1877
1878 if (locked)
1879 mem_cgroup_oom_notify(memcg);
1880
1881 mem_cgroup_unmark_under_oom(memcg);
1882 ret = mem_cgroup_out_of_memory(memcg, mask, order);
1883
1884 if (locked)
1885 mem_cgroup_oom_unlock(memcg);
1886
1887 return ret;
1888 }
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907 bool mem_cgroup_oom_synchronize(bool handle)
1908 {
1909 struct mem_cgroup *memcg = current->memcg_in_oom;
1910 struct oom_wait_info owait;
1911 bool locked;
1912
1913
1914 if (!memcg)
1915 return false;
1916
1917 if (!handle)
1918 goto cleanup;
1919
1920 owait.memcg = memcg;
1921 owait.wait.flags = 0;
1922 owait.wait.func = memcg_oom_wake_function;
1923 owait.wait.private = current;
1924 INIT_LIST_HEAD(&owait.wait.entry);
1925
1926 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1927 mem_cgroup_mark_under_oom(memcg);
1928
1929 locked = mem_cgroup_oom_trylock(memcg);
1930
1931 if (locked)
1932 mem_cgroup_oom_notify(memcg);
1933
1934 if (locked && !memcg->oom_kill_disable) {
1935 mem_cgroup_unmark_under_oom(memcg);
1936 finish_wait(&memcg_oom_waitq, &owait.wait);
1937 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1938 current->memcg_oom_order);
1939 } else {
1940 schedule();
1941 mem_cgroup_unmark_under_oom(memcg);
1942 finish_wait(&memcg_oom_waitq, &owait.wait);
1943 }
1944
1945 if (locked) {
1946 mem_cgroup_oom_unlock(memcg);
1947
1948
1949
1950
1951
1952 memcg_oom_recover(memcg);
1953 }
1954 cleanup:
1955 current->memcg_in_oom = NULL;
1956 css_put(&memcg->css);
1957 return true;
1958 }
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1971 struct mem_cgroup *oom_domain)
1972 {
1973 struct mem_cgroup *oom_group = NULL;
1974 struct mem_cgroup *memcg;
1975
1976 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1977 return NULL;
1978
1979 if (!oom_domain)
1980 oom_domain = root_mem_cgroup;
1981
1982 rcu_read_lock();
1983
1984 memcg = mem_cgroup_from_task(victim);
1985 if (memcg == root_mem_cgroup)
1986 goto out;
1987
1988
1989
1990
1991
1992
1993 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
1994 goto out;
1995
1996
1997
1998
1999
2000
2001 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2002 if (memcg->oom_group)
2003 oom_group = memcg;
2004
2005 if (memcg == oom_domain)
2006 break;
2007 }
2008
2009 if (oom_group)
2010 css_get(&oom_group->css);
2011 out:
2012 rcu_read_unlock();
2013
2014 return oom_group;
2015 }
2016
2017 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2018 {
2019 pr_info("Tasks in ");
2020 pr_cont_cgroup_path(memcg->css.cgroup);
2021 pr_cont(" are going to be killed due to memory.oom.group set\n");
2022 }
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034 void folio_memcg_lock(struct folio *folio)
2035 {
2036 struct mem_cgroup *memcg;
2037 unsigned long flags;
2038
2039
2040
2041
2042
2043
2044 rcu_read_lock();
2045
2046 if (mem_cgroup_disabled())
2047 return;
2048 again:
2049 memcg = folio_memcg(folio);
2050 if (unlikely(!memcg))
2051 return;
2052
2053 #ifdef CONFIG_PROVE_LOCKING
2054 local_irq_save(flags);
2055 might_lock(&memcg->move_lock);
2056 local_irq_restore(flags);
2057 #endif
2058
2059 if (atomic_read(&memcg->moving_account) <= 0)
2060 return;
2061
2062 spin_lock_irqsave(&memcg->move_lock, flags);
2063 if (memcg != folio_memcg(folio)) {
2064 spin_unlock_irqrestore(&memcg->move_lock, flags);
2065 goto again;
2066 }
2067
2068
2069
2070
2071
2072
2073
2074 memcg->move_lock_task = current;
2075 memcg->move_lock_flags = flags;
2076 }
2077
2078 void lock_page_memcg(struct page *page)
2079 {
2080 folio_memcg_lock(page_folio(page));
2081 }
2082
2083 static void __folio_memcg_unlock(struct mem_cgroup *memcg)
2084 {
2085 if (memcg && memcg->move_lock_task == current) {
2086 unsigned long flags = memcg->move_lock_flags;
2087
2088 memcg->move_lock_task = NULL;
2089 memcg->move_lock_flags = 0;
2090
2091 spin_unlock_irqrestore(&memcg->move_lock, flags);
2092 }
2093
2094 rcu_read_unlock();
2095 }
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105 void folio_memcg_unlock(struct folio *folio)
2106 {
2107 __folio_memcg_unlock(folio_memcg(folio));
2108 }
2109
2110 void unlock_page_memcg(struct page *page)
2111 {
2112 folio_memcg_unlock(page_folio(page));
2113 }
2114
2115 struct memcg_stock_pcp {
2116 local_lock_t stock_lock;
2117 struct mem_cgroup *cached;
2118 unsigned int nr_pages;
2119
2120 #ifdef CONFIG_MEMCG_KMEM
2121 struct obj_cgroup *cached_objcg;
2122 struct pglist_data *cached_pgdat;
2123 unsigned int nr_bytes;
2124 int nr_slab_reclaimable_b;
2125 int nr_slab_unreclaimable_b;
2126 #endif
2127
2128 struct work_struct work;
2129 unsigned long flags;
2130 #define FLUSHING_CACHED_CHARGE 0
2131 };
2132 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
2133 .stock_lock = INIT_LOCAL_LOCK(stock_lock),
2134 };
2135 static DEFINE_MUTEX(percpu_charge_mutex);
2136
2137 #ifdef CONFIG_MEMCG_KMEM
2138 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
2139 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2140 struct mem_cgroup *root_memcg);
2141 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
2142
2143 #else
2144 static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
2145 {
2146 return NULL;
2147 }
2148 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2149 struct mem_cgroup *root_memcg)
2150 {
2151 return false;
2152 }
2153 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
2154 {
2155 }
2156 #endif
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2170 {
2171 struct memcg_stock_pcp *stock;
2172 unsigned long flags;
2173 bool ret = false;
2174
2175 if (nr_pages > MEMCG_CHARGE_BATCH)
2176 return ret;
2177
2178 local_lock_irqsave(&memcg_stock.stock_lock, flags);
2179
2180 stock = this_cpu_ptr(&memcg_stock);
2181 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2182 stock->nr_pages -= nr_pages;
2183 ret = true;
2184 }
2185
2186 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2187
2188 return ret;
2189 }
2190
2191
2192
2193
2194 static void drain_stock(struct memcg_stock_pcp *stock)
2195 {
2196 struct mem_cgroup *old = stock->cached;
2197
2198 if (!old)
2199 return;
2200
2201 if (stock->nr_pages) {
2202 page_counter_uncharge(&old->memory, stock->nr_pages);
2203 if (do_memsw_account())
2204 page_counter_uncharge(&old->memsw, stock->nr_pages);
2205 stock->nr_pages = 0;
2206 }
2207
2208 css_put(&old->css);
2209 stock->cached = NULL;
2210 }
2211
2212 static void drain_local_stock(struct work_struct *dummy)
2213 {
2214 struct memcg_stock_pcp *stock;
2215 struct obj_cgroup *old = NULL;
2216 unsigned long flags;
2217
2218
2219
2220
2221
2222
2223 local_lock_irqsave(&memcg_stock.stock_lock, flags);
2224
2225 stock = this_cpu_ptr(&memcg_stock);
2226 old = drain_obj_stock(stock);
2227 drain_stock(stock);
2228 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2229
2230 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2231 if (old)
2232 obj_cgroup_put(old);
2233 }
2234
2235
2236
2237
2238
2239 static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2240 {
2241 struct memcg_stock_pcp *stock;
2242
2243 stock = this_cpu_ptr(&memcg_stock);
2244 if (stock->cached != memcg) {
2245 drain_stock(stock);
2246 css_get(&memcg->css);
2247 stock->cached = memcg;
2248 }
2249 stock->nr_pages += nr_pages;
2250
2251 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2252 drain_stock(stock);
2253 }
2254
2255 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2256 {
2257 unsigned long flags;
2258
2259 local_lock_irqsave(&memcg_stock.stock_lock, flags);
2260 __refill_stock(memcg, nr_pages);
2261 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2262 }
2263
2264
2265
2266
2267
2268 static void drain_all_stock(struct mem_cgroup *root_memcg)
2269 {
2270 int cpu, curcpu;
2271
2272
2273 if (!mutex_trylock(&percpu_charge_mutex))
2274 return;
2275
2276
2277
2278
2279
2280
2281 migrate_disable();
2282 curcpu = smp_processor_id();
2283 for_each_online_cpu(cpu) {
2284 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2285 struct mem_cgroup *memcg;
2286 bool flush = false;
2287
2288 rcu_read_lock();
2289 memcg = stock->cached;
2290 if (memcg && stock->nr_pages &&
2291 mem_cgroup_is_descendant(memcg, root_memcg))
2292 flush = true;
2293 else if (obj_stock_flush_required(stock, root_memcg))
2294 flush = true;
2295 rcu_read_unlock();
2296
2297 if (flush &&
2298 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2299 if (cpu == curcpu)
2300 drain_local_stock(&stock->work);
2301 else
2302 schedule_work_on(cpu, &stock->work);
2303 }
2304 }
2305 migrate_enable();
2306 mutex_unlock(&percpu_charge_mutex);
2307 }
2308
2309 static int memcg_hotplug_cpu_dead(unsigned int cpu)
2310 {
2311 struct memcg_stock_pcp *stock;
2312
2313 stock = &per_cpu(memcg_stock, cpu);
2314 drain_stock(stock);
2315
2316 return 0;
2317 }
2318
2319 static unsigned long reclaim_high(struct mem_cgroup *memcg,
2320 unsigned int nr_pages,
2321 gfp_t gfp_mask)
2322 {
2323 unsigned long nr_reclaimed = 0;
2324
2325 do {
2326 unsigned long pflags;
2327
2328 if (page_counter_read(&memcg->memory) <=
2329 READ_ONCE(memcg->memory.high))
2330 continue;
2331
2332 memcg_memory_event(memcg, MEMCG_HIGH);
2333
2334 psi_memstall_enter(&pflags);
2335 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2336 gfp_mask,
2337 MEMCG_RECLAIM_MAY_SWAP);
2338 psi_memstall_leave(&pflags);
2339 } while ((memcg = parent_mem_cgroup(memcg)) &&
2340 !mem_cgroup_is_root(memcg));
2341
2342 return nr_reclaimed;
2343 }
2344
2345 static void high_work_func(struct work_struct *work)
2346 {
2347 struct mem_cgroup *memcg;
2348
2349 memcg = container_of(work, struct mem_cgroup, high_work);
2350 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2351 }
2352
2353
2354
2355
2356
2357
2358 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403 #define MEMCG_DELAY_PRECISION_SHIFT 20
2404 #define MEMCG_DELAY_SCALING_SHIFT 14
2405
2406 static u64 calculate_overage(unsigned long usage, unsigned long high)
2407 {
2408 u64 overage;
2409
2410 if (usage <= high)
2411 return 0;
2412
2413
2414
2415
2416
2417 high = max(high, 1UL);
2418
2419 overage = usage - high;
2420 overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2421 return div64_u64(overage, high);
2422 }
2423
2424 static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2425 {
2426 u64 overage, max_overage = 0;
2427
2428 do {
2429 overage = calculate_overage(page_counter_read(&memcg->memory),
2430 READ_ONCE(memcg->memory.high));
2431 max_overage = max(overage, max_overage);
2432 } while ((memcg = parent_mem_cgroup(memcg)) &&
2433 !mem_cgroup_is_root(memcg));
2434
2435 return max_overage;
2436 }
2437
2438 static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2439 {
2440 u64 overage, max_overage = 0;
2441
2442 do {
2443 overage = calculate_overage(page_counter_read(&memcg->swap),
2444 READ_ONCE(memcg->swap.high));
2445 if (overage)
2446 memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2447 max_overage = max(overage, max_overage);
2448 } while ((memcg = parent_mem_cgroup(memcg)) &&
2449 !mem_cgroup_is_root(memcg));
2450
2451 return max_overage;
2452 }
2453
2454
2455
2456
2457
2458 static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2459 unsigned int nr_pages,
2460 u64 max_overage)
2461 {
2462 unsigned long penalty_jiffies;
2463
2464 if (!max_overage)
2465 return 0;
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475 penalty_jiffies = max_overage * max_overage * HZ;
2476 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2477 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2488 }
2489
2490
2491
2492
2493
2494 void mem_cgroup_handle_over_high(void)
2495 {
2496 unsigned long penalty_jiffies;
2497 unsigned long pflags;
2498 unsigned long nr_reclaimed;
2499 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2500 int nr_retries = MAX_RECLAIM_RETRIES;
2501 struct mem_cgroup *memcg;
2502 bool in_retry = false;
2503
2504 if (likely(!nr_pages))
2505 return;
2506
2507 memcg = get_mem_cgroup_from_mm(current->mm);
2508 current->memcg_nr_pages_over_high = 0;
2509
2510 retry_reclaim:
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520 nr_reclaimed = reclaim_high(memcg,
2521 in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2522 GFP_KERNEL);
2523
2524
2525
2526
2527
2528 penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2529 mem_find_max_overage(memcg));
2530
2531 penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2532 swap_find_max_overage(memcg));
2533
2534
2535
2536
2537
2538
2539 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2540
2541
2542
2543
2544
2545
2546
2547 if (penalty_jiffies <= HZ / 100)
2548 goto out;
2549
2550
2551
2552
2553
2554
2555 if (nr_reclaimed || nr_retries--) {
2556 in_retry = true;
2557 goto retry_reclaim;
2558 }
2559
2560
2561
2562
2563
2564
2565 psi_memstall_enter(&pflags);
2566 schedule_timeout_killable(penalty_jiffies);
2567 psi_memstall_leave(&pflags);
2568
2569 out:
2570 css_put(&memcg->css);
2571 }
2572
2573 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
2574 unsigned int nr_pages)
2575 {
2576 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2577 int nr_retries = MAX_RECLAIM_RETRIES;
2578 struct mem_cgroup *mem_over_limit;
2579 struct page_counter *counter;
2580 unsigned long nr_reclaimed;
2581 bool passed_oom = false;
2582 unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
2583 bool drained = false;
2584 bool raised_max_event = false;
2585 unsigned long pflags;
2586
2587 retry:
2588 if (consume_stock(memcg, nr_pages))
2589 return 0;
2590
2591 if (!do_memsw_account() ||
2592 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2593 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2594 goto done_restock;
2595 if (do_memsw_account())
2596 page_counter_uncharge(&memcg->memsw, batch);
2597 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2598 } else {
2599 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2600 reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
2601 }
2602
2603 if (batch > nr_pages) {
2604 batch = nr_pages;
2605 goto retry;
2606 }
2607
2608
2609
2610
2611
2612
2613
2614 if (unlikely(current->flags & PF_MEMALLOC))
2615 goto force;
2616
2617 if (unlikely(task_in_memcg_oom(current)))
2618 goto nomem;
2619
2620 if (!gfpflags_allow_blocking(gfp_mask))
2621 goto nomem;
2622
2623 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2624 raised_max_event = true;
2625
2626 psi_memstall_enter(&pflags);
2627 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2628 gfp_mask, reclaim_options);
2629 psi_memstall_leave(&pflags);
2630
2631 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2632 goto retry;
2633
2634 if (!drained) {
2635 drain_all_stock(mem_over_limit);
2636 drained = true;
2637 goto retry;
2638 }
2639
2640 if (gfp_mask & __GFP_NORETRY)
2641 goto nomem;
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2652 goto retry;
2653
2654
2655
2656
2657 if (mem_cgroup_wait_acct_move(mem_over_limit))
2658 goto retry;
2659
2660 if (nr_retries--)
2661 goto retry;
2662
2663 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2664 goto nomem;
2665
2666
2667 if (passed_oom && task_is_dying())
2668 goto nomem;
2669
2670
2671
2672
2673
2674
2675 if (mem_cgroup_oom(mem_over_limit, gfp_mask,
2676 get_order(nr_pages * PAGE_SIZE))) {
2677 passed_oom = true;
2678 nr_retries = MAX_RECLAIM_RETRIES;
2679 goto retry;
2680 }
2681 nomem:
2682
2683
2684
2685
2686
2687
2688 if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
2689 return -ENOMEM;
2690 force:
2691
2692
2693
2694
2695 if (!raised_max_event)
2696 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2697
2698
2699
2700
2701
2702
2703 page_counter_charge(&memcg->memory, nr_pages);
2704 if (do_memsw_account())
2705 page_counter_charge(&memcg->memsw, nr_pages);
2706
2707 return 0;
2708
2709 done_restock:
2710 if (batch > nr_pages)
2711 refill_stock(memcg, batch - nr_pages);
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722 do {
2723 bool mem_high, swap_high;
2724
2725 mem_high = page_counter_read(&memcg->memory) >
2726 READ_ONCE(memcg->memory.high);
2727 swap_high = page_counter_read(&memcg->swap) >
2728 READ_ONCE(memcg->swap.high);
2729
2730
2731 if (!in_task()) {
2732 if (mem_high) {
2733 schedule_work(&memcg->high_work);
2734 break;
2735 }
2736 continue;
2737 }
2738
2739 if (mem_high || swap_high) {
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749 current->memcg_nr_pages_over_high += batch;
2750 set_notify_resume(current);
2751 break;
2752 }
2753 } while ((memcg = parent_mem_cgroup(memcg)));
2754
2755 if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
2756 !(current->flags & PF_MEMALLOC) &&
2757 gfpflags_allow_blocking(gfp_mask)) {
2758 mem_cgroup_handle_over_high();
2759 }
2760 return 0;
2761 }
2762
2763 static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2764 unsigned int nr_pages)
2765 {
2766 if (mem_cgroup_is_root(memcg))
2767 return 0;
2768
2769 return try_charge_memcg(memcg, gfp_mask, nr_pages);
2770 }
2771
2772 static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2773 {
2774 if (mem_cgroup_is_root(memcg))
2775 return;
2776
2777 page_counter_uncharge(&memcg->memory, nr_pages);
2778 if (do_memsw_account())
2779 page_counter_uncharge(&memcg->memsw, nr_pages);
2780 }
2781
2782 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2783 {
2784 VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
2785
2786
2787
2788
2789
2790
2791
2792
2793 folio->memcg_data = (unsigned long)memcg;
2794 }
2795
2796 #ifdef CONFIG_MEMCG_KMEM
2797
2798
2799
2800
2801
2802 #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2803
2804
2805
2806
2807
2808 static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
2809 struct pglist_data *pgdat,
2810 enum node_stat_item idx, int nr)
2811 {
2812 struct mem_cgroup *memcg;
2813 struct lruvec *lruvec;
2814
2815 rcu_read_lock();
2816 memcg = obj_cgroup_memcg(objcg);
2817 lruvec = mem_cgroup_lruvec(memcg, pgdat);
2818 mod_memcg_lruvec_state(lruvec, idx, nr);
2819 rcu_read_unlock();
2820 }
2821
2822 int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
2823 gfp_t gfp, bool new_slab)
2824 {
2825 unsigned int objects = objs_per_slab(s, slab);
2826 unsigned long memcg_data;
2827 void *vec;
2828
2829 gfp &= ~OBJCGS_CLEAR_MASK;
2830 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2831 slab_nid(slab));
2832 if (!vec)
2833 return -ENOMEM;
2834
2835 memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
2836 if (new_slab) {
2837
2838
2839
2840
2841
2842 slab->memcg_data = memcg_data;
2843 } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
2844
2845
2846
2847
2848
2849 kfree(vec);
2850 return 0;
2851 }
2852
2853 kmemleak_not_leak(vec);
2854 return 0;
2855 }
2856
2857 static __always_inline
2858 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
2859 {
2860
2861
2862
2863
2864
2865 if (folio_test_slab(folio)) {
2866 struct obj_cgroup **objcgs;
2867 struct slab *slab;
2868 unsigned int off;
2869
2870 slab = folio_slab(folio);
2871 objcgs = slab_objcgs(slab);
2872 if (!objcgs)
2873 return NULL;
2874
2875 off = obj_to_index(slab->slab_cache, slab, p);
2876 if (objcgs[off])
2877 return obj_cgroup_memcg(objcgs[off]);
2878
2879 return NULL;
2880 }
2881
2882
2883
2884
2885
2886
2887
2888
2889 return page_memcg_check(folio_page(folio, 0));
2890 }
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906 struct mem_cgroup *mem_cgroup_from_obj(void *p)
2907 {
2908 struct folio *folio;
2909
2910 if (mem_cgroup_disabled())
2911 return NULL;
2912
2913 if (unlikely(is_vmalloc_addr(p)))
2914 folio = page_folio(vmalloc_to_page(p));
2915 else
2916 folio = virt_to_folio(p);
2917
2918 return mem_cgroup_from_obj_folio(folio, p);
2919 }
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
2932 {
2933 if (mem_cgroup_disabled())
2934 return NULL;
2935
2936 return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
2937 }
2938
2939 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
2940 {
2941 struct obj_cgroup *objcg = NULL;
2942
2943 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
2944 objcg = rcu_dereference(memcg->objcg);
2945 if (objcg && obj_cgroup_tryget(objcg))
2946 break;
2947 objcg = NULL;
2948 }
2949 return objcg;
2950 }
2951
2952 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
2953 {
2954 struct obj_cgroup *objcg = NULL;
2955 struct mem_cgroup *memcg;
2956
2957 if (memcg_kmem_bypass())
2958 return NULL;
2959
2960 rcu_read_lock();
2961 if (unlikely(active_memcg()))
2962 memcg = active_memcg();
2963 else
2964 memcg = mem_cgroup_from_task(current);
2965 objcg = __get_obj_cgroup_from_memcg(memcg);
2966 rcu_read_unlock();
2967 return objcg;
2968 }
2969
2970 struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
2971 {
2972 struct obj_cgroup *objcg;
2973
2974 if (!memcg_kmem_enabled() || memcg_kmem_bypass())
2975 return NULL;
2976
2977 if (PageMemcgKmem(page)) {
2978 objcg = __folio_objcg(page_folio(page));
2979 obj_cgroup_get(objcg);
2980 } else {
2981 struct mem_cgroup *memcg;
2982
2983 rcu_read_lock();
2984 memcg = __folio_memcg(page_folio(page));
2985 if (memcg)
2986 objcg = __get_obj_cgroup_from_memcg(memcg);
2987 else
2988 objcg = NULL;
2989 rcu_read_unlock();
2990 }
2991 return objcg;
2992 }
2993
2994 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
2995 {
2996 mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
2997 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
2998 if (nr_pages > 0)
2999 page_counter_charge(&memcg->kmem, nr_pages);
3000 else
3001 page_counter_uncharge(&memcg->kmem, -nr_pages);
3002 }
3003 }
3004
3005
3006
3007
3008
3009
3010
3011 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
3012 unsigned int nr_pages)
3013 {
3014 struct mem_cgroup *memcg;
3015
3016 memcg = get_mem_cgroup_from_objcg(objcg);
3017
3018 memcg_account_kmem(memcg, -nr_pages);
3019 refill_stock(memcg, nr_pages);
3020
3021 css_put(&memcg->css);
3022 }
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
3033 unsigned int nr_pages)
3034 {
3035 struct mem_cgroup *memcg;
3036 int ret;
3037
3038 memcg = get_mem_cgroup_from_objcg(objcg);
3039
3040 ret = try_charge_memcg(memcg, gfp, nr_pages);
3041 if (ret)
3042 goto out;
3043
3044 memcg_account_kmem(memcg, nr_pages);
3045 out:
3046 css_put(&memcg->css);
3047
3048 return ret;
3049 }
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
3060 {
3061 struct obj_cgroup *objcg;
3062 int ret = 0;
3063
3064 objcg = get_obj_cgroup_from_current();
3065 if (objcg) {
3066 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
3067 if (!ret) {
3068 page->memcg_data = (unsigned long)objcg |
3069 MEMCG_DATA_KMEM;
3070 return 0;
3071 }
3072 obj_cgroup_put(objcg);
3073 }
3074 return ret;
3075 }
3076
3077
3078
3079
3080
3081
3082 void __memcg_kmem_uncharge_page(struct page *page, int order)
3083 {
3084 struct folio *folio = page_folio(page);
3085 struct obj_cgroup *objcg;
3086 unsigned int nr_pages = 1 << order;
3087
3088 if (!folio_memcg_kmem(folio))
3089 return;
3090
3091 objcg = __folio_objcg(folio);
3092 obj_cgroup_uncharge_pages(objcg, nr_pages);
3093 folio->memcg_data = 0;
3094 obj_cgroup_put(objcg);
3095 }
3096
3097 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
3098 enum node_stat_item idx, int nr)
3099 {
3100 struct memcg_stock_pcp *stock;
3101 struct obj_cgroup *old = NULL;
3102 unsigned long flags;
3103 int *bytes;
3104
3105 local_lock_irqsave(&memcg_stock.stock_lock, flags);
3106 stock = this_cpu_ptr(&memcg_stock);
3107
3108
3109
3110
3111
3112
3113 if (stock->cached_objcg != objcg) {
3114 old = drain_obj_stock(stock);
3115 obj_cgroup_get(objcg);
3116 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3117 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3118 stock->cached_objcg = objcg;
3119 stock->cached_pgdat = pgdat;
3120 } else if (stock->cached_pgdat != pgdat) {
3121
3122 struct pglist_data *oldpg = stock->cached_pgdat;
3123
3124 if (stock->nr_slab_reclaimable_b) {
3125 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
3126 stock->nr_slab_reclaimable_b);
3127 stock->nr_slab_reclaimable_b = 0;
3128 }
3129 if (stock->nr_slab_unreclaimable_b) {
3130 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
3131 stock->nr_slab_unreclaimable_b);
3132 stock->nr_slab_unreclaimable_b = 0;
3133 }
3134 stock->cached_pgdat = pgdat;
3135 }
3136
3137 bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
3138 : &stock->nr_slab_unreclaimable_b;
3139
3140
3141
3142
3143 if (!*bytes) {
3144 *bytes = nr;
3145 nr = 0;
3146 } else {
3147 *bytes += nr;
3148 if (abs(*bytes) > PAGE_SIZE) {
3149 nr = *bytes;
3150 *bytes = 0;
3151 } else {
3152 nr = 0;
3153 }
3154 }
3155 if (nr)
3156 mod_objcg_mlstate(objcg, pgdat, idx, nr);
3157
3158 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3159 if (old)
3160 obj_cgroup_put(old);
3161 }
3162
3163 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3164 {
3165 struct memcg_stock_pcp *stock;
3166 unsigned long flags;
3167 bool ret = false;
3168
3169 local_lock_irqsave(&memcg_stock.stock_lock, flags);
3170
3171 stock = this_cpu_ptr(&memcg_stock);
3172 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3173 stock->nr_bytes -= nr_bytes;
3174 ret = true;
3175 }
3176
3177 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3178
3179 return ret;
3180 }
3181
3182 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
3183 {
3184 struct obj_cgroup *old = stock->cached_objcg;
3185
3186 if (!old)
3187 return NULL;
3188
3189 if (stock->nr_bytes) {
3190 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3191 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
3192
3193 if (nr_pages) {
3194 struct mem_cgroup *memcg;
3195
3196 memcg = get_mem_cgroup_from_objcg(old);
3197
3198 memcg_account_kmem(memcg, -nr_pages);
3199 __refill_stock(memcg, nr_pages);
3200
3201 css_put(&memcg->css);
3202 }
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214 atomic_add(nr_bytes, &old->nr_charged_bytes);
3215 stock->nr_bytes = 0;
3216 }
3217
3218
3219
3220
3221 if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
3222 if (stock->nr_slab_reclaimable_b) {
3223 mod_objcg_mlstate(old, stock->cached_pgdat,
3224 NR_SLAB_RECLAIMABLE_B,
3225 stock->nr_slab_reclaimable_b);
3226 stock->nr_slab_reclaimable_b = 0;
3227 }
3228 if (stock->nr_slab_unreclaimable_b) {
3229 mod_objcg_mlstate(old, stock->cached_pgdat,
3230 NR_SLAB_UNRECLAIMABLE_B,
3231 stock->nr_slab_unreclaimable_b);
3232 stock->nr_slab_unreclaimable_b = 0;
3233 }
3234 stock->cached_pgdat = NULL;
3235 }
3236
3237 stock->cached_objcg = NULL;
3238
3239
3240
3241
3242 return old;
3243 }
3244
3245 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3246 struct mem_cgroup *root_memcg)
3247 {
3248 struct mem_cgroup *memcg;
3249
3250 if (stock->cached_objcg) {
3251 memcg = obj_cgroup_memcg(stock->cached_objcg);
3252 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3253 return true;
3254 }
3255
3256 return false;
3257 }
3258
3259 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
3260 bool allow_uncharge)
3261 {
3262 struct memcg_stock_pcp *stock;
3263 struct obj_cgroup *old = NULL;
3264 unsigned long flags;
3265 unsigned int nr_pages = 0;
3266
3267 local_lock_irqsave(&memcg_stock.stock_lock, flags);
3268
3269 stock = this_cpu_ptr(&memcg_stock);
3270 if (stock->cached_objcg != objcg) {
3271 old = drain_obj_stock(stock);
3272 obj_cgroup_get(objcg);
3273 stock->cached_objcg = objcg;
3274 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3275 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3276 allow_uncharge = true;
3277 }
3278 stock->nr_bytes += nr_bytes;
3279
3280 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
3281 nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3282 stock->nr_bytes &= (PAGE_SIZE - 1);
3283 }
3284
3285 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3286 if (old)
3287 obj_cgroup_put(old);
3288
3289 if (nr_pages)
3290 obj_cgroup_uncharge_pages(objcg, nr_pages);
3291 }
3292
3293 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3294 {
3295 unsigned int nr_pages, nr_bytes;
3296 int ret;
3297
3298 if (consume_obj_stock(objcg, size))
3299 return 0;
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324 nr_pages = size >> PAGE_SHIFT;
3325 nr_bytes = size & (PAGE_SIZE - 1);
3326
3327 if (nr_bytes)
3328 nr_pages += 1;
3329
3330 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
3331 if (!ret && nr_bytes)
3332 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
3333
3334 return ret;
3335 }
3336
3337 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3338 {
3339 refill_obj_stock(objcg, size, true);
3340 }
3341
3342 #endif
3343
3344
3345
3346
3347 void split_page_memcg(struct page *head, unsigned int nr)
3348 {
3349 struct folio *folio = page_folio(head);
3350 struct mem_cgroup *memcg = folio_memcg(folio);
3351 int i;
3352
3353 if (mem_cgroup_disabled() || !memcg)
3354 return;
3355
3356 for (i = 1; i < nr; i++)
3357 folio_page(folio, i)->memcg_data = folio->memcg_data;
3358
3359 if (folio_memcg_kmem(folio))
3360 obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
3361 else
3362 css_get_many(&memcg->css, nr - 1);
3363 }
3364
3365 #ifdef CONFIG_MEMCG_SWAP
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380 static int mem_cgroup_move_swap_account(swp_entry_t entry,
3381 struct mem_cgroup *from, struct mem_cgroup *to)
3382 {
3383 unsigned short old_id, new_id;
3384
3385 old_id = mem_cgroup_id(from);
3386 new_id = mem_cgroup_id(to);
3387
3388 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3389 mod_memcg_state(from, MEMCG_SWAP, -1);
3390 mod_memcg_state(to, MEMCG_SWAP, 1);
3391 return 0;
3392 }
3393 return -EINVAL;
3394 }
3395 #else
3396 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3397 struct mem_cgroup *from, struct mem_cgroup *to)
3398 {
3399 return -EINVAL;
3400 }
3401 #endif
3402
3403 static DEFINE_MUTEX(memcg_max_mutex);
3404
3405 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3406 unsigned long max, bool memsw)
3407 {
3408 bool enlarge = false;
3409 bool drained = false;
3410 int ret;
3411 bool limits_invariant;
3412 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3413
3414 do {
3415 if (signal_pending(current)) {
3416 ret = -EINTR;
3417 break;
3418 }
3419
3420 mutex_lock(&memcg_max_mutex);
3421
3422
3423
3424
3425 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3426 max <= memcg->memsw.max;
3427 if (!limits_invariant) {
3428 mutex_unlock(&memcg_max_mutex);
3429 ret = -EINVAL;
3430 break;
3431 }
3432 if (max > counter->max)
3433 enlarge = true;
3434 ret = page_counter_set_max(counter, max);
3435 mutex_unlock(&memcg_max_mutex);
3436
3437 if (!ret)
3438 break;
3439
3440 if (!drained) {
3441 drain_all_stock(memcg);
3442 drained = true;
3443 continue;
3444 }
3445
3446 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
3447 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
3448 ret = -EBUSY;
3449 break;
3450 }
3451 } while (true);
3452
3453 if (!ret && enlarge)
3454 memcg_oom_recover(memcg);
3455
3456 return ret;
3457 }
3458
3459 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3460 gfp_t gfp_mask,
3461 unsigned long *total_scanned)
3462 {
3463 unsigned long nr_reclaimed = 0;
3464 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3465 unsigned long reclaimed;
3466 int loop = 0;
3467 struct mem_cgroup_tree_per_node *mctz;
3468 unsigned long excess;
3469
3470 if (order > 0)
3471 return 0;
3472
3473 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
3474
3475
3476
3477
3478
3479
3480 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3481 return 0;
3482
3483
3484
3485
3486
3487
3488 do {
3489 if (next_mz)
3490 mz = next_mz;
3491 else
3492 mz = mem_cgroup_largest_soft_limit_node(mctz);
3493 if (!mz)
3494 break;
3495
3496 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3497 gfp_mask, total_scanned);
3498 nr_reclaimed += reclaimed;
3499 spin_lock_irq(&mctz->lock);
3500
3501
3502
3503
3504
3505 next_mz = NULL;
3506 if (!reclaimed)
3507 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3508
3509 excess = soft_limit_excess(mz->memcg);
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3520 spin_unlock_irq(&mctz->lock);
3521 css_put(&mz->memcg->css);
3522 loop++;
3523
3524
3525
3526
3527
3528 if (!nr_reclaimed &&
3529 (next_mz == NULL ||
3530 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3531 break;
3532 } while (!nr_reclaimed);
3533 if (next_mz)
3534 css_put(&next_mz->memcg->css);
3535 return nr_reclaimed;
3536 }
3537
3538
3539
3540
3541
3542
3543 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3544 {
3545 int nr_retries = MAX_RECLAIM_RETRIES;
3546
3547
3548 lru_add_drain_all();
3549
3550 drain_all_stock(memcg);
3551
3552
3553 while (nr_retries && page_counter_read(&memcg->memory)) {
3554 if (signal_pending(current))
3555 return -EINTR;
3556
3557 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
3558 MEMCG_RECLAIM_MAY_SWAP))
3559 nr_retries--;
3560 }
3561
3562 return 0;
3563 }
3564
3565 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3566 char *buf, size_t nbytes,
3567 loff_t off)
3568 {
3569 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3570
3571 if (mem_cgroup_is_root(memcg))
3572 return -EINVAL;
3573 return mem_cgroup_force_empty(memcg) ?: nbytes;
3574 }
3575
3576 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3577 struct cftype *cft)
3578 {
3579 return 1;
3580 }
3581
3582 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3583 struct cftype *cft, u64 val)
3584 {
3585 if (val == 1)
3586 return 0;
3587
3588 pr_warn_once("Non-hierarchical mode is deprecated. "
3589 "Please report your usecase to linux-mm@kvack.org if you "
3590 "depend on this functionality.\n");
3591
3592 return -EINVAL;
3593 }
3594
3595 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3596 {
3597 unsigned long val;
3598
3599 if (mem_cgroup_is_root(memcg)) {
3600 mem_cgroup_flush_stats();
3601 val = memcg_page_state(memcg, NR_FILE_PAGES) +
3602 memcg_page_state(memcg, NR_ANON_MAPPED);
3603 if (swap)
3604 val += memcg_page_state(memcg, MEMCG_SWAP);
3605 } else {
3606 if (!swap)
3607 val = page_counter_read(&memcg->memory);
3608 else
3609 val = page_counter_read(&memcg->memsw);
3610 }
3611 return val;
3612 }
3613
3614 enum {
3615 RES_USAGE,
3616 RES_LIMIT,
3617 RES_MAX_USAGE,
3618 RES_FAILCNT,
3619 RES_SOFT_LIMIT,
3620 };
3621
3622 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3623 struct cftype *cft)
3624 {
3625 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3626 struct page_counter *counter;
3627
3628 switch (MEMFILE_TYPE(cft->private)) {
3629 case _MEM:
3630 counter = &memcg->memory;
3631 break;
3632 case _MEMSWAP:
3633 counter = &memcg->memsw;
3634 break;
3635 case _KMEM:
3636 counter = &memcg->kmem;
3637 break;
3638 case _TCP:
3639 counter = &memcg->tcpmem;
3640 break;
3641 default:
3642 BUG();
3643 }
3644
3645 switch (MEMFILE_ATTR(cft->private)) {
3646 case RES_USAGE:
3647 if (counter == &memcg->memory)
3648 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3649 if (counter == &memcg->memsw)
3650 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3651 return (u64)page_counter_read(counter) * PAGE_SIZE;
3652 case RES_LIMIT:
3653 return (u64)counter->max * PAGE_SIZE;
3654 case RES_MAX_USAGE:
3655 return (u64)counter->watermark * PAGE_SIZE;
3656 case RES_FAILCNT:
3657 return counter->failcnt;
3658 case RES_SOFT_LIMIT:
3659 return (u64)memcg->soft_limit * PAGE_SIZE;
3660 default:
3661 BUG();
3662 }
3663 }
3664
3665 #ifdef CONFIG_MEMCG_KMEM
3666 static int memcg_online_kmem(struct mem_cgroup *memcg)
3667 {
3668 struct obj_cgroup *objcg;
3669
3670 if (mem_cgroup_kmem_disabled())
3671 return 0;
3672
3673 if (unlikely(mem_cgroup_is_root(memcg)))
3674 return 0;
3675
3676 objcg = obj_cgroup_alloc();
3677 if (!objcg)
3678 return -ENOMEM;
3679
3680 objcg->memcg = memcg;
3681 rcu_assign_pointer(memcg->objcg, objcg);
3682
3683 static_branch_enable(&memcg_kmem_enabled_key);
3684
3685 memcg->kmemcg_id = memcg->id.id;
3686
3687 return 0;
3688 }
3689
3690 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3691 {
3692 struct mem_cgroup *parent;
3693
3694 if (mem_cgroup_kmem_disabled())
3695 return;
3696
3697 if (unlikely(mem_cgroup_is_root(memcg)))
3698 return;
3699
3700 parent = parent_mem_cgroup(memcg);
3701 if (!parent)
3702 parent = root_mem_cgroup;
3703
3704 memcg_reparent_objcgs(memcg, parent);
3705
3706
3707
3708
3709
3710
3711
3712 memcg_reparent_list_lrus(memcg, parent);
3713 }
3714 #else
3715 static int memcg_online_kmem(struct mem_cgroup *memcg)
3716 {
3717 return 0;
3718 }
3719 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3720 {
3721 }
3722 #endif
3723
3724 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3725 {
3726 int ret;
3727
3728 mutex_lock(&memcg_max_mutex);
3729
3730 ret = page_counter_set_max(&memcg->tcpmem, max);
3731 if (ret)
3732 goto out;
3733
3734 if (!memcg->tcpmem_active) {
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751 static_branch_inc(&memcg_sockets_enabled_key);
3752 memcg->tcpmem_active = true;
3753 }
3754 out:
3755 mutex_unlock(&memcg_max_mutex);
3756 return ret;
3757 }
3758
3759
3760
3761
3762
3763 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3764 char *buf, size_t nbytes, loff_t off)
3765 {
3766 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3767 unsigned long nr_pages;
3768 int ret;
3769
3770 buf = strstrip(buf);
3771 ret = page_counter_memparse(buf, "-1", &nr_pages);
3772 if (ret)
3773 return ret;
3774
3775 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3776 case RES_LIMIT:
3777 if (mem_cgroup_is_root(memcg)) {
3778 ret = -EINVAL;
3779 break;
3780 }
3781 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3782 case _MEM:
3783 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3784 break;
3785 case _MEMSWAP:
3786 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3787 break;
3788 case _KMEM:
3789
3790 ret = -EOPNOTSUPP;
3791 break;
3792 case _TCP:
3793 ret = memcg_update_tcp_max(memcg, nr_pages);
3794 break;
3795 }
3796 break;
3797 case RES_SOFT_LIMIT:
3798 if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
3799 ret = -EOPNOTSUPP;
3800 } else {
3801 memcg->soft_limit = nr_pages;
3802 ret = 0;
3803 }
3804 break;
3805 }
3806 return ret ?: nbytes;
3807 }
3808
3809 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3810 size_t nbytes, loff_t off)
3811 {
3812 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3813 struct page_counter *counter;
3814
3815 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3816 case _MEM:
3817 counter = &memcg->memory;
3818 break;
3819 case _MEMSWAP:
3820 counter = &memcg->memsw;
3821 break;
3822 case _KMEM:
3823 counter = &memcg->kmem;
3824 break;
3825 case _TCP:
3826 counter = &memcg->tcpmem;
3827 break;
3828 default:
3829 BUG();
3830 }
3831
3832 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3833 case RES_MAX_USAGE:
3834 page_counter_reset_watermark(counter);
3835 break;
3836 case RES_FAILCNT:
3837 counter->failcnt = 0;
3838 break;
3839 default:
3840 BUG();
3841 }
3842
3843 return nbytes;
3844 }
3845
3846 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3847 struct cftype *cft)
3848 {
3849 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3850 }
3851
3852 #ifdef CONFIG_MMU
3853 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3854 struct cftype *cft, u64 val)
3855 {
3856 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3857
3858 if (val & ~MOVE_MASK)
3859 return -EINVAL;
3860
3861
3862
3863
3864
3865
3866
3867 memcg->move_charge_at_immigrate = val;
3868 return 0;
3869 }
3870 #else
3871 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3872 struct cftype *cft, u64 val)
3873 {
3874 return -ENOSYS;
3875 }
3876 #endif
3877
3878 #ifdef CONFIG_NUMA
3879
3880 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3881 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3882 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3883
3884 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3885 int nid, unsigned int lru_mask, bool tree)
3886 {
3887 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3888 unsigned long nr = 0;
3889 enum lru_list lru;
3890
3891 VM_BUG_ON((unsigned)nid >= nr_node_ids);
3892
3893 for_each_lru(lru) {
3894 if (!(BIT(lru) & lru_mask))
3895 continue;
3896 if (tree)
3897 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3898 else
3899 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3900 }
3901 return nr;
3902 }
3903
3904 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3905 unsigned int lru_mask,
3906 bool tree)
3907 {
3908 unsigned long nr = 0;
3909 enum lru_list lru;
3910
3911 for_each_lru(lru) {
3912 if (!(BIT(lru) & lru_mask))
3913 continue;
3914 if (tree)
3915 nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
3916 else
3917 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3918 }
3919 return nr;
3920 }
3921
3922 static int memcg_numa_stat_show(struct seq_file *m, void *v)
3923 {
3924 struct numa_stat {
3925 const char *name;
3926 unsigned int lru_mask;
3927 };
3928
3929 static const struct numa_stat stats[] = {
3930 { "total", LRU_ALL },
3931 { "file", LRU_ALL_FILE },
3932 { "anon", LRU_ALL_ANON },
3933 { "unevictable", BIT(LRU_UNEVICTABLE) },
3934 };
3935 const struct numa_stat *stat;
3936 int nid;
3937 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3938
3939 mem_cgroup_flush_stats();
3940
3941 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3942 seq_printf(m, "%s=%lu", stat->name,
3943 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3944 false));
3945 for_each_node_state(nid, N_MEMORY)
3946 seq_printf(m, " N%d=%lu", nid,
3947 mem_cgroup_node_nr_lru_pages(memcg, nid,
3948 stat->lru_mask, false));
3949 seq_putc(m, '\n');
3950 }
3951
3952 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3953
3954 seq_printf(m, "hierarchical_%s=%lu", stat->name,
3955 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3956 true));
3957 for_each_node_state(nid, N_MEMORY)
3958 seq_printf(m, " N%d=%lu", nid,
3959 mem_cgroup_node_nr_lru_pages(memcg, nid,
3960 stat->lru_mask, true));
3961 seq_putc(m, '\n');
3962 }
3963
3964 return 0;
3965 }
3966 #endif
3967
3968 static const unsigned int memcg1_stats[] = {
3969 NR_FILE_PAGES,
3970 NR_ANON_MAPPED,
3971 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3972 NR_ANON_THPS,
3973 #endif
3974 NR_SHMEM,
3975 NR_FILE_MAPPED,
3976 NR_FILE_DIRTY,
3977 NR_WRITEBACK,
3978 MEMCG_SWAP,
3979 };
3980
3981 static const char *const memcg1_stat_names[] = {
3982 "cache",
3983 "rss",
3984 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3985 "rss_huge",
3986 #endif
3987 "shmem",
3988 "mapped_file",
3989 "dirty",
3990 "writeback",
3991 "swap",
3992 };
3993
3994
3995 static const unsigned int memcg1_events[] = {
3996 PGPGIN,
3997 PGPGOUT,
3998 PGFAULT,
3999 PGMAJFAULT,
4000 };
4001
4002 static int memcg_stat_show(struct seq_file *m, void *v)
4003 {
4004 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4005 unsigned long memory, memsw;
4006 struct mem_cgroup *mi;
4007 unsigned int i;
4008
4009 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
4010
4011 mem_cgroup_flush_stats();
4012
4013 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4014 unsigned long nr;
4015
4016 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4017 continue;
4018 nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4019 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
4020 }
4021
4022 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4023 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
4024 memcg_events_local(memcg, memcg1_events[i]));
4025
4026 for (i = 0; i < NR_LRU_LISTS; i++)
4027 seq_printf(m, "%s %lu\n", lru_list_name(i),
4028 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4029 PAGE_SIZE);
4030
4031
4032 memory = memsw = PAGE_COUNTER_MAX;
4033 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4034 memory = min(memory, READ_ONCE(mi->memory.max));
4035 memsw = min(memsw, READ_ONCE(mi->memsw.max));
4036 }
4037 seq_printf(m, "hierarchical_memory_limit %llu\n",
4038 (u64)memory * PAGE_SIZE);
4039 if (do_memsw_account())
4040 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4041 (u64)memsw * PAGE_SIZE);
4042
4043 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4044 unsigned long nr;
4045
4046 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4047 continue;
4048 nr = memcg_page_state(memcg, memcg1_stats[i]);
4049 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
4050 (u64)nr * PAGE_SIZE);
4051 }
4052
4053 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4054 seq_printf(m, "total_%s %llu\n",
4055 vm_event_name(memcg1_events[i]),
4056 (u64)memcg_events(memcg, memcg1_events[i]));
4057
4058 for (i = 0; i < NR_LRU_LISTS; i++)
4059 seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4060 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4061 PAGE_SIZE);
4062
4063 #ifdef CONFIG_DEBUG_VM
4064 {
4065 pg_data_t *pgdat;
4066 struct mem_cgroup_per_node *mz;
4067 unsigned long anon_cost = 0;
4068 unsigned long file_cost = 0;
4069
4070 for_each_online_pgdat(pgdat) {
4071 mz = memcg->nodeinfo[pgdat->node_id];
4072
4073 anon_cost += mz->lruvec.anon_cost;
4074 file_cost += mz->lruvec.file_cost;
4075 }
4076 seq_printf(m, "anon_cost %lu\n", anon_cost);
4077 seq_printf(m, "file_cost %lu\n", file_cost);
4078 }
4079 #endif
4080
4081 return 0;
4082 }
4083
4084 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4085 struct cftype *cft)
4086 {
4087 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4088
4089 return mem_cgroup_swappiness(memcg);
4090 }
4091
4092 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4093 struct cftype *cft, u64 val)
4094 {
4095 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4096
4097 if (val > 200)
4098 return -EINVAL;
4099
4100 if (!mem_cgroup_is_root(memcg))
4101 memcg->swappiness = val;
4102 else
4103 vm_swappiness = val;
4104
4105 return 0;
4106 }
4107
4108 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4109 {
4110 struct mem_cgroup_threshold_ary *t;
4111 unsigned long usage;
4112 int i;
4113
4114 rcu_read_lock();
4115 if (!swap)
4116 t = rcu_dereference(memcg->thresholds.primary);
4117 else
4118 t = rcu_dereference(memcg->memsw_thresholds.primary);
4119
4120 if (!t)
4121 goto unlock;
4122
4123 usage = mem_cgroup_usage(memcg, swap);
4124
4125
4126
4127
4128
4129
4130 i = t->current_threshold;
4131
4132
4133
4134
4135
4136
4137
4138 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4139 eventfd_signal(t->entries[i].eventfd, 1);
4140
4141
4142 i++;
4143
4144
4145
4146
4147
4148
4149
4150 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4151 eventfd_signal(t->entries[i].eventfd, 1);
4152
4153
4154 t->current_threshold = i - 1;
4155 unlock:
4156 rcu_read_unlock();
4157 }
4158
4159 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4160 {
4161 while (memcg) {
4162 __mem_cgroup_threshold(memcg, false);
4163 if (do_memsw_account())
4164 __mem_cgroup_threshold(memcg, true);
4165
4166 memcg = parent_mem_cgroup(memcg);
4167 }
4168 }
4169
4170 static int compare_thresholds(const void *a, const void *b)
4171 {
4172 const struct mem_cgroup_threshold *_a = a;
4173 const struct mem_cgroup_threshold *_b = b;
4174
4175 if (_a->threshold > _b->threshold)
4176 return 1;
4177
4178 if (_a->threshold < _b->threshold)
4179 return -1;
4180
4181 return 0;
4182 }
4183
4184 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4185 {
4186 struct mem_cgroup_eventfd_list *ev;
4187
4188 spin_lock(&memcg_oom_lock);
4189
4190 list_for_each_entry(ev, &memcg->oom_notify, list)
4191 eventfd_signal(ev->eventfd, 1);
4192
4193 spin_unlock(&memcg_oom_lock);
4194 return 0;
4195 }
4196
4197 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4198 {
4199 struct mem_cgroup *iter;
4200
4201 for_each_mem_cgroup_tree(iter, memcg)
4202 mem_cgroup_oom_notify_cb(iter);
4203 }
4204
4205 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4206 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4207 {
4208 struct mem_cgroup_thresholds *thresholds;
4209 struct mem_cgroup_threshold_ary *new;
4210 unsigned long threshold;
4211 unsigned long usage;
4212 int i, size, ret;
4213
4214 ret = page_counter_memparse(args, "-1", &threshold);
4215 if (ret)
4216 return ret;
4217
4218 mutex_lock(&memcg->thresholds_lock);
4219
4220 if (type == _MEM) {
4221 thresholds = &memcg->thresholds;
4222 usage = mem_cgroup_usage(memcg, false);
4223 } else if (type == _MEMSWAP) {
4224 thresholds = &memcg->memsw_thresholds;
4225 usage = mem_cgroup_usage(memcg, true);
4226 } else
4227 BUG();
4228
4229
4230 if (thresholds->primary)
4231 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4232
4233 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4234
4235
4236 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4237 if (!new) {
4238 ret = -ENOMEM;
4239 goto unlock;
4240 }
4241 new->size = size;
4242
4243
4244 if (thresholds->primary)
4245 memcpy(new->entries, thresholds->primary->entries,
4246 flex_array_size(new, entries, size - 1));
4247
4248
4249 new->entries[size - 1].eventfd = eventfd;
4250 new->entries[size - 1].threshold = threshold;
4251
4252
4253 sort(new->entries, size, sizeof(*new->entries),
4254 compare_thresholds, NULL);
4255
4256
4257 new->current_threshold = -1;
4258 for (i = 0; i < size; i++) {
4259 if (new->entries[i].threshold <= usage) {
4260
4261
4262
4263
4264
4265 ++new->current_threshold;
4266 } else
4267 break;
4268 }
4269
4270
4271 kfree(thresholds->spare);
4272 thresholds->spare = thresholds->primary;
4273
4274 rcu_assign_pointer(thresholds->primary, new);
4275
4276
4277 synchronize_rcu();
4278
4279 unlock:
4280 mutex_unlock(&memcg->thresholds_lock);
4281
4282 return ret;
4283 }
4284
4285 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4286 struct eventfd_ctx *eventfd, const char *args)
4287 {
4288 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4289 }
4290
4291 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4292 struct eventfd_ctx *eventfd, const char *args)
4293 {
4294 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4295 }
4296
4297 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4298 struct eventfd_ctx *eventfd, enum res_type type)
4299 {
4300 struct mem_cgroup_thresholds *thresholds;
4301 struct mem_cgroup_threshold_ary *new;
4302 unsigned long usage;
4303 int i, j, size, entries;
4304
4305 mutex_lock(&memcg->thresholds_lock);
4306
4307 if (type == _MEM) {
4308 thresholds = &memcg->thresholds;
4309 usage = mem_cgroup_usage(memcg, false);
4310 } else if (type == _MEMSWAP) {
4311 thresholds = &memcg->memsw_thresholds;
4312 usage = mem_cgroup_usage(memcg, true);
4313 } else
4314 BUG();
4315
4316 if (!thresholds->primary)
4317 goto unlock;
4318
4319
4320 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4321
4322
4323 size = entries = 0;
4324 for (i = 0; i < thresholds->primary->size; i++) {
4325 if (thresholds->primary->entries[i].eventfd != eventfd)
4326 size++;
4327 else
4328 entries++;
4329 }
4330
4331 new = thresholds->spare;
4332
4333
4334 if (!entries)
4335 goto unlock;
4336
4337
4338 if (!size) {
4339 kfree(new);
4340 new = NULL;
4341 goto swap_buffers;
4342 }
4343
4344 new->size = size;
4345
4346
4347 new->current_threshold = -1;
4348 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4349 if (thresholds->primary->entries[i].eventfd == eventfd)
4350 continue;
4351
4352 new->entries[j] = thresholds->primary->entries[i];
4353 if (new->entries[j].threshold <= usage) {
4354
4355
4356
4357
4358
4359 ++new->current_threshold;
4360 }
4361 j++;
4362 }
4363
4364 swap_buffers:
4365
4366 thresholds->spare = thresholds->primary;
4367
4368 rcu_assign_pointer(thresholds->primary, new);
4369
4370
4371 synchronize_rcu();
4372
4373
4374 if (!new) {
4375 kfree(thresholds->spare);
4376 thresholds->spare = NULL;
4377 }
4378 unlock:
4379 mutex_unlock(&memcg->thresholds_lock);
4380 }
4381
4382 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4383 struct eventfd_ctx *eventfd)
4384 {
4385 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4386 }
4387
4388 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4389 struct eventfd_ctx *eventfd)
4390 {
4391 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4392 }
4393
4394 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4395 struct eventfd_ctx *eventfd, const char *args)
4396 {
4397 struct mem_cgroup_eventfd_list *event;
4398
4399 event = kmalloc(sizeof(*event), GFP_KERNEL);
4400 if (!event)
4401 return -ENOMEM;
4402
4403 spin_lock(&memcg_oom_lock);
4404
4405 event->eventfd = eventfd;
4406 list_add(&event->list, &memcg->oom_notify);
4407
4408
4409 if (memcg->under_oom)
4410 eventfd_signal(eventfd, 1);
4411 spin_unlock(&memcg_oom_lock);
4412
4413 return 0;
4414 }
4415
4416 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4417 struct eventfd_ctx *eventfd)
4418 {
4419 struct mem_cgroup_eventfd_list *ev, *tmp;
4420
4421 spin_lock(&memcg_oom_lock);
4422
4423 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4424 if (ev->eventfd == eventfd) {
4425 list_del(&ev->list);
4426 kfree(ev);
4427 }
4428 }
4429
4430 spin_unlock(&memcg_oom_lock);
4431 }
4432
4433 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4434 {
4435 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4436
4437 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4438 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4439 seq_printf(sf, "oom_kill %lu\n",
4440 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4441 return 0;
4442 }
4443
4444 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4445 struct cftype *cft, u64 val)
4446 {
4447 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4448
4449
4450 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
4451 return -EINVAL;
4452
4453 memcg->oom_kill_disable = val;
4454 if (!val)
4455 memcg_oom_recover(memcg);
4456
4457 return 0;
4458 }
4459
4460 #ifdef CONFIG_CGROUP_WRITEBACK
4461
4462 #include <trace/events/writeback.h>
4463
4464 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4465 {
4466 return wb_domain_init(&memcg->cgwb_domain, gfp);
4467 }
4468
4469 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4470 {
4471 wb_domain_exit(&memcg->cgwb_domain);
4472 }
4473
4474 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4475 {
4476 wb_domain_size_changed(&memcg->cgwb_domain);
4477 }
4478
4479 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4480 {
4481 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4482
4483 if (!memcg->css.parent)
4484 return NULL;
4485
4486 return &memcg->cgwb_domain;
4487 }
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4508 unsigned long *pheadroom, unsigned long *pdirty,
4509 unsigned long *pwriteback)
4510 {
4511 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4512 struct mem_cgroup *parent;
4513
4514 mem_cgroup_flush_stats();
4515
4516 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
4517 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
4518 *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
4519 memcg_page_state(memcg, NR_ACTIVE_FILE);
4520
4521 *pheadroom = PAGE_COUNTER_MAX;
4522 while ((parent = parent_mem_cgroup(memcg))) {
4523 unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4524 READ_ONCE(memcg->memory.high));
4525 unsigned long used = page_counter_read(&memcg->memory);
4526
4527 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4528 memcg = parent;
4529 }
4530 }
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
4577 struct bdi_writeback *wb)
4578 {
4579 struct mem_cgroup *memcg = folio_memcg(folio);
4580 struct memcg_cgwb_frn *frn;
4581 u64 now = get_jiffies_64();
4582 u64 oldest_at = now;
4583 int oldest = -1;
4584 int i;
4585
4586 trace_track_foreign_dirty(folio, wb);
4587
4588
4589
4590
4591
4592
4593 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4594 frn = &memcg->cgwb_frn[i];
4595 if (frn->bdi_id == wb->bdi->id &&
4596 frn->memcg_id == wb->memcg_css->id)
4597 break;
4598 if (time_before64(frn->at, oldest_at) &&
4599 atomic_read(&frn->done.cnt) == 1) {
4600 oldest = i;
4601 oldest_at = frn->at;
4602 }
4603 }
4604
4605 if (i < MEMCG_CGWB_FRN_CNT) {
4606
4607
4608
4609
4610
4611
4612
4613 unsigned long update_intv =
4614 min_t(unsigned long, HZ,
4615 msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4616
4617 if (time_before64(frn->at, now - update_intv))
4618 frn->at = now;
4619 } else if (oldest >= 0) {
4620
4621 frn = &memcg->cgwb_frn[oldest];
4622 frn->bdi_id = wb->bdi->id;
4623 frn->memcg_id = wb->memcg_css->id;
4624 frn->at = now;
4625 }
4626 }
4627
4628
4629 void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4630 {
4631 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4632 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4633 u64 now = jiffies_64;
4634 int i;
4635
4636 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4637 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4638
4639
4640
4641
4642
4643
4644
4645 if (time_after64(frn->at, now - intv) &&
4646 atomic_read(&frn->done.cnt) == 1) {
4647 frn->at = 0;
4648 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4649 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
4650 WB_REASON_FOREIGN_FLUSH,
4651 &frn->done);
4652 }
4653 }
4654 }
4655
4656 #else
4657
4658 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4659 {
4660 return 0;
4661 }
4662
4663 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4664 {
4665 }
4666
4667 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4668 {
4669 }
4670
4671 #endif
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691 static void memcg_event_remove(struct work_struct *work)
4692 {
4693 struct mem_cgroup_event *event =
4694 container_of(work, struct mem_cgroup_event, remove);
4695 struct mem_cgroup *memcg = event->memcg;
4696
4697 remove_wait_queue(event->wqh, &event->wait);
4698
4699 event->unregister_event(memcg, event->eventfd);
4700
4701
4702 eventfd_signal(event->eventfd, 1);
4703
4704 eventfd_ctx_put(event->eventfd);
4705 kfree(event);
4706 css_put(&memcg->css);
4707 }
4708
4709
4710
4711
4712
4713
4714 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4715 int sync, void *key)
4716 {
4717 struct mem_cgroup_event *event =
4718 container_of(wait, struct mem_cgroup_event, wait);
4719 struct mem_cgroup *memcg = event->memcg;
4720 __poll_t flags = key_to_poll(key);
4721
4722 if (flags & EPOLLHUP) {
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732 spin_lock(&memcg->event_list_lock);
4733 if (!list_empty(&event->list)) {
4734 list_del_init(&event->list);
4735
4736
4737
4738
4739 schedule_work(&event->remove);
4740 }
4741 spin_unlock(&memcg->event_list_lock);
4742 }
4743
4744 return 0;
4745 }
4746
4747 static void memcg_event_ptable_queue_proc(struct file *file,
4748 wait_queue_head_t *wqh, poll_table *pt)
4749 {
4750 struct mem_cgroup_event *event =
4751 container_of(pt, struct mem_cgroup_event, pt);
4752
4753 event->wqh = wqh;
4754 add_wait_queue(wqh, &event->wait);
4755 }
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4766 char *buf, size_t nbytes, loff_t off)
4767 {
4768 struct cgroup_subsys_state *css = of_css(of);
4769 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4770 struct mem_cgroup_event *event;
4771 struct cgroup_subsys_state *cfile_css;
4772 unsigned int efd, cfd;
4773 struct fd efile;
4774 struct fd cfile;
4775 const char *name;
4776 char *endp;
4777 int ret;
4778
4779 if (IS_ENABLED(CONFIG_PREEMPT_RT))
4780 return -EOPNOTSUPP;
4781
4782 buf = strstrip(buf);
4783
4784 efd = simple_strtoul(buf, &endp, 10);
4785 if (*endp != ' ')
4786 return -EINVAL;
4787 buf = endp + 1;
4788
4789 cfd = simple_strtoul(buf, &endp, 10);
4790 if ((*endp != ' ') && (*endp != '\0'))
4791 return -EINVAL;
4792 buf = endp + 1;
4793
4794 event = kzalloc(sizeof(*event), GFP_KERNEL);
4795 if (!event)
4796 return -ENOMEM;
4797
4798 event->memcg = memcg;
4799 INIT_LIST_HEAD(&event->list);
4800 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4801 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4802 INIT_WORK(&event->remove, memcg_event_remove);
4803
4804 efile = fdget(efd);
4805 if (!efile.file) {
4806 ret = -EBADF;
4807 goto out_kfree;
4808 }
4809
4810 event->eventfd = eventfd_ctx_fileget(efile.file);
4811 if (IS_ERR(event->eventfd)) {
4812 ret = PTR_ERR(event->eventfd);
4813 goto out_put_efile;
4814 }
4815
4816 cfile = fdget(cfd);
4817 if (!cfile.file) {
4818 ret = -EBADF;
4819 goto out_put_eventfd;
4820 }
4821
4822
4823
4824 ret = file_permission(cfile.file, MAY_READ);
4825 if (ret < 0)
4826 goto out_put_cfile;
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836 name = cfile.file->f_path.dentry->d_name.name;
4837
4838 if (!strcmp(name, "memory.usage_in_bytes")) {
4839 event->register_event = mem_cgroup_usage_register_event;
4840 event->unregister_event = mem_cgroup_usage_unregister_event;
4841 } else if (!strcmp(name, "memory.oom_control")) {
4842 event->register_event = mem_cgroup_oom_register_event;
4843 event->unregister_event = mem_cgroup_oom_unregister_event;
4844 } else if (!strcmp(name, "memory.pressure_level")) {
4845 event->register_event = vmpressure_register_event;
4846 event->unregister_event = vmpressure_unregister_event;
4847 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4848 event->register_event = memsw_cgroup_usage_register_event;
4849 event->unregister_event = memsw_cgroup_usage_unregister_event;
4850 } else {
4851 ret = -EINVAL;
4852 goto out_put_cfile;
4853 }
4854
4855
4856
4857
4858
4859
4860 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4861 &memory_cgrp_subsys);
4862 ret = -EINVAL;
4863 if (IS_ERR(cfile_css))
4864 goto out_put_cfile;
4865 if (cfile_css != css) {
4866 css_put(cfile_css);
4867 goto out_put_cfile;
4868 }
4869
4870 ret = event->register_event(memcg, event->eventfd, buf);
4871 if (ret)
4872 goto out_put_css;
4873
4874 vfs_poll(efile.file, &event->pt);
4875
4876 spin_lock_irq(&memcg->event_list_lock);
4877 list_add(&event->list, &memcg->event_list);
4878 spin_unlock_irq(&memcg->event_list_lock);
4879
4880 fdput(cfile);
4881 fdput(efile);
4882
4883 return nbytes;
4884
4885 out_put_css:
4886 css_put(css);
4887 out_put_cfile:
4888 fdput(cfile);
4889 out_put_eventfd:
4890 eventfd_ctx_put(event->eventfd);
4891 out_put_efile:
4892 fdput(efile);
4893 out_kfree:
4894 kfree(event);
4895
4896 return ret;
4897 }
4898
4899 #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
4900 static int mem_cgroup_slab_show(struct seq_file *m, void *p)
4901 {
4902
4903
4904
4905
4906 return 0;
4907 }
4908 #endif
4909
4910 static struct cftype mem_cgroup_legacy_files[] = {
4911 {
4912 .name = "usage_in_bytes",
4913 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4914 .read_u64 = mem_cgroup_read_u64,
4915 },
4916 {
4917 .name = "max_usage_in_bytes",
4918 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4919 .write = mem_cgroup_reset,
4920 .read_u64 = mem_cgroup_read_u64,
4921 },
4922 {
4923 .name = "limit_in_bytes",
4924 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4925 .write = mem_cgroup_write,
4926 .read_u64 = mem_cgroup_read_u64,
4927 },
4928 {
4929 .name = "soft_limit_in_bytes",
4930 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4931 .write = mem_cgroup_write,
4932 .read_u64 = mem_cgroup_read_u64,
4933 },
4934 {
4935 .name = "failcnt",
4936 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4937 .write = mem_cgroup_reset,
4938 .read_u64 = mem_cgroup_read_u64,
4939 },
4940 {
4941 .name = "stat",
4942 .seq_show = memcg_stat_show,
4943 },
4944 {
4945 .name = "force_empty",
4946 .write = mem_cgroup_force_empty_write,
4947 },
4948 {
4949 .name = "use_hierarchy",
4950 .write_u64 = mem_cgroup_hierarchy_write,
4951 .read_u64 = mem_cgroup_hierarchy_read,
4952 },
4953 {
4954 .name = "cgroup.event_control",
4955 .write = memcg_write_event_control,
4956 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4957 },
4958 {
4959 .name = "swappiness",
4960 .read_u64 = mem_cgroup_swappiness_read,
4961 .write_u64 = mem_cgroup_swappiness_write,
4962 },
4963 {
4964 .name = "move_charge_at_immigrate",
4965 .read_u64 = mem_cgroup_move_charge_read,
4966 .write_u64 = mem_cgroup_move_charge_write,
4967 },
4968 {
4969 .name = "oom_control",
4970 .seq_show = mem_cgroup_oom_control_read,
4971 .write_u64 = mem_cgroup_oom_control_write,
4972 },
4973 {
4974 .name = "pressure_level",
4975 },
4976 #ifdef CONFIG_NUMA
4977 {
4978 .name = "numa_stat",
4979 .seq_show = memcg_numa_stat_show,
4980 },
4981 #endif
4982 {
4983 .name = "kmem.limit_in_bytes",
4984 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4985 .write = mem_cgroup_write,
4986 .read_u64 = mem_cgroup_read_u64,
4987 },
4988 {
4989 .name = "kmem.usage_in_bytes",
4990 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4991 .read_u64 = mem_cgroup_read_u64,
4992 },
4993 {
4994 .name = "kmem.failcnt",
4995 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4996 .write = mem_cgroup_reset,
4997 .read_u64 = mem_cgroup_read_u64,
4998 },
4999 {
5000 .name = "kmem.max_usage_in_bytes",
5001 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5002 .write = mem_cgroup_reset,
5003 .read_u64 = mem_cgroup_read_u64,
5004 },
5005 #if defined(CONFIG_MEMCG_KMEM) && \
5006 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
5007 {
5008 .name = "kmem.slabinfo",
5009 .seq_show = mem_cgroup_slab_show,
5010 },
5011 #endif
5012 {
5013 .name = "kmem.tcp.limit_in_bytes",
5014 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
5015 .write = mem_cgroup_write,
5016 .read_u64 = mem_cgroup_read_u64,
5017 },
5018 {
5019 .name = "kmem.tcp.usage_in_bytes",
5020 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
5021 .read_u64 = mem_cgroup_read_u64,
5022 },
5023 {
5024 .name = "kmem.tcp.failcnt",
5025 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
5026 .write = mem_cgroup_reset,
5027 .read_u64 = mem_cgroup_read_u64,
5028 },
5029 {
5030 .name = "kmem.tcp.max_usage_in_bytes",
5031 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
5032 .write = mem_cgroup_reset,
5033 .read_u64 = mem_cgroup_read_u64,
5034 },
5035 { },
5036 };
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062 static DEFINE_IDR(mem_cgroup_idr);
5063
5064 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
5065 {
5066 if (memcg->id.id > 0) {
5067 idr_remove(&mem_cgroup_idr, memcg->id.id);
5068 memcg->id.id = 0;
5069 }
5070 }
5071
5072 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5073 unsigned int n)
5074 {
5075 refcount_add(n, &memcg->id.ref);
5076 }
5077
5078 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
5079 {
5080 if (refcount_sub_and_test(n, &memcg->id.ref)) {
5081 mem_cgroup_id_remove(memcg);
5082
5083
5084 css_put(&memcg->css);
5085 }
5086 }
5087
5088 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
5089 {
5090 mem_cgroup_id_put_many(memcg, 1);
5091 }
5092
5093
5094
5095
5096
5097
5098
5099 struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
5100 {
5101 WARN_ON_ONCE(!rcu_read_lock_held());
5102 return idr_find(&mem_cgroup_idr, id);
5103 }
5104
5105 #ifdef CONFIG_SHRINKER_DEBUG
5106 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
5107 {
5108 struct cgroup *cgrp;
5109 struct cgroup_subsys_state *css;
5110 struct mem_cgroup *memcg;
5111
5112 cgrp = cgroup_get_from_id(ino);
5113 if (!cgrp)
5114 return ERR_PTR(-ENOENT);
5115
5116 css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
5117 if (css)
5118 memcg = container_of(css, struct mem_cgroup, css);
5119 else
5120 memcg = ERR_PTR(-ENOENT);
5121
5122 cgroup_put(cgrp);
5123
5124 return memcg;
5125 }
5126 #endif
5127
5128 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5129 {
5130 struct mem_cgroup_per_node *pn;
5131
5132 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
5133 if (!pn)
5134 return 1;
5135
5136 pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
5137 GFP_KERNEL_ACCOUNT);
5138 if (!pn->lruvec_stats_percpu) {
5139 kfree(pn);
5140 return 1;
5141 }
5142
5143 lruvec_init(&pn->lruvec);
5144 pn->memcg = memcg;
5145
5146 memcg->nodeinfo[node] = pn;
5147 return 0;
5148 }
5149
5150 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5151 {
5152 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5153
5154 if (!pn)
5155 return;
5156
5157 free_percpu(pn->lruvec_stats_percpu);
5158 kfree(pn);
5159 }
5160
5161 static void __mem_cgroup_free(struct mem_cgroup *memcg)
5162 {
5163 int node;
5164
5165 for_each_node(node)
5166 free_mem_cgroup_per_node_info(memcg, node);
5167 free_percpu(memcg->vmstats_percpu);
5168 kfree(memcg);
5169 }
5170
5171 static void mem_cgroup_free(struct mem_cgroup *memcg)
5172 {
5173 memcg_wb_domain_exit(memcg);
5174 __mem_cgroup_free(memcg);
5175 }
5176
5177 static struct mem_cgroup *mem_cgroup_alloc(void)
5178 {
5179 struct mem_cgroup *memcg;
5180 int node;
5181 int __maybe_unused i;
5182 long error = -ENOMEM;
5183
5184 memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
5185 if (!memcg)
5186 return ERR_PTR(error);
5187
5188 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5189 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
5190 if (memcg->id.id < 0) {
5191 error = memcg->id.id;
5192 goto fail;
5193 }
5194
5195 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5196 GFP_KERNEL_ACCOUNT);
5197 if (!memcg->vmstats_percpu)
5198 goto fail;
5199
5200 for_each_node(node)
5201 if (alloc_mem_cgroup_per_node_info(memcg, node))
5202 goto fail;
5203
5204 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5205 goto fail;
5206
5207 INIT_WORK(&memcg->high_work, high_work_func);
5208 INIT_LIST_HEAD(&memcg->oom_notify);
5209 mutex_init(&memcg->thresholds_lock);
5210 spin_lock_init(&memcg->move_lock);
5211 vmpressure_init(&memcg->vmpressure);
5212 INIT_LIST_HEAD(&memcg->event_list);
5213 spin_lock_init(&memcg->event_list_lock);
5214 memcg->socket_pressure = jiffies;
5215 #ifdef CONFIG_MEMCG_KMEM
5216 memcg->kmemcg_id = -1;
5217 INIT_LIST_HEAD(&memcg->objcg_list);
5218 #endif
5219 #ifdef CONFIG_CGROUP_WRITEBACK
5220 INIT_LIST_HEAD(&memcg->cgwb_list);
5221 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5222 memcg->cgwb_frn[i].done =
5223 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5224 #endif
5225 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5226 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5227 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5228 memcg->deferred_split_queue.split_queue_len = 0;
5229 #endif
5230 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5231 return memcg;
5232 fail:
5233 mem_cgroup_id_remove(memcg);
5234 __mem_cgroup_free(memcg);
5235 return ERR_PTR(error);
5236 }
5237
5238 static struct cgroup_subsys_state * __ref
5239 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5240 {
5241 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5242 struct mem_cgroup *memcg, *old_memcg;
5243
5244 old_memcg = set_active_memcg(parent);
5245 memcg = mem_cgroup_alloc();
5246 set_active_memcg(old_memcg);
5247 if (IS_ERR(memcg))
5248 return ERR_CAST(memcg);
5249
5250 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5251 memcg->soft_limit = PAGE_COUNTER_MAX;
5252 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
5253 memcg->zswap_max = PAGE_COUNTER_MAX;
5254 #endif
5255 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5256 if (parent) {
5257 memcg->swappiness = mem_cgroup_swappiness(parent);
5258 memcg->oom_kill_disable = parent->oom_kill_disable;
5259
5260 page_counter_init(&memcg->memory, &parent->memory);
5261 page_counter_init(&memcg->swap, &parent->swap);
5262 page_counter_init(&memcg->kmem, &parent->kmem);
5263 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5264 } else {
5265 page_counter_init(&memcg->memory, NULL);
5266 page_counter_init(&memcg->swap, NULL);
5267 page_counter_init(&memcg->kmem, NULL);
5268 page_counter_init(&memcg->tcpmem, NULL);
5269
5270 root_mem_cgroup = memcg;
5271 return &memcg->css;
5272 }
5273
5274 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5275 static_branch_inc(&memcg_sockets_enabled_key);
5276
5277 return &memcg->css;
5278 }
5279
5280 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5281 {
5282 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5283
5284 if (memcg_online_kmem(memcg))
5285 goto remove_id;
5286
5287
5288
5289
5290
5291
5292 if (alloc_shrinker_info(memcg))
5293 goto offline_kmem;
5294
5295
5296 refcount_set(&memcg->id.ref, 1);
5297 css_get(css);
5298
5299 if (unlikely(mem_cgroup_is_root(memcg)))
5300 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
5301 2UL*HZ);
5302 return 0;
5303 offline_kmem:
5304 memcg_offline_kmem(memcg);
5305 remove_id:
5306 mem_cgroup_id_remove(memcg);
5307 return -ENOMEM;
5308 }
5309
5310 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5311 {
5312 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5313 struct mem_cgroup_event *event, *tmp;
5314
5315
5316
5317
5318
5319
5320 spin_lock_irq(&memcg->event_list_lock);
5321 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5322 list_del_init(&event->list);
5323 schedule_work(&event->remove);
5324 }
5325 spin_unlock_irq(&memcg->event_list_lock);
5326
5327 page_counter_set_min(&memcg->memory, 0);
5328 page_counter_set_low(&memcg->memory, 0);
5329
5330 memcg_offline_kmem(memcg);
5331 reparent_shrinker_deferred(memcg);
5332 wb_memcg_offline(memcg);
5333
5334 drain_all_stock(memcg);
5335
5336 mem_cgroup_id_put(memcg);
5337 }
5338
5339 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5340 {
5341 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5342
5343 invalidate_reclaim_iterators(memcg);
5344 }
5345
5346 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5347 {
5348 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5349 int __maybe_unused i;
5350
5351 #ifdef CONFIG_CGROUP_WRITEBACK
5352 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5353 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5354 #endif
5355 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5356 static_branch_dec(&memcg_sockets_enabled_key);
5357
5358 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5359 static_branch_dec(&memcg_sockets_enabled_key);
5360
5361 vmpressure_cleanup(&memcg->vmpressure);
5362 cancel_work_sync(&memcg->high_work);
5363 mem_cgroup_remove_from_trees(memcg);
5364 free_shrinker_info(memcg);
5365 mem_cgroup_free(memcg);
5366 }
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5382 {
5383 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5384
5385 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5386 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5387 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5388 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5389 page_counter_set_min(&memcg->memory, 0);
5390 page_counter_set_low(&memcg->memory, 0);
5391 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5392 memcg->soft_limit = PAGE_COUNTER_MAX;
5393 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5394 memcg_wb_domain_size_changed(memcg);
5395 }
5396
5397 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
5398 {
5399 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5400 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5401 struct memcg_vmstats_percpu *statc;
5402 long delta, v;
5403 int i, nid;
5404
5405 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
5406
5407 for (i = 0; i < MEMCG_NR_STAT; i++) {
5408
5409
5410
5411
5412
5413 delta = memcg->vmstats.state_pending[i];
5414 if (delta)
5415 memcg->vmstats.state_pending[i] = 0;
5416
5417
5418 v = READ_ONCE(statc->state[i]);
5419 if (v != statc->state_prev[i]) {
5420 delta += v - statc->state_prev[i];
5421 statc->state_prev[i] = v;
5422 }
5423
5424 if (!delta)
5425 continue;
5426
5427
5428 memcg->vmstats.state[i] += delta;
5429 if (parent)
5430 parent->vmstats.state_pending[i] += delta;
5431 }
5432
5433 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
5434 delta = memcg->vmstats.events_pending[i];
5435 if (delta)
5436 memcg->vmstats.events_pending[i] = 0;
5437
5438 v = READ_ONCE(statc->events[i]);
5439 if (v != statc->events_prev[i]) {
5440 delta += v - statc->events_prev[i];
5441 statc->events_prev[i] = v;
5442 }
5443
5444 if (!delta)
5445 continue;
5446
5447 memcg->vmstats.events[i] += delta;
5448 if (parent)
5449 parent->vmstats.events_pending[i] += delta;
5450 }
5451
5452 for_each_node_state(nid, N_MEMORY) {
5453 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
5454 struct mem_cgroup_per_node *ppn = NULL;
5455 struct lruvec_stats_percpu *lstatc;
5456
5457 if (parent)
5458 ppn = parent->nodeinfo[nid];
5459
5460 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
5461
5462 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
5463 delta = pn->lruvec_stats.state_pending[i];
5464 if (delta)
5465 pn->lruvec_stats.state_pending[i] = 0;
5466
5467 v = READ_ONCE(lstatc->state[i]);
5468 if (v != lstatc->state_prev[i]) {
5469 delta += v - lstatc->state_prev[i];
5470 lstatc->state_prev[i] = v;
5471 }
5472
5473 if (!delta)
5474 continue;
5475
5476 pn->lruvec_stats.state[i] += delta;
5477 if (ppn)
5478 ppn->lruvec_stats.state_pending[i] += delta;
5479 }
5480 }
5481 }
5482
5483 #ifdef CONFIG_MMU
5484
5485 static int mem_cgroup_do_precharge(unsigned long count)
5486 {
5487 int ret;
5488
5489
5490 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5491 if (!ret) {
5492 mc.precharge += count;
5493 return ret;
5494 }
5495
5496
5497 while (count--) {
5498 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5499 if (ret)
5500 return ret;
5501 mc.precharge++;
5502 cond_resched();
5503 }
5504 return 0;
5505 }
5506
5507 union mc_target {
5508 struct page *page;
5509 swp_entry_t ent;
5510 };
5511
5512 enum mc_target_type {
5513 MC_TARGET_NONE = 0,
5514 MC_TARGET_PAGE,
5515 MC_TARGET_SWAP,
5516 MC_TARGET_DEVICE,
5517 };
5518
5519 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5520 unsigned long addr, pte_t ptent)
5521 {
5522 struct page *page = vm_normal_page(vma, addr, ptent);
5523
5524 if (!page || !page_mapped(page))
5525 return NULL;
5526 if (PageAnon(page)) {
5527 if (!(mc.flags & MOVE_ANON))
5528 return NULL;
5529 } else {
5530 if (!(mc.flags & MOVE_FILE))
5531 return NULL;
5532 }
5533 if (!get_page_unless_zero(page))
5534 return NULL;
5535
5536 return page;
5537 }
5538
5539 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5540 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5541 pte_t ptent, swp_entry_t *entry)
5542 {
5543 struct page *page = NULL;
5544 swp_entry_t ent = pte_to_swp_entry(ptent);
5545
5546 if (!(mc.flags & MOVE_ANON))
5547 return NULL;
5548
5549
5550
5551
5552
5553 if (is_device_private_entry(ent)) {
5554 page = pfn_swap_entry_to_page(ent);
5555 if (!get_page_unless_zero(page))
5556 return NULL;
5557 return page;
5558 }
5559
5560 if (non_swap_entry(ent))
5561 return NULL;
5562
5563
5564
5565
5566
5567 page = find_get_page(swap_address_space(ent), swp_offset(ent));
5568 entry->val = ent.val;
5569
5570 return page;
5571 }
5572 #else
5573 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5574 pte_t ptent, swp_entry_t *entry)
5575 {
5576 return NULL;
5577 }
5578 #endif
5579
5580 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5581 unsigned long addr, pte_t ptent)
5582 {
5583 if (!vma->vm_file)
5584 return NULL;
5585 if (!(mc.flags & MOVE_FILE))
5586 return NULL;
5587
5588
5589
5590 return find_get_incore_page(vma->vm_file->f_mapping,
5591 linear_page_index(vma, addr));
5592 }
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606 static int mem_cgroup_move_account(struct page *page,
5607 bool compound,
5608 struct mem_cgroup *from,
5609 struct mem_cgroup *to)
5610 {
5611 struct folio *folio = page_folio(page);
5612 struct lruvec *from_vec, *to_vec;
5613 struct pglist_data *pgdat;
5614 unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
5615 int nid, ret;
5616
5617 VM_BUG_ON(from == to);
5618 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
5619 VM_BUG_ON(compound && !folio_test_large(folio));
5620
5621
5622
5623
5624
5625 ret = -EBUSY;
5626 if (!folio_trylock(folio))
5627 goto out;
5628
5629 ret = -EINVAL;
5630 if (folio_memcg(folio) != from)
5631 goto out_unlock;
5632
5633 pgdat = folio_pgdat(folio);
5634 from_vec = mem_cgroup_lruvec(from, pgdat);
5635 to_vec = mem_cgroup_lruvec(to, pgdat);
5636
5637 folio_memcg_lock(folio);
5638
5639 if (folio_test_anon(folio)) {
5640 if (folio_mapped(folio)) {
5641 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5642 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5643 if (folio_test_transhuge(folio)) {
5644 __mod_lruvec_state(from_vec, NR_ANON_THPS,
5645 -nr_pages);
5646 __mod_lruvec_state(to_vec, NR_ANON_THPS,
5647 nr_pages);
5648 }
5649 }
5650 } else {
5651 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5652 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
5653
5654 if (folio_test_swapbacked(folio)) {
5655 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5656 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5657 }
5658
5659 if (folio_mapped(folio)) {
5660 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5661 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5662 }
5663
5664 if (folio_test_dirty(folio)) {
5665 struct address_space *mapping = folio_mapping(folio);
5666
5667 if (mapping_can_writeback(mapping)) {
5668 __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5669 -nr_pages);
5670 __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5671 nr_pages);
5672 }
5673 }
5674 }
5675
5676 if (folio_test_writeback(folio)) {
5677 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5678 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5679 }
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694 smp_mb();
5695
5696 css_get(&to->css);
5697 css_put(&from->css);
5698
5699 folio->memcg_data = (unsigned long)to;
5700
5701 __folio_memcg_unlock(from);
5702
5703 ret = 0;
5704 nid = folio_nid(folio);
5705
5706 local_irq_disable();
5707 mem_cgroup_charge_statistics(to, nr_pages);
5708 memcg_check_events(to, nid);
5709 mem_cgroup_charge_statistics(from, -nr_pages);
5710 memcg_check_events(from, nid);
5711 local_irq_enable();
5712 out_unlock:
5713 folio_unlock(folio);
5714 out:
5715 return ret;
5716 }
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5745 unsigned long addr, pte_t ptent, union mc_target *target)
5746 {
5747 struct page *page = NULL;
5748 enum mc_target_type ret = MC_TARGET_NONE;
5749 swp_entry_t ent = { .val = 0 };
5750
5751 if (pte_present(ptent))
5752 page = mc_handle_present_pte(vma, addr, ptent);
5753 else if (pte_none_mostly(ptent))
5754
5755
5756
5757
5758 page = mc_handle_file_pte(vma, addr, ptent);
5759 else if (is_swap_pte(ptent))
5760 page = mc_handle_swap_pte(vma, ptent, &ent);
5761
5762 if (!page && !ent.val)
5763 return ret;
5764 if (page) {
5765
5766
5767
5768
5769
5770 if (page_memcg(page) == mc.from) {
5771 ret = MC_TARGET_PAGE;
5772 if (is_device_private_page(page) ||
5773 is_device_coherent_page(page))
5774 ret = MC_TARGET_DEVICE;
5775 if (target)
5776 target->page = page;
5777 }
5778 if (!ret || !target)
5779 put_page(page);
5780 }
5781
5782
5783
5784
5785 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5786 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5787 ret = MC_TARGET_SWAP;
5788 if (target)
5789 target->ent = ent;
5790 }
5791 return ret;
5792 }
5793
5794 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5795
5796
5797
5798
5799
5800 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5801 unsigned long addr, pmd_t pmd, union mc_target *target)
5802 {
5803 struct page *page = NULL;
5804 enum mc_target_type ret = MC_TARGET_NONE;
5805
5806 if (unlikely(is_swap_pmd(pmd))) {
5807 VM_BUG_ON(thp_migration_supported() &&
5808 !is_pmd_migration_entry(pmd));
5809 return ret;
5810 }
5811 page = pmd_page(pmd);
5812 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5813 if (!(mc.flags & MOVE_ANON))
5814 return ret;
5815 if (page_memcg(page) == mc.from) {
5816 ret = MC_TARGET_PAGE;
5817 if (target) {
5818 get_page(page);
5819 target->page = page;
5820 }
5821 }
5822 return ret;
5823 }
5824 #else
5825 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5826 unsigned long addr, pmd_t pmd, union mc_target *target)
5827 {
5828 return MC_TARGET_NONE;
5829 }
5830 #endif
5831
5832 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5833 unsigned long addr, unsigned long end,
5834 struct mm_walk *walk)
5835 {
5836 struct vm_area_struct *vma = walk->vma;
5837 pte_t *pte;
5838 spinlock_t *ptl;
5839
5840 ptl = pmd_trans_huge_lock(pmd, vma);
5841 if (ptl) {
5842
5843
5844
5845
5846
5847 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5848 mc.precharge += HPAGE_PMD_NR;
5849 spin_unlock(ptl);
5850 return 0;
5851 }
5852
5853 if (pmd_trans_unstable(pmd))
5854 return 0;
5855 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5856 for (; addr != end; pte++, addr += PAGE_SIZE)
5857 if (get_mctgt_type(vma, addr, *pte, NULL))
5858 mc.precharge++;
5859 pte_unmap_unlock(pte - 1, ptl);
5860 cond_resched();
5861
5862 return 0;
5863 }
5864
5865 static const struct mm_walk_ops precharge_walk_ops = {
5866 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5867 };
5868
5869 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5870 {
5871 unsigned long precharge;
5872
5873 mmap_read_lock(mm);
5874 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5875 mmap_read_unlock(mm);
5876
5877 precharge = mc.precharge;
5878 mc.precharge = 0;
5879
5880 return precharge;
5881 }
5882
5883 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5884 {
5885 unsigned long precharge = mem_cgroup_count_precharge(mm);
5886
5887 VM_BUG_ON(mc.moving_task);
5888 mc.moving_task = current;
5889 return mem_cgroup_do_precharge(precharge);
5890 }
5891
5892
5893 static void __mem_cgroup_clear_mc(void)
5894 {
5895 struct mem_cgroup *from = mc.from;
5896 struct mem_cgroup *to = mc.to;
5897
5898
5899 if (mc.precharge) {
5900 cancel_charge(mc.to, mc.precharge);
5901 mc.precharge = 0;
5902 }
5903
5904
5905
5906
5907 if (mc.moved_charge) {
5908 cancel_charge(mc.from, mc.moved_charge);
5909 mc.moved_charge = 0;
5910 }
5911
5912 if (mc.moved_swap) {
5913
5914 if (!mem_cgroup_is_root(mc.from))
5915 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5916
5917 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5918
5919
5920
5921
5922
5923 if (!mem_cgroup_is_root(mc.to))
5924 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5925
5926 mc.moved_swap = 0;
5927 }
5928 memcg_oom_recover(from);
5929 memcg_oom_recover(to);
5930 wake_up_all(&mc.waitq);
5931 }
5932
5933 static void mem_cgroup_clear_mc(void)
5934 {
5935 struct mm_struct *mm = mc.mm;
5936
5937
5938
5939
5940
5941 mc.moving_task = NULL;
5942 __mem_cgroup_clear_mc();
5943 spin_lock(&mc.lock);
5944 mc.from = NULL;
5945 mc.to = NULL;
5946 mc.mm = NULL;
5947 spin_unlock(&mc.lock);
5948
5949 mmput(mm);
5950 }
5951
5952 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5953 {
5954 struct cgroup_subsys_state *css;
5955 struct mem_cgroup *memcg = NULL;
5956 struct mem_cgroup *from;
5957 struct task_struct *leader, *p;
5958 struct mm_struct *mm;
5959 unsigned long move_flags;
5960 int ret = 0;
5961
5962
5963 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5964 return 0;
5965
5966
5967
5968
5969
5970
5971
5972 p = NULL;
5973 cgroup_taskset_for_each_leader(leader, css, tset) {
5974 WARN_ON_ONCE(p);
5975 p = leader;
5976 memcg = mem_cgroup_from_css(css);
5977 }
5978 if (!p)
5979 return 0;
5980
5981
5982
5983
5984
5985
5986 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5987 if (!move_flags)
5988 return 0;
5989
5990 from = mem_cgroup_from_task(p);
5991
5992 VM_BUG_ON(from == memcg);
5993
5994 mm = get_task_mm(p);
5995 if (!mm)
5996 return 0;
5997
5998 if (mm->owner == p) {
5999 VM_BUG_ON(mc.from);
6000 VM_BUG_ON(mc.to);
6001 VM_BUG_ON(mc.precharge);
6002 VM_BUG_ON(mc.moved_charge);
6003 VM_BUG_ON(mc.moved_swap);
6004
6005 spin_lock(&mc.lock);
6006 mc.mm = mm;
6007 mc.from = from;
6008 mc.to = memcg;
6009 mc.flags = move_flags;
6010 spin_unlock(&mc.lock);
6011
6012
6013 ret = mem_cgroup_precharge_mc(mm);
6014 if (ret)
6015 mem_cgroup_clear_mc();
6016 } else {
6017 mmput(mm);
6018 }
6019 return ret;
6020 }
6021
6022 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6023 {
6024 if (mc.to)
6025 mem_cgroup_clear_mc();
6026 }
6027
6028 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6029 unsigned long addr, unsigned long end,
6030 struct mm_walk *walk)
6031 {
6032 int ret = 0;
6033 struct vm_area_struct *vma = walk->vma;
6034 pte_t *pte;
6035 spinlock_t *ptl;
6036 enum mc_target_type target_type;
6037 union mc_target target;
6038 struct page *page;
6039
6040 ptl = pmd_trans_huge_lock(pmd, vma);
6041 if (ptl) {
6042 if (mc.precharge < HPAGE_PMD_NR) {
6043 spin_unlock(ptl);
6044 return 0;
6045 }
6046 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6047 if (target_type == MC_TARGET_PAGE) {
6048 page = target.page;
6049 if (!isolate_lru_page(page)) {
6050 if (!mem_cgroup_move_account(page, true,
6051 mc.from, mc.to)) {
6052 mc.precharge -= HPAGE_PMD_NR;
6053 mc.moved_charge += HPAGE_PMD_NR;
6054 }
6055 putback_lru_page(page);
6056 }
6057 put_page(page);
6058 } else if (target_type == MC_TARGET_DEVICE) {
6059 page = target.page;
6060 if (!mem_cgroup_move_account(page, true,
6061 mc.from, mc.to)) {
6062 mc.precharge -= HPAGE_PMD_NR;
6063 mc.moved_charge += HPAGE_PMD_NR;
6064 }
6065 put_page(page);
6066 }
6067 spin_unlock(ptl);
6068 return 0;
6069 }
6070
6071 if (pmd_trans_unstable(pmd))
6072 return 0;
6073 retry:
6074 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6075 for (; addr != end; addr += PAGE_SIZE) {
6076 pte_t ptent = *(pte++);
6077 bool device = false;
6078 swp_entry_t ent;
6079
6080 if (!mc.precharge)
6081 break;
6082
6083 switch (get_mctgt_type(vma, addr, ptent, &target)) {
6084 case MC_TARGET_DEVICE:
6085 device = true;
6086 fallthrough;
6087 case MC_TARGET_PAGE:
6088 page = target.page;
6089
6090
6091
6092
6093
6094
6095 if (PageTransCompound(page))
6096 goto put;
6097 if (!device && isolate_lru_page(page))
6098 goto put;
6099 if (!mem_cgroup_move_account(page, false,
6100 mc.from, mc.to)) {
6101 mc.precharge--;
6102
6103 mc.moved_charge++;
6104 }
6105 if (!device)
6106 putback_lru_page(page);
6107 put:
6108 put_page(page);
6109 break;
6110 case MC_TARGET_SWAP:
6111 ent = target.ent;
6112 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6113 mc.precharge--;
6114 mem_cgroup_id_get_many(mc.to, 1);
6115
6116 mc.moved_swap++;
6117 }
6118 break;
6119 default:
6120 break;
6121 }
6122 }
6123 pte_unmap_unlock(pte - 1, ptl);
6124 cond_resched();
6125
6126 if (addr != end) {
6127
6128
6129
6130
6131
6132
6133 ret = mem_cgroup_do_precharge(1);
6134 if (!ret)
6135 goto retry;
6136 }
6137
6138 return ret;
6139 }
6140
6141 static const struct mm_walk_ops charge_walk_ops = {
6142 .pmd_entry = mem_cgroup_move_charge_pte_range,
6143 };
6144
6145 static void mem_cgroup_move_charge(void)
6146 {
6147 lru_add_drain_all();
6148
6149
6150
6151
6152
6153 atomic_inc(&mc.from->moving_account);
6154 synchronize_rcu();
6155 retry:
6156 if (unlikely(!mmap_read_trylock(mc.mm))) {
6157
6158
6159
6160
6161
6162
6163
6164 __mem_cgroup_clear_mc();
6165 cond_resched();
6166 goto retry;
6167 }
6168
6169
6170
6171
6172 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6173 NULL);
6174
6175 mmap_read_unlock(mc.mm);
6176 atomic_dec(&mc.from->moving_account);
6177 }
6178
6179 static void mem_cgroup_move_task(void)
6180 {
6181 if (mc.to) {
6182 mem_cgroup_move_charge();
6183 mem_cgroup_clear_mc();
6184 }
6185 }
6186 #else
6187 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6188 {
6189 return 0;
6190 }
6191 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6192 {
6193 }
6194 static void mem_cgroup_move_task(void)
6195 {
6196 }
6197 #endif
6198
6199 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6200 {
6201 if (value == PAGE_COUNTER_MAX)
6202 seq_puts(m, "max\n");
6203 else
6204 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6205
6206 return 0;
6207 }
6208
6209 static u64 memory_current_read(struct cgroup_subsys_state *css,
6210 struct cftype *cft)
6211 {
6212 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6213
6214 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6215 }
6216
6217 static u64 memory_peak_read(struct cgroup_subsys_state *css,
6218 struct cftype *cft)
6219 {
6220 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6221
6222 return (u64)memcg->memory.watermark * PAGE_SIZE;
6223 }
6224
6225 static int memory_min_show(struct seq_file *m, void *v)
6226 {
6227 return seq_puts_memcg_tunable(m,
6228 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6229 }
6230
6231 static ssize_t memory_min_write(struct kernfs_open_file *of,
6232 char *buf, size_t nbytes, loff_t off)
6233 {
6234 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6235 unsigned long min;
6236 int err;
6237
6238 buf = strstrip(buf);
6239 err = page_counter_memparse(buf, "max", &min);
6240 if (err)
6241 return err;
6242
6243 page_counter_set_min(&memcg->memory, min);
6244
6245 return nbytes;
6246 }
6247
6248 static int memory_low_show(struct seq_file *m, void *v)
6249 {
6250 return seq_puts_memcg_tunable(m,
6251 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6252 }
6253
6254 static ssize_t memory_low_write(struct kernfs_open_file *of,
6255 char *buf, size_t nbytes, loff_t off)
6256 {
6257 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6258 unsigned long low;
6259 int err;
6260
6261 buf = strstrip(buf);
6262 err = page_counter_memparse(buf, "max", &low);
6263 if (err)
6264 return err;
6265
6266 page_counter_set_low(&memcg->memory, low);
6267
6268 return nbytes;
6269 }
6270
6271 static int memory_high_show(struct seq_file *m, void *v)
6272 {
6273 return seq_puts_memcg_tunable(m,
6274 READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
6275 }
6276
6277 static ssize_t memory_high_write(struct kernfs_open_file *of,
6278 char *buf, size_t nbytes, loff_t off)
6279 {
6280 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6281 unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6282 bool drained = false;
6283 unsigned long high;
6284 int err;
6285
6286 buf = strstrip(buf);
6287 err = page_counter_memparse(buf, "max", &high);
6288 if (err)
6289 return err;
6290
6291 page_counter_set_high(&memcg->memory, high);
6292
6293 for (;;) {
6294 unsigned long nr_pages = page_counter_read(&memcg->memory);
6295 unsigned long reclaimed;
6296
6297 if (nr_pages <= high)
6298 break;
6299
6300 if (signal_pending(current))
6301 break;
6302
6303 if (!drained) {
6304 drain_all_stock(memcg);
6305 drained = true;
6306 continue;
6307 }
6308
6309 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6310 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
6311
6312 if (!reclaimed && !nr_retries--)
6313 break;
6314 }
6315
6316 memcg_wb_domain_size_changed(memcg);
6317 return nbytes;
6318 }
6319
6320 static int memory_max_show(struct seq_file *m, void *v)
6321 {
6322 return seq_puts_memcg_tunable(m,
6323 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6324 }
6325
6326 static ssize_t memory_max_write(struct kernfs_open_file *of,
6327 char *buf, size_t nbytes, loff_t off)
6328 {
6329 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6330 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
6331 bool drained = false;
6332 unsigned long max;
6333 int err;
6334
6335 buf = strstrip(buf);
6336 err = page_counter_memparse(buf, "max", &max);
6337 if (err)
6338 return err;
6339
6340 xchg(&memcg->memory.max, max);
6341
6342 for (;;) {
6343 unsigned long nr_pages = page_counter_read(&memcg->memory);
6344
6345 if (nr_pages <= max)
6346 break;
6347
6348 if (signal_pending(current))
6349 break;
6350
6351 if (!drained) {
6352 drain_all_stock(memcg);
6353 drained = true;
6354 continue;
6355 }
6356
6357 if (nr_reclaims) {
6358 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6359 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
6360 nr_reclaims--;
6361 continue;
6362 }
6363
6364 memcg_memory_event(memcg, MEMCG_OOM);
6365 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6366 break;
6367 }
6368
6369 memcg_wb_domain_size_changed(memcg);
6370 return nbytes;
6371 }
6372
6373 static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6374 {
6375 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6376 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6377 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6378 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6379 seq_printf(m, "oom_kill %lu\n",
6380 atomic_long_read(&events[MEMCG_OOM_KILL]));
6381 seq_printf(m, "oom_group_kill %lu\n",
6382 atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
6383 }
6384
6385 static int memory_events_show(struct seq_file *m, void *v)
6386 {
6387 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6388
6389 __memory_events_show(m, memcg->memory_events);
6390 return 0;
6391 }
6392
6393 static int memory_events_local_show(struct seq_file *m, void *v)
6394 {
6395 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6396
6397 __memory_events_show(m, memcg->memory_events_local);
6398 return 0;
6399 }
6400
6401 static int memory_stat_show(struct seq_file *m, void *v)
6402 {
6403 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6404 char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
6405
6406 if (!buf)
6407 return -ENOMEM;
6408 memory_stat_format(memcg, buf, PAGE_SIZE);
6409 seq_puts(m, buf);
6410 kfree(buf);
6411 return 0;
6412 }
6413
6414 #ifdef CONFIG_NUMA
6415 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
6416 int item)
6417 {
6418 return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
6419 }
6420
6421 static int memory_numa_stat_show(struct seq_file *m, void *v)
6422 {
6423 int i;
6424 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6425
6426 mem_cgroup_flush_stats();
6427
6428 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6429 int nid;
6430
6431 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6432 continue;
6433
6434 seq_printf(m, "%s", memory_stats[i].name);
6435 for_each_node_state(nid, N_MEMORY) {
6436 u64 size;
6437 struct lruvec *lruvec;
6438
6439 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6440 size = lruvec_page_state_output(lruvec,
6441 memory_stats[i].idx);
6442 seq_printf(m, " N%d=%llu", nid, size);
6443 }
6444 seq_putc(m, '\n');
6445 }
6446
6447 return 0;
6448 }
6449 #endif
6450
6451 static int memory_oom_group_show(struct seq_file *m, void *v)
6452 {
6453 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6454
6455 seq_printf(m, "%d\n", memcg->oom_group);
6456
6457 return 0;
6458 }
6459
6460 static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6461 char *buf, size_t nbytes, loff_t off)
6462 {
6463 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6464 int ret, oom_group;
6465
6466 buf = strstrip(buf);
6467 if (!buf)
6468 return -EINVAL;
6469
6470 ret = kstrtoint(buf, 0, &oom_group);
6471 if (ret)
6472 return ret;
6473
6474 if (oom_group != 0 && oom_group != 1)
6475 return -EINVAL;
6476
6477 memcg->oom_group = oom_group;
6478
6479 return nbytes;
6480 }
6481
6482 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
6483 size_t nbytes, loff_t off)
6484 {
6485 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6486 unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6487 unsigned long nr_to_reclaim, nr_reclaimed = 0;
6488 unsigned int reclaim_options;
6489 int err;
6490
6491 buf = strstrip(buf);
6492 err = page_counter_memparse(buf, "", &nr_to_reclaim);
6493 if (err)
6494 return err;
6495
6496 reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
6497 while (nr_reclaimed < nr_to_reclaim) {
6498 unsigned long reclaimed;
6499
6500 if (signal_pending(current))
6501 return -EINTR;
6502
6503
6504
6505
6506
6507
6508 if (!nr_retries)
6509 lru_add_drain_all();
6510
6511 reclaimed = try_to_free_mem_cgroup_pages(memcg,
6512 nr_to_reclaim - nr_reclaimed,
6513 GFP_KERNEL, reclaim_options);
6514
6515 if (!reclaimed && !nr_retries--)
6516 return -EAGAIN;
6517
6518 nr_reclaimed += reclaimed;
6519 }
6520
6521 return nbytes;
6522 }
6523
6524 static struct cftype memory_files[] = {
6525 {
6526 .name = "current",
6527 .flags = CFTYPE_NOT_ON_ROOT,
6528 .read_u64 = memory_current_read,
6529 },
6530 {
6531 .name = "peak",
6532 .flags = CFTYPE_NOT_ON_ROOT,
6533 .read_u64 = memory_peak_read,
6534 },
6535 {
6536 .name = "min",
6537 .flags = CFTYPE_NOT_ON_ROOT,
6538 .seq_show = memory_min_show,
6539 .write = memory_min_write,
6540 },
6541 {
6542 .name = "low",
6543 .flags = CFTYPE_NOT_ON_ROOT,
6544 .seq_show = memory_low_show,
6545 .write = memory_low_write,
6546 },
6547 {
6548 .name = "high",
6549 .flags = CFTYPE_NOT_ON_ROOT,
6550 .seq_show = memory_high_show,
6551 .write = memory_high_write,
6552 },
6553 {
6554 .name = "max",
6555 .flags = CFTYPE_NOT_ON_ROOT,
6556 .seq_show = memory_max_show,
6557 .write = memory_max_write,
6558 },
6559 {
6560 .name = "events",
6561 .flags = CFTYPE_NOT_ON_ROOT,
6562 .file_offset = offsetof(struct mem_cgroup, events_file),
6563 .seq_show = memory_events_show,
6564 },
6565 {
6566 .name = "events.local",
6567 .flags = CFTYPE_NOT_ON_ROOT,
6568 .file_offset = offsetof(struct mem_cgroup, events_local_file),
6569 .seq_show = memory_events_local_show,
6570 },
6571 {
6572 .name = "stat",
6573 .seq_show = memory_stat_show,
6574 },
6575 #ifdef CONFIG_NUMA
6576 {
6577 .name = "numa_stat",
6578 .seq_show = memory_numa_stat_show,
6579 },
6580 #endif
6581 {
6582 .name = "oom.group",
6583 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6584 .seq_show = memory_oom_group_show,
6585 .write = memory_oom_group_write,
6586 },
6587 {
6588 .name = "reclaim",
6589 .flags = CFTYPE_NS_DELEGATABLE,
6590 .write = memory_reclaim,
6591 },
6592 { }
6593 };
6594
6595 struct cgroup_subsys memory_cgrp_subsys = {
6596 .css_alloc = mem_cgroup_css_alloc,
6597 .css_online = mem_cgroup_css_online,
6598 .css_offline = mem_cgroup_css_offline,
6599 .css_released = mem_cgroup_css_released,
6600 .css_free = mem_cgroup_css_free,
6601 .css_reset = mem_cgroup_css_reset,
6602 .css_rstat_flush = mem_cgroup_css_rstat_flush,
6603 .can_attach = mem_cgroup_can_attach,
6604 .cancel_attach = mem_cgroup_cancel_attach,
6605 .post_attach = mem_cgroup_move_task,
6606 .dfl_cftypes = memory_files,
6607 .legacy_cftypes = mem_cgroup_legacy_files,
6608 .early_init = 0,
6609 };
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654 static unsigned long effective_protection(unsigned long usage,
6655 unsigned long parent_usage,
6656 unsigned long setting,
6657 unsigned long parent_effective,
6658 unsigned long siblings_protected)
6659 {
6660 unsigned long protected;
6661 unsigned long ep;
6662
6663 protected = min(usage, setting);
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674 if (siblings_protected > parent_effective)
6675 return protected * parent_effective / siblings_protected;
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692 ep = protected;
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6711 return ep;
6712 if (parent_effective > siblings_protected &&
6713 parent_usage > siblings_protected &&
6714 usage > protected) {
6715 unsigned long unclaimed;
6716
6717 unclaimed = parent_effective - siblings_protected;
6718 unclaimed *= usage - protected;
6719 unclaimed /= parent_usage - siblings_protected;
6720
6721 ep += unclaimed;
6722 }
6723
6724 return ep;
6725 }
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735 void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6736 struct mem_cgroup *memcg)
6737 {
6738 unsigned long usage, parent_usage;
6739 struct mem_cgroup *parent;
6740
6741 if (mem_cgroup_disabled())
6742 return;
6743
6744 if (!root)
6745 root = root_mem_cgroup;
6746
6747
6748
6749
6750
6751
6752
6753
6754 if (memcg == root)
6755 return;
6756
6757 usage = page_counter_read(&memcg->memory);
6758 if (!usage)
6759 return;
6760
6761 parent = parent_mem_cgroup(memcg);
6762
6763 if (parent == root) {
6764 memcg->memory.emin = READ_ONCE(memcg->memory.min);
6765 memcg->memory.elow = READ_ONCE(memcg->memory.low);
6766 return;
6767 }
6768
6769 parent_usage = page_counter_read(&parent->memory);
6770
6771 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6772 READ_ONCE(memcg->memory.min),
6773 READ_ONCE(parent->memory.emin),
6774 atomic_long_read(&parent->memory.children_min_usage)));
6775
6776 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6777 READ_ONCE(memcg->memory.low),
6778 READ_ONCE(parent->memory.elow),
6779 atomic_long_read(&parent->memory.children_low_usage)));
6780 }
6781
6782 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
6783 gfp_t gfp)
6784 {
6785 long nr_pages = folio_nr_pages(folio);
6786 int ret;
6787
6788 ret = try_charge(memcg, gfp, nr_pages);
6789 if (ret)
6790 goto out;
6791
6792 css_get(&memcg->css);
6793 commit_charge(folio, memcg);
6794
6795 local_irq_disable();
6796 mem_cgroup_charge_statistics(memcg, nr_pages);
6797 memcg_check_events(memcg, folio_nid(folio));
6798 local_irq_enable();
6799 out:
6800 return ret;
6801 }
6802
6803 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
6804 {
6805 struct mem_cgroup *memcg;
6806 int ret;
6807
6808 memcg = get_mem_cgroup_from_mm(mm);
6809 ret = charge_memcg(folio, memcg, gfp);
6810 css_put(&memcg->css);
6811
6812 return ret;
6813 }
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
6828 gfp_t gfp, swp_entry_t entry)
6829 {
6830 struct folio *folio = page_folio(page);
6831 struct mem_cgroup *memcg;
6832 unsigned short id;
6833 int ret;
6834
6835 if (mem_cgroup_disabled())
6836 return 0;
6837
6838 id = lookup_swap_cgroup_id(entry);
6839 rcu_read_lock();
6840 memcg = mem_cgroup_from_id(id);
6841 if (!memcg || !css_tryget_online(&memcg->css))
6842 memcg = get_mem_cgroup_from_mm(mm);
6843 rcu_read_unlock();
6844
6845 ret = charge_memcg(folio, memcg, gfp);
6846
6847 css_put(&memcg->css);
6848 return ret;
6849 }
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
6861 {
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874 if (!mem_cgroup_disabled() && do_memsw_account()) {
6875
6876
6877
6878
6879
6880 mem_cgroup_uncharge_swap(entry, 1);
6881 }
6882 }
6883
6884 struct uncharge_gather {
6885 struct mem_cgroup *memcg;
6886 unsigned long nr_memory;
6887 unsigned long pgpgout;
6888 unsigned long nr_kmem;
6889 int nid;
6890 };
6891
6892 static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6893 {
6894 memset(ug, 0, sizeof(*ug));
6895 }
6896
6897 static void uncharge_batch(const struct uncharge_gather *ug)
6898 {
6899 unsigned long flags;
6900
6901 if (ug->nr_memory) {
6902 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
6903 if (do_memsw_account())
6904 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
6905 if (ug->nr_kmem)
6906 memcg_account_kmem(ug->memcg, -ug->nr_kmem);
6907 memcg_oom_recover(ug->memcg);
6908 }
6909
6910 local_irq_save(flags);
6911 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6912 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
6913 memcg_check_events(ug->memcg, ug->nid);
6914 local_irq_restore(flags);
6915
6916
6917 css_put(&ug->memcg->css);
6918 }
6919
6920 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
6921 {
6922 long nr_pages;
6923 struct mem_cgroup *memcg;
6924 struct obj_cgroup *objcg;
6925
6926 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
6927
6928
6929
6930
6931
6932
6933 if (folio_memcg_kmem(folio)) {
6934 objcg = __folio_objcg(folio);
6935
6936
6937
6938
6939 memcg = get_mem_cgroup_from_objcg(objcg);
6940 } else {
6941 memcg = __folio_memcg(folio);
6942 }
6943
6944 if (!memcg)
6945 return;
6946
6947 if (ug->memcg != memcg) {
6948 if (ug->memcg) {
6949 uncharge_batch(ug);
6950 uncharge_gather_clear(ug);
6951 }
6952 ug->memcg = memcg;
6953 ug->nid = folio_nid(folio);
6954
6955
6956 css_get(&memcg->css);
6957 }
6958
6959 nr_pages = folio_nr_pages(folio);
6960
6961 if (folio_memcg_kmem(folio)) {
6962 ug->nr_memory += nr_pages;
6963 ug->nr_kmem += nr_pages;
6964
6965 folio->memcg_data = 0;
6966 obj_cgroup_put(objcg);
6967 } else {
6968
6969 if (!mem_cgroup_is_root(memcg))
6970 ug->nr_memory += nr_pages;
6971 ug->pgpgout++;
6972
6973 folio->memcg_data = 0;
6974 }
6975
6976 css_put(&memcg->css);
6977 }
6978
6979 void __mem_cgroup_uncharge(struct folio *folio)
6980 {
6981 struct uncharge_gather ug;
6982
6983
6984 if (!folio_memcg(folio))
6985 return;
6986
6987 uncharge_gather_clear(&ug);
6988 uncharge_folio(folio, &ug);
6989 uncharge_batch(&ug);
6990 }
6991
6992
6993
6994
6995
6996
6997
6998
6999 void __mem_cgroup_uncharge_list(struct list_head *page_list)
7000 {
7001 struct uncharge_gather ug;
7002 struct folio *folio;
7003
7004 uncharge_gather_clear(&ug);
7005 list_for_each_entry(folio, page_list, lru)
7006 uncharge_folio(folio, &ug);
7007 if (ug.memcg)
7008 uncharge_batch(&ug);
7009 }
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021 void mem_cgroup_migrate(struct folio *old, struct folio *new)
7022 {
7023 struct mem_cgroup *memcg;
7024 long nr_pages = folio_nr_pages(new);
7025 unsigned long flags;
7026
7027 VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
7028 VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
7029 VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
7030 VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
7031
7032 if (mem_cgroup_disabled())
7033 return;
7034
7035
7036 if (folio_memcg(new))
7037 return;
7038
7039 memcg = folio_memcg(old);
7040 VM_WARN_ON_ONCE_FOLIO(!memcg, old);
7041 if (!memcg)
7042 return;
7043
7044
7045 if (!mem_cgroup_is_root(memcg)) {
7046 page_counter_charge(&memcg->memory, nr_pages);
7047 if (do_memsw_account())
7048 page_counter_charge(&memcg->memsw, nr_pages);
7049 }
7050
7051 css_get(&memcg->css);
7052 commit_charge(new, memcg);
7053
7054 local_irq_save(flags);
7055 mem_cgroup_charge_statistics(memcg, nr_pages);
7056 memcg_check_events(memcg, folio_nid(new));
7057 local_irq_restore(flags);
7058 }
7059
7060 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
7061 EXPORT_SYMBOL(memcg_sockets_enabled_key);
7062
7063 void mem_cgroup_sk_alloc(struct sock *sk)
7064 {
7065 struct mem_cgroup *memcg;
7066
7067 if (!mem_cgroup_sockets_enabled)
7068 return;
7069
7070
7071 if (!in_task())
7072 return;
7073
7074 rcu_read_lock();
7075 memcg = mem_cgroup_from_task(current);
7076 if (memcg == root_mem_cgroup)
7077 goto out;
7078 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
7079 goto out;
7080 if (css_tryget(&memcg->css))
7081 sk->sk_memcg = memcg;
7082 out:
7083 rcu_read_unlock();
7084 }
7085
7086 void mem_cgroup_sk_free(struct sock *sk)
7087 {
7088 if (sk->sk_memcg)
7089 css_put(&sk->sk_memcg->css);
7090 }
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
7102 gfp_t gfp_mask)
7103 {
7104 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7105 struct page_counter *fail;
7106
7107 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
7108 memcg->tcpmem_pressure = 0;
7109 return true;
7110 }
7111 memcg->tcpmem_pressure = 1;
7112 if (gfp_mask & __GFP_NOFAIL) {
7113 page_counter_charge(&memcg->tcpmem, nr_pages);
7114 return true;
7115 }
7116 return false;
7117 }
7118
7119 if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
7120 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
7121 return true;
7122 }
7123
7124 return false;
7125 }
7126
7127
7128
7129
7130
7131
7132 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
7133 {
7134 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7135 page_counter_uncharge(&memcg->tcpmem, nr_pages);
7136 return;
7137 }
7138
7139 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
7140
7141 refill_stock(memcg, nr_pages);
7142 }
7143
7144 static int __init cgroup_memory(char *s)
7145 {
7146 char *token;
7147
7148 while ((token = strsep(&s, ",")) != NULL) {
7149 if (!*token)
7150 continue;
7151 if (!strcmp(token, "nosocket"))
7152 cgroup_memory_nosocket = true;
7153 if (!strcmp(token, "nokmem"))
7154 cgroup_memory_nokmem = true;
7155 }
7156 return 1;
7157 }
7158 __setup("cgroup.memory=", cgroup_memory);
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168 static int __init mem_cgroup_init(void)
7169 {
7170 int cpu, node;
7171
7172
7173
7174
7175
7176
7177
7178 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
7179
7180 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
7181 memcg_hotplug_cpu_dead);
7182
7183 for_each_possible_cpu(cpu)
7184 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
7185 drain_local_stock);
7186
7187 for_each_node(node) {
7188 struct mem_cgroup_tree_per_node *rtpn;
7189
7190 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
7191 node_online(node) ? node : NUMA_NO_NODE);
7192
7193 rtpn->rb_root = RB_ROOT;
7194 rtpn->rb_rightmost = NULL;
7195 spin_lock_init(&rtpn->lock);
7196 soft_limit_tree.rb_tree_per_node[node] = rtpn;
7197 }
7198
7199 return 0;
7200 }
7201 subsys_initcall(mem_cgroup_init);
7202
7203 #ifdef CONFIG_MEMCG_SWAP
7204 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
7205 {
7206 while (!refcount_inc_not_zero(&memcg->id.ref)) {
7207
7208
7209
7210
7211 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
7212 VM_BUG_ON(1);
7213 break;
7214 }
7215 memcg = parent_mem_cgroup(memcg);
7216 if (!memcg)
7217 memcg = root_mem_cgroup;
7218 }
7219 return memcg;
7220 }
7221
7222
7223
7224
7225
7226
7227
7228
7229 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
7230 {
7231 struct mem_cgroup *memcg, *swap_memcg;
7232 unsigned int nr_entries;
7233 unsigned short oldid;
7234
7235 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
7236 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
7237
7238 if (mem_cgroup_disabled())
7239 return;
7240
7241 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7242 return;
7243
7244 memcg = folio_memcg(folio);
7245
7246 VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
7247 if (!memcg)
7248 return;
7249
7250
7251
7252
7253
7254
7255 swap_memcg = mem_cgroup_id_get_online(memcg);
7256 nr_entries = folio_nr_pages(folio);
7257
7258 if (nr_entries > 1)
7259 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7260 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7261 nr_entries);
7262 VM_BUG_ON_FOLIO(oldid, folio);
7263 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7264
7265 folio->memcg_data = 0;
7266
7267 if (!mem_cgroup_is_root(memcg))
7268 page_counter_uncharge(&memcg->memory, nr_entries);
7269
7270 if (!cgroup_memory_noswap && memcg != swap_memcg) {
7271 if (!mem_cgroup_is_root(swap_memcg))
7272 page_counter_charge(&swap_memcg->memsw, nr_entries);
7273 page_counter_uncharge(&memcg->memsw, nr_entries);
7274 }
7275
7276
7277
7278
7279
7280
7281
7282 memcg_stats_lock();
7283 mem_cgroup_charge_statistics(memcg, -nr_entries);
7284 memcg_stats_unlock();
7285 memcg_check_events(memcg, folio_nid(folio));
7286
7287 css_put(&memcg->css);
7288 }
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
7300 {
7301 unsigned int nr_pages = folio_nr_pages(folio);
7302 struct page_counter *counter;
7303 struct mem_cgroup *memcg;
7304 unsigned short oldid;
7305
7306 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7307 return 0;
7308
7309 memcg = folio_memcg(folio);
7310
7311 VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
7312 if (!memcg)
7313 return 0;
7314
7315 if (!entry.val) {
7316 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7317 return 0;
7318 }
7319
7320 memcg = mem_cgroup_id_get_online(memcg);
7321
7322 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
7323 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7324 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7325 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7326 mem_cgroup_id_put(memcg);
7327 return -ENOMEM;
7328 }
7329
7330
7331 if (nr_pages > 1)
7332 mem_cgroup_id_get_many(memcg, nr_pages - 1);
7333 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7334 VM_BUG_ON_FOLIO(oldid, folio);
7335 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7336
7337 return 0;
7338 }
7339
7340
7341
7342
7343
7344
7345 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7346 {
7347 struct mem_cgroup *memcg;
7348 unsigned short id;
7349
7350 id = swap_cgroup_record(entry, 0, nr_pages);
7351 rcu_read_lock();
7352 memcg = mem_cgroup_from_id(id);
7353 if (memcg) {
7354 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
7355 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7356 page_counter_uncharge(&memcg->swap, nr_pages);
7357 else
7358 page_counter_uncharge(&memcg->memsw, nr_pages);
7359 }
7360 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7361 mem_cgroup_id_put_many(memcg, nr_pages);
7362 }
7363 rcu_read_unlock();
7364 }
7365
7366 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7367 {
7368 long nr_swap_pages = get_nr_swap_pages();
7369
7370 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7371 return nr_swap_pages;
7372 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7373 nr_swap_pages = min_t(long, nr_swap_pages,
7374 READ_ONCE(memcg->swap.max) -
7375 page_counter_read(&memcg->swap));
7376 return nr_swap_pages;
7377 }
7378
7379 bool mem_cgroup_swap_full(struct page *page)
7380 {
7381 struct mem_cgroup *memcg;
7382
7383 VM_BUG_ON_PAGE(!PageLocked(page), page);
7384
7385 if (vm_swap_full())
7386 return true;
7387 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7388 return false;
7389
7390 memcg = page_memcg(page);
7391 if (!memcg)
7392 return false;
7393
7394 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7395 unsigned long usage = page_counter_read(&memcg->swap);
7396
7397 if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7398 usage * 2 >= READ_ONCE(memcg->swap.max))
7399 return true;
7400 }
7401
7402 return false;
7403 }
7404
7405 static int __init setup_swap_account(char *s)
7406 {
7407 if (!strcmp(s, "1"))
7408 cgroup_memory_noswap = false;
7409 else if (!strcmp(s, "0"))
7410 cgroup_memory_noswap = true;
7411 return 1;
7412 }
7413 __setup("swapaccount=", setup_swap_account);
7414
7415 static u64 swap_current_read(struct cgroup_subsys_state *css,
7416 struct cftype *cft)
7417 {
7418 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7419
7420 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7421 }
7422
7423 static int swap_high_show(struct seq_file *m, void *v)
7424 {
7425 return seq_puts_memcg_tunable(m,
7426 READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7427 }
7428
7429 static ssize_t swap_high_write(struct kernfs_open_file *of,
7430 char *buf, size_t nbytes, loff_t off)
7431 {
7432 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7433 unsigned long high;
7434 int err;
7435
7436 buf = strstrip(buf);
7437 err = page_counter_memparse(buf, "max", &high);
7438 if (err)
7439 return err;
7440
7441 page_counter_set_high(&memcg->swap, high);
7442
7443 return nbytes;
7444 }
7445
7446 static int swap_max_show(struct seq_file *m, void *v)
7447 {
7448 return seq_puts_memcg_tunable(m,
7449 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7450 }
7451
7452 static ssize_t swap_max_write(struct kernfs_open_file *of,
7453 char *buf, size_t nbytes, loff_t off)
7454 {
7455 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7456 unsigned long max;
7457 int err;
7458
7459 buf = strstrip(buf);
7460 err = page_counter_memparse(buf, "max", &max);
7461 if (err)
7462 return err;
7463
7464 xchg(&memcg->swap.max, max);
7465
7466 return nbytes;
7467 }
7468
7469 static int swap_events_show(struct seq_file *m, void *v)
7470 {
7471 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7472
7473 seq_printf(m, "high %lu\n",
7474 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7475 seq_printf(m, "max %lu\n",
7476 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7477 seq_printf(m, "fail %lu\n",
7478 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7479
7480 return 0;
7481 }
7482
7483 static struct cftype swap_files[] = {
7484 {
7485 .name = "swap.current",
7486 .flags = CFTYPE_NOT_ON_ROOT,
7487 .read_u64 = swap_current_read,
7488 },
7489 {
7490 .name = "swap.high",
7491 .flags = CFTYPE_NOT_ON_ROOT,
7492 .seq_show = swap_high_show,
7493 .write = swap_high_write,
7494 },
7495 {
7496 .name = "swap.max",
7497 .flags = CFTYPE_NOT_ON_ROOT,
7498 .seq_show = swap_max_show,
7499 .write = swap_max_write,
7500 },
7501 {
7502 .name = "swap.events",
7503 .flags = CFTYPE_NOT_ON_ROOT,
7504 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7505 .seq_show = swap_events_show,
7506 },
7507 { }
7508 };
7509
7510 static struct cftype memsw_files[] = {
7511 {
7512 .name = "memsw.usage_in_bytes",
7513 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7514 .read_u64 = mem_cgroup_read_u64,
7515 },
7516 {
7517 .name = "memsw.max_usage_in_bytes",
7518 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7519 .write = mem_cgroup_reset,
7520 .read_u64 = mem_cgroup_read_u64,
7521 },
7522 {
7523 .name = "memsw.limit_in_bytes",
7524 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7525 .write = mem_cgroup_write,
7526 .read_u64 = mem_cgroup_read_u64,
7527 },
7528 {
7529 .name = "memsw.failcnt",
7530 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7531 .write = mem_cgroup_reset,
7532 .read_u64 = mem_cgroup_read_u64,
7533 },
7534 { },
7535 };
7536
7537 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
7551 {
7552 struct mem_cgroup *memcg, *original_memcg;
7553 bool ret = true;
7554
7555 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7556 return true;
7557
7558 original_memcg = get_mem_cgroup_from_objcg(objcg);
7559 for (memcg = original_memcg; memcg != root_mem_cgroup;
7560 memcg = parent_mem_cgroup(memcg)) {
7561 unsigned long max = READ_ONCE(memcg->zswap_max);
7562 unsigned long pages;
7563
7564 if (max == PAGE_COUNTER_MAX)
7565 continue;
7566 if (max == 0) {
7567 ret = false;
7568 break;
7569 }
7570
7571 cgroup_rstat_flush(memcg->css.cgroup);
7572 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
7573 if (pages < max)
7574 continue;
7575 ret = false;
7576 break;
7577 }
7578 mem_cgroup_put(original_memcg);
7579 return ret;
7580 }
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
7591 {
7592 struct mem_cgroup *memcg;
7593
7594 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7595 return;
7596
7597 VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
7598
7599
7600 if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
7601 VM_WARN_ON_ONCE(1);
7602
7603 rcu_read_lock();
7604 memcg = obj_cgroup_memcg(objcg);
7605 mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
7606 mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
7607 rcu_read_unlock();
7608 }
7609
7610
7611
7612
7613
7614
7615
7616
7617 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
7618 {
7619 struct mem_cgroup *memcg;
7620
7621 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7622 return;
7623
7624 obj_cgroup_uncharge(objcg, size);
7625
7626 rcu_read_lock();
7627 memcg = obj_cgroup_memcg(objcg);
7628 mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
7629 mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
7630 rcu_read_unlock();
7631 }
7632
7633 static u64 zswap_current_read(struct cgroup_subsys_state *css,
7634 struct cftype *cft)
7635 {
7636 cgroup_rstat_flush(css->cgroup);
7637 return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
7638 }
7639
7640 static int zswap_max_show(struct seq_file *m, void *v)
7641 {
7642 return seq_puts_memcg_tunable(m,
7643 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
7644 }
7645
7646 static ssize_t zswap_max_write(struct kernfs_open_file *of,
7647 char *buf, size_t nbytes, loff_t off)
7648 {
7649 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7650 unsigned long max;
7651 int err;
7652
7653 buf = strstrip(buf);
7654 err = page_counter_memparse(buf, "max", &max);
7655 if (err)
7656 return err;
7657
7658 xchg(&memcg->zswap_max, max);
7659
7660 return nbytes;
7661 }
7662
7663 static struct cftype zswap_files[] = {
7664 {
7665 .name = "zswap.current",
7666 .flags = CFTYPE_NOT_ON_ROOT,
7667 .read_u64 = zswap_current_read,
7668 },
7669 {
7670 .name = "zswap.max",
7671 .flags = CFTYPE_NOT_ON_ROOT,
7672 .seq_show = zswap_max_show,
7673 .write = zswap_max_write,
7674 },
7675 { }
7676 };
7677 #endif
7678
7679
7680
7681
7682
7683
7684
7685
7686 static int __init mem_cgroup_swap_init(void)
7687 {
7688
7689 if (mem_cgroup_disabled())
7690 cgroup_memory_noswap = true;
7691
7692 if (cgroup_memory_noswap)
7693 return 0;
7694
7695 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7696 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7697 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
7698 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
7699 #endif
7700 return 0;
7701 }
7702 core_initcall(mem_cgroup_swap_init);
7703
7704 #endif