0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021 #include <linux/oom.h>
0022 #include <linux/mm.h>
0023 #include <linux/err.h>
0024 #include <linux/gfp.h>
0025 #include <linux/sched.h>
0026 #include <linux/sched/mm.h>
0027 #include <linux/sched/coredump.h>
0028 #include <linux/sched/task.h>
0029 #include <linux/sched/debug.h>
0030 #include <linux/swap.h>
0031 #include <linux/syscalls.h>
0032 #include <linux/timex.h>
0033 #include <linux/jiffies.h>
0034 #include <linux/cpuset.h>
0035 #include <linux/export.h>
0036 #include <linux/notifier.h>
0037 #include <linux/memcontrol.h>
0038 #include <linux/mempolicy.h>
0039 #include <linux/security.h>
0040 #include <linux/ptrace.h>
0041 #include <linux/freezer.h>
0042 #include <linux/ftrace.h>
0043 #include <linux/ratelimit.h>
0044 #include <linux/kthread.h>
0045 #include <linux/init.h>
0046 #include <linux/mmu_notifier.h>
0047
0048 #include <asm/tlb.h>
0049 #include "internal.h"
0050 #include "slab.h"
0051
0052 #define CREATE_TRACE_POINTS
0053 #include <trace/events/oom.h>
0054
0055 static int sysctl_panic_on_oom;
0056 static int sysctl_oom_kill_allocating_task;
0057 static int sysctl_oom_dump_tasks = 1;
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067 DEFINE_MUTEX(oom_lock);
0068
0069 DEFINE_MUTEX(oom_adj_mutex);
0070
0071 static inline bool is_memcg_oom(struct oom_control *oc)
0072 {
0073 return oc->memcg != NULL;
0074 }
0075
0076 #ifdef CONFIG_NUMA
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089 static bool oom_cpuset_eligible(struct task_struct *start,
0090 struct oom_control *oc)
0091 {
0092 struct task_struct *tsk;
0093 bool ret = false;
0094 const nodemask_t *mask = oc->nodemask;
0095
0096 rcu_read_lock();
0097 for_each_thread(start, tsk) {
0098 if (mask) {
0099
0100
0101
0102
0103
0104
0105 ret = mempolicy_in_oom_domain(tsk, mask);
0106 } else {
0107
0108
0109
0110
0111 ret = cpuset_mems_allowed_intersects(current, tsk);
0112 }
0113 if (ret)
0114 break;
0115 }
0116 rcu_read_unlock();
0117
0118 return ret;
0119 }
0120 #else
0121 static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
0122 {
0123 return true;
0124 }
0125 #endif
0126
0127
0128
0129
0130
0131
0132
0133 struct task_struct *find_lock_task_mm(struct task_struct *p)
0134 {
0135 struct task_struct *t;
0136
0137 rcu_read_lock();
0138
0139 for_each_thread(p, t) {
0140 task_lock(t);
0141 if (likely(t->mm))
0142 goto found;
0143 task_unlock(t);
0144 }
0145 t = NULL;
0146 found:
0147 rcu_read_unlock();
0148
0149 return t;
0150 }
0151
0152
0153
0154
0155
0156 static inline bool is_sysrq_oom(struct oom_control *oc)
0157 {
0158 return oc->order == -1;
0159 }
0160
0161
0162 static bool oom_unkillable_task(struct task_struct *p)
0163 {
0164 if (is_global_init(p))
0165 return true;
0166 if (p->flags & PF_KTHREAD)
0167 return true;
0168 return false;
0169 }
0170
0171
0172
0173
0174
0175
0176
0177 static bool should_dump_unreclaim_slab(void)
0178 {
0179 unsigned long nr_lru;
0180
0181 nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
0182 global_node_page_state(NR_INACTIVE_ANON) +
0183 global_node_page_state(NR_ACTIVE_FILE) +
0184 global_node_page_state(NR_INACTIVE_FILE) +
0185 global_node_page_state(NR_ISOLATED_ANON) +
0186 global_node_page_state(NR_ISOLATED_FILE) +
0187 global_node_page_state(NR_UNEVICTABLE);
0188
0189 return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
0190 }
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200
0201 long oom_badness(struct task_struct *p, unsigned long totalpages)
0202 {
0203 long points;
0204 long adj;
0205
0206 if (oom_unkillable_task(p))
0207 return LONG_MIN;
0208
0209 p = find_lock_task_mm(p);
0210 if (!p)
0211 return LONG_MIN;
0212
0213
0214
0215
0216
0217
0218 adj = (long)p->signal->oom_score_adj;
0219 if (adj == OOM_SCORE_ADJ_MIN ||
0220 test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
0221 in_vfork(p)) {
0222 task_unlock(p);
0223 return LONG_MIN;
0224 }
0225
0226
0227
0228
0229
0230 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
0231 mm_pgtables_bytes(p->mm) / PAGE_SIZE;
0232 task_unlock(p);
0233
0234
0235 adj *= totalpages / 1000;
0236 points += adj;
0237
0238 return points;
0239 }
0240
0241 static const char * const oom_constraint_text[] = {
0242 [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
0243 [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
0244 [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
0245 [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
0246 };
0247
0248
0249
0250
0251 static enum oom_constraint constrained_alloc(struct oom_control *oc)
0252 {
0253 struct zone *zone;
0254 struct zoneref *z;
0255 enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
0256 bool cpuset_limited = false;
0257 int nid;
0258
0259 if (is_memcg_oom(oc)) {
0260 oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
0261 return CONSTRAINT_MEMCG;
0262 }
0263
0264
0265 oc->totalpages = totalram_pages() + total_swap_pages;
0266
0267 if (!IS_ENABLED(CONFIG_NUMA))
0268 return CONSTRAINT_NONE;
0269
0270 if (!oc->zonelist)
0271 return CONSTRAINT_NONE;
0272
0273
0274
0275
0276
0277 if (oc->gfp_mask & __GFP_THISNODE)
0278 return CONSTRAINT_NONE;
0279
0280
0281
0282
0283
0284
0285 if (oc->nodemask &&
0286 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
0287 oc->totalpages = total_swap_pages;
0288 for_each_node_mask(nid, *oc->nodemask)
0289 oc->totalpages += node_present_pages(nid);
0290 return CONSTRAINT_MEMORY_POLICY;
0291 }
0292
0293
0294 for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
0295 highest_zoneidx, oc->nodemask)
0296 if (!cpuset_zone_allowed(zone, oc->gfp_mask))
0297 cpuset_limited = true;
0298
0299 if (cpuset_limited) {
0300 oc->totalpages = total_swap_pages;
0301 for_each_node_mask(nid, cpuset_current_mems_allowed)
0302 oc->totalpages += node_present_pages(nid);
0303 return CONSTRAINT_CPUSET;
0304 }
0305 return CONSTRAINT_NONE;
0306 }
0307
0308 static int oom_evaluate_task(struct task_struct *task, void *arg)
0309 {
0310 struct oom_control *oc = arg;
0311 long points;
0312
0313 if (oom_unkillable_task(task))
0314 goto next;
0315
0316
0317 if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
0318 goto next;
0319
0320
0321
0322
0323
0324
0325
0326 if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
0327 if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
0328 goto next;
0329 goto abort;
0330 }
0331
0332
0333
0334
0335
0336 if (oom_task_origin(task)) {
0337 points = LONG_MAX;
0338 goto select;
0339 }
0340
0341 points = oom_badness(task, oc->totalpages);
0342 if (points == LONG_MIN || points < oc->chosen_points)
0343 goto next;
0344
0345 select:
0346 if (oc->chosen)
0347 put_task_struct(oc->chosen);
0348 get_task_struct(task);
0349 oc->chosen = task;
0350 oc->chosen_points = points;
0351 next:
0352 return 0;
0353 abort:
0354 if (oc->chosen)
0355 put_task_struct(oc->chosen);
0356 oc->chosen = (void *)-1UL;
0357 return 1;
0358 }
0359
0360
0361
0362
0363
0364 static void select_bad_process(struct oom_control *oc)
0365 {
0366 oc->chosen_points = LONG_MIN;
0367
0368 if (is_memcg_oom(oc))
0369 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
0370 else {
0371 struct task_struct *p;
0372
0373 rcu_read_lock();
0374 for_each_process(p)
0375 if (oom_evaluate_task(p, oc))
0376 break;
0377 rcu_read_unlock();
0378 }
0379 }
0380
0381 static int dump_task(struct task_struct *p, void *arg)
0382 {
0383 struct oom_control *oc = arg;
0384 struct task_struct *task;
0385
0386 if (oom_unkillable_task(p))
0387 return 0;
0388
0389
0390 if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
0391 return 0;
0392
0393 task = find_lock_task_mm(p);
0394 if (!task) {
0395
0396
0397
0398
0399 return 0;
0400 }
0401
0402 pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
0403 task->pid, from_kuid(&init_user_ns, task_uid(task)),
0404 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
0405 mm_pgtables_bytes(task->mm),
0406 get_mm_counter(task->mm, MM_SWAPENTS),
0407 task->signal->oom_score_adj, task->comm);
0408 task_unlock(task);
0409
0410 return 0;
0411 }
0412
0413
0414
0415
0416
0417
0418
0419
0420
0421
0422
0423 static void dump_tasks(struct oom_control *oc)
0424 {
0425 pr_info("Tasks state (memory values in pages):\n");
0426 pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
0427
0428 if (is_memcg_oom(oc))
0429 mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
0430 else {
0431 struct task_struct *p;
0432
0433 rcu_read_lock();
0434 for_each_process(p)
0435 dump_task(p, oc);
0436 rcu_read_unlock();
0437 }
0438 }
0439
0440 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
0441 {
0442
0443 pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
0444 oom_constraint_text[oc->constraint],
0445 nodemask_pr_args(oc->nodemask));
0446 cpuset_print_current_mems_allowed();
0447 mem_cgroup_print_oom_context(oc->memcg, victim);
0448 pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
0449 from_kuid(&init_user_ns, task_uid(victim)));
0450 }
0451
0452 static void dump_header(struct oom_control *oc, struct task_struct *p)
0453 {
0454 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
0455 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
0456 current->signal->oom_score_adj);
0457 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
0458 pr_warn("COMPACTION is disabled!!!\n");
0459
0460 dump_stack();
0461 if (is_memcg_oom(oc))
0462 mem_cgroup_print_oom_meminfo(oc->memcg);
0463 else {
0464 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
0465 if (should_dump_unreclaim_slab())
0466 dump_unreclaimable_slab();
0467 }
0468 if (sysctl_oom_dump_tasks)
0469 dump_tasks(oc);
0470 if (p)
0471 dump_oom_summary(oc, p);
0472 }
0473
0474
0475
0476
0477 static atomic_t oom_victims = ATOMIC_INIT(0);
0478 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
0479
0480 static bool oom_killer_disabled __read_mostly;
0481
0482 #define K(x) ((x) << (PAGE_SHIFT-10))
0483
0484
0485
0486
0487
0488
0489
0490 bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
0491 {
0492 struct task_struct *t;
0493
0494 for_each_thread(p, t) {
0495 struct mm_struct *t_mm = READ_ONCE(t->mm);
0496 if (t_mm)
0497 return t_mm == mm;
0498 }
0499 return false;
0500 }
0501
0502 #ifdef CONFIG_MMU
0503
0504
0505
0506
0507 static struct task_struct *oom_reaper_th;
0508 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
0509 static struct task_struct *oom_reaper_list;
0510 static DEFINE_SPINLOCK(oom_reaper_lock);
0511
0512 bool __oom_reap_task_mm(struct mm_struct *mm)
0513 {
0514 struct vm_area_struct *vma;
0515 bool ret = true;
0516
0517
0518
0519
0520
0521
0522
0523 set_bit(MMF_UNSTABLE, &mm->flags);
0524
0525 for (vma = mm->mmap ; vma; vma = vma->vm_next) {
0526 if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
0527 continue;
0528
0529
0530
0531
0532
0533
0534
0535
0536
0537
0538
0539 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
0540 struct mmu_notifier_range range;
0541 struct mmu_gather tlb;
0542
0543 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
0544 vma, mm, vma->vm_start,
0545 vma->vm_end);
0546 tlb_gather_mmu(&tlb, mm);
0547 if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
0548 tlb_finish_mmu(&tlb);
0549 ret = false;
0550 continue;
0551 }
0552 unmap_page_range(&tlb, vma, range.start, range.end, NULL);
0553 mmu_notifier_invalidate_range_end(&range);
0554 tlb_finish_mmu(&tlb);
0555 }
0556 }
0557
0558 return ret;
0559 }
0560
0561
0562
0563
0564
0565
0566
0567 static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
0568 {
0569 bool ret = true;
0570
0571 if (!mmap_read_trylock(mm)) {
0572 trace_skip_task_reaping(tsk->pid);
0573 return false;
0574 }
0575
0576
0577
0578
0579
0580
0581
0582 if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
0583 trace_skip_task_reaping(tsk->pid);
0584 goto out_unlock;
0585 }
0586
0587 trace_start_task_reaping(tsk->pid);
0588
0589
0590 ret = __oom_reap_task_mm(mm);
0591 if (!ret)
0592 goto out_finish;
0593
0594 pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
0595 task_pid_nr(tsk), tsk->comm,
0596 K(get_mm_counter(mm, MM_ANONPAGES)),
0597 K(get_mm_counter(mm, MM_FILEPAGES)),
0598 K(get_mm_counter(mm, MM_SHMEMPAGES)));
0599 out_finish:
0600 trace_finish_task_reaping(tsk->pid);
0601 out_unlock:
0602 mmap_read_unlock(mm);
0603
0604 return ret;
0605 }
0606
0607 #define MAX_OOM_REAP_RETRIES 10
0608 static void oom_reap_task(struct task_struct *tsk)
0609 {
0610 int attempts = 0;
0611 struct mm_struct *mm = tsk->signal->oom_mm;
0612
0613
0614 while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
0615 schedule_timeout_idle(HZ/10);
0616
0617 if (attempts <= MAX_OOM_REAP_RETRIES ||
0618 test_bit(MMF_OOM_SKIP, &mm->flags))
0619 goto done;
0620
0621 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
0622 task_pid_nr(tsk), tsk->comm);
0623 sched_show_task(tsk);
0624 debug_show_all_locks();
0625
0626 done:
0627 tsk->oom_reaper_list = NULL;
0628
0629
0630
0631
0632
0633 set_bit(MMF_OOM_SKIP, &mm->flags);
0634
0635
0636 put_task_struct(tsk);
0637 }
0638
0639 static int oom_reaper(void *unused)
0640 {
0641 set_freezable();
0642
0643 while (true) {
0644 struct task_struct *tsk = NULL;
0645
0646 wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
0647 spin_lock_irq(&oom_reaper_lock);
0648 if (oom_reaper_list != NULL) {
0649 tsk = oom_reaper_list;
0650 oom_reaper_list = tsk->oom_reaper_list;
0651 }
0652 spin_unlock_irq(&oom_reaper_lock);
0653
0654 if (tsk)
0655 oom_reap_task(tsk);
0656 }
0657
0658 return 0;
0659 }
0660
0661 static void wake_oom_reaper(struct timer_list *timer)
0662 {
0663 struct task_struct *tsk = container_of(timer, struct task_struct,
0664 oom_reaper_timer);
0665 struct mm_struct *mm = tsk->signal->oom_mm;
0666 unsigned long flags;
0667
0668
0669 if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
0670 put_task_struct(tsk);
0671 return;
0672 }
0673
0674 spin_lock_irqsave(&oom_reaper_lock, flags);
0675 tsk->oom_reaper_list = oom_reaper_list;
0676 oom_reaper_list = tsk;
0677 spin_unlock_irqrestore(&oom_reaper_lock, flags);
0678 trace_wake_reaper(tsk->pid);
0679 wake_up(&oom_reaper_wait);
0680 }
0681
0682
0683
0684
0685
0686
0687
0688
0689
0690 #define OOM_REAPER_DELAY (2*HZ)
0691 static void queue_oom_reaper(struct task_struct *tsk)
0692 {
0693
0694 if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
0695 return;
0696
0697 get_task_struct(tsk);
0698 timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
0699 tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
0700 add_timer(&tsk->oom_reaper_timer);
0701 }
0702
0703 #ifdef CONFIG_SYSCTL
0704 static struct ctl_table vm_oom_kill_table[] = {
0705 {
0706 .procname = "panic_on_oom",
0707 .data = &sysctl_panic_on_oom,
0708 .maxlen = sizeof(sysctl_panic_on_oom),
0709 .mode = 0644,
0710 .proc_handler = proc_dointvec_minmax,
0711 .extra1 = SYSCTL_ZERO,
0712 .extra2 = SYSCTL_TWO,
0713 },
0714 {
0715 .procname = "oom_kill_allocating_task",
0716 .data = &sysctl_oom_kill_allocating_task,
0717 .maxlen = sizeof(sysctl_oom_kill_allocating_task),
0718 .mode = 0644,
0719 .proc_handler = proc_dointvec,
0720 },
0721 {
0722 .procname = "oom_dump_tasks",
0723 .data = &sysctl_oom_dump_tasks,
0724 .maxlen = sizeof(sysctl_oom_dump_tasks),
0725 .mode = 0644,
0726 .proc_handler = proc_dointvec,
0727 },
0728 {}
0729 };
0730 #endif
0731
0732 static int __init oom_init(void)
0733 {
0734 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
0735 #ifdef CONFIG_SYSCTL
0736 register_sysctl_init("vm", vm_oom_kill_table);
0737 #endif
0738 return 0;
0739 }
0740 subsys_initcall(oom_init)
0741 #else
0742 static inline void queue_oom_reaper(struct task_struct *tsk)
0743 {
0744 }
0745 #endif
0746
0747
0748
0749
0750
0751
0752
0753
0754
0755
0756
0757 static void mark_oom_victim(struct task_struct *tsk)
0758 {
0759 struct mm_struct *mm = tsk->mm;
0760
0761 WARN_ON(oom_killer_disabled);
0762
0763 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
0764 return;
0765
0766
0767 if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
0768 mmgrab(tsk->signal->oom_mm);
0769 set_bit(MMF_OOM_VICTIM, &mm->flags);
0770 }
0771
0772
0773
0774
0775
0776
0777
0778 __thaw_task(tsk);
0779 atomic_inc(&oom_victims);
0780 trace_mark_victim(tsk->pid);
0781 }
0782
0783
0784
0785
0786 void exit_oom_victim(void)
0787 {
0788 clear_thread_flag(TIF_MEMDIE);
0789
0790 if (!atomic_dec_return(&oom_victims))
0791 wake_up_all(&oom_victims_wait);
0792 }
0793
0794
0795
0796
0797 void oom_killer_enable(void)
0798 {
0799 oom_killer_disabled = false;
0800 pr_info("OOM killer enabled.\n");
0801 }
0802
0803
0804
0805
0806
0807
0808
0809
0810
0811
0812
0813
0814
0815
0816
0817
0818 bool oom_killer_disable(signed long timeout)
0819 {
0820 signed long ret;
0821
0822
0823
0824
0825
0826 if (mutex_lock_killable(&oom_lock))
0827 return false;
0828 oom_killer_disabled = true;
0829 mutex_unlock(&oom_lock);
0830
0831 ret = wait_event_interruptible_timeout(oom_victims_wait,
0832 !atomic_read(&oom_victims), timeout);
0833 if (ret <= 0) {
0834 oom_killer_enable();
0835 return false;
0836 }
0837 pr_info("OOM killer disabled.\n");
0838
0839 return true;
0840 }
0841
0842 static inline bool __task_will_free_mem(struct task_struct *task)
0843 {
0844 struct signal_struct *sig = task->signal;
0845
0846
0847
0848
0849
0850
0851 if (sig->core_state)
0852 return false;
0853
0854 if (sig->flags & SIGNAL_GROUP_EXIT)
0855 return true;
0856
0857 if (thread_group_empty(task) && (task->flags & PF_EXITING))
0858 return true;
0859
0860 return false;
0861 }
0862
0863
0864
0865
0866
0867
0868
0869
0870 static bool task_will_free_mem(struct task_struct *task)
0871 {
0872 struct mm_struct *mm = task->mm;
0873 struct task_struct *p;
0874 bool ret = true;
0875
0876
0877
0878
0879
0880
0881 if (!mm)
0882 return false;
0883
0884 if (!__task_will_free_mem(task))
0885 return false;
0886
0887
0888
0889
0890
0891 if (test_bit(MMF_OOM_SKIP, &mm->flags))
0892 return false;
0893
0894 if (atomic_read(&mm->mm_users) <= 1)
0895 return true;
0896
0897
0898
0899
0900
0901
0902 rcu_read_lock();
0903 for_each_process(p) {
0904 if (!process_shares_mm(p, mm))
0905 continue;
0906 if (same_thread_group(task, p))
0907 continue;
0908 ret = __task_will_free_mem(p);
0909 if (!ret)
0910 break;
0911 }
0912 rcu_read_unlock();
0913
0914 return ret;
0915 }
0916
0917 static void __oom_kill_process(struct task_struct *victim, const char *message)
0918 {
0919 struct task_struct *p;
0920 struct mm_struct *mm;
0921 bool can_oom_reap = true;
0922
0923 p = find_lock_task_mm(victim);
0924 if (!p) {
0925 pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
0926 message, task_pid_nr(victim), victim->comm);
0927 put_task_struct(victim);
0928 return;
0929 } else if (victim != p) {
0930 get_task_struct(p);
0931 put_task_struct(victim);
0932 victim = p;
0933 }
0934
0935
0936 mm = victim->mm;
0937 mmgrab(mm);
0938
0939
0940 count_vm_event(OOM_KILL);
0941 memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
0942
0943
0944
0945
0946
0947
0948 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
0949 mark_oom_victim(victim);
0950 pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
0951 message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
0952 K(get_mm_counter(mm, MM_ANONPAGES)),
0953 K(get_mm_counter(mm, MM_FILEPAGES)),
0954 K(get_mm_counter(mm, MM_SHMEMPAGES)),
0955 from_kuid(&init_user_ns, task_uid(victim)),
0956 mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
0957 task_unlock(victim);
0958
0959
0960
0961
0962
0963
0964
0965
0966
0967
0968 rcu_read_lock();
0969 for_each_process(p) {
0970 if (!process_shares_mm(p, mm))
0971 continue;
0972 if (same_thread_group(p, victim))
0973 continue;
0974 if (is_global_init(p)) {
0975 can_oom_reap = false;
0976 set_bit(MMF_OOM_SKIP, &mm->flags);
0977 pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
0978 task_pid_nr(victim), victim->comm,
0979 task_pid_nr(p), p->comm);
0980 continue;
0981 }
0982
0983
0984
0985
0986 if (unlikely(p->flags & PF_KTHREAD))
0987 continue;
0988 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
0989 }
0990 rcu_read_unlock();
0991
0992 if (can_oom_reap)
0993 queue_oom_reaper(victim);
0994
0995 mmdrop(mm);
0996 put_task_struct(victim);
0997 }
0998 #undef K
0999
1000
1001
1002
1003
1004 static int oom_kill_memcg_member(struct task_struct *task, void *message)
1005 {
1006 if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
1007 !is_global_init(task)) {
1008 get_task_struct(task);
1009 __oom_kill_process(task, message);
1010 }
1011 return 0;
1012 }
1013
1014 static void oom_kill_process(struct oom_control *oc, const char *message)
1015 {
1016 struct task_struct *victim = oc->chosen;
1017 struct mem_cgroup *oom_group;
1018 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
1019 DEFAULT_RATELIMIT_BURST);
1020
1021
1022
1023
1024
1025
1026 task_lock(victim);
1027 if (task_will_free_mem(victim)) {
1028 mark_oom_victim(victim);
1029 queue_oom_reaper(victim);
1030 task_unlock(victim);
1031 put_task_struct(victim);
1032 return;
1033 }
1034 task_unlock(victim);
1035
1036 if (__ratelimit(&oom_rs))
1037 dump_header(oc, victim);
1038
1039
1040
1041
1042
1043
1044 oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
1045
1046 __oom_kill_process(victim, message);
1047
1048
1049
1050
1051 if (oom_group) {
1052 memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL);
1053 mem_cgroup_print_oom_group(oom_group);
1054 mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
1055 (void *)message);
1056 mem_cgroup_put(oom_group);
1057 }
1058 }
1059
1060
1061
1062
1063 static void check_panic_on_oom(struct oom_control *oc)
1064 {
1065 if (likely(!sysctl_panic_on_oom))
1066 return;
1067 if (sysctl_panic_on_oom != 2) {
1068
1069
1070
1071
1072
1073 if (oc->constraint != CONSTRAINT_NONE)
1074 return;
1075 }
1076
1077 if (is_sysrq_oom(oc))
1078 return;
1079 dump_header(oc, NULL);
1080 panic("Out of memory: %s panic_on_oom is enabled\n",
1081 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
1082 }
1083
1084 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
1085
1086 int register_oom_notifier(struct notifier_block *nb)
1087 {
1088 return blocking_notifier_chain_register(&oom_notify_list, nb);
1089 }
1090 EXPORT_SYMBOL_GPL(register_oom_notifier);
1091
1092 int unregister_oom_notifier(struct notifier_block *nb)
1093 {
1094 return blocking_notifier_chain_unregister(&oom_notify_list, nb);
1095 }
1096 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107 bool out_of_memory(struct oom_control *oc)
1108 {
1109 unsigned long freed = 0;
1110
1111 if (oom_killer_disabled)
1112 return false;
1113
1114 if (!is_memcg_oom(oc)) {
1115 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
1116 if (freed > 0 && !is_sysrq_oom(oc))
1117
1118 return true;
1119 }
1120
1121
1122
1123
1124
1125
1126 if (task_will_free_mem(current)) {
1127 mark_oom_victim(current);
1128 queue_oom_reaper(current);
1129 return true;
1130 }
1131
1132
1133
1134
1135
1136
1137
1138
1139 if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
1140 return true;
1141
1142
1143
1144
1145
1146 oc->constraint = constrained_alloc(oc);
1147 if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
1148 oc->nodemask = NULL;
1149 check_panic_on_oom(oc);
1150
1151 if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
1152 current->mm && !oom_unkillable_task(current) &&
1153 oom_cpuset_eligible(current, oc) &&
1154 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
1155 get_task_struct(current);
1156 oc->chosen = current;
1157 oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
1158 return true;
1159 }
1160
1161 select_bad_process(oc);
1162
1163 if (!oc->chosen) {
1164 dump_header(oc, NULL);
1165 pr_warn("Out of memory and no killable processes...\n");
1166
1167
1168
1169
1170
1171 if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
1172 panic("System is deadlocked on memory\n");
1173 }
1174 if (oc->chosen && oc->chosen != (void *)-1UL)
1175 oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
1176 "Memory cgroup out of memory");
1177 return !!oc->chosen;
1178 }
1179
1180
1181
1182
1183
1184
1185
1186 void pagefault_out_of_memory(void)
1187 {
1188 static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
1189 DEFAULT_RATELIMIT_BURST);
1190
1191 if (mem_cgroup_oom_synchronize(true))
1192 return;
1193
1194 if (fatal_signal_pending(current))
1195 return;
1196
1197 if (__ratelimit(&pfoom_rs))
1198 pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
1199 }
1200
1201 SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
1202 {
1203 #ifdef CONFIG_MMU
1204 struct mm_struct *mm = NULL;
1205 struct task_struct *task;
1206 struct task_struct *p;
1207 unsigned int f_flags;
1208 bool reap = false;
1209 long ret = 0;
1210
1211 if (flags)
1212 return -EINVAL;
1213
1214 task = pidfd_get_task(pidfd, &f_flags);
1215 if (IS_ERR(task))
1216 return PTR_ERR(task);
1217
1218
1219
1220
1221
1222 p = find_lock_task_mm(task);
1223 if (!p) {
1224 ret = -ESRCH;
1225 goto put_task;
1226 }
1227
1228 mm = p->mm;
1229 mmgrab(mm);
1230
1231 if (task_will_free_mem(p))
1232 reap = true;
1233 else {
1234
1235 if (!test_bit(MMF_OOM_SKIP, &mm->flags))
1236 ret = -EINVAL;
1237 }
1238 task_unlock(p);
1239
1240 if (!reap)
1241 goto drop_mm;
1242
1243 if (mmap_read_lock_killable(mm)) {
1244 ret = -EINTR;
1245 goto drop_mm;
1246 }
1247
1248
1249
1250
1251 if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
1252 ret = -EAGAIN;
1253 mmap_read_unlock(mm);
1254
1255 drop_mm:
1256 mmdrop(mm);
1257 put_task:
1258 put_task_struct(task);
1259 return ret;
1260 #else
1261 return -ENOSYS;
1262 #endif
1263 }