Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/mm/oom_kill.c
0003  * 
0004  *  Copyright (C)  1998,2000  Rik van Riel
0005  *  Thanks go out to Claus Fischer for some serious inspiration and
0006  *  for goading me into coding this file...
0007  *  Copyright (C)  2010  Google, Inc.
0008  *  Rewritten by David Rientjes
0009  *
0010  *  The routines in this file are used to kill a process when
0011  *  we're seriously out of memory. This gets called from __alloc_pages()
0012  *  in mm/page_alloc.c when we really run out of memory.
0013  *
0014  *  Since we won't call these routines often (on a well-configured
0015  *  machine) this file will double as a 'coding guide' and a signpost
0016  *  for newbie kernel hackers. It features several pointers to major
0017  *  kernel subsystems and hints as to where to find out what things do.
0018  */
0019 
0020 #include <linux/oom.h>
0021 #include <linux/mm.h>
0022 #include <linux/err.h>
0023 #include <linux/gfp.h>
0024 #include <linux/sched.h>
0025 #include <linux/swap.h>
0026 #include <linux/timex.h>
0027 #include <linux/jiffies.h>
0028 #include <linux/cpuset.h>
0029 #include <linux/export.h>
0030 #include <linux/notifier.h>
0031 #include <linux/memcontrol.h>
0032 #include <linux/mempolicy.h>
0033 #include <linux/security.h>
0034 #include <linux/ptrace.h>
0035 #include <linux/freezer.h>
0036 #include <linux/ftrace.h>
0037 #include <linux/ratelimit.h>
0038 #include <linux/kthread.h>
0039 #include <linux/init.h>
0040 
0041 #include <asm/tlb.h>
0042 #include "internal.h"
0043 
0044 #define CREATE_TRACE_POINTS
0045 #include <trace/events/oom.h>
0046 
0047 int sysctl_panic_on_oom;
0048 int sysctl_oom_kill_allocating_task;
0049 int sysctl_oom_dump_tasks = 1;
0050 
0051 DEFINE_MUTEX(oom_lock);
0052 
0053 #ifdef CONFIG_NUMA
0054 /**
0055  * has_intersects_mems_allowed() - check task eligiblity for kill
0056  * @start: task struct of which task to consider
0057  * @mask: nodemask passed to page allocator for mempolicy ooms
0058  *
0059  * Task eligibility is determined by whether or not a candidate task, @tsk,
0060  * shares the same mempolicy nodes as current if it is bound by such a policy
0061  * and whether or not it has the same set of allowed cpuset nodes.
0062  */
0063 static bool has_intersects_mems_allowed(struct task_struct *start,
0064                     const nodemask_t *mask)
0065 {
0066     struct task_struct *tsk;
0067     bool ret = false;
0068 
0069     rcu_read_lock();
0070     for_each_thread(start, tsk) {
0071         if (mask) {
0072             /*
0073              * If this is a mempolicy constrained oom, tsk's
0074              * cpuset is irrelevant.  Only return true if its
0075              * mempolicy intersects current, otherwise it may be
0076              * needlessly killed.
0077              */
0078             ret = mempolicy_nodemask_intersects(tsk, mask);
0079         } else {
0080             /*
0081              * This is not a mempolicy constrained oom, so only
0082              * check the mems of tsk's cpuset.
0083              */
0084             ret = cpuset_mems_allowed_intersects(current, tsk);
0085         }
0086         if (ret)
0087             break;
0088     }
0089     rcu_read_unlock();
0090 
0091     return ret;
0092 }
0093 #else
0094 static bool has_intersects_mems_allowed(struct task_struct *tsk,
0095                     const nodemask_t *mask)
0096 {
0097     return true;
0098 }
0099 #endif /* CONFIG_NUMA */
0100 
0101 /*
0102  * The process p may have detached its own ->mm while exiting or through
0103  * use_mm(), but one or more of its subthreads may still have a valid
0104  * pointer.  Return p, or any of its subthreads with a valid ->mm, with
0105  * task_lock() held.
0106  */
0107 struct task_struct *find_lock_task_mm(struct task_struct *p)
0108 {
0109     struct task_struct *t;
0110 
0111     rcu_read_lock();
0112 
0113     for_each_thread(p, t) {
0114         task_lock(t);
0115         if (likely(t->mm))
0116             goto found;
0117         task_unlock(t);
0118     }
0119     t = NULL;
0120 found:
0121     rcu_read_unlock();
0122 
0123     return t;
0124 }
0125 
0126 /*
0127  * order == -1 means the oom kill is required by sysrq, otherwise only
0128  * for display purposes.
0129  */
0130 static inline bool is_sysrq_oom(struct oom_control *oc)
0131 {
0132     return oc->order == -1;
0133 }
0134 
0135 static inline bool is_memcg_oom(struct oom_control *oc)
0136 {
0137     return oc->memcg != NULL;
0138 }
0139 
0140 /* return true if the task is not adequate as candidate victim task. */
0141 static bool oom_unkillable_task(struct task_struct *p,
0142         struct mem_cgroup *memcg, const nodemask_t *nodemask)
0143 {
0144     if (is_global_init(p))
0145         return true;
0146     if (p->flags & PF_KTHREAD)
0147         return true;
0148 
0149     /* When mem_cgroup_out_of_memory() and p is not member of the group */
0150     if (memcg && !task_in_mem_cgroup(p, memcg))
0151         return true;
0152 
0153     /* p may not have freeable memory in nodemask */
0154     if (!has_intersects_mems_allowed(p, nodemask))
0155         return true;
0156 
0157     return false;
0158 }
0159 
0160 /**
0161  * oom_badness - heuristic function to determine which candidate task to kill
0162  * @p: task struct of which task we should calculate
0163  * @totalpages: total present RAM allowed for page allocation
0164  *
0165  * The heuristic for determining which task to kill is made to be as simple and
0166  * predictable as possible.  The goal is to return the highest value for the
0167  * task consuming the most memory to avoid subsequent oom failures.
0168  */
0169 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
0170               const nodemask_t *nodemask, unsigned long totalpages)
0171 {
0172     long points;
0173     long adj;
0174 
0175     if (oom_unkillable_task(p, memcg, nodemask))
0176         return 0;
0177 
0178     p = find_lock_task_mm(p);
0179     if (!p)
0180         return 0;
0181 
0182     /*
0183      * Do not even consider tasks which are explicitly marked oom
0184      * unkillable or have been already oom reaped or the are in
0185      * the middle of vfork
0186      */
0187     adj = (long)p->signal->oom_score_adj;
0188     if (adj == OOM_SCORE_ADJ_MIN ||
0189             test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
0190             in_vfork(p)) {
0191         task_unlock(p);
0192         return 0;
0193     }
0194 
0195     /*
0196      * The baseline for the badness score is the proportion of RAM that each
0197      * task's rss, pagetable and swap space use.
0198      */
0199     points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
0200         atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
0201     task_unlock(p);
0202 
0203     /*
0204      * Root processes get 3% bonus, just like the __vm_enough_memory()
0205      * implementation used by LSMs.
0206      */
0207     if (has_capability_noaudit(p, CAP_SYS_ADMIN))
0208         points -= (points * 3) / 100;
0209 
0210     /* Normalize to oom_score_adj units */
0211     adj *= totalpages / 1000;
0212     points += adj;
0213 
0214     /*
0215      * Never return 0 for an eligible task regardless of the root bonus and
0216      * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
0217      */
0218     return points > 0 ? points : 1;
0219 }
0220 
0221 enum oom_constraint {
0222     CONSTRAINT_NONE,
0223     CONSTRAINT_CPUSET,
0224     CONSTRAINT_MEMORY_POLICY,
0225     CONSTRAINT_MEMCG,
0226 };
0227 
0228 /*
0229  * Determine the type of allocation constraint.
0230  */
0231 static enum oom_constraint constrained_alloc(struct oom_control *oc)
0232 {
0233     struct zone *zone;
0234     struct zoneref *z;
0235     enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
0236     bool cpuset_limited = false;
0237     int nid;
0238 
0239     if (is_memcg_oom(oc)) {
0240         oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
0241         return CONSTRAINT_MEMCG;
0242     }
0243 
0244     /* Default to all available memory */
0245     oc->totalpages = totalram_pages + total_swap_pages;
0246 
0247     if (!IS_ENABLED(CONFIG_NUMA))
0248         return CONSTRAINT_NONE;
0249 
0250     if (!oc->zonelist)
0251         return CONSTRAINT_NONE;
0252     /*
0253      * Reach here only when __GFP_NOFAIL is used. So, we should avoid
0254      * to kill current.We have to random task kill in this case.
0255      * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
0256      */
0257     if (oc->gfp_mask & __GFP_THISNODE)
0258         return CONSTRAINT_NONE;
0259 
0260     /*
0261      * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
0262      * the page allocator means a mempolicy is in effect.  Cpuset policy
0263      * is enforced in get_page_from_freelist().
0264      */
0265     if (oc->nodemask &&
0266         !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
0267         oc->totalpages = total_swap_pages;
0268         for_each_node_mask(nid, *oc->nodemask)
0269             oc->totalpages += node_spanned_pages(nid);
0270         return CONSTRAINT_MEMORY_POLICY;
0271     }
0272 
0273     /* Check this allocation failure is caused by cpuset's wall function */
0274     for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
0275             high_zoneidx, oc->nodemask)
0276         if (!cpuset_zone_allowed(zone, oc->gfp_mask))
0277             cpuset_limited = true;
0278 
0279     if (cpuset_limited) {
0280         oc->totalpages = total_swap_pages;
0281         for_each_node_mask(nid, cpuset_current_mems_allowed)
0282             oc->totalpages += node_spanned_pages(nid);
0283         return CONSTRAINT_CPUSET;
0284     }
0285     return CONSTRAINT_NONE;
0286 }
0287 
0288 static int oom_evaluate_task(struct task_struct *task, void *arg)
0289 {
0290     struct oom_control *oc = arg;
0291     unsigned long points;
0292 
0293     if (oom_unkillable_task(task, NULL, oc->nodemask))
0294         goto next;
0295 
0296     /*
0297      * This task already has access to memory reserves and is being killed.
0298      * Don't allow any other task to have access to the reserves unless
0299      * the task has MMF_OOM_SKIP because chances that it would release
0300      * any memory is quite low.
0301      */
0302     if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
0303         if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
0304             goto next;
0305         goto abort;
0306     }
0307 
0308     /*
0309      * If task is allocating a lot of memory and has been marked to be
0310      * killed first if it triggers an oom, then select it.
0311      */
0312     if (oom_task_origin(task)) {
0313         points = ULONG_MAX;
0314         goto select;
0315     }
0316 
0317     points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
0318     if (!points || points < oc->chosen_points)
0319         goto next;
0320 
0321     /* Prefer thread group leaders for display purposes */
0322     if (points == oc->chosen_points && thread_group_leader(oc->chosen))
0323         goto next;
0324 select:
0325     if (oc->chosen)
0326         put_task_struct(oc->chosen);
0327     get_task_struct(task);
0328     oc->chosen = task;
0329     oc->chosen_points = points;
0330 next:
0331     return 0;
0332 abort:
0333     if (oc->chosen)
0334         put_task_struct(oc->chosen);
0335     oc->chosen = (void *)-1UL;
0336     return 1;
0337 }
0338 
0339 /*
0340  * Simple selection loop. We choose the process with the highest number of
0341  * 'points'. In case scan was aborted, oc->chosen is set to -1.
0342  */
0343 static void select_bad_process(struct oom_control *oc)
0344 {
0345     if (is_memcg_oom(oc))
0346         mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
0347     else {
0348         struct task_struct *p;
0349 
0350         rcu_read_lock();
0351         for_each_process(p)
0352             if (oom_evaluate_task(p, oc))
0353                 break;
0354         rcu_read_unlock();
0355     }
0356 
0357     oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
0358 }
0359 
0360 /**
0361  * dump_tasks - dump current memory state of all system tasks
0362  * @memcg: current's memory controller, if constrained
0363  * @nodemask: nodemask passed to page allocator for mempolicy ooms
0364  *
0365  * Dumps the current memory state of all eligible tasks.  Tasks not in the same
0366  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
0367  * are not shown.
0368  * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
0369  * swapents, oom_score_adj value, and name.
0370  */
0371 static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
0372 {
0373     struct task_struct *p;
0374     struct task_struct *task;
0375 
0376     pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
0377     rcu_read_lock();
0378     for_each_process(p) {
0379         if (oom_unkillable_task(p, memcg, nodemask))
0380             continue;
0381 
0382         task = find_lock_task_mm(p);
0383         if (!task) {
0384             /*
0385              * This is a kthread or all of p's threads have already
0386              * detached their mm's.  There's no need to report
0387              * them; they can't be oom killed anyway.
0388              */
0389             continue;
0390         }
0391 
0392         pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
0393             task->pid, from_kuid(&init_user_ns, task_uid(task)),
0394             task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
0395             atomic_long_read(&task->mm->nr_ptes),
0396             mm_nr_pmds(task->mm),
0397             get_mm_counter(task->mm, MM_SWAPENTS),
0398             task->signal->oom_score_adj, task->comm);
0399         task_unlock(task);
0400     }
0401     rcu_read_unlock();
0402 }
0403 
0404 static void dump_header(struct oom_control *oc, struct task_struct *p)
0405 {
0406     nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed;
0407 
0408     pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
0409         current->comm, oc->gfp_mask, &oc->gfp_mask,
0410         nodemask_pr_args(nm), oc->order,
0411         current->signal->oom_score_adj);
0412     if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
0413         pr_warn("COMPACTION is disabled!!!\n");
0414 
0415     cpuset_print_current_mems_allowed();
0416     dump_stack();
0417     if (oc->memcg)
0418         mem_cgroup_print_oom_info(oc->memcg, p);
0419     else
0420         show_mem(SHOW_MEM_FILTER_NODES);
0421     if (sysctl_oom_dump_tasks)
0422         dump_tasks(oc->memcg, oc->nodemask);
0423 }
0424 
0425 /*
0426  * Number of OOM victims in flight
0427  */
0428 static atomic_t oom_victims = ATOMIC_INIT(0);
0429 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
0430 
0431 static bool oom_killer_disabled __read_mostly;
0432 
0433 #define K(x) ((x) << (PAGE_SHIFT-10))
0434 
0435 /*
0436  * task->mm can be NULL if the task is the exited group leader.  So to
0437  * determine whether the task is using a particular mm, we examine all the
0438  * task's threads: if one of those is using this mm then this task was also
0439  * using it.
0440  */
0441 bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
0442 {
0443     struct task_struct *t;
0444 
0445     for_each_thread(p, t) {
0446         struct mm_struct *t_mm = READ_ONCE(t->mm);
0447         if (t_mm)
0448             return t_mm == mm;
0449     }
0450     return false;
0451 }
0452 
0453 
0454 #ifdef CONFIG_MMU
0455 /*
0456  * OOM Reaper kernel thread which tries to reap the memory used by the OOM
0457  * victim (if that is possible) to help the OOM killer to move on.
0458  */
0459 static struct task_struct *oom_reaper_th;
0460 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
0461 static struct task_struct *oom_reaper_list;
0462 static DEFINE_SPINLOCK(oom_reaper_lock);
0463 
0464 static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
0465 {
0466     struct mmu_gather tlb;
0467     struct vm_area_struct *vma;
0468     struct zap_details details = {.check_swap_entries = true,
0469                       .ignore_dirty = true};
0470     bool ret = true;
0471 
0472     /*
0473      * We have to make sure to not race with the victim exit path
0474      * and cause premature new oom victim selection:
0475      * __oom_reap_task_mm       exit_mm
0476      *   mmget_not_zero
0477      *                mmput
0478      *                  atomic_dec_and_test
0479      *                exit_oom_victim
0480      *              [...]
0481      *              out_of_memory
0482      *                select_bad_process
0483      *                  # no TIF_MEMDIE task selects new victim
0484      *  unmap_page_range # frees some memory
0485      */
0486     mutex_lock(&oom_lock);
0487 
0488     if (!down_read_trylock(&mm->mmap_sem)) {
0489         ret = false;
0490         goto unlock_oom;
0491     }
0492 
0493     /*
0494      * increase mm_users only after we know we will reap something so
0495      * that the mmput_async is called only when we have reaped something
0496      * and delayed __mmput doesn't matter that much
0497      */
0498     if (!mmget_not_zero(mm)) {
0499         up_read(&mm->mmap_sem);
0500         goto unlock_oom;
0501     }
0502 
0503     /*
0504      * Tell all users of get_user/copy_from_user etc... that the content
0505      * is no longer stable. No barriers really needed because unmapping
0506      * should imply barriers already and the reader would hit a page fault
0507      * if it stumbled over a reaped memory.
0508      */
0509     set_bit(MMF_UNSTABLE, &mm->flags);
0510 
0511     tlb_gather_mmu(&tlb, mm, 0, -1);
0512     for (vma = mm->mmap ; vma; vma = vma->vm_next) {
0513         if (is_vm_hugetlb_page(vma))
0514             continue;
0515 
0516         /*
0517          * mlocked VMAs require explicit munlocking before unmap.
0518          * Let's keep it simple here and skip such VMAs.
0519          */
0520         if (vma->vm_flags & VM_LOCKED)
0521             continue;
0522 
0523         /*
0524          * Only anonymous pages have a good chance to be dropped
0525          * without additional steps which we cannot afford as we
0526          * are OOM already.
0527          *
0528          * We do not even care about fs backed pages because all
0529          * which are reclaimable have already been reclaimed and
0530          * we do not want to block exit_mmap by keeping mm ref
0531          * count elevated without a good reason.
0532          */
0533         if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
0534             unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
0535                      &details);
0536     }
0537     tlb_finish_mmu(&tlb, 0, -1);
0538     pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
0539             task_pid_nr(tsk), tsk->comm,
0540             K(get_mm_counter(mm, MM_ANONPAGES)),
0541             K(get_mm_counter(mm, MM_FILEPAGES)),
0542             K(get_mm_counter(mm, MM_SHMEMPAGES)));
0543     up_read(&mm->mmap_sem);
0544 
0545     /*
0546      * Drop our reference but make sure the mmput slow path is called from a
0547      * different context because we shouldn't risk we get stuck there and
0548      * put the oom_reaper out of the way.
0549      */
0550     mmput_async(mm);
0551 unlock_oom:
0552     mutex_unlock(&oom_lock);
0553     return ret;
0554 }
0555 
0556 #define MAX_OOM_REAP_RETRIES 10
0557 static void oom_reap_task(struct task_struct *tsk)
0558 {
0559     int attempts = 0;
0560     struct mm_struct *mm = tsk->signal->oom_mm;
0561 
0562     /* Retry the down_read_trylock(mmap_sem) a few times */
0563     while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
0564         schedule_timeout_idle(HZ/10);
0565 
0566     if (attempts <= MAX_OOM_REAP_RETRIES)
0567         goto done;
0568 
0569 
0570     pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
0571         task_pid_nr(tsk), tsk->comm);
0572     debug_show_all_locks();
0573 
0574 done:
0575     tsk->oom_reaper_list = NULL;
0576 
0577     /*
0578      * Hide this mm from OOM killer because it has been either reaped or
0579      * somebody can't call up_write(mmap_sem).
0580      */
0581     set_bit(MMF_OOM_SKIP, &mm->flags);
0582 
0583     /* Drop a reference taken by wake_oom_reaper */
0584     put_task_struct(tsk);
0585 }
0586 
0587 static int oom_reaper(void *unused)
0588 {
0589     while (true) {
0590         struct task_struct *tsk = NULL;
0591 
0592         wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
0593         spin_lock(&oom_reaper_lock);
0594         if (oom_reaper_list != NULL) {
0595             tsk = oom_reaper_list;
0596             oom_reaper_list = tsk->oom_reaper_list;
0597         }
0598         spin_unlock(&oom_reaper_lock);
0599 
0600         if (tsk)
0601             oom_reap_task(tsk);
0602     }
0603 
0604     return 0;
0605 }
0606 
0607 static void wake_oom_reaper(struct task_struct *tsk)
0608 {
0609     if (!oom_reaper_th)
0610         return;
0611 
0612     /* tsk is already queued? */
0613     if (tsk == oom_reaper_list || tsk->oom_reaper_list)
0614         return;
0615 
0616     get_task_struct(tsk);
0617 
0618     spin_lock(&oom_reaper_lock);
0619     tsk->oom_reaper_list = oom_reaper_list;
0620     oom_reaper_list = tsk;
0621     spin_unlock(&oom_reaper_lock);
0622     wake_up(&oom_reaper_wait);
0623 }
0624 
0625 static int __init oom_init(void)
0626 {
0627     oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
0628     if (IS_ERR(oom_reaper_th)) {
0629         pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
0630                 PTR_ERR(oom_reaper_th));
0631         oom_reaper_th = NULL;
0632     }
0633     return 0;
0634 }
0635 subsys_initcall(oom_init)
0636 #else
0637 static inline void wake_oom_reaper(struct task_struct *tsk)
0638 {
0639 }
0640 #endif /* CONFIG_MMU */
0641 
0642 /**
0643  * mark_oom_victim - mark the given task as OOM victim
0644  * @tsk: task to mark
0645  *
0646  * Has to be called with oom_lock held and never after
0647  * oom has been disabled already.
0648  *
0649  * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
0650  * under task_lock or operate on the current).
0651  */
0652 static void mark_oom_victim(struct task_struct *tsk)
0653 {
0654     struct mm_struct *mm = tsk->mm;
0655 
0656     WARN_ON(oom_killer_disabled);
0657     /* OOM killer might race with memcg OOM */
0658     if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
0659         return;
0660 
0661     /* oom_mm is bound to the signal struct life time. */
0662     if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
0663         atomic_inc(&tsk->signal->oom_mm->mm_count);
0664 
0665     /*
0666      * Make sure that the task is woken up from uninterruptible sleep
0667      * if it is frozen because OOM killer wouldn't be able to free
0668      * any memory and livelock. freezing_slow_path will tell the freezer
0669      * that TIF_MEMDIE tasks should be ignored.
0670      */
0671     __thaw_task(tsk);
0672     atomic_inc(&oom_victims);
0673 }
0674 
0675 /**
0676  * exit_oom_victim - note the exit of an OOM victim
0677  */
0678 void exit_oom_victim(void)
0679 {
0680     clear_thread_flag(TIF_MEMDIE);
0681 
0682     if (!atomic_dec_return(&oom_victims))
0683         wake_up_all(&oom_victims_wait);
0684 }
0685 
0686 /**
0687  * oom_killer_enable - enable OOM killer
0688  */
0689 void oom_killer_enable(void)
0690 {
0691     oom_killer_disabled = false;
0692 }
0693 
0694 /**
0695  * oom_killer_disable - disable OOM killer
0696  * @timeout: maximum timeout to wait for oom victims in jiffies
0697  *
0698  * Forces all page allocations to fail rather than trigger OOM killer.
0699  * Will block and wait until all OOM victims are killed or the given
0700  * timeout expires.
0701  *
0702  * The function cannot be called when there are runnable user tasks because
0703  * the userspace would see unexpected allocation failures as a result. Any
0704  * new usage of this function should be consulted with MM people.
0705  *
0706  * Returns true if successful and false if the OOM killer cannot be
0707  * disabled.
0708  */
0709 bool oom_killer_disable(signed long timeout)
0710 {
0711     signed long ret;
0712 
0713     /*
0714      * Make sure to not race with an ongoing OOM killer. Check that the
0715      * current is not killed (possibly due to sharing the victim's memory).
0716      */
0717     if (mutex_lock_killable(&oom_lock))
0718         return false;
0719     oom_killer_disabled = true;
0720     mutex_unlock(&oom_lock);
0721 
0722     ret = wait_event_interruptible_timeout(oom_victims_wait,
0723             !atomic_read(&oom_victims), timeout);
0724     if (ret <= 0) {
0725         oom_killer_enable();
0726         return false;
0727     }
0728 
0729     return true;
0730 }
0731 
0732 static inline bool __task_will_free_mem(struct task_struct *task)
0733 {
0734     struct signal_struct *sig = task->signal;
0735 
0736     /*
0737      * A coredumping process may sleep for an extended period in exit_mm(),
0738      * so the oom killer cannot assume that the process will promptly exit
0739      * and release memory.
0740      */
0741     if (sig->flags & SIGNAL_GROUP_COREDUMP)
0742         return false;
0743 
0744     if (sig->flags & SIGNAL_GROUP_EXIT)
0745         return true;
0746 
0747     if (thread_group_empty(task) && (task->flags & PF_EXITING))
0748         return true;
0749 
0750     return false;
0751 }
0752 
0753 /*
0754  * Checks whether the given task is dying or exiting and likely to
0755  * release its address space. This means that all threads and processes
0756  * sharing the same mm have to be killed or exiting.
0757  * Caller has to make sure that task->mm is stable (hold task_lock or
0758  * it operates on the current).
0759  */
0760 static bool task_will_free_mem(struct task_struct *task)
0761 {
0762     struct mm_struct *mm = task->mm;
0763     struct task_struct *p;
0764     bool ret = true;
0765 
0766     /*
0767      * Skip tasks without mm because it might have passed its exit_mm and
0768      * exit_oom_victim. oom_reaper could have rescued that but do not rely
0769      * on that for now. We can consider find_lock_task_mm in future.
0770      */
0771     if (!mm)
0772         return false;
0773 
0774     if (!__task_will_free_mem(task))
0775         return false;
0776 
0777     /*
0778      * This task has already been drained by the oom reaper so there are
0779      * only small chances it will free some more
0780      */
0781     if (test_bit(MMF_OOM_SKIP, &mm->flags))
0782         return false;
0783 
0784     if (atomic_read(&mm->mm_users) <= 1)
0785         return true;
0786 
0787     /*
0788      * Make sure that all tasks which share the mm with the given tasks
0789      * are dying as well to make sure that a) nobody pins its mm and
0790      * b) the task is also reapable by the oom reaper.
0791      */
0792     rcu_read_lock();
0793     for_each_process(p) {
0794         if (!process_shares_mm(p, mm))
0795             continue;
0796         if (same_thread_group(task, p))
0797             continue;
0798         ret = __task_will_free_mem(p);
0799         if (!ret)
0800             break;
0801     }
0802     rcu_read_unlock();
0803 
0804     return ret;
0805 }
0806 
0807 static void oom_kill_process(struct oom_control *oc, const char *message)
0808 {
0809     struct task_struct *p = oc->chosen;
0810     unsigned int points = oc->chosen_points;
0811     struct task_struct *victim = p;
0812     struct task_struct *child;
0813     struct task_struct *t;
0814     struct mm_struct *mm;
0815     unsigned int victim_points = 0;
0816     static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
0817                           DEFAULT_RATELIMIT_BURST);
0818     bool can_oom_reap = true;
0819 
0820     /*
0821      * If the task is already exiting, don't alarm the sysadmin or kill
0822      * its children or threads, just set TIF_MEMDIE so it can die quickly
0823      */
0824     task_lock(p);
0825     if (task_will_free_mem(p)) {
0826         mark_oom_victim(p);
0827         wake_oom_reaper(p);
0828         task_unlock(p);
0829         put_task_struct(p);
0830         return;
0831     }
0832     task_unlock(p);
0833 
0834     if (__ratelimit(&oom_rs))
0835         dump_header(oc, p);
0836 
0837     pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
0838         message, task_pid_nr(p), p->comm, points);
0839 
0840     /*
0841      * If any of p's children has a different mm and is eligible for kill,
0842      * the one with the highest oom_badness() score is sacrificed for its
0843      * parent.  This attempts to lose the minimal amount of work done while
0844      * still freeing memory.
0845      */
0846     read_lock(&tasklist_lock);
0847     for_each_thread(p, t) {
0848         list_for_each_entry(child, &t->children, sibling) {
0849             unsigned int child_points;
0850 
0851             if (process_shares_mm(child, p->mm))
0852                 continue;
0853             /*
0854              * oom_badness() returns 0 if the thread is unkillable
0855              */
0856             child_points = oom_badness(child,
0857                 oc->memcg, oc->nodemask, oc->totalpages);
0858             if (child_points > victim_points) {
0859                 put_task_struct(victim);
0860                 victim = child;
0861                 victim_points = child_points;
0862                 get_task_struct(victim);
0863             }
0864         }
0865     }
0866     read_unlock(&tasklist_lock);
0867 
0868     p = find_lock_task_mm(victim);
0869     if (!p) {
0870         put_task_struct(victim);
0871         return;
0872     } else if (victim != p) {
0873         get_task_struct(p);
0874         put_task_struct(victim);
0875         victim = p;
0876     }
0877 
0878     /* Get a reference to safely compare mm after task_unlock(victim) */
0879     mm = victim->mm;
0880     atomic_inc(&mm->mm_count);
0881     /*
0882      * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
0883      * the OOM victim from depleting the memory reserves from the user
0884      * space under its control.
0885      */
0886     do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
0887     mark_oom_victim(victim);
0888     pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
0889         task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
0890         K(get_mm_counter(victim->mm, MM_ANONPAGES)),
0891         K(get_mm_counter(victim->mm, MM_FILEPAGES)),
0892         K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
0893     task_unlock(victim);
0894 
0895     /*
0896      * Kill all user processes sharing victim->mm in other thread groups, if
0897      * any.  They don't get access to memory reserves, though, to avoid
0898      * depletion of all memory.  This prevents mm->mmap_sem livelock when an
0899      * oom killed thread cannot exit because it requires the semaphore and
0900      * its contended by another thread trying to allocate memory itself.
0901      * That thread will now get access to memory reserves since it has a
0902      * pending fatal signal.
0903      */
0904     rcu_read_lock();
0905     for_each_process(p) {
0906         if (!process_shares_mm(p, mm))
0907             continue;
0908         if (same_thread_group(p, victim))
0909             continue;
0910         if (is_global_init(p)) {
0911             can_oom_reap = false;
0912             set_bit(MMF_OOM_SKIP, &mm->flags);
0913             pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
0914                     task_pid_nr(victim), victim->comm,
0915                     task_pid_nr(p), p->comm);
0916             continue;
0917         }
0918         /*
0919          * No use_mm() user needs to read from the userspace so we are
0920          * ok to reap it.
0921          */
0922         if (unlikely(p->flags & PF_KTHREAD))
0923             continue;
0924         do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
0925     }
0926     rcu_read_unlock();
0927 
0928     if (can_oom_reap)
0929         wake_oom_reaper(victim);
0930 
0931     mmdrop(mm);
0932     put_task_struct(victim);
0933 }
0934 #undef K
0935 
0936 /*
0937  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
0938  */
0939 static void check_panic_on_oom(struct oom_control *oc,
0940                    enum oom_constraint constraint)
0941 {
0942     if (likely(!sysctl_panic_on_oom))
0943         return;
0944     if (sysctl_panic_on_oom != 2) {
0945         /*
0946          * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
0947          * does not panic for cpuset, mempolicy, or memcg allocation
0948          * failures.
0949          */
0950         if (constraint != CONSTRAINT_NONE)
0951             return;
0952     }
0953     /* Do not panic for oom kills triggered by sysrq */
0954     if (is_sysrq_oom(oc))
0955         return;
0956     dump_header(oc, NULL);
0957     panic("Out of memory: %s panic_on_oom is enabled\n",
0958         sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
0959 }
0960 
0961 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
0962 
0963 int register_oom_notifier(struct notifier_block *nb)
0964 {
0965     return blocking_notifier_chain_register(&oom_notify_list, nb);
0966 }
0967 EXPORT_SYMBOL_GPL(register_oom_notifier);
0968 
0969 int unregister_oom_notifier(struct notifier_block *nb)
0970 {
0971     return blocking_notifier_chain_unregister(&oom_notify_list, nb);
0972 }
0973 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
0974 
0975 /**
0976  * out_of_memory - kill the "best" process when we run out of memory
0977  * @oc: pointer to struct oom_control
0978  *
0979  * If we run out of memory, we have the choice between either
0980  * killing a random task (bad), letting the system crash (worse)
0981  * OR try to be smart about which process to kill. Note that we
0982  * don't have to be perfect here, we just have to be good.
0983  */
0984 bool out_of_memory(struct oom_control *oc)
0985 {
0986     unsigned long freed = 0;
0987     enum oom_constraint constraint = CONSTRAINT_NONE;
0988 
0989     if (oom_killer_disabled)
0990         return false;
0991 
0992     if (!is_memcg_oom(oc)) {
0993         blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
0994         if (freed > 0)
0995             /* Got some memory back in the last second. */
0996             return true;
0997     }
0998 
0999     /*
1000      * If current has a pending SIGKILL or is exiting, then automatically
1001      * select it.  The goal is to allow it to allocate so that it may
1002      * quickly exit and free its memory.
1003      */
1004     if (task_will_free_mem(current)) {
1005         mark_oom_victim(current);
1006         wake_oom_reaper(current);
1007         return true;
1008     }
1009 
1010     /*
1011      * The OOM killer does not compensate for IO-less reclaim.
1012      * pagefault_out_of_memory lost its gfp context so we have to
1013      * make sure exclude 0 mask - all other users should have at least
1014      * ___GFP_DIRECT_RECLAIM to get here.
1015      */
1016     if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
1017         return true;
1018 
1019     /*
1020      * Check if there were limitations on the allocation (only relevant for
1021      * NUMA and memcg) that may require different handling.
1022      */
1023     constraint = constrained_alloc(oc);
1024     if (constraint != CONSTRAINT_MEMORY_POLICY)
1025         oc->nodemask = NULL;
1026     check_panic_on_oom(oc, constraint);
1027 
1028     if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
1029         current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
1030         current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
1031         get_task_struct(current);
1032         oc->chosen = current;
1033         oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
1034         return true;
1035     }
1036 
1037     select_bad_process(oc);
1038     /* Found nothing?!?! Either we hang forever, or we panic. */
1039     if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
1040         dump_header(oc, NULL);
1041         panic("Out of memory and no killable processes...\n");
1042     }
1043     if (oc->chosen && oc->chosen != (void *)-1UL) {
1044         oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
1045                  "Memory cgroup out of memory");
1046         /*
1047          * Give the killed process a good chance to exit before trying
1048          * to allocate memory again.
1049          */
1050         schedule_timeout_killable(1);
1051     }
1052     return !!oc->chosen;
1053 }
1054 
1055 /*
1056  * The pagefault handler calls here because it is out of memory, so kill a
1057  * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
1058  * killing is already in progress so do nothing.
1059  */
1060 void pagefault_out_of_memory(void)
1061 {
1062     struct oom_control oc = {
1063         .zonelist = NULL,
1064         .nodemask = NULL,
1065         .memcg = NULL,
1066         .gfp_mask = 0,
1067         .order = 0,
1068     };
1069 
1070     if (mem_cgroup_oom_synchronize(true))
1071         return;
1072 
1073     if (!mutex_trylock(&oom_lock))
1074         return;
1075     out_of_memory(&oc);
1076     mutex_unlock(&oom_lock);
1077 }