Back to home page

LXR

 
 

    


0001 /*
0002  *  kernel/cpuset.c
0003  *
0004  *  Processor and Memory placement constraints for sets of tasks.
0005  *
0006  *  Copyright (C) 2003 BULL SA.
0007  *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
0008  *  Copyright (C) 2006 Google, Inc
0009  *
0010  *  Portions derived from Patrick Mochel's sysfs code.
0011  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
0012  *
0013  *  2003-10-10 Written by Simon Derr.
0014  *  2003-10-22 Updates by Stephen Hemminger.
0015  *  2004 May-July Rework by Paul Jackson.
0016  *  2006 Rework by Paul Menage to use generic cgroups
0017  *  2008 Rework of the scheduler domains and CPU hotplug handling
0018  *       by Max Krasnyansky
0019  *
0020  *  This file is subject to the terms and conditions of the GNU General Public
0021  *  License.  See the file COPYING in the main directory of the Linux
0022  *  distribution for more details.
0023  */
0024 
0025 #include <linux/cpu.h>
0026 #include <linux/cpumask.h>
0027 #include <linux/cpuset.h>
0028 #include <linux/err.h>
0029 #include <linux/errno.h>
0030 #include <linux/file.h>
0031 #include <linux/fs.h>
0032 #include <linux/init.h>
0033 #include <linux/interrupt.h>
0034 #include <linux/kernel.h>
0035 #include <linux/kmod.h>
0036 #include <linux/list.h>
0037 #include <linux/mempolicy.h>
0038 #include <linux/mm.h>
0039 #include <linux/memory.h>
0040 #include <linux/export.h>
0041 #include <linux/mount.h>
0042 #include <linux/namei.h>
0043 #include <linux/pagemap.h>
0044 #include <linux/proc_fs.h>
0045 #include <linux/rcupdate.h>
0046 #include <linux/sched.h>
0047 #include <linux/seq_file.h>
0048 #include <linux/security.h>
0049 #include <linux/slab.h>
0050 #include <linux/spinlock.h>
0051 #include <linux/stat.h>
0052 #include <linux/string.h>
0053 #include <linux/time.h>
0054 #include <linux/time64.h>
0055 #include <linux/backing-dev.h>
0056 #include <linux/sort.h>
0057 
0058 #include <linux/uaccess.h>
0059 #include <linux/atomic.h>
0060 #include <linux/mutex.h>
0061 #include <linux/cgroup.h>
0062 #include <linux/wait.h>
0063 
0064 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
0065 
0066 /* See "Frequency meter" comments, below. */
0067 
0068 struct fmeter {
0069     int cnt;        /* unprocessed events count */
0070     int val;        /* most recent output value */
0071     time64_t time;      /* clock (secs) when val computed */
0072     spinlock_t lock;    /* guards read or write of above */
0073 };
0074 
0075 struct cpuset {
0076     struct cgroup_subsys_state css;
0077 
0078     unsigned long flags;        /* "unsigned long" so bitops work */
0079 
0080     /*
0081      * On default hierarchy:
0082      *
0083      * The user-configured masks can only be changed by writing to
0084      * cpuset.cpus and cpuset.mems, and won't be limited by the
0085      * parent masks.
0086      *
0087      * The effective masks is the real masks that apply to the tasks
0088      * in the cpuset. They may be changed if the configured masks are
0089      * changed or hotplug happens.
0090      *
0091      * effective_mask == configured_mask & parent's effective_mask,
0092      * and if it ends up empty, it will inherit the parent's mask.
0093      *
0094      *
0095      * On legacy hierachy:
0096      *
0097      * The user-configured masks are always the same with effective masks.
0098      */
0099 
0100     /* user-configured CPUs and Memory Nodes allow to tasks */
0101     cpumask_var_t cpus_allowed;
0102     nodemask_t mems_allowed;
0103 
0104     /* effective CPUs and Memory Nodes allow to tasks */
0105     cpumask_var_t effective_cpus;
0106     nodemask_t effective_mems;
0107 
0108     /*
0109      * This is old Memory Nodes tasks took on.
0110      *
0111      * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
0112      * - A new cpuset's old_mems_allowed is initialized when some
0113      *   task is moved into it.
0114      * - old_mems_allowed is used in cpuset_migrate_mm() when we change
0115      *   cpuset.mems_allowed and have tasks' nodemask updated, and
0116      *   then old_mems_allowed is updated to mems_allowed.
0117      */
0118     nodemask_t old_mems_allowed;
0119 
0120     struct fmeter fmeter;       /* memory_pressure filter */
0121 
0122     /*
0123      * Tasks are being attached to this cpuset.  Used to prevent
0124      * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
0125      */
0126     int attach_in_progress;
0127 
0128     /* partition number for rebuild_sched_domains() */
0129     int pn;
0130 
0131     /* for custom sched domain */
0132     int relax_domain_level;
0133 };
0134 
0135 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
0136 {
0137     return css ? container_of(css, struct cpuset, css) : NULL;
0138 }
0139 
0140 /* Retrieve the cpuset for a task */
0141 static inline struct cpuset *task_cs(struct task_struct *task)
0142 {
0143     return css_cs(task_css(task, cpuset_cgrp_id));
0144 }
0145 
0146 static inline struct cpuset *parent_cs(struct cpuset *cs)
0147 {
0148     return css_cs(cs->css.parent);
0149 }
0150 
0151 #ifdef CONFIG_NUMA
0152 static inline bool task_has_mempolicy(struct task_struct *task)
0153 {
0154     return task->mempolicy;
0155 }
0156 #else
0157 static inline bool task_has_mempolicy(struct task_struct *task)
0158 {
0159     return false;
0160 }
0161 #endif
0162 
0163 
0164 /* bits in struct cpuset flags field */
0165 typedef enum {
0166     CS_ONLINE,
0167     CS_CPU_EXCLUSIVE,
0168     CS_MEM_EXCLUSIVE,
0169     CS_MEM_HARDWALL,
0170     CS_MEMORY_MIGRATE,
0171     CS_SCHED_LOAD_BALANCE,
0172     CS_SPREAD_PAGE,
0173     CS_SPREAD_SLAB,
0174 } cpuset_flagbits_t;
0175 
0176 /* convenient tests for these bits */
0177 static inline bool is_cpuset_online(const struct cpuset *cs)
0178 {
0179     return test_bit(CS_ONLINE, &cs->flags);
0180 }
0181 
0182 static inline int is_cpu_exclusive(const struct cpuset *cs)
0183 {
0184     return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
0185 }
0186 
0187 static inline int is_mem_exclusive(const struct cpuset *cs)
0188 {
0189     return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
0190 }
0191 
0192 static inline int is_mem_hardwall(const struct cpuset *cs)
0193 {
0194     return test_bit(CS_MEM_HARDWALL, &cs->flags);
0195 }
0196 
0197 static inline int is_sched_load_balance(const struct cpuset *cs)
0198 {
0199     return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
0200 }
0201 
0202 static inline int is_memory_migrate(const struct cpuset *cs)
0203 {
0204     return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
0205 }
0206 
0207 static inline int is_spread_page(const struct cpuset *cs)
0208 {
0209     return test_bit(CS_SPREAD_PAGE, &cs->flags);
0210 }
0211 
0212 static inline int is_spread_slab(const struct cpuset *cs)
0213 {
0214     return test_bit(CS_SPREAD_SLAB, &cs->flags);
0215 }
0216 
0217 static struct cpuset top_cpuset = {
0218     .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
0219           (1 << CS_MEM_EXCLUSIVE)),
0220 };
0221 
0222 /**
0223  * cpuset_for_each_child - traverse online children of a cpuset
0224  * @child_cs: loop cursor pointing to the current child
0225  * @pos_css: used for iteration
0226  * @parent_cs: target cpuset to walk children of
0227  *
0228  * Walk @child_cs through the online children of @parent_cs.  Must be used
0229  * with RCU read locked.
0230  */
0231 #define cpuset_for_each_child(child_cs, pos_css, parent_cs)     \
0232     css_for_each_child((pos_css), &(parent_cs)->css)        \
0233         if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
0234 
0235 /**
0236  * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
0237  * @des_cs: loop cursor pointing to the current descendant
0238  * @pos_css: used for iteration
0239  * @root_cs: target cpuset to walk ancestor of
0240  *
0241  * Walk @des_cs through the online descendants of @root_cs.  Must be used
0242  * with RCU read locked.  The caller may modify @pos_css by calling
0243  * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
0244  * iteration and the first node to be visited.
0245  */
0246 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)    \
0247     css_for_each_descendant_pre((pos_css), &(root_cs)->css)     \
0248         if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
0249 
0250 /*
0251  * There are two global locks guarding cpuset structures - cpuset_mutex and
0252  * callback_lock. We also require taking task_lock() when dereferencing a
0253  * task's cpuset pointer. See "The task_lock() exception", at the end of this
0254  * comment.
0255  *
0256  * A task must hold both locks to modify cpusets.  If a task holds
0257  * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
0258  * is the only task able to also acquire callback_lock and be able to
0259  * modify cpusets.  It can perform various checks on the cpuset structure
0260  * first, knowing nothing will change.  It can also allocate memory while
0261  * just holding cpuset_mutex.  While it is performing these checks, various
0262  * callback routines can briefly acquire callback_lock to query cpusets.
0263  * Once it is ready to make the changes, it takes callback_lock, blocking
0264  * everyone else.
0265  *
0266  * Calls to the kernel memory allocator can not be made while holding
0267  * callback_lock, as that would risk double tripping on callback_lock
0268  * from one of the callbacks into the cpuset code from within
0269  * __alloc_pages().
0270  *
0271  * If a task is only holding callback_lock, then it has read-only
0272  * access to cpusets.
0273  *
0274  * Now, the task_struct fields mems_allowed and mempolicy may be changed
0275  * by other task, we use alloc_lock in the task_struct fields to protect
0276  * them.
0277  *
0278  * The cpuset_common_file_read() handlers only hold callback_lock across
0279  * small pieces of code, such as when reading out possibly multi-word
0280  * cpumasks and nodemasks.
0281  *
0282  * Accessing a task's cpuset should be done in accordance with the
0283  * guidelines for accessing subsystem state in kernel/cgroup.c
0284  */
0285 
0286 static DEFINE_MUTEX(cpuset_mutex);
0287 static DEFINE_SPINLOCK(callback_lock);
0288 
0289 static struct workqueue_struct *cpuset_migrate_mm_wq;
0290 
0291 /*
0292  * CPU / memory hotplug is handled asynchronously.
0293  */
0294 static void cpuset_hotplug_workfn(struct work_struct *work);
0295 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
0296 
0297 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
0298 
0299 /*
0300  * This is ugly, but preserves the userspace API for existing cpuset
0301  * users. If someone tries to mount the "cpuset" filesystem, we
0302  * silently switch it to mount "cgroup" instead
0303  */
0304 static struct dentry *cpuset_mount(struct file_system_type *fs_type,
0305              int flags, const char *unused_dev_name, void *data)
0306 {
0307     struct file_system_type *cgroup_fs = get_fs_type("cgroup");
0308     struct dentry *ret = ERR_PTR(-ENODEV);
0309     if (cgroup_fs) {
0310         char mountopts[] =
0311             "cpuset,noprefix,"
0312             "release_agent=/sbin/cpuset_release_agent";
0313         ret = cgroup_fs->mount(cgroup_fs, flags,
0314                        unused_dev_name, mountopts);
0315         put_filesystem(cgroup_fs);
0316     }
0317     return ret;
0318 }
0319 
0320 static struct file_system_type cpuset_fs_type = {
0321     .name = "cpuset",
0322     .mount = cpuset_mount,
0323 };
0324 
0325 /*
0326  * Return in pmask the portion of a cpusets's cpus_allowed that
0327  * are online.  If none are online, walk up the cpuset hierarchy
0328  * until we find one that does have some online cpus.
0329  *
0330  * One way or another, we guarantee to return some non-empty subset
0331  * of cpu_online_mask.
0332  *
0333  * Call with callback_lock or cpuset_mutex held.
0334  */
0335 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
0336 {
0337     while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
0338         cs = parent_cs(cs);
0339         if (unlikely(!cs)) {
0340             /*
0341              * The top cpuset doesn't have any online cpu as a
0342              * consequence of a race between cpuset_hotplug_work
0343              * and cpu hotplug notifier.  But we know the top
0344              * cpuset's effective_cpus is on its way to to be
0345              * identical to cpu_online_mask.
0346              */
0347             cpumask_copy(pmask, cpu_online_mask);
0348             return;
0349         }
0350     }
0351     cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
0352 }
0353 
0354 /*
0355  * Return in *pmask the portion of a cpusets's mems_allowed that
0356  * are online, with memory.  If none are online with memory, walk
0357  * up the cpuset hierarchy until we find one that does have some
0358  * online mems.  The top cpuset always has some mems online.
0359  *
0360  * One way or another, we guarantee to return some non-empty subset
0361  * of node_states[N_MEMORY].
0362  *
0363  * Call with callback_lock or cpuset_mutex held.
0364  */
0365 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
0366 {
0367     while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
0368         cs = parent_cs(cs);
0369     nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
0370 }
0371 
0372 /*
0373  * update task's spread flag if cpuset's page/slab spread flag is set
0374  *
0375  * Call with callback_lock or cpuset_mutex held.
0376  */
0377 static void cpuset_update_task_spread_flag(struct cpuset *cs,
0378                     struct task_struct *tsk)
0379 {
0380     if (is_spread_page(cs))
0381         task_set_spread_page(tsk);
0382     else
0383         task_clear_spread_page(tsk);
0384 
0385     if (is_spread_slab(cs))
0386         task_set_spread_slab(tsk);
0387     else
0388         task_clear_spread_slab(tsk);
0389 }
0390 
0391 /*
0392  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
0393  *
0394  * One cpuset is a subset of another if all its allowed CPUs and
0395  * Memory Nodes are a subset of the other, and its exclusive flags
0396  * are only set if the other's are set.  Call holding cpuset_mutex.
0397  */
0398 
0399 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
0400 {
0401     return  cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
0402         nodes_subset(p->mems_allowed, q->mems_allowed) &&
0403         is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
0404         is_mem_exclusive(p) <= is_mem_exclusive(q);
0405 }
0406 
0407 /**
0408  * alloc_trial_cpuset - allocate a trial cpuset
0409  * @cs: the cpuset that the trial cpuset duplicates
0410  */
0411 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
0412 {
0413     struct cpuset *trial;
0414 
0415     trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
0416     if (!trial)
0417         return NULL;
0418 
0419     if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
0420         goto free_cs;
0421     if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
0422         goto free_cpus;
0423 
0424     cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
0425     cpumask_copy(trial->effective_cpus, cs->effective_cpus);
0426     return trial;
0427 
0428 free_cpus:
0429     free_cpumask_var(trial->cpus_allowed);
0430 free_cs:
0431     kfree(trial);
0432     return NULL;
0433 }
0434 
0435 /**
0436  * free_trial_cpuset - free the trial cpuset
0437  * @trial: the trial cpuset to be freed
0438  */
0439 static void free_trial_cpuset(struct cpuset *trial)
0440 {
0441     free_cpumask_var(trial->effective_cpus);
0442     free_cpumask_var(trial->cpus_allowed);
0443     kfree(trial);
0444 }
0445 
0446 /*
0447  * validate_change() - Used to validate that any proposed cpuset change
0448  *             follows the structural rules for cpusets.
0449  *
0450  * If we replaced the flag and mask values of the current cpuset
0451  * (cur) with those values in the trial cpuset (trial), would
0452  * our various subset and exclusive rules still be valid?  Presumes
0453  * cpuset_mutex held.
0454  *
0455  * 'cur' is the address of an actual, in-use cpuset.  Operations
0456  * such as list traversal that depend on the actual address of the
0457  * cpuset in the list must use cur below, not trial.
0458  *
0459  * 'trial' is the address of bulk structure copy of cur, with
0460  * perhaps one or more of the fields cpus_allowed, mems_allowed,
0461  * or flags changed to new, trial values.
0462  *
0463  * Return 0 if valid, -errno if not.
0464  */
0465 
0466 static int validate_change(struct cpuset *cur, struct cpuset *trial)
0467 {
0468     struct cgroup_subsys_state *css;
0469     struct cpuset *c, *par;
0470     int ret;
0471 
0472     rcu_read_lock();
0473 
0474     /* Each of our child cpusets must be a subset of us */
0475     ret = -EBUSY;
0476     cpuset_for_each_child(c, css, cur)
0477         if (!is_cpuset_subset(c, trial))
0478             goto out;
0479 
0480     /* Remaining checks don't apply to root cpuset */
0481     ret = 0;
0482     if (cur == &top_cpuset)
0483         goto out;
0484 
0485     par = parent_cs(cur);
0486 
0487     /* On legacy hiearchy, we must be a subset of our parent cpuset. */
0488     ret = -EACCES;
0489     if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
0490         !is_cpuset_subset(trial, par))
0491         goto out;
0492 
0493     /*
0494      * If either I or some sibling (!= me) is exclusive, we can't
0495      * overlap
0496      */
0497     ret = -EINVAL;
0498     cpuset_for_each_child(c, css, par) {
0499         if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
0500             c != cur &&
0501             cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
0502             goto out;
0503         if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
0504             c != cur &&
0505             nodes_intersects(trial->mems_allowed, c->mems_allowed))
0506             goto out;
0507     }
0508 
0509     /*
0510      * Cpusets with tasks - existing or newly being attached - can't
0511      * be changed to have empty cpus_allowed or mems_allowed.
0512      */
0513     ret = -ENOSPC;
0514     if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
0515         if (!cpumask_empty(cur->cpus_allowed) &&
0516             cpumask_empty(trial->cpus_allowed))
0517             goto out;
0518         if (!nodes_empty(cur->mems_allowed) &&
0519             nodes_empty(trial->mems_allowed))
0520             goto out;
0521     }
0522 
0523     /*
0524      * We can't shrink if we won't have enough room for SCHED_DEADLINE
0525      * tasks.
0526      */
0527     ret = -EBUSY;
0528     if (is_cpu_exclusive(cur) &&
0529         !cpuset_cpumask_can_shrink(cur->cpus_allowed,
0530                        trial->cpus_allowed))
0531         goto out;
0532 
0533     ret = 0;
0534 out:
0535     rcu_read_unlock();
0536     return ret;
0537 }
0538 
0539 #ifdef CONFIG_SMP
0540 /*
0541  * Helper routine for generate_sched_domains().
0542  * Do cpusets a, b have overlapping effective cpus_allowed masks?
0543  */
0544 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
0545 {
0546     return cpumask_intersects(a->effective_cpus, b->effective_cpus);
0547 }
0548 
0549 static void
0550 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
0551 {
0552     if (dattr->relax_domain_level < c->relax_domain_level)
0553         dattr->relax_domain_level = c->relax_domain_level;
0554     return;
0555 }
0556 
0557 static void update_domain_attr_tree(struct sched_domain_attr *dattr,
0558                     struct cpuset *root_cs)
0559 {
0560     struct cpuset *cp;
0561     struct cgroup_subsys_state *pos_css;
0562 
0563     rcu_read_lock();
0564     cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
0565         /* skip the whole subtree if @cp doesn't have any CPU */
0566         if (cpumask_empty(cp->cpus_allowed)) {
0567             pos_css = css_rightmost_descendant(pos_css);
0568             continue;
0569         }
0570 
0571         if (is_sched_load_balance(cp))
0572             update_domain_attr(dattr, cp);
0573     }
0574     rcu_read_unlock();
0575 }
0576 
0577 /*
0578  * generate_sched_domains()
0579  *
0580  * This function builds a partial partition of the systems CPUs
0581  * A 'partial partition' is a set of non-overlapping subsets whose
0582  * union is a subset of that set.
0583  * The output of this function needs to be passed to kernel/sched/core.c
0584  * partition_sched_domains() routine, which will rebuild the scheduler's
0585  * load balancing domains (sched domains) as specified by that partial
0586  * partition.
0587  *
0588  * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
0589  * for a background explanation of this.
0590  *
0591  * Does not return errors, on the theory that the callers of this
0592  * routine would rather not worry about failures to rebuild sched
0593  * domains when operating in the severe memory shortage situations
0594  * that could cause allocation failures below.
0595  *
0596  * Must be called with cpuset_mutex held.
0597  *
0598  * The three key local variables below are:
0599  *    q  - a linked-list queue of cpuset pointers, used to implement a
0600  *     top-down scan of all cpusets.  This scan loads a pointer
0601  *     to each cpuset marked is_sched_load_balance into the
0602  *     array 'csa'.  For our purposes, rebuilding the schedulers
0603  *     sched domains, we can ignore !is_sched_load_balance cpusets.
0604  *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
0605  *     that need to be load balanced, for convenient iterative
0606  *     access by the subsequent code that finds the best partition,
0607  *     i.e the set of domains (subsets) of CPUs such that the
0608  *     cpus_allowed of every cpuset marked is_sched_load_balance
0609  *     is a subset of one of these domains, while there are as
0610  *     many such domains as possible, each as small as possible.
0611  * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
0612  *     the kernel/sched/core.c routine partition_sched_domains() in a
0613  *     convenient format, that can be easily compared to the prior
0614  *     value to determine what partition elements (sched domains)
0615  *     were changed (added or removed.)
0616  *
0617  * Finding the best partition (set of domains):
0618  *  The triple nested loops below over i, j, k scan over the
0619  *  load balanced cpusets (using the array of cpuset pointers in
0620  *  csa[]) looking for pairs of cpusets that have overlapping
0621  *  cpus_allowed, but which don't have the same 'pn' partition
0622  *  number and gives them in the same partition number.  It keeps
0623  *  looping on the 'restart' label until it can no longer find
0624  *  any such pairs.
0625  *
0626  *  The union of the cpus_allowed masks from the set of
0627  *  all cpusets having the same 'pn' value then form the one
0628  *  element of the partition (one sched domain) to be passed to
0629  *  partition_sched_domains().
0630  */
0631 static int generate_sched_domains(cpumask_var_t **domains,
0632             struct sched_domain_attr **attributes)
0633 {
0634     struct cpuset *cp;  /* scans q */
0635     struct cpuset **csa;    /* array of all cpuset ptrs */
0636     int csn;        /* how many cpuset ptrs in csa so far */
0637     int i, j, k;        /* indices for partition finding loops */
0638     cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
0639     cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
0640     struct sched_domain_attr *dattr;  /* attributes for custom domains */
0641     int ndoms = 0;      /* number of sched domains in result */
0642     int nslot;      /* next empty doms[] struct cpumask slot */
0643     struct cgroup_subsys_state *pos_css;
0644 
0645     doms = NULL;
0646     dattr = NULL;
0647     csa = NULL;
0648 
0649     if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
0650         goto done;
0651     cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
0652 
0653     /* Special case for the 99% of systems with one, full, sched domain */
0654     if (is_sched_load_balance(&top_cpuset)) {
0655         ndoms = 1;
0656         doms = alloc_sched_domains(ndoms);
0657         if (!doms)
0658             goto done;
0659 
0660         dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
0661         if (dattr) {
0662             *dattr = SD_ATTR_INIT;
0663             update_domain_attr_tree(dattr, &top_cpuset);
0664         }
0665         cpumask_and(doms[0], top_cpuset.effective_cpus,
0666                      non_isolated_cpus);
0667 
0668         goto done;
0669     }
0670 
0671     csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
0672     if (!csa)
0673         goto done;
0674     csn = 0;
0675 
0676     rcu_read_lock();
0677     cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
0678         if (cp == &top_cpuset)
0679             continue;
0680         /*
0681          * Continue traversing beyond @cp iff @cp has some CPUs and
0682          * isn't load balancing.  The former is obvious.  The
0683          * latter: All child cpusets contain a subset of the
0684          * parent's cpus, so just skip them, and then we call
0685          * update_domain_attr_tree() to calc relax_domain_level of
0686          * the corresponding sched domain.
0687          */
0688         if (!cpumask_empty(cp->cpus_allowed) &&
0689             !(is_sched_load_balance(cp) &&
0690               cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
0691             continue;
0692 
0693         if (is_sched_load_balance(cp))
0694             csa[csn++] = cp;
0695 
0696         /* skip @cp's subtree */
0697         pos_css = css_rightmost_descendant(pos_css);
0698     }
0699     rcu_read_unlock();
0700 
0701     for (i = 0; i < csn; i++)
0702         csa[i]->pn = i;
0703     ndoms = csn;
0704 
0705 restart:
0706     /* Find the best partition (set of sched domains) */
0707     for (i = 0; i < csn; i++) {
0708         struct cpuset *a = csa[i];
0709         int apn = a->pn;
0710 
0711         for (j = 0; j < csn; j++) {
0712             struct cpuset *b = csa[j];
0713             int bpn = b->pn;
0714 
0715             if (apn != bpn && cpusets_overlap(a, b)) {
0716                 for (k = 0; k < csn; k++) {
0717                     struct cpuset *c = csa[k];
0718 
0719                     if (c->pn == bpn)
0720                         c->pn = apn;
0721                 }
0722                 ndoms--;    /* one less element */
0723                 goto restart;
0724             }
0725         }
0726     }
0727 
0728     /*
0729      * Now we know how many domains to create.
0730      * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
0731      */
0732     doms = alloc_sched_domains(ndoms);
0733     if (!doms)
0734         goto done;
0735 
0736     /*
0737      * The rest of the code, including the scheduler, can deal with
0738      * dattr==NULL case. No need to abort if alloc fails.
0739      */
0740     dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
0741 
0742     for (nslot = 0, i = 0; i < csn; i++) {
0743         struct cpuset *a = csa[i];
0744         struct cpumask *dp;
0745         int apn = a->pn;
0746 
0747         if (apn < 0) {
0748             /* Skip completed partitions */
0749             continue;
0750         }
0751 
0752         dp = doms[nslot];
0753 
0754         if (nslot == ndoms) {
0755             static int warnings = 10;
0756             if (warnings) {
0757                 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
0758                     nslot, ndoms, csn, i, apn);
0759                 warnings--;
0760             }
0761             continue;
0762         }
0763 
0764         cpumask_clear(dp);
0765         if (dattr)
0766             *(dattr + nslot) = SD_ATTR_INIT;
0767         for (j = i; j < csn; j++) {
0768             struct cpuset *b = csa[j];
0769 
0770             if (apn == b->pn) {
0771                 cpumask_or(dp, dp, b->effective_cpus);
0772                 cpumask_and(dp, dp, non_isolated_cpus);
0773                 if (dattr)
0774                     update_domain_attr_tree(dattr + nslot, b);
0775 
0776                 /* Done with this partition */
0777                 b->pn = -1;
0778             }
0779         }
0780         nslot++;
0781     }
0782     BUG_ON(nslot != ndoms);
0783 
0784 done:
0785     free_cpumask_var(non_isolated_cpus);
0786     kfree(csa);
0787 
0788     /*
0789      * Fallback to the default domain if kmalloc() failed.
0790      * See comments in partition_sched_domains().
0791      */
0792     if (doms == NULL)
0793         ndoms = 1;
0794 
0795     *domains    = doms;
0796     *attributes = dattr;
0797     return ndoms;
0798 }
0799 
0800 /*
0801  * Rebuild scheduler domains.
0802  *
0803  * If the flag 'sched_load_balance' of any cpuset with non-empty
0804  * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
0805  * which has that flag enabled, or if any cpuset with a non-empty
0806  * 'cpus' is removed, then call this routine to rebuild the
0807  * scheduler's dynamic sched domains.
0808  *
0809  * Call with cpuset_mutex held.  Takes get_online_cpus().
0810  */
0811 static void rebuild_sched_domains_locked(void)
0812 {
0813     struct sched_domain_attr *attr;
0814     cpumask_var_t *doms;
0815     int ndoms;
0816 
0817     lockdep_assert_held(&cpuset_mutex);
0818     get_online_cpus();
0819 
0820     /*
0821      * We have raced with CPU hotplug. Don't do anything to avoid
0822      * passing doms with offlined cpu to partition_sched_domains().
0823      * Anyways, hotplug work item will rebuild sched domains.
0824      */
0825     if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
0826         goto out;
0827 
0828     /* Generate domain masks and attrs */
0829     ndoms = generate_sched_domains(&doms, &attr);
0830 
0831     /* Have scheduler rebuild the domains */
0832     partition_sched_domains(ndoms, doms, attr);
0833 out:
0834     put_online_cpus();
0835 }
0836 #else /* !CONFIG_SMP */
0837 static void rebuild_sched_domains_locked(void)
0838 {
0839 }
0840 #endif /* CONFIG_SMP */
0841 
0842 void rebuild_sched_domains(void)
0843 {
0844     mutex_lock(&cpuset_mutex);
0845     rebuild_sched_domains_locked();
0846     mutex_unlock(&cpuset_mutex);
0847 }
0848 
0849 /**
0850  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
0851  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
0852  *
0853  * Iterate through each task of @cs updating its cpus_allowed to the
0854  * effective cpuset's.  As this function is called with cpuset_mutex held,
0855  * cpuset membership stays stable.
0856  */
0857 static void update_tasks_cpumask(struct cpuset *cs)
0858 {
0859     struct css_task_iter it;
0860     struct task_struct *task;
0861 
0862     css_task_iter_start(&cs->css, &it);
0863     while ((task = css_task_iter_next(&it)))
0864         set_cpus_allowed_ptr(task, cs->effective_cpus);
0865     css_task_iter_end(&it);
0866 }
0867 
0868 /*
0869  * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
0870  * @cs: the cpuset to consider
0871  * @new_cpus: temp variable for calculating new effective_cpus
0872  *
0873  * When congifured cpumask is changed, the effective cpumasks of this cpuset
0874  * and all its descendants need to be updated.
0875  *
0876  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
0877  *
0878  * Called with cpuset_mutex held
0879  */
0880 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
0881 {
0882     struct cpuset *cp;
0883     struct cgroup_subsys_state *pos_css;
0884     bool need_rebuild_sched_domains = false;
0885 
0886     rcu_read_lock();
0887     cpuset_for_each_descendant_pre(cp, pos_css, cs) {
0888         struct cpuset *parent = parent_cs(cp);
0889 
0890         cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
0891 
0892         /*
0893          * If it becomes empty, inherit the effective mask of the
0894          * parent, which is guaranteed to have some CPUs.
0895          */
0896         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
0897             cpumask_empty(new_cpus))
0898             cpumask_copy(new_cpus, parent->effective_cpus);
0899 
0900         /* Skip the whole subtree if the cpumask remains the same. */
0901         if (cpumask_equal(new_cpus, cp->effective_cpus)) {
0902             pos_css = css_rightmost_descendant(pos_css);
0903             continue;
0904         }
0905 
0906         if (!css_tryget_online(&cp->css))
0907             continue;
0908         rcu_read_unlock();
0909 
0910         spin_lock_irq(&callback_lock);
0911         cpumask_copy(cp->effective_cpus, new_cpus);
0912         spin_unlock_irq(&callback_lock);
0913 
0914         WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
0915             !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
0916 
0917         update_tasks_cpumask(cp);
0918 
0919         /*
0920          * If the effective cpumask of any non-empty cpuset is changed,
0921          * we need to rebuild sched domains.
0922          */
0923         if (!cpumask_empty(cp->cpus_allowed) &&
0924             is_sched_load_balance(cp))
0925             need_rebuild_sched_domains = true;
0926 
0927         rcu_read_lock();
0928         css_put(&cp->css);
0929     }
0930     rcu_read_unlock();
0931 
0932     if (need_rebuild_sched_domains)
0933         rebuild_sched_domains_locked();
0934 }
0935 
0936 /**
0937  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
0938  * @cs: the cpuset to consider
0939  * @trialcs: trial cpuset
0940  * @buf: buffer of cpu numbers written to this cpuset
0941  */
0942 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
0943               const char *buf)
0944 {
0945     int retval;
0946 
0947     /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
0948     if (cs == &top_cpuset)
0949         return -EACCES;
0950 
0951     /*
0952      * An empty cpus_allowed is ok only if the cpuset has no tasks.
0953      * Since cpulist_parse() fails on an empty mask, we special case
0954      * that parsing.  The validate_change() call ensures that cpusets
0955      * with tasks have cpus.
0956      */
0957     if (!*buf) {
0958         cpumask_clear(trialcs->cpus_allowed);
0959     } else {
0960         retval = cpulist_parse(buf, trialcs->cpus_allowed);
0961         if (retval < 0)
0962             return retval;
0963 
0964         if (!cpumask_subset(trialcs->cpus_allowed,
0965                     top_cpuset.cpus_allowed))
0966             return -EINVAL;
0967     }
0968 
0969     /* Nothing to do if the cpus didn't change */
0970     if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
0971         return 0;
0972 
0973     retval = validate_change(cs, trialcs);
0974     if (retval < 0)
0975         return retval;
0976 
0977     spin_lock_irq(&callback_lock);
0978     cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
0979     spin_unlock_irq(&callback_lock);
0980 
0981     /* use trialcs->cpus_allowed as a temp variable */
0982     update_cpumasks_hier(cs, trialcs->cpus_allowed);
0983     return 0;
0984 }
0985 
0986 /*
0987  * Migrate memory region from one set of nodes to another.  This is
0988  * performed asynchronously as it can be called from process migration path
0989  * holding locks involved in process management.  All mm migrations are
0990  * performed in the queued order and can be waited for by flushing
0991  * cpuset_migrate_mm_wq.
0992  */
0993 
0994 struct cpuset_migrate_mm_work {
0995     struct work_struct  work;
0996     struct mm_struct    *mm;
0997     nodemask_t      from;
0998     nodemask_t      to;
0999 };
1000 
1001 static void cpuset_migrate_mm_workfn(struct work_struct *work)
1002 {
1003     struct cpuset_migrate_mm_work *mwork =
1004         container_of(work, struct cpuset_migrate_mm_work, work);
1005 
1006     /* on a wq worker, no need to worry about %current's mems_allowed */
1007     do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1008     mmput(mwork->mm);
1009     kfree(mwork);
1010 }
1011 
1012 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1013                             const nodemask_t *to)
1014 {
1015     struct cpuset_migrate_mm_work *mwork;
1016 
1017     mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1018     if (mwork) {
1019         mwork->mm = mm;
1020         mwork->from = *from;
1021         mwork->to = *to;
1022         INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1023         queue_work(cpuset_migrate_mm_wq, &mwork->work);
1024     } else {
1025         mmput(mm);
1026     }
1027 }
1028 
1029 static void cpuset_post_attach(void)
1030 {
1031     flush_workqueue(cpuset_migrate_mm_wq);
1032 }
1033 
1034 /*
1035  * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1036  * @tsk: the task to change
1037  * @newmems: new nodes that the task will be set
1038  *
1039  * In order to avoid seeing no nodes if the old and new nodes are disjoint,
1040  * we structure updates as setting all new allowed nodes, then clearing newly
1041  * disallowed ones.
1042  */
1043 static void cpuset_change_task_nodemask(struct task_struct *tsk,
1044                     nodemask_t *newmems)
1045 {
1046     bool need_loop;
1047 
1048     task_lock(tsk);
1049     /*
1050      * Determine if a loop is necessary if another thread is doing
1051      * read_mems_allowed_begin().  If at least one node remains unchanged and
1052      * tsk does not have a mempolicy, then an empty nodemask will not be
1053      * possible when mems_allowed is larger than a word.
1054      */
1055     need_loop = task_has_mempolicy(tsk) ||
1056             !nodes_intersects(*newmems, tsk->mems_allowed);
1057 
1058     if (need_loop) {
1059         local_irq_disable();
1060         write_seqcount_begin(&tsk->mems_allowed_seq);
1061     }
1062 
1063     nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1064     mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1065 
1066     mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1067     tsk->mems_allowed = *newmems;
1068 
1069     if (need_loop) {
1070         write_seqcount_end(&tsk->mems_allowed_seq);
1071         local_irq_enable();
1072     }
1073 
1074     task_unlock(tsk);
1075 }
1076 
1077 static void *cpuset_being_rebound;
1078 
1079 /**
1080  * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1081  * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1082  *
1083  * Iterate through each task of @cs updating its mems_allowed to the
1084  * effective cpuset's.  As this function is called with cpuset_mutex held,
1085  * cpuset membership stays stable.
1086  */
1087 static void update_tasks_nodemask(struct cpuset *cs)
1088 {
1089     static nodemask_t newmems;  /* protected by cpuset_mutex */
1090     struct css_task_iter it;
1091     struct task_struct *task;
1092 
1093     cpuset_being_rebound = cs;      /* causes mpol_dup() rebind */
1094 
1095     guarantee_online_mems(cs, &newmems);
1096 
1097     /*
1098      * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1099      * take while holding tasklist_lock.  Forks can happen - the
1100      * mpol_dup() cpuset_being_rebound check will catch such forks,
1101      * and rebind their vma mempolicies too.  Because we still hold
1102      * the global cpuset_mutex, we know that no other rebind effort
1103      * will be contending for the global variable cpuset_being_rebound.
1104      * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1105      * is idempotent.  Also migrate pages in each mm to new nodes.
1106      */
1107     css_task_iter_start(&cs->css, &it);
1108     while ((task = css_task_iter_next(&it))) {
1109         struct mm_struct *mm;
1110         bool migrate;
1111 
1112         cpuset_change_task_nodemask(task, &newmems);
1113 
1114         mm = get_task_mm(task);
1115         if (!mm)
1116             continue;
1117 
1118         migrate = is_memory_migrate(cs);
1119 
1120         mpol_rebind_mm(mm, &cs->mems_allowed);
1121         if (migrate)
1122             cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1123         else
1124             mmput(mm);
1125     }
1126     css_task_iter_end(&it);
1127 
1128     /*
1129      * All the tasks' nodemasks have been updated, update
1130      * cs->old_mems_allowed.
1131      */
1132     cs->old_mems_allowed = newmems;
1133 
1134     /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1135     cpuset_being_rebound = NULL;
1136 }
1137 
1138 /*
1139  * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
1140  * @cs: the cpuset to consider
1141  * @new_mems: a temp variable for calculating new effective_mems
1142  *
1143  * When configured nodemask is changed, the effective nodemasks of this cpuset
1144  * and all its descendants need to be updated.
1145  *
1146  * On legacy hiearchy, effective_mems will be the same with mems_allowed.
1147  *
1148  * Called with cpuset_mutex held
1149  */
1150 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1151 {
1152     struct cpuset *cp;
1153     struct cgroup_subsys_state *pos_css;
1154 
1155     rcu_read_lock();
1156     cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1157         struct cpuset *parent = parent_cs(cp);
1158 
1159         nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1160 
1161         /*
1162          * If it becomes empty, inherit the effective mask of the
1163          * parent, which is guaranteed to have some MEMs.
1164          */
1165         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1166             nodes_empty(*new_mems))
1167             *new_mems = parent->effective_mems;
1168 
1169         /* Skip the whole subtree if the nodemask remains the same. */
1170         if (nodes_equal(*new_mems, cp->effective_mems)) {
1171             pos_css = css_rightmost_descendant(pos_css);
1172             continue;
1173         }
1174 
1175         if (!css_tryget_online(&cp->css))
1176             continue;
1177         rcu_read_unlock();
1178 
1179         spin_lock_irq(&callback_lock);
1180         cp->effective_mems = *new_mems;
1181         spin_unlock_irq(&callback_lock);
1182 
1183         WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1184             !nodes_equal(cp->mems_allowed, cp->effective_mems));
1185 
1186         update_tasks_nodemask(cp);
1187 
1188         rcu_read_lock();
1189         css_put(&cp->css);
1190     }
1191     rcu_read_unlock();
1192 }
1193 
1194 /*
1195  * Handle user request to change the 'mems' memory placement
1196  * of a cpuset.  Needs to validate the request, update the
1197  * cpusets mems_allowed, and for each task in the cpuset,
1198  * update mems_allowed and rebind task's mempolicy and any vma
1199  * mempolicies and if the cpuset is marked 'memory_migrate',
1200  * migrate the tasks pages to the new memory.
1201  *
1202  * Call with cpuset_mutex held. May take callback_lock during call.
1203  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1204  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1205  * their mempolicies to the cpusets new mems_allowed.
1206  */
1207 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1208                const char *buf)
1209 {
1210     int retval;
1211 
1212     /*
1213      * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1214      * it's read-only
1215      */
1216     if (cs == &top_cpuset) {
1217         retval = -EACCES;
1218         goto done;
1219     }
1220 
1221     /*
1222      * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1223      * Since nodelist_parse() fails on an empty mask, we special case
1224      * that parsing.  The validate_change() call ensures that cpusets
1225      * with tasks have memory.
1226      */
1227     if (!*buf) {
1228         nodes_clear(trialcs->mems_allowed);
1229     } else {
1230         retval = nodelist_parse(buf, trialcs->mems_allowed);
1231         if (retval < 0)
1232             goto done;
1233 
1234         if (!nodes_subset(trialcs->mems_allowed,
1235                   top_cpuset.mems_allowed)) {
1236             retval = -EINVAL;
1237             goto done;
1238         }
1239     }
1240 
1241     if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1242         retval = 0;     /* Too easy - nothing to do */
1243         goto done;
1244     }
1245     retval = validate_change(cs, trialcs);
1246     if (retval < 0)
1247         goto done;
1248 
1249     spin_lock_irq(&callback_lock);
1250     cs->mems_allowed = trialcs->mems_allowed;
1251     spin_unlock_irq(&callback_lock);
1252 
1253     /* use trialcs->mems_allowed as a temp variable */
1254     update_nodemasks_hier(cs, &trialcs->mems_allowed);
1255 done:
1256     return retval;
1257 }
1258 
1259 int current_cpuset_is_being_rebound(void)
1260 {
1261     int ret;
1262 
1263     rcu_read_lock();
1264     ret = task_cs(current) == cpuset_being_rebound;
1265     rcu_read_unlock();
1266 
1267     return ret;
1268 }
1269 
1270 static int update_relax_domain_level(struct cpuset *cs, s64 val)
1271 {
1272 #ifdef CONFIG_SMP
1273     if (val < -1 || val >= sched_domain_level_max)
1274         return -EINVAL;
1275 #endif
1276 
1277     if (val != cs->relax_domain_level) {
1278         cs->relax_domain_level = val;
1279         if (!cpumask_empty(cs->cpus_allowed) &&
1280             is_sched_load_balance(cs))
1281             rebuild_sched_domains_locked();
1282     }
1283 
1284     return 0;
1285 }
1286 
1287 /**
1288  * update_tasks_flags - update the spread flags of tasks in the cpuset.
1289  * @cs: the cpuset in which each task's spread flags needs to be changed
1290  *
1291  * Iterate through each task of @cs updating its spread flags.  As this
1292  * function is called with cpuset_mutex held, cpuset membership stays
1293  * stable.
1294  */
1295 static void update_tasks_flags(struct cpuset *cs)
1296 {
1297     struct css_task_iter it;
1298     struct task_struct *task;
1299 
1300     css_task_iter_start(&cs->css, &it);
1301     while ((task = css_task_iter_next(&it)))
1302         cpuset_update_task_spread_flag(cs, task);
1303     css_task_iter_end(&it);
1304 }
1305 
1306 /*
1307  * update_flag - read a 0 or a 1 in a file and update associated flag
1308  * bit:     the bit to update (see cpuset_flagbits_t)
1309  * cs:      the cpuset to update
1310  * turning_on:  whether the flag is being set or cleared
1311  *
1312  * Call with cpuset_mutex held.
1313  */
1314 
1315 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1316                int turning_on)
1317 {
1318     struct cpuset *trialcs;
1319     int balance_flag_changed;
1320     int spread_flag_changed;
1321     int err;
1322 
1323     trialcs = alloc_trial_cpuset(cs);
1324     if (!trialcs)
1325         return -ENOMEM;
1326 
1327     if (turning_on)
1328         set_bit(bit, &trialcs->flags);
1329     else
1330         clear_bit(bit, &trialcs->flags);
1331 
1332     err = validate_change(cs, trialcs);
1333     if (err < 0)
1334         goto out;
1335 
1336     balance_flag_changed = (is_sched_load_balance(cs) !=
1337                 is_sched_load_balance(trialcs));
1338 
1339     spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1340             || (is_spread_page(cs) != is_spread_page(trialcs)));
1341 
1342     spin_lock_irq(&callback_lock);
1343     cs->flags = trialcs->flags;
1344     spin_unlock_irq(&callback_lock);
1345 
1346     if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1347         rebuild_sched_domains_locked();
1348 
1349     if (spread_flag_changed)
1350         update_tasks_flags(cs);
1351 out:
1352     free_trial_cpuset(trialcs);
1353     return err;
1354 }
1355 
1356 /*
1357  * Frequency meter - How fast is some event occurring?
1358  *
1359  * These routines manage a digitally filtered, constant time based,
1360  * event frequency meter.  There are four routines:
1361  *   fmeter_init() - initialize a frequency meter.
1362  *   fmeter_markevent() - called each time the event happens.
1363  *   fmeter_getrate() - returns the recent rate of such events.
1364  *   fmeter_update() - internal routine used to update fmeter.
1365  *
1366  * A common data structure is passed to each of these routines,
1367  * which is used to keep track of the state required to manage the
1368  * frequency meter and its digital filter.
1369  *
1370  * The filter works on the number of events marked per unit time.
1371  * The filter is single-pole low-pass recursive (IIR).  The time unit
1372  * is 1 second.  Arithmetic is done using 32-bit integers scaled to
1373  * simulate 3 decimal digits of precision (multiplied by 1000).
1374  *
1375  * With an FM_COEF of 933, and a time base of 1 second, the filter
1376  * has a half-life of 10 seconds, meaning that if the events quit
1377  * happening, then the rate returned from the fmeter_getrate()
1378  * will be cut in half each 10 seconds, until it converges to zero.
1379  *
1380  * It is not worth doing a real infinitely recursive filter.  If more
1381  * than FM_MAXTICKS ticks have elapsed since the last filter event,
1382  * just compute FM_MAXTICKS ticks worth, by which point the level
1383  * will be stable.
1384  *
1385  * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
1386  * arithmetic overflow in the fmeter_update() routine.
1387  *
1388  * Given the simple 32 bit integer arithmetic used, this meter works
1389  * best for reporting rates between one per millisecond (msec) and
1390  * one per 32 (approx) seconds.  At constant rates faster than one
1391  * per msec it maxes out at values just under 1,000,000.  At constant
1392  * rates between one per msec, and one per second it will stabilize
1393  * to a value N*1000, where N is the rate of events per second.
1394  * At constant rates between one per second and one per 32 seconds,
1395  * it will be choppy, moving up on the seconds that have an event,
1396  * and then decaying until the next event.  At rates slower than
1397  * about one in 32 seconds, it decays all the way back to zero between
1398  * each event.
1399  */
1400 
1401 #define FM_COEF 933     /* coefficient for half-life of 10 secs */
1402 #define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
1403 #define FM_MAXCNT 1000000   /* limit cnt to avoid overflow */
1404 #define FM_SCALE 1000       /* faux fixed point scale */
1405 
1406 /* Initialize a frequency meter */
1407 static void fmeter_init(struct fmeter *fmp)
1408 {
1409     fmp->cnt = 0;
1410     fmp->val = 0;
1411     fmp->time = 0;
1412     spin_lock_init(&fmp->lock);
1413 }
1414 
1415 /* Internal meter update - process cnt events and update value */
1416 static void fmeter_update(struct fmeter *fmp)
1417 {
1418     time64_t now;
1419     u32 ticks;
1420 
1421     now = ktime_get_seconds();
1422     ticks = now - fmp->time;
1423 
1424     if (ticks == 0)
1425         return;
1426 
1427     ticks = min(FM_MAXTICKS, ticks);
1428     while (ticks-- > 0)
1429         fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1430     fmp->time = now;
1431 
1432     fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1433     fmp->cnt = 0;
1434 }
1435 
1436 /* Process any previous ticks, then bump cnt by one (times scale). */
1437 static void fmeter_markevent(struct fmeter *fmp)
1438 {
1439     spin_lock(&fmp->lock);
1440     fmeter_update(fmp);
1441     fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1442     spin_unlock(&fmp->lock);
1443 }
1444 
1445 /* Process any previous ticks, then return current value. */
1446 static int fmeter_getrate(struct fmeter *fmp)
1447 {
1448     int val;
1449 
1450     spin_lock(&fmp->lock);
1451     fmeter_update(fmp);
1452     val = fmp->val;
1453     spin_unlock(&fmp->lock);
1454     return val;
1455 }
1456 
1457 static struct cpuset *cpuset_attach_old_cs;
1458 
1459 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1460 static int cpuset_can_attach(struct cgroup_taskset *tset)
1461 {
1462     struct cgroup_subsys_state *css;
1463     struct cpuset *cs;
1464     struct task_struct *task;
1465     int ret;
1466 
1467     /* used later by cpuset_attach() */
1468     cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
1469     cs = css_cs(css);
1470 
1471     mutex_lock(&cpuset_mutex);
1472 
1473     /* allow moving tasks into an empty cpuset if on default hierarchy */
1474     ret = -ENOSPC;
1475     if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1476         (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1477         goto out_unlock;
1478 
1479     cgroup_taskset_for_each(task, css, tset) {
1480         ret = task_can_attach(task, cs->cpus_allowed);
1481         if (ret)
1482             goto out_unlock;
1483         ret = security_task_setscheduler(task);
1484         if (ret)
1485             goto out_unlock;
1486     }
1487 
1488     /*
1489      * Mark attach is in progress.  This makes validate_change() fail
1490      * changes which zero cpus/mems_allowed.
1491      */
1492     cs->attach_in_progress++;
1493     ret = 0;
1494 out_unlock:
1495     mutex_unlock(&cpuset_mutex);
1496     return ret;
1497 }
1498 
1499 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
1500 {
1501     struct cgroup_subsys_state *css;
1502     struct cpuset *cs;
1503 
1504     cgroup_taskset_first(tset, &css);
1505     cs = css_cs(css);
1506 
1507     mutex_lock(&cpuset_mutex);
1508     css_cs(css)->attach_in_progress--;
1509     mutex_unlock(&cpuset_mutex);
1510 }
1511 
1512 /*
1513  * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
1514  * but we can't allocate it dynamically there.  Define it global and
1515  * allocate from cpuset_init().
1516  */
1517 static cpumask_var_t cpus_attach;
1518 
1519 static void cpuset_attach(struct cgroup_taskset *tset)
1520 {
1521     /* static buf protected by cpuset_mutex */
1522     static nodemask_t cpuset_attach_nodemask_to;
1523     struct task_struct *task;
1524     struct task_struct *leader;
1525     struct cgroup_subsys_state *css;
1526     struct cpuset *cs;
1527     struct cpuset *oldcs = cpuset_attach_old_cs;
1528 
1529     cgroup_taskset_first(tset, &css);
1530     cs = css_cs(css);
1531 
1532     mutex_lock(&cpuset_mutex);
1533 
1534     /* prepare for attach */
1535     if (cs == &top_cpuset)
1536         cpumask_copy(cpus_attach, cpu_possible_mask);
1537     else
1538         guarantee_online_cpus(cs, cpus_attach);
1539 
1540     guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1541 
1542     cgroup_taskset_for_each(task, css, tset) {
1543         /*
1544          * can_attach beforehand should guarantee that this doesn't
1545          * fail.  TODO: have a better way to handle failure here
1546          */
1547         WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1548 
1549         cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1550         cpuset_update_task_spread_flag(cs, task);
1551     }
1552 
1553     /*
1554      * Change mm for all threadgroup leaders. This is expensive and may
1555      * sleep and should be moved outside migration path proper.
1556      */
1557     cpuset_attach_nodemask_to = cs->effective_mems;
1558     cgroup_taskset_for_each_leader(leader, css, tset) {
1559         struct mm_struct *mm = get_task_mm(leader);
1560 
1561         if (mm) {
1562             mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1563 
1564             /*
1565              * old_mems_allowed is the same with mems_allowed
1566              * here, except if this task is being moved
1567              * automatically due to hotplug.  In that case
1568              * @mems_allowed has been updated and is empty, so
1569              * @old_mems_allowed is the right nodesets that we
1570              * migrate mm from.
1571              */
1572             if (is_memory_migrate(cs))
1573                 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1574                           &cpuset_attach_nodemask_to);
1575             else
1576                 mmput(mm);
1577         }
1578     }
1579 
1580     cs->old_mems_allowed = cpuset_attach_nodemask_to;
1581 
1582     cs->attach_in_progress--;
1583     if (!cs->attach_in_progress)
1584         wake_up(&cpuset_attach_wq);
1585 
1586     mutex_unlock(&cpuset_mutex);
1587 }
1588 
1589 /* The various types of files and directories in a cpuset file system */
1590 
1591 typedef enum {
1592     FILE_MEMORY_MIGRATE,
1593     FILE_CPULIST,
1594     FILE_MEMLIST,
1595     FILE_EFFECTIVE_CPULIST,
1596     FILE_EFFECTIVE_MEMLIST,
1597     FILE_CPU_EXCLUSIVE,
1598     FILE_MEM_EXCLUSIVE,
1599     FILE_MEM_HARDWALL,
1600     FILE_SCHED_LOAD_BALANCE,
1601     FILE_SCHED_RELAX_DOMAIN_LEVEL,
1602     FILE_MEMORY_PRESSURE_ENABLED,
1603     FILE_MEMORY_PRESSURE,
1604     FILE_SPREAD_PAGE,
1605     FILE_SPREAD_SLAB,
1606 } cpuset_filetype_t;
1607 
1608 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1609                 u64 val)
1610 {
1611     struct cpuset *cs = css_cs(css);
1612     cpuset_filetype_t type = cft->private;
1613     int retval = 0;
1614 
1615     mutex_lock(&cpuset_mutex);
1616     if (!is_cpuset_online(cs)) {
1617         retval = -ENODEV;
1618         goto out_unlock;
1619     }
1620 
1621     switch (type) {
1622     case FILE_CPU_EXCLUSIVE:
1623         retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1624         break;
1625     case FILE_MEM_EXCLUSIVE:
1626         retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1627         break;
1628     case FILE_MEM_HARDWALL:
1629         retval = update_flag(CS_MEM_HARDWALL, cs, val);
1630         break;
1631     case FILE_SCHED_LOAD_BALANCE:
1632         retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1633         break;
1634     case FILE_MEMORY_MIGRATE:
1635         retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1636         break;
1637     case FILE_MEMORY_PRESSURE_ENABLED:
1638         cpuset_memory_pressure_enabled = !!val;
1639         break;
1640     case FILE_SPREAD_PAGE:
1641         retval = update_flag(CS_SPREAD_PAGE, cs, val);
1642         break;
1643     case FILE_SPREAD_SLAB:
1644         retval = update_flag(CS_SPREAD_SLAB, cs, val);
1645         break;
1646     default:
1647         retval = -EINVAL;
1648         break;
1649     }
1650 out_unlock:
1651     mutex_unlock(&cpuset_mutex);
1652     return retval;
1653 }
1654 
1655 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1656                 s64 val)
1657 {
1658     struct cpuset *cs = css_cs(css);
1659     cpuset_filetype_t type = cft->private;
1660     int retval = -ENODEV;
1661 
1662     mutex_lock(&cpuset_mutex);
1663     if (!is_cpuset_online(cs))
1664         goto out_unlock;
1665 
1666     switch (type) {
1667     case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1668         retval = update_relax_domain_level(cs, val);
1669         break;
1670     default:
1671         retval = -EINVAL;
1672         break;
1673     }
1674 out_unlock:
1675     mutex_unlock(&cpuset_mutex);
1676     return retval;
1677 }
1678 
1679 /*
1680  * Common handling for a write to a "cpus" or "mems" file.
1681  */
1682 static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1683                     char *buf, size_t nbytes, loff_t off)
1684 {
1685     struct cpuset *cs = css_cs(of_css(of));
1686     struct cpuset *trialcs;
1687     int retval = -ENODEV;
1688 
1689     buf = strstrip(buf);
1690 
1691     /*
1692      * CPU or memory hotunplug may leave @cs w/o any execution
1693      * resources, in which case the hotplug code asynchronously updates
1694      * configuration and transfers all tasks to the nearest ancestor
1695      * which can execute.
1696      *
1697      * As writes to "cpus" or "mems" may restore @cs's execution
1698      * resources, wait for the previously scheduled operations before
1699      * proceeding, so that we don't end up keep removing tasks added
1700      * after execution capability is restored.
1701      *
1702      * cpuset_hotplug_work calls back into cgroup core via
1703      * cgroup_transfer_tasks() and waiting for it from a cgroupfs
1704      * operation like this one can lead to a deadlock through kernfs
1705      * active_ref protection.  Let's break the protection.  Losing the
1706      * protection is okay as we check whether @cs is online after
1707      * grabbing cpuset_mutex anyway.  This only happens on the legacy
1708      * hierarchies.
1709      */
1710     css_get(&cs->css);
1711     kernfs_break_active_protection(of->kn);
1712     flush_work(&cpuset_hotplug_work);
1713 
1714     mutex_lock(&cpuset_mutex);
1715     if (!is_cpuset_online(cs))
1716         goto out_unlock;
1717 
1718     trialcs = alloc_trial_cpuset(cs);
1719     if (!trialcs) {
1720         retval = -ENOMEM;
1721         goto out_unlock;
1722     }
1723 
1724     switch (of_cft(of)->private) {
1725     case FILE_CPULIST:
1726         retval = update_cpumask(cs, trialcs, buf);
1727         break;
1728     case FILE_MEMLIST:
1729         retval = update_nodemask(cs, trialcs, buf);
1730         break;
1731     default:
1732         retval = -EINVAL;
1733         break;
1734     }
1735 
1736     free_trial_cpuset(trialcs);
1737 out_unlock:
1738     mutex_unlock(&cpuset_mutex);
1739     kernfs_unbreak_active_protection(of->kn);
1740     css_put(&cs->css);
1741     flush_workqueue(cpuset_migrate_mm_wq);
1742     return retval ?: nbytes;
1743 }
1744 
1745 /*
1746  * These ascii lists should be read in a single call, by using a user
1747  * buffer large enough to hold the entire map.  If read in smaller
1748  * chunks, there is no guarantee of atomicity.  Since the display format
1749  * used, list of ranges of sequential numbers, is variable length,
1750  * and since these maps can change value dynamically, one could read
1751  * gibberish by doing partial reads while a list was changing.
1752  */
1753 static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1754 {
1755     struct cpuset *cs = css_cs(seq_css(sf));
1756     cpuset_filetype_t type = seq_cft(sf)->private;
1757     int ret = 0;
1758 
1759     spin_lock_irq(&callback_lock);
1760 
1761     switch (type) {
1762     case FILE_CPULIST:
1763         seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
1764         break;
1765     case FILE_MEMLIST:
1766         seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
1767         break;
1768     case FILE_EFFECTIVE_CPULIST:
1769         seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
1770         break;
1771     case FILE_EFFECTIVE_MEMLIST:
1772         seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
1773         break;
1774     default:
1775         ret = -EINVAL;
1776     }
1777 
1778     spin_unlock_irq(&callback_lock);
1779     return ret;
1780 }
1781 
1782 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1783 {
1784     struct cpuset *cs = css_cs(css);
1785     cpuset_filetype_t type = cft->private;
1786     switch (type) {
1787     case FILE_CPU_EXCLUSIVE:
1788         return is_cpu_exclusive(cs);
1789     case FILE_MEM_EXCLUSIVE:
1790         return is_mem_exclusive(cs);
1791     case FILE_MEM_HARDWALL:
1792         return is_mem_hardwall(cs);
1793     case FILE_SCHED_LOAD_BALANCE:
1794         return is_sched_load_balance(cs);
1795     case FILE_MEMORY_MIGRATE:
1796         return is_memory_migrate(cs);
1797     case FILE_MEMORY_PRESSURE_ENABLED:
1798         return cpuset_memory_pressure_enabled;
1799     case FILE_MEMORY_PRESSURE:
1800         return fmeter_getrate(&cs->fmeter);
1801     case FILE_SPREAD_PAGE:
1802         return is_spread_page(cs);
1803     case FILE_SPREAD_SLAB:
1804         return is_spread_slab(cs);
1805     default:
1806         BUG();
1807     }
1808 
1809     /* Unreachable but makes gcc happy */
1810     return 0;
1811 }
1812 
1813 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1814 {
1815     struct cpuset *cs = css_cs(css);
1816     cpuset_filetype_t type = cft->private;
1817     switch (type) {
1818     case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1819         return cs->relax_domain_level;
1820     default:
1821         BUG();
1822     }
1823 
1824     /* Unrechable but makes gcc happy */
1825     return 0;
1826 }
1827 
1828 
1829 /*
1830  * for the common functions, 'private' gives the type of file
1831  */
1832 
1833 static struct cftype files[] = {
1834     {
1835         .name = "cpus",
1836         .seq_show = cpuset_common_seq_show,
1837         .write = cpuset_write_resmask,
1838         .max_write_len = (100U + 6 * NR_CPUS),
1839         .private = FILE_CPULIST,
1840     },
1841 
1842     {
1843         .name = "mems",
1844         .seq_show = cpuset_common_seq_show,
1845         .write = cpuset_write_resmask,
1846         .max_write_len = (100U + 6 * MAX_NUMNODES),
1847         .private = FILE_MEMLIST,
1848     },
1849 
1850     {
1851         .name = "effective_cpus",
1852         .seq_show = cpuset_common_seq_show,
1853         .private = FILE_EFFECTIVE_CPULIST,
1854     },
1855 
1856     {
1857         .name = "effective_mems",
1858         .seq_show = cpuset_common_seq_show,
1859         .private = FILE_EFFECTIVE_MEMLIST,
1860     },
1861 
1862     {
1863         .name = "cpu_exclusive",
1864         .read_u64 = cpuset_read_u64,
1865         .write_u64 = cpuset_write_u64,
1866         .private = FILE_CPU_EXCLUSIVE,
1867     },
1868 
1869     {
1870         .name = "mem_exclusive",
1871         .read_u64 = cpuset_read_u64,
1872         .write_u64 = cpuset_write_u64,
1873         .private = FILE_MEM_EXCLUSIVE,
1874     },
1875 
1876     {
1877         .name = "mem_hardwall",
1878         .read_u64 = cpuset_read_u64,
1879         .write_u64 = cpuset_write_u64,
1880         .private = FILE_MEM_HARDWALL,
1881     },
1882 
1883     {
1884         .name = "sched_load_balance",
1885         .read_u64 = cpuset_read_u64,
1886         .write_u64 = cpuset_write_u64,
1887         .private = FILE_SCHED_LOAD_BALANCE,
1888     },
1889 
1890     {
1891         .name = "sched_relax_domain_level",
1892         .read_s64 = cpuset_read_s64,
1893         .write_s64 = cpuset_write_s64,
1894         .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1895     },
1896 
1897     {
1898         .name = "memory_migrate",
1899         .read_u64 = cpuset_read_u64,
1900         .write_u64 = cpuset_write_u64,
1901         .private = FILE_MEMORY_MIGRATE,
1902     },
1903 
1904     {
1905         .name = "memory_pressure",
1906         .read_u64 = cpuset_read_u64,
1907     },
1908 
1909     {
1910         .name = "memory_spread_page",
1911         .read_u64 = cpuset_read_u64,
1912         .write_u64 = cpuset_write_u64,
1913         .private = FILE_SPREAD_PAGE,
1914     },
1915 
1916     {
1917         .name = "memory_spread_slab",
1918         .read_u64 = cpuset_read_u64,
1919         .write_u64 = cpuset_write_u64,
1920         .private = FILE_SPREAD_SLAB,
1921     },
1922 
1923     {
1924         .name = "memory_pressure_enabled",
1925         .flags = CFTYPE_ONLY_ON_ROOT,
1926         .read_u64 = cpuset_read_u64,
1927         .write_u64 = cpuset_write_u64,
1928         .private = FILE_MEMORY_PRESSURE_ENABLED,
1929     },
1930 
1931     { } /* terminate */
1932 };
1933 
1934 /*
1935  *  cpuset_css_alloc - allocate a cpuset css
1936  *  cgrp:   control group that the new cpuset will be part of
1937  */
1938 
1939 static struct cgroup_subsys_state *
1940 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1941 {
1942     struct cpuset *cs;
1943 
1944     if (!parent_css)
1945         return &top_cpuset.css;
1946 
1947     cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1948     if (!cs)
1949         return ERR_PTR(-ENOMEM);
1950     if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1951         goto free_cs;
1952     if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1953         goto free_cpus;
1954 
1955     set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1956     cpumask_clear(cs->cpus_allowed);
1957     nodes_clear(cs->mems_allowed);
1958     cpumask_clear(cs->effective_cpus);
1959     nodes_clear(cs->effective_mems);
1960     fmeter_init(&cs->fmeter);
1961     cs->relax_domain_level = -1;
1962 
1963     return &cs->css;
1964 
1965 free_cpus:
1966     free_cpumask_var(cs->cpus_allowed);
1967 free_cs:
1968     kfree(cs);
1969     return ERR_PTR(-ENOMEM);
1970 }
1971 
1972 static int cpuset_css_online(struct cgroup_subsys_state *css)
1973 {
1974     struct cpuset *cs = css_cs(css);
1975     struct cpuset *parent = parent_cs(cs);
1976     struct cpuset *tmp_cs;
1977     struct cgroup_subsys_state *pos_css;
1978 
1979     if (!parent)
1980         return 0;
1981 
1982     mutex_lock(&cpuset_mutex);
1983 
1984     set_bit(CS_ONLINE, &cs->flags);
1985     if (is_spread_page(parent))
1986         set_bit(CS_SPREAD_PAGE, &cs->flags);
1987     if (is_spread_slab(parent))
1988         set_bit(CS_SPREAD_SLAB, &cs->flags);
1989 
1990     cpuset_inc();
1991 
1992     spin_lock_irq(&callback_lock);
1993     if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
1994         cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1995         cs->effective_mems = parent->effective_mems;
1996     }
1997     spin_unlock_irq(&callback_lock);
1998 
1999     if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2000         goto out_unlock;
2001 
2002     /*
2003      * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
2004      * set.  This flag handling is implemented in cgroup core for
2005      * histrical reasons - the flag may be specified during mount.
2006      *
2007      * Currently, if any sibling cpusets have exclusive cpus or mem, we
2008      * refuse to clone the configuration - thereby refusing the task to
2009      * be entered, and as a result refusing the sys_unshare() or
2010      * clone() which initiated it.  If this becomes a problem for some
2011      * users who wish to allow that scenario, then this could be
2012      * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
2013      * (and likewise for mems) to the new cgroup.
2014      */
2015     rcu_read_lock();
2016     cpuset_for_each_child(tmp_cs, pos_css, parent) {
2017         if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2018             rcu_read_unlock();
2019             goto out_unlock;
2020         }
2021     }
2022     rcu_read_unlock();
2023 
2024     spin_lock_irq(&callback_lock);
2025     cs->mems_allowed = parent->mems_allowed;
2026     cs->effective_mems = parent->mems_allowed;
2027     cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2028     cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2029     spin_unlock_irq(&callback_lock);
2030 out_unlock:
2031     mutex_unlock(&cpuset_mutex);
2032     return 0;
2033 }
2034 
2035 /*
2036  * If the cpuset being removed has its flag 'sched_load_balance'
2037  * enabled, then simulate turning sched_load_balance off, which
2038  * will call rebuild_sched_domains_locked().
2039  */
2040 
2041 static void cpuset_css_offline(struct cgroup_subsys_state *css)
2042 {
2043     struct cpuset *cs = css_cs(css);
2044 
2045     mutex_lock(&cpuset_mutex);
2046 
2047     if (is_sched_load_balance(cs))
2048         update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2049 
2050     cpuset_dec();
2051     clear_bit(CS_ONLINE, &cs->flags);
2052 
2053     mutex_unlock(&cpuset_mutex);
2054 }
2055 
2056 static void cpuset_css_free(struct cgroup_subsys_state *css)
2057 {
2058     struct cpuset *cs = css_cs(css);
2059 
2060     free_cpumask_var(cs->effective_cpus);
2061     free_cpumask_var(cs->cpus_allowed);
2062     kfree(cs);
2063 }
2064 
2065 static void cpuset_bind(struct cgroup_subsys_state *root_css)
2066 {
2067     mutex_lock(&cpuset_mutex);
2068     spin_lock_irq(&callback_lock);
2069 
2070     if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
2071         cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2072         top_cpuset.mems_allowed = node_possible_map;
2073     } else {
2074         cpumask_copy(top_cpuset.cpus_allowed,
2075                  top_cpuset.effective_cpus);
2076         top_cpuset.mems_allowed = top_cpuset.effective_mems;
2077     }
2078 
2079     spin_unlock_irq(&callback_lock);
2080     mutex_unlock(&cpuset_mutex);
2081 }
2082 
2083 /*
2084  * Make sure the new task conform to the current state of its parent,
2085  * which could have been changed by cpuset just after it inherits the
2086  * state from the parent and before it sits on the cgroup's task list.
2087  */
2088 static void cpuset_fork(struct task_struct *task)
2089 {
2090     if (task_css_is_root(task, cpuset_cgrp_id))
2091         return;
2092 
2093     set_cpus_allowed_ptr(task, &current->cpus_allowed);
2094     task->mems_allowed = current->mems_allowed;
2095 }
2096 
2097 struct cgroup_subsys cpuset_cgrp_subsys = {
2098     .css_alloc  = cpuset_css_alloc,
2099     .css_online = cpuset_css_online,
2100     .css_offline    = cpuset_css_offline,
2101     .css_free   = cpuset_css_free,
2102     .can_attach = cpuset_can_attach,
2103     .cancel_attach  = cpuset_cancel_attach,
2104     .attach     = cpuset_attach,
2105     .post_attach    = cpuset_post_attach,
2106     .bind       = cpuset_bind,
2107     .fork       = cpuset_fork,
2108     .legacy_cftypes = files,
2109     .early_init = true,
2110 };
2111 
2112 /**
2113  * cpuset_init - initialize cpusets at system boot
2114  *
2115  * Description: Initialize top_cpuset and the cpuset internal file system,
2116  **/
2117 
2118 int __init cpuset_init(void)
2119 {
2120     int err = 0;
2121 
2122     if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
2123         BUG();
2124     if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
2125         BUG();
2126 
2127     cpumask_setall(top_cpuset.cpus_allowed);
2128     nodes_setall(top_cpuset.mems_allowed);
2129     cpumask_setall(top_cpuset.effective_cpus);
2130     nodes_setall(top_cpuset.effective_mems);
2131 
2132     fmeter_init(&top_cpuset.fmeter);
2133     set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2134     top_cpuset.relax_domain_level = -1;
2135 
2136     err = register_filesystem(&cpuset_fs_type);
2137     if (err < 0)
2138         return err;
2139 
2140     if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
2141         BUG();
2142 
2143     return 0;
2144 }
2145 
2146 /*
2147  * If CPU and/or memory hotplug handlers, below, unplug any CPUs
2148  * or memory nodes, we need to walk over the cpuset hierarchy,
2149  * removing that CPU or node from all cpusets.  If this removes the
2150  * last CPU or node from a cpuset, then move the tasks in the empty
2151  * cpuset to its next-highest non-empty parent.
2152  */
2153 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2154 {
2155     struct cpuset *parent;
2156 
2157     /*
2158      * Find its next-highest non-empty parent, (top cpuset
2159      * has online cpus, so can't be empty).
2160      */
2161     parent = parent_cs(cs);
2162     while (cpumask_empty(parent->cpus_allowed) ||
2163             nodes_empty(parent->mems_allowed))
2164         parent = parent_cs(parent);
2165 
2166     if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2167         pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2168         pr_cont_cgroup_name(cs->css.cgroup);
2169         pr_cont("\n");
2170     }
2171 }
2172 
2173 static void
2174 hotplug_update_tasks_legacy(struct cpuset *cs,
2175                 struct cpumask *new_cpus, nodemask_t *new_mems,
2176                 bool cpus_updated, bool mems_updated)
2177 {
2178     bool is_empty;
2179 
2180     spin_lock_irq(&callback_lock);
2181     cpumask_copy(cs->cpus_allowed, new_cpus);
2182     cpumask_copy(cs->effective_cpus, new_cpus);
2183     cs->mems_allowed = *new_mems;
2184     cs->effective_mems = *new_mems;
2185     spin_unlock_irq(&callback_lock);
2186 
2187     /*
2188      * Don't call update_tasks_cpumask() if the cpuset becomes empty,
2189      * as the tasks will be migratecd to an ancestor.
2190      */
2191     if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2192         update_tasks_cpumask(cs);
2193     if (mems_updated && !nodes_empty(cs->mems_allowed))
2194         update_tasks_nodemask(cs);
2195 
2196     is_empty = cpumask_empty(cs->cpus_allowed) ||
2197            nodes_empty(cs->mems_allowed);
2198 
2199     mutex_unlock(&cpuset_mutex);
2200 
2201     /*
2202      * Move tasks to the nearest ancestor with execution resources,
2203      * This is full cgroup operation which will also call back into
2204      * cpuset. Should be done outside any lock.
2205      */
2206     if (is_empty)
2207         remove_tasks_in_empty_cpuset(cs);
2208 
2209     mutex_lock(&cpuset_mutex);
2210 }
2211 
2212 static void
2213 hotplug_update_tasks(struct cpuset *cs,
2214              struct cpumask *new_cpus, nodemask_t *new_mems,
2215              bool cpus_updated, bool mems_updated)
2216 {
2217     if (cpumask_empty(new_cpus))
2218         cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2219     if (nodes_empty(*new_mems))
2220         *new_mems = parent_cs(cs)->effective_mems;
2221 
2222     spin_lock_irq(&callback_lock);
2223     cpumask_copy(cs->effective_cpus, new_cpus);
2224     cs->effective_mems = *new_mems;
2225     spin_unlock_irq(&callback_lock);
2226 
2227     if (cpus_updated)
2228         update_tasks_cpumask(cs);
2229     if (mems_updated)
2230         update_tasks_nodemask(cs);
2231 }
2232 
2233 /**
2234  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2235  * @cs: cpuset in interest
2236  *
2237  * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2238  * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
2239  * all its tasks are moved to the nearest ancestor with both resources.
2240  */
2241 static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2242 {
2243     static cpumask_t new_cpus;
2244     static nodemask_t new_mems;
2245     bool cpus_updated;
2246     bool mems_updated;
2247 retry:
2248     wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2249 
2250     mutex_lock(&cpuset_mutex);
2251 
2252     /*
2253      * We have raced with task attaching. We wait until attaching
2254      * is finished, so we won't attach a task to an empty cpuset.
2255      */
2256     if (cs->attach_in_progress) {
2257         mutex_unlock(&cpuset_mutex);
2258         goto retry;
2259     }
2260 
2261     cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
2262     nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
2263 
2264     cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2265     mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2266 
2267     if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
2268         hotplug_update_tasks(cs, &new_cpus, &new_mems,
2269                      cpus_updated, mems_updated);
2270     else
2271         hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
2272                         cpus_updated, mems_updated);
2273 
2274     mutex_unlock(&cpuset_mutex);
2275 }
2276 
2277 /**
2278  * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
2279  *
2280  * This function is called after either CPU or memory configuration has
2281  * changed and updates cpuset accordingly.  The top_cpuset is always
2282  * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
2283  * order to make cpusets transparent (of no affect) on systems that are
2284  * actively using CPU hotplug but making no active use of cpusets.
2285  *
2286  * Non-root cpusets are only affected by offlining.  If any CPUs or memory
2287  * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
2288  * all descendants.
2289  *
2290  * Note that CPU offlining during suspend is ignored.  We don't modify
2291  * cpusets across suspend/resume cycles at all.
2292  */
2293 static void cpuset_hotplug_workfn(struct work_struct *work)
2294 {
2295     static cpumask_t new_cpus;
2296     static nodemask_t new_mems;
2297     bool cpus_updated, mems_updated;
2298     bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
2299 
2300     mutex_lock(&cpuset_mutex);
2301 
2302     /* fetch the available cpus/mems and find out which changed how */
2303     cpumask_copy(&new_cpus, cpu_active_mask);
2304     new_mems = node_states[N_MEMORY];
2305 
2306     cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
2307     mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
2308 
2309     /* synchronize cpus_allowed to cpu_active_mask */
2310     if (cpus_updated) {
2311         spin_lock_irq(&callback_lock);
2312         if (!on_dfl)
2313             cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2314         cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2315         spin_unlock_irq(&callback_lock);
2316         /* we don't mess with cpumasks of tasks in top_cpuset */
2317     }
2318 
2319     /* synchronize mems_allowed to N_MEMORY */
2320     if (mems_updated) {
2321         spin_lock_irq(&callback_lock);
2322         if (!on_dfl)
2323             top_cpuset.mems_allowed = new_mems;
2324         top_cpuset.effective_mems = new_mems;
2325         spin_unlock_irq(&callback_lock);
2326         update_tasks_nodemask(&top_cpuset);
2327     }
2328 
2329     mutex_unlock(&cpuset_mutex);
2330 
2331     /* if cpus or mems changed, we need to propagate to descendants */
2332     if (cpus_updated || mems_updated) {
2333         struct cpuset *cs;
2334         struct cgroup_subsys_state *pos_css;
2335 
2336         rcu_read_lock();
2337         cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2338             if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2339                 continue;
2340             rcu_read_unlock();
2341 
2342             cpuset_hotplug_update_tasks(cs);
2343 
2344             rcu_read_lock();
2345             css_put(&cs->css);
2346         }
2347         rcu_read_unlock();
2348     }
2349 
2350     /* rebuild sched domains if cpus_allowed has changed */
2351     if (cpus_updated)
2352         rebuild_sched_domains();
2353 }
2354 
2355 void cpuset_update_active_cpus(bool cpu_online)
2356 {
2357     /*
2358      * We're inside cpu hotplug critical region which usually nests
2359      * inside cgroup synchronization.  Bounce actual hotplug processing
2360      * to a work item to avoid reverse locking order.
2361      *
2362      * We still need to do partition_sched_domains() synchronously;
2363      * otherwise, the scheduler will get confused and put tasks to the
2364      * dead CPU.  Fall back to the default single domain.
2365      * cpuset_hotplug_workfn() will rebuild it as necessary.
2366      */
2367     partition_sched_domains(1, NULL, NULL);
2368     schedule_work(&cpuset_hotplug_work);
2369 }
2370 
2371 /*
2372  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
2373  * Call this routine anytime after node_states[N_MEMORY] changes.
2374  * See cpuset_update_active_cpus() for CPU hotplug handling.
2375  */
2376 static int cpuset_track_online_nodes(struct notifier_block *self,
2377                 unsigned long action, void *arg)
2378 {
2379     schedule_work(&cpuset_hotplug_work);
2380     return NOTIFY_OK;
2381 }
2382 
2383 static struct notifier_block cpuset_track_online_nodes_nb = {
2384     .notifier_call = cpuset_track_online_nodes,
2385     .priority = 10,     /* ??! */
2386 };
2387 
2388 /**
2389  * cpuset_init_smp - initialize cpus_allowed
2390  *
2391  * Description: Finish top cpuset after cpu, node maps are initialized
2392  */
2393 void __init cpuset_init_smp(void)
2394 {
2395     cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2396     top_cpuset.mems_allowed = node_states[N_MEMORY];
2397     top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2398 
2399     cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
2400     top_cpuset.effective_mems = node_states[N_MEMORY];
2401 
2402     register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2403 
2404     cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
2405     BUG_ON(!cpuset_migrate_mm_wq);
2406 }
2407 
2408 /**
2409  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
2410  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
2411  * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
2412  *
2413  * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
2414  * attached to the specified @tsk.  Guaranteed to return some non-empty
2415  * subset of cpu_online_mask, even if this means going outside the
2416  * tasks cpuset.
2417  **/
2418 
2419 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2420 {
2421     unsigned long flags;
2422 
2423     spin_lock_irqsave(&callback_lock, flags);
2424     rcu_read_lock();
2425     guarantee_online_cpus(task_cs(tsk), pmask);
2426     rcu_read_unlock();
2427     spin_unlock_irqrestore(&callback_lock, flags);
2428 }
2429 
2430 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2431 {
2432     rcu_read_lock();
2433     do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
2434     rcu_read_unlock();
2435 
2436     /*
2437      * We own tsk->cpus_allowed, nobody can change it under us.
2438      *
2439      * But we used cs && cs->cpus_allowed lockless and thus can
2440      * race with cgroup_attach_task() or update_cpumask() and get
2441      * the wrong tsk->cpus_allowed. However, both cases imply the
2442      * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
2443      * which takes task_rq_lock().
2444      *
2445      * If we are called after it dropped the lock we must see all
2446      * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2447      * set any mask even if it is not right from task_cs() pov,
2448      * the pending set_cpus_allowed_ptr() will fix things.
2449      *
2450      * select_fallback_rq() will fix things ups and set cpu_possible_mask
2451      * if required.
2452      */
2453 }
2454 
2455 void __init cpuset_init_current_mems_allowed(void)
2456 {
2457     nodes_setall(current->mems_allowed);
2458 }
2459 
2460 /**
2461  * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
2462  * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
2463  *
2464  * Description: Returns the nodemask_t mems_allowed of the cpuset
2465  * attached to the specified @tsk.  Guaranteed to return some non-empty
2466  * subset of node_states[N_MEMORY], even if this means going outside the
2467  * tasks cpuset.
2468  **/
2469 
2470 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2471 {
2472     nodemask_t mask;
2473     unsigned long flags;
2474 
2475     spin_lock_irqsave(&callback_lock, flags);
2476     rcu_read_lock();
2477     guarantee_online_mems(task_cs(tsk), &mask);
2478     rcu_read_unlock();
2479     spin_unlock_irqrestore(&callback_lock, flags);
2480 
2481     return mask;
2482 }
2483 
2484 /**
2485  * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
2486  * @nodemask: the nodemask to be checked
2487  *
2488  * Are any of the nodes in the nodemask allowed in current->mems_allowed?
2489  */
2490 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2491 {
2492     return nodes_intersects(*nodemask, current->mems_allowed);
2493 }
2494 
2495 /*
2496  * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
2497  * mem_hardwall ancestor to the specified cpuset.  Call holding
2498  * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
2499  * (an unusual configuration), then returns the root cpuset.
2500  */
2501 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2502 {
2503     while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2504         cs = parent_cs(cs);
2505     return cs;
2506 }
2507 
2508 /**
2509  * cpuset_node_allowed - Can we allocate on a memory node?
2510  * @node: is this an allowed node?
2511  * @gfp_mask: memory allocation flags
2512  *
2513  * If we're in interrupt, yes, we can always allocate.  If @node is set in
2514  * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
2515  * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
2516  * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
2517  * Otherwise, no.
2518  *
2519  * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
2520  * and do not allow allocations outside the current tasks cpuset
2521  * unless the task has been OOM killed as is marked TIF_MEMDIE.
2522  * GFP_KERNEL allocations are not so marked, so can escape to the
2523  * nearest enclosing hardwalled ancestor cpuset.
2524  *
2525  * Scanning up parent cpusets requires callback_lock.  The
2526  * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
2527  * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
2528  * current tasks mems_allowed came up empty on the first pass over
2529  * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
2530  * cpuset are short of memory, might require taking the callback_lock.
2531  *
2532  * The first call here from mm/page_alloc:get_page_from_freelist()
2533  * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
2534  * so no allocation on a node outside the cpuset is allowed (unless
2535  * in interrupt, of course).
2536  *
2537  * The second pass through get_page_from_freelist() doesn't even call
2538  * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
2539  * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
2540  * in alloc_flags.  That logic and the checks below have the combined
2541  * affect that:
2542  *  in_interrupt - any node ok (current task context irrelevant)
2543  *  GFP_ATOMIC   - any node ok
2544  *  TIF_MEMDIE   - any node ok
2545  *  GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
2546  *  GFP_USER     - only nodes in current tasks mems allowed ok.
2547  */
2548 bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
2549 {
2550     struct cpuset *cs;      /* current cpuset ancestors */
2551     int allowed;            /* is allocation in zone z allowed? */
2552     unsigned long flags;
2553 
2554     if (in_interrupt())
2555         return true;
2556     if (node_isset(node, current->mems_allowed))
2557         return true;
2558     /*
2559      * Allow tasks that have access to memory reserves because they have
2560      * been OOM killed to get memory anywhere.
2561      */
2562     if (unlikely(test_thread_flag(TIF_MEMDIE)))
2563         return true;
2564     if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
2565         return false;
2566 
2567     if (current->flags & PF_EXITING) /* Let dying task have memory */
2568         return true;
2569 
2570     /* Not hardwall and node outside mems_allowed: scan up cpusets */
2571     spin_lock_irqsave(&callback_lock, flags);
2572 
2573     rcu_read_lock();
2574     cs = nearest_hardwall_ancestor(task_cs(current));
2575     allowed = node_isset(node, cs->mems_allowed);
2576     rcu_read_unlock();
2577 
2578     spin_unlock_irqrestore(&callback_lock, flags);
2579     return allowed;
2580 }
2581 
2582 /**
2583  * cpuset_mem_spread_node() - On which node to begin search for a file page
2584  * cpuset_slab_spread_node() - On which node to begin search for a slab page
2585  *
2586  * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2587  * tasks in a cpuset with is_spread_page or is_spread_slab set),
2588  * and if the memory allocation used cpuset_mem_spread_node()
2589  * to determine on which node to start looking, as it will for
2590  * certain page cache or slab cache pages such as used for file
2591  * system buffers and inode caches, then instead of starting on the
2592  * local node to look for a free page, rather spread the starting
2593  * node around the tasks mems_allowed nodes.
2594  *
2595  * We don't have to worry about the returned node being offline
2596  * because "it can't happen", and even if it did, it would be ok.
2597  *
2598  * The routines calling guarantee_online_mems() are careful to
2599  * only set nodes in task->mems_allowed that are online.  So it
2600  * should not be possible for the following code to return an
2601  * offline node.  But if it did, that would be ok, as this routine
2602  * is not returning the node where the allocation must be, only
2603  * the node where the search should start.  The zonelist passed to
2604  * __alloc_pages() will include all nodes.  If the slab allocator
2605  * is passed an offline node, it will fall back to the local node.
2606  * See kmem_cache_alloc_node().
2607  */
2608 
2609 static int cpuset_spread_node(int *rotor)
2610 {
2611     return *rotor = next_node_in(*rotor, current->mems_allowed);
2612 }
2613 
2614 int cpuset_mem_spread_node(void)
2615 {
2616     if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2617         current->cpuset_mem_spread_rotor =
2618             node_random(&current->mems_allowed);
2619 
2620     return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
2621 }
2622 
2623 int cpuset_slab_spread_node(void)
2624 {
2625     if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2626         current->cpuset_slab_spread_rotor =
2627             node_random(&current->mems_allowed);
2628 
2629     return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
2630 }
2631 
2632 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2633 
2634 /**
2635  * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
2636  * @tsk1: pointer to task_struct of some task.
2637  * @tsk2: pointer to task_struct of some other task.
2638  *
2639  * Description: Return true if @tsk1's mems_allowed intersects the
2640  * mems_allowed of @tsk2.  Used by the OOM killer to determine if
2641  * one of the task's memory usage might impact the memory available
2642  * to the other.
2643  **/
2644 
2645 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2646                    const struct task_struct *tsk2)
2647 {
2648     return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2649 }
2650 
2651 /**
2652  * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
2653  *
2654  * Description: Prints current's name, cpuset name, and cached copy of its
2655  * mems_allowed to the kernel log.
2656  */
2657 void cpuset_print_current_mems_allowed(void)
2658 {
2659     struct cgroup *cgrp;
2660 
2661     rcu_read_lock();
2662 
2663     cgrp = task_cs(current)->css.cgroup;
2664     pr_info("%s cpuset=", current->comm);
2665     pr_cont_cgroup_name(cgrp);
2666     pr_cont(" mems_allowed=%*pbl\n",
2667         nodemask_pr_args(&current->mems_allowed));
2668 
2669     rcu_read_unlock();
2670 }
2671 
2672 /*
2673  * Collection of memory_pressure is suppressed unless
2674  * this flag is enabled by writing "1" to the special
2675  * cpuset file 'memory_pressure_enabled' in the root cpuset.
2676  */
2677 
2678 int cpuset_memory_pressure_enabled __read_mostly;
2679 
2680 /**
2681  * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
2682  *
2683  * Keep a running average of the rate of synchronous (direct)
2684  * page reclaim efforts initiated by tasks in each cpuset.
2685  *
2686  * This represents the rate at which some task in the cpuset
2687  * ran low on memory on all nodes it was allowed to use, and
2688  * had to enter the kernels page reclaim code in an effort to
2689  * create more free memory by tossing clean pages or swapping
2690  * or writing dirty pages.
2691  *
2692  * Display to user space in the per-cpuset read-only file
2693  * "memory_pressure".  Value displayed is an integer
2694  * representing the recent rate of entry into the synchronous
2695  * (direct) page reclaim by any task attached to the cpuset.
2696  **/
2697 
2698 void __cpuset_memory_pressure_bump(void)
2699 {
2700     rcu_read_lock();
2701     fmeter_markevent(&task_cs(current)->fmeter);
2702     rcu_read_unlock();
2703 }
2704 
2705 #ifdef CONFIG_PROC_PID_CPUSET
2706 /*
2707  * proc_cpuset_show()
2708  *  - Print tasks cpuset path into seq_file.
2709  *  - Used for /proc/<pid>/cpuset.
2710  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2711  *    doesn't really matter if tsk->cpuset changes after we read it,
2712  *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
2713  *    anyway.
2714  */
2715 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
2716              struct pid *pid, struct task_struct *tsk)
2717 {
2718     char *buf;
2719     struct cgroup_subsys_state *css;
2720     int retval;
2721 
2722     retval = -ENOMEM;
2723     buf = kmalloc(PATH_MAX, GFP_KERNEL);
2724     if (!buf)
2725         goto out;
2726 
2727     css = task_get_css(tsk, cpuset_cgrp_id);
2728     retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
2729                 current->nsproxy->cgroup_ns);
2730     css_put(css);
2731     if (retval >= PATH_MAX)
2732         retval = -ENAMETOOLONG;
2733     if (retval < 0)
2734         goto out_free;
2735     seq_puts(m, buf);
2736     seq_putc(m, '\n');
2737     retval = 0;
2738 out_free:
2739     kfree(buf);
2740 out:
2741     return retval;
2742 }
2743 #endif /* CONFIG_PROC_PID_CPUSET */
2744 
2745 /* Display task mems_allowed in /proc/<pid>/status file. */
2746 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2747 {
2748     seq_printf(m, "Mems_allowed:\t%*pb\n",
2749            nodemask_pr_args(&task->mems_allowed));
2750     seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
2751            nodemask_pr_args(&task->mems_allowed));
2752 }