Back to home page

LXR

 
 

    


0001 /*
0002  * Simple NUMA memory policy for the Linux kernel.
0003  *
0004  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
0005  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
0006  * Subject to the GNU Public License, version 2.
0007  *
0008  * NUMA policy allows the user to give hints in which node(s) memory should
0009  * be allocated.
0010  *
0011  * Support four policies per VMA and per process:
0012  *
0013  * The VMA policy has priority over the process policy for a page fault.
0014  *
0015  * interleave     Allocate memory interleaved over a set of nodes,
0016  *                with normal fallback if it fails.
0017  *                For VMA based allocations this interleaves based on the
0018  *                offset into the backing object or offset into the mapping
0019  *                for anonymous memory. For process policy an process counter
0020  *                is used.
0021  *
0022  * bind           Only allocate memory on a specific set of nodes,
0023  *                no fallback.
0024  *                FIXME: memory is allocated starting with the first node
0025  *                to the last. It would be better if bind would truly restrict
0026  *                the allocation to memory nodes instead
0027  *
0028  * preferred       Try a specific node first before normal fallback.
0029  *                As a special case NUMA_NO_NODE here means do the allocation
0030  *                on the local CPU. This is normally identical to default,
0031  *                but useful to set in a VMA when you have a non default
0032  *                process policy.
0033  *
0034  * default        Allocate on the local node first, or when on a VMA
0035  *                use the process policy. This is what Linux always did
0036  *        in a NUMA aware kernel and still does by, ahem, default.
0037  *
0038  * The process policy is applied for most non interrupt memory allocations
0039  * in that process' context. Interrupts ignore the policies and always
0040  * try to allocate on the local CPU. The VMA policy is only applied for memory
0041  * allocations for a VMA in the VM.
0042  *
0043  * Currently there are a few corner cases in swapping where the policy
0044  * is not applied, but the majority should be handled. When process policy
0045  * is used it is not remembered over swap outs/swap ins.
0046  *
0047  * Only the highest zone in the zone hierarchy gets policied. Allocations
0048  * requesting a lower zone just use default policy. This implies that
0049  * on systems with highmem kernel lowmem allocation don't get policied.
0050  * Same with GFP_DMA allocations.
0051  *
0052  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
0053  * all users and remembered even when nobody has memory mapped.
0054  */
0055 
0056 /* Notebook:
0057    fix mmap readahead to honour policy and enable policy for any page cache
0058    object
0059    statistics for bigpages
0060    global policy for page cache? currently it uses process policy. Requires
0061    first item above.
0062    handle mremap for shared memory (currently ignored for the policy)
0063    grows down?
0064    make bind policy root only? It can trigger oom much faster and the
0065    kernel is not always grateful with that.
0066 */
0067 
0068 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0069 
0070 #include <linux/mempolicy.h>
0071 #include <linux/mm.h>
0072 #include <linux/highmem.h>
0073 #include <linux/hugetlb.h>
0074 #include <linux/kernel.h>
0075 #include <linux/sched.h>
0076 #include <linux/nodemask.h>
0077 #include <linux/cpuset.h>
0078 #include <linux/slab.h>
0079 #include <linux/string.h>
0080 #include <linux/export.h>
0081 #include <linux/nsproxy.h>
0082 #include <linux/interrupt.h>
0083 #include <linux/init.h>
0084 #include <linux/compat.h>
0085 #include <linux/swap.h>
0086 #include <linux/seq_file.h>
0087 #include <linux/proc_fs.h>
0088 #include <linux/migrate.h>
0089 #include <linux/ksm.h>
0090 #include <linux/rmap.h>
0091 #include <linux/security.h>
0092 #include <linux/syscalls.h>
0093 #include <linux/ctype.h>
0094 #include <linux/mm_inline.h>
0095 #include <linux/mmu_notifier.h>
0096 #include <linux/printk.h>
0097 
0098 #include <asm/tlbflush.h>
0099 #include <linux/uaccess.h>
0100 
0101 #include "internal.h"
0102 
0103 /* Internal flags */
0104 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
0105 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)      /* Invert check for nodemask */
0106 
0107 static struct kmem_cache *policy_cache;
0108 static struct kmem_cache *sn_cache;
0109 
0110 /* Highest zone. An specific allocation for a zone below that is not
0111    policied. */
0112 enum zone_type policy_zone = 0;
0113 
0114 /*
0115  * run-time system-wide default policy => local allocation
0116  */
0117 static struct mempolicy default_policy = {
0118     .refcnt = ATOMIC_INIT(1), /* never free it */
0119     .mode = MPOL_PREFERRED,
0120     .flags = MPOL_F_LOCAL,
0121 };
0122 
0123 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
0124 
0125 struct mempolicy *get_task_policy(struct task_struct *p)
0126 {
0127     struct mempolicy *pol = p->mempolicy;
0128     int node;
0129 
0130     if (pol)
0131         return pol;
0132 
0133     node = numa_node_id();
0134     if (node != NUMA_NO_NODE) {
0135         pol = &preferred_node_policy[node];
0136         /* preferred_node_policy is not initialised early in boot */
0137         if (pol->mode)
0138             return pol;
0139     }
0140 
0141     return &default_policy;
0142 }
0143 
0144 static const struct mempolicy_operations {
0145     int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
0146     /*
0147      * If read-side task has no lock to protect task->mempolicy, write-side
0148      * task will rebind the task->mempolicy by two step. The first step is
0149      * setting all the newly nodes, and the second step is cleaning all the
0150      * disallowed nodes. In this way, we can avoid finding no node to alloc
0151      * page.
0152      * If we have a lock to protect task->mempolicy in read-side, we do
0153      * rebind directly.
0154      *
0155      * step:
0156      *  MPOL_REBIND_ONCE - do rebind work at once
0157      *  MPOL_REBIND_STEP1 - set all the newly nodes
0158      *  MPOL_REBIND_STEP2 - clean all the disallowed nodes
0159      */
0160     void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
0161             enum mpol_rebind_step step);
0162 } mpol_ops[MPOL_MAX];
0163 
0164 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
0165 {
0166     return pol->flags & MPOL_MODE_FLAGS;
0167 }
0168 
0169 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
0170                    const nodemask_t *rel)
0171 {
0172     nodemask_t tmp;
0173     nodes_fold(tmp, *orig, nodes_weight(*rel));
0174     nodes_onto(*ret, tmp, *rel);
0175 }
0176 
0177 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
0178 {
0179     if (nodes_empty(*nodes))
0180         return -EINVAL;
0181     pol->v.nodes = *nodes;
0182     return 0;
0183 }
0184 
0185 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
0186 {
0187     if (!nodes)
0188         pol->flags |= MPOL_F_LOCAL; /* local allocation */
0189     else if (nodes_empty(*nodes))
0190         return -EINVAL;         /*  no allowed nodes */
0191     else
0192         pol->v.preferred_node = first_node(*nodes);
0193     return 0;
0194 }
0195 
0196 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
0197 {
0198     if (nodes_empty(*nodes))
0199         return -EINVAL;
0200     pol->v.nodes = *nodes;
0201     return 0;
0202 }
0203 
0204 /*
0205  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
0206  * any, for the new policy.  mpol_new() has already validated the nodes
0207  * parameter with respect to the policy mode and flags.  But, we need to
0208  * handle an empty nodemask with MPOL_PREFERRED here.
0209  *
0210  * Must be called holding task's alloc_lock to protect task's mems_allowed
0211  * and mempolicy.  May also be called holding the mmap_semaphore for write.
0212  */
0213 static int mpol_set_nodemask(struct mempolicy *pol,
0214              const nodemask_t *nodes, struct nodemask_scratch *nsc)
0215 {
0216     int ret;
0217 
0218     /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
0219     if (pol == NULL)
0220         return 0;
0221     /* Check N_MEMORY */
0222     nodes_and(nsc->mask1,
0223           cpuset_current_mems_allowed, node_states[N_MEMORY]);
0224 
0225     VM_BUG_ON(!nodes);
0226     if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
0227         nodes = NULL;   /* explicit local allocation */
0228     else {
0229         if (pol->flags & MPOL_F_RELATIVE_NODES)
0230             mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
0231         else
0232             nodes_and(nsc->mask2, *nodes, nsc->mask1);
0233 
0234         if (mpol_store_user_nodemask(pol))
0235             pol->w.user_nodemask = *nodes;
0236         else
0237             pol->w.cpuset_mems_allowed =
0238                         cpuset_current_mems_allowed;
0239     }
0240 
0241     if (nodes)
0242         ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
0243     else
0244         ret = mpol_ops[pol->mode].create(pol, NULL);
0245     return ret;
0246 }
0247 
0248 /*
0249  * This function just creates a new policy, does some check and simple
0250  * initialization. You must invoke mpol_set_nodemask() to set nodes.
0251  */
0252 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
0253                   nodemask_t *nodes)
0254 {
0255     struct mempolicy *policy;
0256 
0257     pr_debug("setting mode %d flags %d nodes[0] %lx\n",
0258          mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
0259 
0260     if (mode == MPOL_DEFAULT) {
0261         if (nodes && !nodes_empty(*nodes))
0262             return ERR_PTR(-EINVAL);
0263         return NULL;
0264     }
0265     VM_BUG_ON(!nodes);
0266 
0267     /*
0268      * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
0269      * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
0270      * All other modes require a valid pointer to a non-empty nodemask.
0271      */
0272     if (mode == MPOL_PREFERRED) {
0273         if (nodes_empty(*nodes)) {
0274             if (((flags & MPOL_F_STATIC_NODES) ||
0275                  (flags & MPOL_F_RELATIVE_NODES)))
0276                 return ERR_PTR(-EINVAL);
0277         }
0278     } else if (mode == MPOL_LOCAL) {
0279         if (!nodes_empty(*nodes) ||
0280             (flags & MPOL_F_STATIC_NODES) ||
0281             (flags & MPOL_F_RELATIVE_NODES))
0282             return ERR_PTR(-EINVAL);
0283         mode = MPOL_PREFERRED;
0284     } else if (nodes_empty(*nodes))
0285         return ERR_PTR(-EINVAL);
0286     policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
0287     if (!policy)
0288         return ERR_PTR(-ENOMEM);
0289     atomic_set(&policy->refcnt, 1);
0290     policy->mode = mode;
0291     policy->flags = flags;
0292 
0293     return policy;
0294 }
0295 
0296 /* Slow path of a mpol destructor. */
0297 void __mpol_put(struct mempolicy *p)
0298 {
0299     if (!atomic_dec_and_test(&p->refcnt))
0300         return;
0301     kmem_cache_free(policy_cache, p);
0302 }
0303 
0304 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
0305                 enum mpol_rebind_step step)
0306 {
0307 }
0308 
0309 /*
0310  * step:
0311  *  MPOL_REBIND_ONCE  - do rebind work at once
0312  *  MPOL_REBIND_STEP1 - set all the newly nodes
0313  *  MPOL_REBIND_STEP2 - clean all the disallowed nodes
0314  */
0315 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
0316                  enum mpol_rebind_step step)
0317 {
0318     nodemask_t tmp;
0319 
0320     if (pol->flags & MPOL_F_STATIC_NODES)
0321         nodes_and(tmp, pol->w.user_nodemask, *nodes);
0322     else if (pol->flags & MPOL_F_RELATIVE_NODES)
0323         mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
0324     else {
0325         /*
0326          * if step == 1, we use ->w.cpuset_mems_allowed to cache the
0327          * result
0328          */
0329         if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
0330             nodes_remap(tmp, pol->v.nodes,
0331                     pol->w.cpuset_mems_allowed, *nodes);
0332             pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
0333         } else if (step == MPOL_REBIND_STEP2) {
0334             tmp = pol->w.cpuset_mems_allowed;
0335             pol->w.cpuset_mems_allowed = *nodes;
0336         } else
0337             BUG();
0338     }
0339 
0340     if (nodes_empty(tmp))
0341         tmp = *nodes;
0342 
0343     if (step == MPOL_REBIND_STEP1)
0344         nodes_or(pol->v.nodes, pol->v.nodes, tmp);
0345     else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
0346         pol->v.nodes = tmp;
0347     else
0348         BUG();
0349 
0350     if (!node_isset(current->il_next, tmp)) {
0351         current->il_next = next_node_in(current->il_next, tmp);
0352         if (current->il_next >= MAX_NUMNODES)
0353             current->il_next = numa_node_id();
0354     }
0355 }
0356 
0357 static void mpol_rebind_preferred(struct mempolicy *pol,
0358                   const nodemask_t *nodes,
0359                   enum mpol_rebind_step step)
0360 {
0361     nodemask_t tmp;
0362 
0363     if (pol->flags & MPOL_F_STATIC_NODES) {
0364         int node = first_node(pol->w.user_nodemask);
0365 
0366         if (node_isset(node, *nodes)) {
0367             pol->v.preferred_node = node;
0368             pol->flags &= ~MPOL_F_LOCAL;
0369         } else
0370             pol->flags |= MPOL_F_LOCAL;
0371     } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
0372         mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
0373         pol->v.preferred_node = first_node(tmp);
0374     } else if (!(pol->flags & MPOL_F_LOCAL)) {
0375         pol->v.preferred_node = node_remap(pol->v.preferred_node,
0376                            pol->w.cpuset_mems_allowed,
0377                            *nodes);
0378         pol->w.cpuset_mems_allowed = *nodes;
0379     }
0380 }
0381 
0382 /*
0383  * mpol_rebind_policy - Migrate a policy to a different set of nodes
0384  *
0385  * If read-side task has no lock to protect task->mempolicy, write-side
0386  * task will rebind the task->mempolicy by two step. The first step is
0387  * setting all the newly nodes, and the second step is cleaning all the
0388  * disallowed nodes. In this way, we can avoid finding no node to alloc
0389  * page.
0390  * If we have a lock to protect task->mempolicy in read-side, we do
0391  * rebind directly.
0392  *
0393  * step:
0394  *  MPOL_REBIND_ONCE  - do rebind work at once
0395  *  MPOL_REBIND_STEP1 - set all the newly nodes
0396  *  MPOL_REBIND_STEP2 - clean all the disallowed nodes
0397  */
0398 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
0399                 enum mpol_rebind_step step)
0400 {
0401     if (!pol)
0402         return;
0403     if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
0404         nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
0405         return;
0406 
0407     if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
0408         return;
0409 
0410     if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
0411         BUG();
0412 
0413     if (step == MPOL_REBIND_STEP1)
0414         pol->flags |= MPOL_F_REBINDING;
0415     else if (step == MPOL_REBIND_STEP2)
0416         pol->flags &= ~MPOL_F_REBINDING;
0417     else if (step >= MPOL_REBIND_NSTEP)
0418         BUG();
0419 
0420     mpol_ops[pol->mode].rebind(pol, newmask, step);
0421 }
0422 
0423 /*
0424  * Wrapper for mpol_rebind_policy() that just requires task
0425  * pointer, and updates task mempolicy.
0426  *
0427  * Called with task's alloc_lock held.
0428  */
0429 
0430 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
0431             enum mpol_rebind_step step)
0432 {
0433     mpol_rebind_policy(tsk->mempolicy, new, step);
0434 }
0435 
0436 /*
0437  * Rebind each vma in mm to new nodemask.
0438  *
0439  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
0440  */
0441 
0442 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
0443 {
0444     struct vm_area_struct *vma;
0445 
0446     down_write(&mm->mmap_sem);
0447     for (vma = mm->mmap; vma; vma = vma->vm_next)
0448         mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
0449     up_write(&mm->mmap_sem);
0450 }
0451 
0452 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
0453     [MPOL_DEFAULT] = {
0454         .rebind = mpol_rebind_default,
0455     },
0456     [MPOL_INTERLEAVE] = {
0457         .create = mpol_new_interleave,
0458         .rebind = mpol_rebind_nodemask,
0459     },
0460     [MPOL_PREFERRED] = {
0461         .create = mpol_new_preferred,
0462         .rebind = mpol_rebind_preferred,
0463     },
0464     [MPOL_BIND] = {
0465         .create = mpol_new_bind,
0466         .rebind = mpol_rebind_nodemask,
0467     },
0468 };
0469 
0470 static void migrate_page_add(struct page *page, struct list_head *pagelist,
0471                 unsigned long flags);
0472 
0473 struct queue_pages {
0474     struct list_head *pagelist;
0475     unsigned long flags;
0476     nodemask_t *nmask;
0477     struct vm_area_struct *prev;
0478 };
0479 
0480 /*
0481  * Scan through pages checking if pages follow certain conditions,
0482  * and move them to the pagelist if they do.
0483  */
0484 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
0485             unsigned long end, struct mm_walk *walk)
0486 {
0487     struct vm_area_struct *vma = walk->vma;
0488     struct page *page;
0489     struct queue_pages *qp = walk->private;
0490     unsigned long flags = qp->flags;
0491     int nid, ret;
0492     pte_t *pte;
0493     spinlock_t *ptl;
0494 
0495     if (pmd_trans_huge(*pmd)) {
0496         ptl = pmd_lock(walk->mm, pmd);
0497         if (pmd_trans_huge(*pmd)) {
0498             page = pmd_page(*pmd);
0499             if (is_huge_zero_page(page)) {
0500                 spin_unlock(ptl);
0501                 __split_huge_pmd(vma, pmd, addr, false, NULL);
0502             } else {
0503                 get_page(page);
0504                 spin_unlock(ptl);
0505                 lock_page(page);
0506                 ret = split_huge_page(page);
0507                 unlock_page(page);
0508                 put_page(page);
0509                 if (ret)
0510                     return 0;
0511             }
0512         } else {
0513             spin_unlock(ptl);
0514         }
0515     }
0516 
0517     if (pmd_trans_unstable(pmd))
0518         return 0;
0519 retry:
0520     pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
0521     for (; addr != end; pte++, addr += PAGE_SIZE) {
0522         if (!pte_present(*pte))
0523             continue;
0524         page = vm_normal_page(vma, addr, *pte);
0525         if (!page)
0526             continue;
0527         /*
0528          * vm_normal_page() filters out zero pages, but there might
0529          * still be PageReserved pages to skip, perhaps in a VDSO.
0530          */
0531         if (PageReserved(page))
0532             continue;
0533         nid = page_to_nid(page);
0534         if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
0535             continue;
0536         if (PageTransCompound(page)) {
0537             get_page(page);
0538             pte_unmap_unlock(pte, ptl);
0539             lock_page(page);
0540             ret = split_huge_page(page);
0541             unlock_page(page);
0542             put_page(page);
0543             /* Failed to split -- skip. */
0544             if (ret) {
0545                 pte = pte_offset_map_lock(walk->mm, pmd,
0546                         addr, &ptl);
0547                 continue;
0548             }
0549             goto retry;
0550         }
0551 
0552         migrate_page_add(page, qp->pagelist, flags);
0553     }
0554     pte_unmap_unlock(pte - 1, ptl);
0555     cond_resched();
0556     return 0;
0557 }
0558 
0559 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
0560                    unsigned long addr, unsigned long end,
0561                    struct mm_walk *walk)
0562 {
0563 #ifdef CONFIG_HUGETLB_PAGE
0564     struct queue_pages *qp = walk->private;
0565     unsigned long flags = qp->flags;
0566     int nid;
0567     struct page *page;
0568     spinlock_t *ptl;
0569     pte_t entry;
0570 
0571     ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
0572     entry = huge_ptep_get(pte);
0573     if (!pte_present(entry))
0574         goto unlock;
0575     page = pte_page(entry);
0576     nid = page_to_nid(page);
0577     if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
0578         goto unlock;
0579     /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
0580     if (flags & (MPOL_MF_MOVE_ALL) ||
0581         (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
0582         isolate_huge_page(page, qp->pagelist);
0583 unlock:
0584     spin_unlock(ptl);
0585 #else
0586     BUG();
0587 #endif
0588     return 0;
0589 }
0590 
0591 #ifdef CONFIG_NUMA_BALANCING
0592 /*
0593  * This is used to mark a range of virtual addresses to be inaccessible.
0594  * These are later cleared by a NUMA hinting fault. Depending on these
0595  * faults, pages may be migrated for better NUMA placement.
0596  *
0597  * This is assuming that NUMA faults are handled using PROT_NONE. If
0598  * an architecture makes a different choice, it will need further
0599  * changes to the core.
0600  */
0601 unsigned long change_prot_numa(struct vm_area_struct *vma,
0602             unsigned long addr, unsigned long end)
0603 {
0604     int nr_updated;
0605 
0606     nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
0607     if (nr_updated)
0608         count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
0609 
0610     return nr_updated;
0611 }
0612 #else
0613 static unsigned long change_prot_numa(struct vm_area_struct *vma,
0614             unsigned long addr, unsigned long end)
0615 {
0616     return 0;
0617 }
0618 #endif /* CONFIG_NUMA_BALANCING */
0619 
0620 static int queue_pages_test_walk(unsigned long start, unsigned long end,
0621                 struct mm_walk *walk)
0622 {
0623     struct vm_area_struct *vma = walk->vma;
0624     struct queue_pages *qp = walk->private;
0625     unsigned long endvma = vma->vm_end;
0626     unsigned long flags = qp->flags;
0627 
0628     if (!vma_migratable(vma))
0629         return 1;
0630 
0631     if (endvma > end)
0632         endvma = end;
0633     if (vma->vm_start > start)
0634         start = vma->vm_start;
0635 
0636     if (!(flags & MPOL_MF_DISCONTIG_OK)) {
0637         if (!vma->vm_next && vma->vm_end < end)
0638             return -EFAULT;
0639         if (qp->prev && qp->prev->vm_end < vma->vm_start)
0640             return -EFAULT;
0641     }
0642 
0643     qp->prev = vma;
0644 
0645     if (flags & MPOL_MF_LAZY) {
0646         /* Similar to task_numa_work, skip inaccessible VMAs */
0647         if (!is_vm_hugetlb_page(vma) &&
0648             (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
0649             !(vma->vm_flags & VM_MIXEDMAP))
0650             change_prot_numa(vma, start, endvma);
0651         return 1;
0652     }
0653 
0654     /* queue pages from current vma */
0655     if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
0656         return 0;
0657     return 1;
0658 }
0659 
0660 /*
0661  * Walk through page tables and collect pages to be migrated.
0662  *
0663  * If pages found in a given range are on a set of nodes (determined by
0664  * @nodes and @flags,) it's isolated and queued to the pagelist which is
0665  * passed via @private.)
0666  */
0667 static int
0668 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
0669         nodemask_t *nodes, unsigned long flags,
0670         struct list_head *pagelist)
0671 {
0672     struct queue_pages qp = {
0673         .pagelist = pagelist,
0674         .flags = flags,
0675         .nmask = nodes,
0676         .prev = NULL,
0677     };
0678     struct mm_walk queue_pages_walk = {
0679         .hugetlb_entry = queue_pages_hugetlb,
0680         .pmd_entry = queue_pages_pte_range,
0681         .test_walk = queue_pages_test_walk,
0682         .mm = mm,
0683         .private = &qp,
0684     };
0685 
0686     return walk_page_range(start, end, &queue_pages_walk);
0687 }
0688 
0689 /*
0690  * Apply policy to a single VMA
0691  * This must be called with the mmap_sem held for writing.
0692  */
0693 static int vma_replace_policy(struct vm_area_struct *vma,
0694                         struct mempolicy *pol)
0695 {
0696     int err;
0697     struct mempolicy *old;
0698     struct mempolicy *new;
0699 
0700     pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
0701          vma->vm_start, vma->vm_end, vma->vm_pgoff,
0702          vma->vm_ops, vma->vm_file,
0703          vma->vm_ops ? vma->vm_ops->set_policy : NULL);
0704 
0705     new = mpol_dup(pol);
0706     if (IS_ERR(new))
0707         return PTR_ERR(new);
0708 
0709     if (vma->vm_ops && vma->vm_ops->set_policy) {
0710         err = vma->vm_ops->set_policy(vma, new);
0711         if (err)
0712             goto err_out;
0713     }
0714 
0715     old = vma->vm_policy;
0716     vma->vm_policy = new; /* protected by mmap_sem */
0717     mpol_put(old);
0718 
0719     return 0;
0720  err_out:
0721     mpol_put(new);
0722     return err;
0723 }
0724 
0725 /* Step 2: apply policy to a range and do splits. */
0726 static int mbind_range(struct mm_struct *mm, unsigned long start,
0727                unsigned long end, struct mempolicy *new_pol)
0728 {
0729     struct vm_area_struct *next;
0730     struct vm_area_struct *prev;
0731     struct vm_area_struct *vma;
0732     int err = 0;
0733     pgoff_t pgoff;
0734     unsigned long vmstart;
0735     unsigned long vmend;
0736 
0737     vma = find_vma(mm, start);
0738     if (!vma || vma->vm_start > start)
0739         return -EFAULT;
0740 
0741     prev = vma->vm_prev;
0742     if (start > vma->vm_start)
0743         prev = vma;
0744 
0745     for (; vma && vma->vm_start < end; prev = vma, vma = next) {
0746         next = vma->vm_next;
0747         vmstart = max(start, vma->vm_start);
0748         vmend   = min(end, vma->vm_end);
0749 
0750         if (mpol_equal(vma_policy(vma), new_pol))
0751             continue;
0752 
0753         pgoff = vma->vm_pgoff +
0754             ((vmstart - vma->vm_start) >> PAGE_SHIFT);
0755         prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
0756                  vma->anon_vma, vma->vm_file, pgoff,
0757                  new_pol, vma->vm_userfaultfd_ctx);
0758         if (prev) {
0759             vma = prev;
0760             next = vma->vm_next;
0761             if (mpol_equal(vma_policy(vma), new_pol))
0762                 continue;
0763             /* vma_merge() joined vma && vma->next, case 8 */
0764             goto replace;
0765         }
0766         if (vma->vm_start != vmstart) {
0767             err = split_vma(vma->vm_mm, vma, vmstart, 1);
0768             if (err)
0769                 goto out;
0770         }
0771         if (vma->vm_end != vmend) {
0772             err = split_vma(vma->vm_mm, vma, vmend, 0);
0773             if (err)
0774                 goto out;
0775         }
0776  replace:
0777         err = vma_replace_policy(vma, new_pol);
0778         if (err)
0779             goto out;
0780     }
0781 
0782  out:
0783     return err;
0784 }
0785 
0786 /* Set the process memory policy */
0787 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
0788                  nodemask_t *nodes)
0789 {
0790     struct mempolicy *new, *old;
0791     NODEMASK_SCRATCH(scratch);
0792     int ret;
0793 
0794     if (!scratch)
0795         return -ENOMEM;
0796 
0797     new = mpol_new(mode, flags, nodes);
0798     if (IS_ERR(new)) {
0799         ret = PTR_ERR(new);
0800         goto out;
0801     }
0802 
0803     task_lock(current);
0804     ret = mpol_set_nodemask(new, nodes, scratch);
0805     if (ret) {
0806         task_unlock(current);
0807         mpol_put(new);
0808         goto out;
0809     }
0810     old = current->mempolicy;
0811     current->mempolicy = new;
0812     if (new && new->mode == MPOL_INTERLEAVE &&
0813         nodes_weight(new->v.nodes))
0814         current->il_next = first_node(new->v.nodes);
0815     task_unlock(current);
0816     mpol_put(old);
0817     ret = 0;
0818 out:
0819     NODEMASK_SCRATCH_FREE(scratch);
0820     return ret;
0821 }
0822 
0823 /*
0824  * Return nodemask for policy for get_mempolicy() query
0825  *
0826  * Called with task's alloc_lock held
0827  */
0828 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
0829 {
0830     nodes_clear(*nodes);
0831     if (p == &default_policy)
0832         return;
0833 
0834     switch (p->mode) {
0835     case MPOL_BIND:
0836         /* Fall through */
0837     case MPOL_INTERLEAVE:
0838         *nodes = p->v.nodes;
0839         break;
0840     case MPOL_PREFERRED:
0841         if (!(p->flags & MPOL_F_LOCAL))
0842             node_set(p->v.preferred_node, *nodes);
0843         /* else return empty node mask for local allocation */
0844         break;
0845     default:
0846         BUG();
0847     }
0848 }
0849 
0850 static int lookup_node(unsigned long addr)
0851 {
0852     struct page *p;
0853     int err;
0854 
0855     err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
0856     if (err >= 0) {
0857         err = page_to_nid(p);
0858         put_page(p);
0859     }
0860     return err;
0861 }
0862 
0863 /* Retrieve NUMA policy */
0864 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
0865                  unsigned long addr, unsigned long flags)
0866 {
0867     int err;
0868     struct mm_struct *mm = current->mm;
0869     struct vm_area_struct *vma = NULL;
0870     struct mempolicy *pol = current->mempolicy;
0871 
0872     if (flags &
0873         ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
0874         return -EINVAL;
0875 
0876     if (flags & MPOL_F_MEMS_ALLOWED) {
0877         if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
0878             return -EINVAL;
0879         *policy = 0;    /* just so it's initialized */
0880         task_lock(current);
0881         *nmask  = cpuset_current_mems_allowed;
0882         task_unlock(current);
0883         return 0;
0884     }
0885 
0886     if (flags & MPOL_F_ADDR) {
0887         /*
0888          * Do NOT fall back to task policy if the
0889          * vma/shared policy at addr is NULL.  We
0890          * want to return MPOL_DEFAULT in this case.
0891          */
0892         down_read(&mm->mmap_sem);
0893         vma = find_vma_intersection(mm, addr, addr+1);
0894         if (!vma) {
0895             up_read(&mm->mmap_sem);
0896             return -EFAULT;
0897         }
0898         if (vma->vm_ops && vma->vm_ops->get_policy)
0899             pol = vma->vm_ops->get_policy(vma, addr);
0900         else
0901             pol = vma->vm_policy;
0902     } else if (addr)
0903         return -EINVAL;
0904 
0905     if (!pol)
0906         pol = &default_policy;  /* indicates default behavior */
0907 
0908     if (flags & MPOL_F_NODE) {
0909         if (flags & MPOL_F_ADDR) {
0910             err = lookup_node(addr);
0911             if (err < 0)
0912                 goto out;
0913             *policy = err;
0914         } else if (pol == current->mempolicy &&
0915                 pol->mode == MPOL_INTERLEAVE) {
0916             *policy = current->il_next;
0917         } else {
0918             err = -EINVAL;
0919             goto out;
0920         }
0921     } else {
0922         *policy = pol == &default_policy ? MPOL_DEFAULT :
0923                         pol->mode;
0924         /*
0925          * Internal mempolicy flags must be masked off before exposing
0926          * the policy to userspace.
0927          */
0928         *policy |= (pol->flags & MPOL_MODE_FLAGS);
0929     }
0930 
0931     if (vma) {
0932         up_read(&current->mm->mmap_sem);
0933         vma = NULL;
0934     }
0935 
0936     err = 0;
0937     if (nmask) {
0938         if (mpol_store_user_nodemask(pol)) {
0939             *nmask = pol->w.user_nodemask;
0940         } else {
0941             task_lock(current);
0942             get_policy_nodemask(pol, nmask);
0943             task_unlock(current);
0944         }
0945     }
0946 
0947  out:
0948     mpol_cond_put(pol);
0949     if (vma)
0950         up_read(&current->mm->mmap_sem);
0951     return err;
0952 }
0953 
0954 #ifdef CONFIG_MIGRATION
0955 /*
0956  * page migration
0957  */
0958 static void migrate_page_add(struct page *page, struct list_head *pagelist,
0959                 unsigned long flags)
0960 {
0961     /*
0962      * Avoid migrating a page that is shared with others.
0963      */
0964     if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
0965         if (!isolate_lru_page(page)) {
0966             list_add_tail(&page->lru, pagelist);
0967             inc_node_page_state(page, NR_ISOLATED_ANON +
0968                         page_is_file_cache(page));
0969         }
0970     }
0971 }
0972 
0973 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
0974 {
0975     if (PageHuge(page))
0976         return alloc_huge_page_node(page_hstate(compound_head(page)),
0977                     node);
0978     else
0979         return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
0980                             __GFP_THISNODE, 0);
0981 }
0982 
0983 /*
0984  * Migrate pages from one node to a target node.
0985  * Returns error or the number of pages not migrated.
0986  */
0987 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
0988                int flags)
0989 {
0990     nodemask_t nmask;
0991     LIST_HEAD(pagelist);
0992     int err = 0;
0993 
0994     nodes_clear(nmask);
0995     node_set(source, nmask);
0996 
0997     /*
0998      * This does not "check" the range but isolates all pages that
0999      * need migration.  Between passing in the full user address
1000      * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1001      */
1002     VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1003     queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1004             flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1005 
1006     if (!list_empty(&pagelist)) {
1007         err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1008                     MIGRATE_SYNC, MR_SYSCALL);
1009         if (err)
1010             putback_movable_pages(&pagelist);
1011     }
1012 
1013     return err;
1014 }
1015 
1016 /*
1017  * Move pages between the two nodesets so as to preserve the physical
1018  * layout as much as possible.
1019  *
1020  * Returns the number of page that could not be moved.
1021  */
1022 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1023              const nodemask_t *to, int flags)
1024 {
1025     int busy = 0;
1026     int err;
1027     nodemask_t tmp;
1028 
1029     err = migrate_prep();
1030     if (err)
1031         return err;
1032 
1033     down_read(&mm->mmap_sem);
1034 
1035     /*
1036      * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1037      * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1038      * bit in 'tmp', and return that <source, dest> pair for migration.
1039      * The pair of nodemasks 'to' and 'from' define the map.
1040      *
1041      * If no pair of bits is found that way, fallback to picking some
1042      * pair of 'source' and 'dest' bits that are not the same.  If the
1043      * 'source' and 'dest' bits are the same, this represents a node
1044      * that will be migrating to itself, so no pages need move.
1045      *
1046      * If no bits are left in 'tmp', or if all remaining bits left
1047      * in 'tmp' correspond to the same bit in 'to', return false
1048      * (nothing left to migrate).
1049      *
1050      * This lets us pick a pair of nodes to migrate between, such that
1051      * if possible the dest node is not already occupied by some other
1052      * source node, minimizing the risk of overloading the memory on a
1053      * node that would happen if we migrated incoming memory to a node
1054      * before migrating outgoing memory source that same node.
1055      *
1056      * A single scan of tmp is sufficient.  As we go, we remember the
1057      * most recent <s, d> pair that moved (s != d).  If we find a pair
1058      * that not only moved, but what's better, moved to an empty slot
1059      * (d is not set in tmp), then we break out then, with that pair.
1060      * Otherwise when we finish scanning from_tmp, we at least have the
1061      * most recent <s, d> pair that moved.  If we get all the way through
1062      * the scan of tmp without finding any node that moved, much less
1063      * moved to an empty node, then there is nothing left worth migrating.
1064      */
1065 
1066     tmp = *from;
1067     while (!nodes_empty(tmp)) {
1068         int s,d;
1069         int source = NUMA_NO_NODE;
1070         int dest = 0;
1071 
1072         for_each_node_mask(s, tmp) {
1073 
1074             /*
1075              * do_migrate_pages() tries to maintain the relative
1076              * node relationship of the pages established between
1077              * threads and memory areas.
1078                          *
1079              * However if the number of source nodes is not equal to
1080              * the number of destination nodes we can not preserve
1081              * this node relative relationship.  In that case, skip
1082              * copying memory from a node that is in the destination
1083              * mask.
1084              *
1085              * Example: [2,3,4] -> [3,4,5] moves everything.
1086              *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1087              */
1088 
1089             if ((nodes_weight(*from) != nodes_weight(*to)) &&
1090                         (node_isset(s, *to)))
1091                 continue;
1092 
1093             d = node_remap(s, *from, *to);
1094             if (s == d)
1095                 continue;
1096 
1097             source = s; /* Node moved. Memorize */
1098             dest = d;
1099 
1100             /* dest not in remaining from nodes? */
1101             if (!node_isset(dest, tmp))
1102                 break;
1103         }
1104         if (source == NUMA_NO_NODE)
1105             break;
1106 
1107         node_clear(source, tmp);
1108         err = migrate_to_node(mm, source, dest, flags);
1109         if (err > 0)
1110             busy += err;
1111         if (err < 0)
1112             break;
1113     }
1114     up_read(&mm->mmap_sem);
1115     if (err < 0)
1116         return err;
1117     return busy;
1118 
1119 }
1120 
1121 /*
1122  * Allocate a new page for page migration based on vma policy.
1123  * Start by assuming the page is mapped by the same vma as contains @start.
1124  * Search forward from there, if not.  N.B., this assumes that the
1125  * list of pages handed to migrate_pages()--which is how we get here--
1126  * is in virtual address order.
1127  */
1128 static struct page *new_page(struct page *page, unsigned long start, int **x)
1129 {
1130     struct vm_area_struct *vma;
1131     unsigned long uninitialized_var(address);
1132 
1133     vma = find_vma(current->mm, start);
1134     while (vma) {
1135         address = page_address_in_vma(page, vma);
1136         if (address != -EFAULT)
1137             break;
1138         vma = vma->vm_next;
1139     }
1140 
1141     if (PageHuge(page)) {
1142         BUG_ON(!vma);
1143         return alloc_huge_page_noerr(vma, address, 1);
1144     }
1145     /*
1146      * if !vma, alloc_page_vma() will use task or system default policy
1147      */
1148     return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1149 }
1150 #else
1151 
1152 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1153                 unsigned long flags)
1154 {
1155 }
1156 
1157 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1158              const nodemask_t *to, int flags)
1159 {
1160     return -ENOSYS;
1161 }
1162 
1163 static struct page *new_page(struct page *page, unsigned long start, int **x)
1164 {
1165     return NULL;
1166 }
1167 #endif
1168 
1169 static long do_mbind(unsigned long start, unsigned long len,
1170              unsigned short mode, unsigned short mode_flags,
1171              nodemask_t *nmask, unsigned long flags)
1172 {
1173     struct mm_struct *mm = current->mm;
1174     struct mempolicy *new;
1175     unsigned long end;
1176     int err;
1177     LIST_HEAD(pagelist);
1178 
1179     if (flags & ~(unsigned long)MPOL_MF_VALID)
1180         return -EINVAL;
1181     if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1182         return -EPERM;
1183 
1184     if (start & ~PAGE_MASK)
1185         return -EINVAL;
1186 
1187     if (mode == MPOL_DEFAULT)
1188         flags &= ~MPOL_MF_STRICT;
1189 
1190     len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1191     end = start + len;
1192 
1193     if (end < start)
1194         return -EINVAL;
1195     if (end == start)
1196         return 0;
1197 
1198     new = mpol_new(mode, mode_flags, nmask);
1199     if (IS_ERR(new))
1200         return PTR_ERR(new);
1201 
1202     if (flags & MPOL_MF_LAZY)
1203         new->flags |= MPOL_F_MOF;
1204 
1205     /*
1206      * If we are using the default policy then operation
1207      * on discontinuous address spaces is okay after all
1208      */
1209     if (!new)
1210         flags |= MPOL_MF_DISCONTIG_OK;
1211 
1212     pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1213          start, start + len, mode, mode_flags,
1214          nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1215 
1216     if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1217 
1218         err = migrate_prep();
1219         if (err)
1220             goto mpol_out;
1221     }
1222     {
1223         NODEMASK_SCRATCH(scratch);
1224         if (scratch) {
1225             down_write(&mm->mmap_sem);
1226             task_lock(current);
1227             err = mpol_set_nodemask(new, nmask, scratch);
1228             task_unlock(current);
1229             if (err)
1230                 up_write(&mm->mmap_sem);
1231         } else
1232             err = -ENOMEM;
1233         NODEMASK_SCRATCH_FREE(scratch);
1234     }
1235     if (err)
1236         goto mpol_out;
1237 
1238     err = queue_pages_range(mm, start, end, nmask,
1239               flags | MPOL_MF_INVERT, &pagelist);
1240     if (!err)
1241         err = mbind_range(mm, start, end, new);
1242 
1243     if (!err) {
1244         int nr_failed = 0;
1245 
1246         if (!list_empty(&pagelist)) {
1247             WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1248             nr_failed = migrate_pages(&pagelist, new_page, NULL,
1249                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1250             if (nr_failed)
1251                 putback_movable_pages(&pagelist);
1252         }
1253 
1254         if (nr_failed && (flags & MPOL_MF_STRICT))
1255             err = -EIO;
1256     } else
1257         putback_movable_pages(&pagelist);
1258 
1259     up_write(&mm->mmap_sem);
1260  mpol_out:
1261     mpol_put(new);
1262     return err;
1263 }
1264 
1265 /*
1266  * User space interface with variable sized bitmaps for nodelists.
1267  */
1268 
1269 /* Copy a node mask from user space. */
1270 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1271              unsigned long maxnode)
1272 {
1273     unsigned long k;
1274     unsigned long nlongs;
1275     unsigned long endmask;
1276 
1277     --maxnode;
1278     nodes_clear(*nodes);
1279     if (maxnode == 0 || !nmask)
1280         return 0;
1281     if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1282         return -EINVAL;
1283 
1284     nlongs = BITS_TO_LONGS(maxnode);
1285     if ((maxnode % BITS_PER_LONG) == 0)
1286         endmask = ~0UL;
1287     else
1288         endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1289 
1290     /* When the user specified more nodes than supported just check
1291        if the non supported part is all zero. */
1292     if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1293         if (nlongs > PAGE_SIZE/sizeof(long))
1294             return -EINVAL;
1295         for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1296             unsigned long t;
1297             if (get_user(t, nmask + k))
1298                 return -EFAULT;
1299             if (k == nlongs - 1) {
1300                 if (t & endmask)
1301                     return -EINVAL;
1302             } else if (t)
1303                 return -EINVAL;
1304         }
1305         nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1306         endmask = ~0UL;
1307     }
1308 
1309     if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1310         return -EFAULT;
1311     nodes_addr(*nodes)[nlongs-1] &= endmask;
1312     return 0;
1313 }
1314 
1315 /* Copy a kernel node mask to user space */
1316 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1317                   nodemask_t *nodes)
1318 {
1319     unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1320     const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1321 
1322     if (copy > nbytes) {
1323         if (copy > PAGE_SIZE)
1324             return -EINVAL;
1325         if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1326             return -EFAULT;
1327         copy = nbytes;
1328     }
1329     return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1330 }
1331 
1332 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1333         unsigned long, mode, const unsigned long __user *, nmask,
1334         unsigned long, maxnode, unsigned, flags)
1335 {
1336     nodemask_t nodes;
1337     int err;
1338     unsigned short mode_flags;
1339 
1340     mode_flags = mode & MPOL_MODE_FLAGS;
1341     mode &= ~MPOL_MODE_FLAGS;
1342     if (mode >= MPOL_MAX)
1343         return -EINVAL;
1344     if ((mode_flags & MPOL_F_STATIC_NODES) &&
1345         (mode_flags & MPOL_F_RELATIVE_NODES))
1346         return -EINVAL;
1347     err = get_nodes(&nodes, nmask, maxnode);
1348     if (err)
1349         return err;
1350     return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1351 }
1352 
1353 /* Set the process memory policy */
1354 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1355         unsigned long, maxnode)
1356 {
1357     int err;
1358     nodemask_t nodes;
1359     unsigned short flags;
1360 
1361     flags = mode & MPOL_MODE_FLAGS;
1362     mode &= ~MPOL_MODE_FLAGS;
1363     if ((unsigned int)mode >= MPOL_MAX)
1364         return -EINVAL;
1365     if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1366         return -EINVAL;
1367     err = get_nodes(&nodes, nmask, maxnode);
1368     if (err)
1369         return err;
1370     return do_set_mempolicy(mode, flags, &nodes);
1371 }
1372 
1373 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1374         const unsigned long __user *, old_nodes,
1375         const unsigned long __user *, new_nodes)
1376 {
1377     const struct cred *cred = current_cred(), *tcred;
1378     struct mm_struct *mm = NULL;
1379     struct task_struct *task;
1380     nodemask_t task_nodes;
1381     int err;
1382     nodemask_t *old;
1383     nodemask_t *new;
1384     NODEMASK_SCRATCH(scratch);
1385 
1386     if (!scratch)
1387         return -ENOMEM;
1388 
1389     old = &scratch->mask1;
1390     new = &scratch->mask2;
1391 
1392     err = get_nodes(old, old_nodes, maxnode);
1393     if (err)
1394         goto out;
1395 
1396     err = get_nodes(new, new_nodes, maxnode);
1397     if (err)
1398         goto out;
1399 
1400     /* Find the mm_struct */
1401     rcu_read_lock();
1402     task = pid ? find_task_by_vpid(pid) : current;
1403     if (!task) {
1404         rcu_read_unlock();
1405         err = -ESRCH;
1406         goto out;
1407     }
1408     get_task_struct(task);
1409 
1410     err = -EINVAL;
1411 
1412     /*
1413      * Check if this process has the right to modify the specified
1414      * process. The right exists if the process has administrative
1415      * capabilities, superuser privileges or the same
1416      * userid as the target process.
1417      */
1418     tcred = __task_cred(task);
1419     if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1420         !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1421         !capable(CAP_SYS_NICE)) {
1422         rcu_read_unlock();
1423         err = -EPERM;
1424         goto out_put;
1425     }
1426     rcu_read_unlock();
1427 
1428     task_nodes = cpuset_mems_allowed(task);
1429     /* Is the user allowed to access the target nodes? */
1430     if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1431         err = -EPERM;
1432         goto out_put;
1433     }
1434 
1435     if (!nodes_subset(*new, node_states[N_MEMORY])) {
1436         err = -EINVAL;
1437         goto out_put;
1438     }
1439 
1440     err = security_task_movememory(task);
1441     if (err)
1442         goto out_put;
1443 
1444     mm = get_task_mm(task);
1445     put_task_struct(task);
1446 
1447     if (!mm) {
1448         err = -EINVAL;
1449         goto out;
1450     }
1451 
1452     err = do_migrate_pages(mm, old, new,
1453         capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1454 
1455     mmput(mm);
1456 out:
1457     NODEMASK_SCRATCH_FREE(scratch);
1458 
1459     return err;
1460 
1461 out_put:
1462     put_task_struct(task);
1463     goto out;
1464 
1465 }
1466 
1467 
1468 /* Retrieve NUMA policy */
1469 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1470         unsigned long __user *, nmask, unsigned long, maxnode,
1471         unsigned long, addr, unsigned long, flags)
1472 {
1473     int err;
1474     int uninitialized_var(pval);
1475     nodemask_t nodes;
1476 
1477     if (nmask != NULL && maxnode < MAX_NUMNODES)
1478         return -EINVAL;
1479 
1480     err = do_get_mempolicy(&pval, &nodes, addr, flags);
1481 
1482     if (err)
1483         return err;
1484 
1485     if (policy && put_user(pval, policy))
1486         return -EFAULT;
1487 
1488     if (nmask)
1489         err = copy_nodes_to_user(nmask, maxnode, &nodes);
1490 
1491     return err;
1492 }
1493 
1494 #ifdef CONFIG_COMPAT
1495 
1496 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1497                compat_ulong_t __user *, nmask,
1498                compat_ulong_t, maxnode,
1499                compat_ulong_t, addr, compat_ulong_t, flags)
1500 {
1501     long err;
1502     unsigned long __user *nm = NULL;
1503     unsigned long nr_bits, alloc_size;
1504     DECLARE_BITMAP(bm, MAX_NUMNODES);
1505 
1506     nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1507     alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1508 
1509     if (nmask)
1510         nm = compat_alloc_user_space(alloc_size);
1511 
1512     err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1513 
1514     if (!err && nmask) {
1515         unsigned long copy_size;
1516         copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1517         err = copy_from_user(bm, nm, copy_size);
1518         /* ensure entire bitmap is zeroed */
1519         err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1520         err |= compat_put_bitmap(nmask, bm, nr_bits);
1521     }
1522 
1523     return err;
1524 }
1525 
1526 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1527                compat_ulong_t, maxnode)
1528 {
1529     long err = 0;
1530     unsigned long __user *nm = NULL;
1531     unsigned long nr_bits, alloc_size;
1532     DECLARE_BITMAP(bm, MAX_NUMNODES);
1533 
1534     nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1535     alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1536 
1537     if (nmask) {
1538         err = compat_get_bitmap(bm, nmask, nr_bits);
1539         nm = compat_alloc_user_space(alloc_size);
1540         err |= copy_to_user(nm, bm, alloc_size);
1541     }
1542 
1543     if (err)
1544         return -EFAULT;
1545 
1546     return sys_set_mempolicy(mode, nm, nr_bits+1);
1547 }
1548 
1549 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1550                compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1551                compat_ulong_t, maxnode, compat_ulong_t, flags)
1552 {
1553     long err = 0;
1554     unsigned long __user *nm = NULL;
1555     unsigned long nr_bits, alloc_size;
1556     nodemask_t bm;
1557 
1558     nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1559     alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1560 
1561     if (nmask) {
1562         err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1563         nm = compat_alloc_user_space(alloc_size);
1564         err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1565     }
1566 
1567     if (err)
1568         return -EFAULT;
1569 
1570     return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1571 }
1572 
1573 #endif
1574 
1575 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1576                         unsigned long addr)
1577 {
1578     struct mempolicy *pol = NULL;
1579 
1580     if (vma) {
1581         if (vma->vm_ops && vma->vm_ops->get_policy) {
1582             pol = vma->vm_ops->get_policy(vma, addr);
1583         } else if (vma->vm_policy) {
1584             pol = vma->vm_policy;
1585 
1586             /*
1587              * shmem_alloc_page() passes MPOL_F_SHARED policy with
1588              * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1589              * count on these policies which will be dropped by
1590              * mpol_cond_put() later
1591              */
1592             if (mpol_needs_cond_ref(pol))
1593                 mpol_get(pol);
1594         }
1595     }
1596 
1597     return pol;
1598 }
1599 
1600 /*
1601  * get_vma_policy(@vma, @addr)
1602  * @vma: virtual memory area whose policy is sought
1603  * @addr: address in @vma for shared policy lookup
1604  *
1605  * Returns effective policy for a VMA at specified address.
1606  * Falls back to current->mempolicy or system default policy, as necessary.
1607  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1608  * count--added by the get_policy() vm_op, as appropriate--to protect against
1609  * freeing by another task.  It is the caller's responsibility to free the
1610  * extra reference for shared policies.
1611  */
1612 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1613                         unsigned long addr)
1614 {
1615     struct mempolicy *pol = __get_vma_policy(vma, addr);
1616 
1617     if (!pol)
1618         pol = get_task_policy(current);
1619 
1620     return pol;
1621 }
1622 
1623 bool vma_policy_mof(struct vm_area_struct *vma)
1624 {
1625     struct mempolicy *pol;
1626 
1627     if (vma->vm_ops && vma->vm_ops->get_policy) {
1628         bool ret = false;
1629 
1630         pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1631         if (pol && (pol->flags & MPOL_F_MOF))
1632             ret = true;
1633         mpol_cond_put(pol);
1634 
1635         return ret;
1636     }
1637 
1638     pol = vma->vm_policy;
1639     if (!pol)
1640         pol = get_task_policy(current);
1641 
1642     return pol->flags & MPOL_F_MOF;
1643 }
1644 
1645 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1646 {
1647     enum zone_type dynamic_policy_zone = policy_zone;
1648 
1649     BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1650 
1651     /*
1652      * if policy->v.nodes has movable memory only,
1653      * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1654      *
1655      * policy->v.nodes is intersect with node_states[N_MEMORY].
1656      * so if the following test faile, it implies
1657      * policy->v.nodes has movable memory only.
1658      */
1659     if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1660         dynamic_policy_zone = ZONE_MOVABLE;
1661 
1662     return zone >= dynamic_policy_zone;
1663 }
1664 
1665 /*
1666  * Return a nodemask representing a mempolicy for filtering nodes for
1667  * page allocation
1668  */
1669 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1670 {
1671     /* Lower zones don't get a nodemask applied for MPOL_BIND */
1672     if (unlikely(policy->mode == MPOL_BIND) &&
1673             apply_policy_zone(policy, gfp_zone(gfp)) &&
1674             cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1675         return &policy->v.nodes;
1676 
1677     return NULL;
1678 }
1679 
1680 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1681 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1682     int nd)
1683 {
1684     if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1685         nd = policy->v.preferred_node;
1686     else {
1687         /*
1688          * __GFP_THISNODE shouldn't even be used with the bind policy
1689          * because we might easily break the expectation to stay on the
1690          * requested node and not break the policy.
1691          */
1692         WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1693     }
1694 
1695     return node_zonelist(nd, gfp);
1696 }
1697 
1698 /* Do dynamic interleaving for a process */
1699 static unsigned interleave_nodes(struct mempolicy *policy)
1700 {
1701     unsigned nid, next;
1702     struct task_struct *me = current;
1703 
1704     nid = me->il_next;
1705     next = next_node_in(nid, policy->v.nodes);
1706     if (next < MAX_NUMNODES)
1707         me->il_next = next;
1708     return nid;
1709 }
1710 
1711 /*
1712  * Depending on the memory policy provide a node from which to allocate the
1713  * next slab entry.
1714  */
1715 unsigned int mempolicy_slab_node(void)
1716 {
1717     struct mempolicy *policy;
1718     int node = numa_mem_id();
1719 
1720     if (in_interrupt())
1721         return node;
1722 
1723     policy = current->mempolicy;
1724     if (!policy || policy->flags & MPOL_F_LOCAL)
1725         return node;
1726 
1727     switch (policy->mode) {
1728     case MPOL_PREFERRED:
1729         /*
1730          * handled MPOL_F_LOCAL above
1731          */
1732         return policy->v.preferred_node;
1733 
1734     case MPOL_INTERLEAVE:
1735         return interleave_nodes(policy);
1736 
1737     case MPOL_BIND: {
1738         struct zoneref *z;
1739 
1740         /*
1741          * Follow bind policy behavior and start allocation at the
1742          * first node.
1743          */
1744         struct zonelist *zonelist;
1745         enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1746         zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1747         z = first_zones_zonelist(zonelist, highest_zoneidx,
1748                             &policy->v.nodes);
1749         return z->zone ? z->zone->node : node;
1750     }
1751 
1752     default:
1753         BUG();
1754     }
1755 }
1756 
1757 /*
1758  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1759  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1760  * number of present nodes.
1761  */
1762 static unsigned offset_il_node(struct mempolicy *pol,
1763                    struct vm_area_struct *vma, unsigned long n)
1764 {
1765     unsigned nnodes = nodes_weight(pol->v.nodes);
1766     unsigned target;
1767     int i;
1768     int nid;
1769 
1770     if (!nnodes)
1771         return numa_node_id();
1772     target = (unsigned int)n % nnodes;
1773     nid = first_node(pol->v.nodes);
1774     for (i = 0; i < target; i++)
1775         nid = next_node(nid, pol->v.nodes);
1776     return nid;
1777 }
1778 
1779 /* Determine a node number for interleave */
1780 static inline unsigned interleave_nid(struct mempolicy *pol,
1781          struct vm_area_struct *vma, unsigned long addr, int shift)
1782 {
1783     if (vma) {
1784         unsigned long off;
1785 
1786         /*
1787          * for small pages, there is no difference between
1788          * shift and PAGE_SHIFT, so the bit-shift is safe.
1789          * for huge pages, since vm_pgoff is in units of small
1790          * pages, we need to shift off the always 0 bits to get
1791          * a useful offset.
1792          */
1793         BUG_ON(shift < PAGE_SHIFT);
1794         off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1795         off += (addr - vma->vm_start) >> shift;
1796         return offset_il_node(pol, vma, off);
1797     } else
1798         return interleave_nodes(pol);
1799 }
1800 
1801 #ifdef CONFIG_HUGETLBFS
1802 /*
1803  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1804  * @vma: virtual memory area whose policy is sought
1805  * @addr: address in @vma for shared policy lookup and interleave policy
1806  * @gfp_flags: for requested zone
1807  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1808  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1809  *
1810  * Returns a zonelist suitable for a huge page allocation and a pointer
1811  * to the struct mempolicy for conditional unref after allocation.
1812  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1813  * @nodemask for filtering the zonelist.
1814  *
1815  * Must be protected by read_mems_allowed_begin()
1816  */
1817 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1818                 gfp_t gfp_flags, struct mempolicy **mpol,
1819                 nodemask_t **nodemask)
1820 {
1821     struct zonelist *zl;
1822 
1823     *mpol = get_vma_policy(vma, addr);
1824     *nodemask = NULL;   /* assume !MPOL_BIND */
1825 
1826     if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1827         zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1828                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1829     } else {
1830         zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1831         if ((*mpol)->mode == MPOL_BIND)
1832             *nodemask = &(*mpol)->v.nodes;
1833     }
1834     return zl;
1835 }
1836 
1837 /*
1838  * init_nodemask_of_mempolicy
1839  *
1840  * If the current task's mempolicy is "default" [NULL], return 'false'
1841  * to indicate default policy.  Otherwise, extract the policy nodemask
1842  * for 'bind' or 'interleave' policy into the argument nodemask, or
1843  * initialize the argument nodemask to contain the single node for
1844  * 'preferred' or 'local' policy and return 'true' to indicate presence
1845  * of non-default mempolicy.
1846  *
1847  * We don't bother with reference counting the mempolicy [mpol_get/put]
1848  * because the current task is examining it's own mempolicy and a task's
1849  * mempolicy is only ever changed by the task itself.
1850  *
1851  * N.B., it is the caller's responsibility to free a returned nodemask.
1852  */
1853 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1854 {
1855     struct mempolicy *mempolicy;
1856     int nid;
1857 
1858     if (!(mask && current->mempolicy))
1859         return false;
1860 
1861     task_lock(current);
1862     mempolicy = current->mempolicy;
1863     switch (mempolicy->mode) {
1864     case MPOL_PREFERRED:
1865         if (mempolicy->flags & MPOL_F_LOCAL)
1866             nid = numa_node_id();
1867         else
1868             nid = mempolicy->v.preferred_node;
1869         init_nodemask_of_node(mask, nid);
1870         break;
1871 
1872     case MPOL_BIND:
1873         /* Fall through */
1874     case MPOL_INTERLEAVE:
1875         *mask =  mempolicy->v.nodes;
1876         break;
1877 
1878     default:
1879         BUG();
1880     }
1881     task_unlock(current);
1882 
1883     return true;
1884 }
1885 #endif
1886 
1887 /*
1888  * mempolicy_nodemask_intersects
1889  *
1890  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1891  * policy.  Otherwise, check for intersection between mask and the policy
1892  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1893  * policy, always return true since it may allocate elsewhere on fallback.
1894  *
1895  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1896  */
1897 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1898                     const nodemask_t *mask)
1899 {
1900     struct mempolicy *mempolicy;
1901     bool ret = true;
1902 
1903     if (!mask)
1904         return ret;
1905     task_lock(tsk);
1906     mempolicy = tsk->mempolicy;
1907     if (!mempolicy)
1908         goto out;
1909 
1910     switch (mempolicy->mode) {
1911     case MPOL_PREFERRED:
1912         /*
1913          * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1914          * allocate from, they may fallback to other nodes when oom.
1915          * Thus, it's possible for tsk to have allocated memory from
1916          * nodes in mask.
1917          */
1918         break;
1919     case MPOL_BIND:
1920     case MPOL_INTERLEAVE:
1921         ret = nodes_intersects(mempolicy->v.nodes, *mask);
1922         break;
1923     default:
1924         BUG();
1925     }
1926 out:
1927     task_unlock(tsk);
1928     return ret;
1929 }
1930 
1931 /* Allocate a page in interleaved policy.
1932    Own path because it needs to do special accounting. */
1933 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1934                     unsigned nid)
1935 {
1936     struct zonelist *zl;
1937     struct page *page;
1938 
1939     zl = node_zonelist(nid, gfp);
1940     page = __alloc_pages(gfp, order, zl);
1941     if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1942         inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1943     return page;
1944 }
1945 
1946 /**
1947  *  alloc_pages_vma - Allocate a page for a VMA.
1948  *
1949  *  @gfp:
1950  *      %GFP_USER    user allocation.
1951  *      %GFP_KERNEL  kernel allocations,
1952  *      %GFP_HIGHMEM highmem/user allocations,
1953  *      %GFP_FS      allocation should not call back into a file system.
1954  *      %GFP_ATOMIC  don't sleep.
1955  *
1956  *  @order:Order of the GFP allocation.
1957  *  @vma:  Pointer to VMA or NULL if not available.
1958  *  @addr: Virtual Address of the allocation. Must be inside the VMA.
1959  *  @node: Which node to prefer for allocation (modulo policy).
1960  *  @hugepage: for hugepages try only the preferred node if possible
1961  *
1962  *  This function allocates a page from the kernel page pool and applies
1963  *  a NUMA policy associated with the VMA or the current process.
1964  *  When VMA is not NULL caller must hold down_read on the mmap_sem of the
1965  *  mm_struct of the VMA to prevent it from going away. Should be used for
1966  *  all allocations for pages that will be mapped into user space. Returns
1967  *  NULL when no page can be allocated.
1968  */
1969 struct page *
1970 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1971         unsigned long addr, int node, bool hugepage)
1972 {
1973     struct mempolicy *pol;
1974     struct page *page;
1975     unsigned int cpuset_mems_cookie;
1976     struct zonelist *zl;
1977     nodemask_t *nmask;
1978 
1979 retry_cpuset:
1980     pol = get_vma_policy(vma, addr);
1981     cpuset_mems_cookie = read_mems_allowed_begin();
1982 
1983     if (pol->mode == MPOL_INTERLEAVE) {
1984         unsigned nid;
1985 
1986         nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1987         mpol_cond_put(pol);
1988         page = alloc_page_interleave(gfp, order, nid);
1989         goto out;
1990     }
1991 
1992     if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1993         int hpage_node = node;
1994 
1995         /*
1996          * For hugepage allocation and non-interleave policy which
1997          * allows the current node (or other explicitly preferred
1998          * node) we only try to allocate from the current/preferred
1999          * node and don't fall back to other nodes, as the cost of
2000          * remote accesses would likely offset THP benefits.
2001          *
2002          * If the policy is interleave, or does not allow the current
2003          * node in its nodemask, we allocate the standard way.
2004          */
2005         if (pol->mode == MPOL_PREFERRED &&
2006                         !(pol->flags & MPOL_F_LOCAL))
2007             hpage_node = pol->v.preferred_node;
2008 
2009         nmask = policy_nodemask(gfp, pol);
2010         if (!nmask || node_isset(hpage_node, *nmask)) {
2011             mpol_cond_put(pol);
2012             page = __alloc_pages_node(hpage_node,
2013                         gfp | __GFP_THISNODE, order);
2014             goto out;
2015         }
2016     }
2017 
2018     nmask = policy_nodemask(gfp, pol);
2019     zl = policy_zonelist(gfp, pol, node);
2020     page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2021     mpol_cond_put(pol);
2022 out:
2023     if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2024         goto retry_cpuset;
2025     return page;
2026 }
2027 
2028 /**
2029  *  alloc_pages_current - Allocate pages.
2030  *
2031  *  @gfp:
2032  *      %GFP_USER   user allocation,
2033  *          %GFP_KERNEL kernel allocation,
2034  *          %GFP_HIGHMEM highmem allocation,
2035  *          %GFP_FS     don't call back into a file system.
2036  *          %GFP_ATOMIC don't sleep.
2037  *  @order: Power of two of allocation size in pages. 0 is a single page.
2038  *
2039  *  Allocate a page from the kernel page pool.  When not in
2040  *  interrupt context and apply the current process NUMA policy.
2041  *  Returns NULL when no page can be allocated.
2042  *
2043  *  Don't call cpuset_update_task_memory_state() unless
2044  *  1) it's ok to take cpuset_sem (can WAIT), and
2045  *  2) allocating for current task (not interrupt).
2046  */
2047 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2048 {
2049     struct mempolicy *pol = &default_policy;
2050     struct page *page;
2051     unsigned int cpuset_mems_cookie;
2052 
2053     if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2054         pol = get_task_policy(current);
2055 
2056 retry_cpuset:
2057     cpuset_mems_cookie = read_mems_allowed_begin();
2058 
2059     /*
2060      * No reference counting needed for current->mempolicy
2061      * nor system default_policy
2062      */
2063     if (pol->mode == MPOL_INTERLEAVE)
2064         page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2065     else
2066         page = __alloc_pages_nodemask(gfp, order,
2067                 policy_zonelist(gfp, pol, numa_node_id()),
2068                 policy_nodemask(gfp, pol));
2069 
2070     if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2071         goto retry_cpuset;
2072 
2073     return page;
2074 }
2075 EXPORT_SYMBOL(alloc_pages_current);
2076 
2077 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2078 {
2079     struct mempolicy *pol = mpol_dup(vma_policy(src));
2080 
2081     if (IS_ERR(pol))
2082         return PTR_ERR(pol);
2083     dst->vm_policy = pol;
2084     return 0;
2085 }
2086 
2087 /*
2088  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2089  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2090  * with the mems_allowed returned by cpuset_mems_allowed().  This
2091  * keeps mempolicies cpuset relative after its cpuset moves.  See
2092  * further kernel/cpuset.c update_nodemask().
2093  *
2094  * current's mempolicy may be rebinded by the other task(the task that changes
2095  * cpuset's mems), so we needn't do rebind work for current task.
2096  */
2097 
2098 /* Slow path of a mempolicy duplicate */
2099 struct mempolicy *__mpol_dup(struct mempolicy *old)
2100 {
2101     struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2102 
2103     if (!new)
2104         return ERR_PTR(-ENOMEM);
2105 
2106     /* task's mempolicy is protected by alloc_lock */
2107     if (old == current->mempolicy) {
2108         task_lock(current);
2109         *new = *old;
2110         task_unlock(current);
2111     } else
2112         *new = *old;
2113 
2114     if (current_cpuset_is_being_rebound()) {
2115         nodemask_t mems = cpuset_mems_allowed(current);
2116         if (new->flags & MPOL_F_REBINDING)
2117             mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2118         else
2119             mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2120     }
2121     atomic_set(&new->refcnt, 1);
2122     return new;
2123 }
2124 
2125 /* Slow path of a mempolicy comparison */
2126 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2127 {
2128     if (!a || !b)
2129         return false;
2130     if (a->mode != b->mode)
2131         return false;
2132     if (a->flags != b->flags)
2133         return false;
2134     if (mpol_store_user_nodemask(a))
2135         if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2136             return false;
2137 
2138     switch (a->mode) {
2139     case MPOL_BIND:
2140         /* Fall through */
2141     case MPOL_INTERLEAVE:
2142         return !!nodes_equal(a->v.nodes, b->v.nodes);
2143     case MPOL_PREFERRED:
2144         return a->v.preferred_node == b->v.preferred_node;
2145     default:
2146         BUG();
2147         return false;
2148     }
2149 }
2150 
2151 /*
2152  * Shared memory backing store policy support.
2153  *
2154  * Remember policies even when nobody has shared memory mapped.
2155  * The policies are kept in Red-Black tree linked from the inode.
2156  * They are protected by the sp->lock rwlock, which should be held
2157  * for any accesses to the tree.
2158  */
2159 
2160 /*
2161  * lookup first element intersecting start-end.  Caller holds sp->lock for
2162  * reading or for writing
2163  */
2164 static struct sp_node *
2165 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2166 {
2167     struct rb_node *n = sp->root.rb_node;
2168 
2169     while (n) {
2170         struct sp_node *p = rb_entry(n, struct sp_node, nd);
2171 
2172         if (start >= p->end)
2173             n = n->rb_right;
2174         else if (end <= p->start)
2175             n = n->rb_left;
2176         else
2177             break;
2178     }
2179     if (!n)
2180         return NULL;
2181     for (;;) {
2182         struct sp_node *w = NULL;
2183         struct rb_node *prev = rb_prev(n);
2184         if (!prev)
2185             break;
2186         w = rb_entry(prev, struct sp_node, nd);
2187         if (w->end <= start)
2188             break;
2189         n = prev;
2190     }
2191     return rb_entry(n, struct sp_node, nd);
2192 }
2193 
2194 /*
2195  * Insert a new shared policy into the list.  Caller holds sp->lock for
2196  * writing.
2197  */
2198 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2199 {
2200     struct rb_node **p = &sp->root.rb_node;
2201     struct rb_node *parent = NULL;
2202     struct sp_node *nd;
2203 
2204     while (*p) {
2205         parent = *p;
2206         nd = rb_entry(parent, struct sp_node, nd);
2207         if (new->start < nd->start)
2208             p = &(*p)->rb_left;
2209         else if (new->end > nd->end)
2210             p = &(*p)->rb_right;
2211         else
2212             BUG();
2213     }
2214     rb_link_node(&new->nd, parent, p);
2215     rb_insert_color(&new->nd, &sp->root);
2216     pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2217          new->policy ? new->policy->mode : 0);
2218 }
2219 
2220 /* Find shared policy intersecting idx */
2221 struct mempolicy *
2222 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2223 {
2224     struct mempolicy *pol = NULL;
2225     struct sp_node *sn;
2226 
2227     if (!sp->root.rb_node)
2228         return NULL;
2229     read_lock(&sp->lock);
2230     sn = sp_lookup(sp, idx, idx+1);
2231     if (sn) {
2232         mpol_get(sn->policy);
2233         pol = sn->policy;
2234     }
2235     read_unlock(&sp->lock);
2236     return pol;
2237 }
2238 
2239 static void sp_free(struct sp_node *n)
2240 {
2241     mpol_put(n->policy);
2242     kmem_cache_free(sn_cache, n);
2243 }
2244 
2245 /**
2246  * mpol_misplaced - check whether current page node is valid in policy
2247  *
2248  * @page: page to be checked
2249  * @vma: vm area where page mapped
2250  * @addr: virtual address where page mapped
2251  *
2252  * Lookup current policy node id for vma,addr and "compare to" page's
2253  * node id.
2254  *
2255  * Returns:
2256  *  -1  - not misplaced, page is in the right node
2257  *  node    - node id where the page should be
2258  *
2259  * Policy determination "mimics" alloc_page_vma().
2260  * Called from fault path where we know the vma and faulting address.
2261  */
2262 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2263 {
2264     struct mempolicy *pol;
2265     struct zoneref *z;
2266     int curnid = page_to_nid(page);
2267     unsigned long pgoff;
2268     int thiscpu = raw_smp_processor_id();
2269     int thisnid = cpu_to_node(thiscpu);
2270     int polnid = -1;
2271     int ret = -1;
2272 
2273     BUG_ON(!vma);
2274 
2275     pol = get_vma_policy(vma, addr);
2276     if (!(pol->flags & MPOL_F_MOF))
2277         goto out;
2278 
2279     switch (pol->mode) {
2280     case MPOL_INTERLEAVE:
2281         BUG_ON(addr >= vma->vm_end);
2282         BUG_ON(addr < vma->vm_start);
2283 
2284         pgoff = vma->vm_pgoff;
2285         pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2286         polnid = offset_il_node(pol, vma, pgoff);
2287         break;
2288 
2289     case MPOL_PREFERRED:
2290         if (pol->flags & MPOL_F_LOCAL)
2291             polnid = numa_node_id();
2292         else
2293             polnid = pol->v.preferred_node;
2294         break;
2295 
2296     case MPOL_BIND:
2297 
2298         /*
2299          * allows binding to multiple nodes.
2300          * use current page if in policy nodemask,
2301          * else select nearest allowed node, if any.
2302          * If no allowed nodes, use current [!misplaced].
2303          */
2304         if (node_isset(curnid, pol->v.nodes))
2305             goto out;
2306         z = first_zones_zonelist(
2307                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2308                 gfp_zone(GFP_HIGHUSER),
2309                 &pol->v.nodes);
2310         polnid = z->zone->node;
2311         break;
2312 
2313     default:
2314         BUG();
2315     }
2316 
2317     /* Migrate the page towards the node whose CPU is referencing it */
2318     if (pol->flags & MPOL_F_MORON) {
2319         polnid = thisnid;
2320 
2321         if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2322             goto out;
2323     }
2324 
2325     if (curnid != polnid)
2326         ret = polnid;
2327 out:
2328     mpol_cond_put(pol);
2329 
2330     return ret;
2331 }
2332 
2333 /*
2334  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2335  * dropped after task->mempolicy is set to NULL so that any allocation done as
2336  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2337  * policy.
2338  */
2339 void mpol_put_task_policy(struct task_struct *task)
2340 {
2341     struct mempolicy *pol;
2342 
2343     task_lock(task);
2344     pol = task->mempolicy;
2345     task->mempolicy = NULL;
2346     task_unlock(task);
2347     mpol_put(pol);
2348 }
2349 
2350 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2351 {
2352     pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2353     rb_erase(&n->nd, &sp->root);
2354     sp_free(n);
2355 }
2356 
2357 static void sp_node_init(struct sp_node *node, unsigned long start,
2358             unsigned long end, struct mempolicy *pol)
2359 {
2360     node->start = start;
2361     node->end = end;
2362     node->policy = pol;
2363 }
2364 
2365 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2366                 struct mempolicy *pol)
2367 {
2368     struct sp_node *n;
2369     struct mempolicy *newpol;
2370 
2371     n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2372     if (!n)
2373         return NULL;
2374 
2375     newpol = mpol_dup(pol);
2376     if (IS_ERR(newpol)) {
2377         kmem_cache_free(sn_cache, n);
2378         return NULL;
2379     }
2380     newpol->flags |= MPOL_F_SHARED;
2381     sp_node_init(n, start, end, newpol);
2382 
2383     return n;
2384 }
2385 
2386 /* Replace a policy range. */
2387 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2388                  unsigned long end, struct sp_node *new)
2389 {
2390     struct sp_node *n;
2391     struct sp_node *n_new = NULL;
2392     struct mempolicy *mpol_new = NULL;
2393     int ret = 0;
2394 
2395 restart:
2396     write_lock(&sp->lock);
2397     n = sp_lookup(sp, start, end);
2398     /* Take care of old policies in the same range. */
2399     while (n && n->start < end) {
2400         struct rb_node *next = rb_next(&n->nd);
2401         if (n->start >= start) {
2402             if (n->end <= end)
2403                 sp_delete(sp, n);
2404             else
2405                 n->start = end;
2406         } else {
2407             /* Old policy spanning whole new range. */
2408             if (n->end > end) {
2409                 if (!n_new)
2410                     goto alloc_new;
2411 
2412                 *mpol_new = *n->policy;
2413                 atomic_set(&mpol_new->refcnt, 1);
2414                 sp_node_init(n_new, end, n->end, mpol_new);
2415                 n->end = start;
2416                 sp_insert(sp, n_new);
2417                 n_new = NULL;
2418                 mpol_new = NULL;
2419                 break;
2420             } else
2421                 n->end = start;
2422         }
2423         if (!next)
2424             break;
2425         n = rb_entry(next, struct sp_node, nd);
2426     }
2427     if (new)
2428         sp_insert(sp, new);
2429     write_unlock(&sp->lock);
2430     ret = 0;
2431 
2432 err_out:
2433     if (mpol_new)
2434         mpol_put(mpol_new);
2435     if (n_new)
2436         kmem_cache_free(sn_cache, n_new);
2437 
2438     return ret;
2439 
2440 alloc_new:
2441     write_unlock(&sp->lock);
2442     ret = -ENOMEM;
2443     n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2444     if (!n_new)
2445         goto err_out;
2446     mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2447     if (!mpol_new)
2448         goto err_out;
2449     goto restart;
2450 }
2451 
2452 /**
2453  * mpol_shared_policy_init - initialize shared policy for inode
2454  * @sp: pointer to inode shared policy
2455  * @mpol:  struct mempolicy to install
2456  *
2457  * Install non-NULL @mpol in inode's shared policy rb-tree.
2458  * On entry, the current task has a reference on a non-NULL @mpol.
2459  * This must be released on exit.
2460  * This is called at get_inode() calls and we can use GFP_KERNEL.
2461  */
2462 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2463 {
2464     int ret;
2465 
2466     sp->root = RB_ROOT;     /* empty tree == default mempolicy */
2467     rwlock_init(&sp->lock);
2468 
2469     if (mpol) {
2470         struct vm_area_struct pvma;
2471         struct mempolicy *new;
2472         NODEMASK_SCRATCH(scratch);
2473 
2474         if (!scratch)
2475             goto put_mpol;
2476         /* contextualize the tmpfs mount point mempolicy */
2477         new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2478         if (IS_ERR(new))
2479             goto free_scratch; /* no valid nodemask intersection */
2480 
2481         task_lock(current);
2482         ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2483         task_unlock(current);
2484         if (ret)
2485             goto put_new;
2486 
2487         /* Create pseudo-vma that contains just the policy */
2488         memset(&pvma, 0, sizeof(struct vm_area_struct));
2489         pvma.vm_end = TASK_SIZE;    /* policy covers entire file */
2490         mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2491 
2492 put_new:
2493         mpol_put(new);          /* drop initial ref */
2494 free_scratch:
2495         NODEMASK_SCRATCH_FREE(scratch);
2496 put_mpol:
2497         mpol_put(mpol); /* drop our incoming ref on sb mpol */
2498     }
2499 }
2500 
2501 int mpol_set_shared_policy(struct shared_policy *info,
2502             struct vm_area_struct *vma, struct mempolicy *npol)
2503 {
2504     int err;
2505     struct sp_node *new = NULL;
2506     unsigned long sz = vma_pages(vma);
2507 
2508     pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2509          vma->vm_pgoff,
2510          sz, npol ? npol->mode : -1,
2511          npol ? npol->flags : -1,
2512          npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2513 
2514     if (npol) {
2515         new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2516         if (!new)
2517             return -ENOMEM;
2518     }
2519     err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2520     if (err && new)
2521         sp_free(new);
2522     return err;
2523 }
2524 
2525 /* Free a backing policy store on inode delete. */
2526 void mpol_free_shared_policy(struct shared_policy *p)
2527 {
2528     struct sp_node *n;
2529     struct rb_node *next;
2530 
2531     if (!p->root.rb_node)
2532         return;
2533     write_lock(&p->lock);
2534     next = rb_first(&p->root);
2535     while (next) {
2536         n = rb_entry(next, struct sp_node, nd);
2537         next = rb_next(&n->nd);
2538         sp_delete(p, n);
2539     }
2540     write_unlock(&p->lock);
2541 }
2542 
2543 #ifdef CONFIG_NUMA_BALANCING
2544 static int __initdata numabalancing_override;
2545 
2546 static void __init check_numabalancing_enable(void)
2547 {
2548     bool numabalancing_default = false;
2549 
2550     if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2551         numabalancing_default = true;
2552 
2553     /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2554     if (numabalancing_override)
2555         set_numabalancing_state(numabalancing_override == 1);
2556 
2557     if (num_online_nodes() > 1 && !numabalancing_override) {
2558         pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2559             numabalancing_default ? "Enabling" : "Disabling");
2560         set_numabalancing_state(numabalancing_default);
2561     }
2562 }
2563 
2564 static int __init setup_numabalancing(char *str)
2565 {
2566     int ret = 0;
2567     if (!str)
2568         goto out;
2569 
2570     if (!strcmp(str, "enable")) {
2571         numabalancing_override = 1;
2572         ret = 1;
2573     } else if (!strcmp(str, "disable")) {
2574         numabalancing_override = -1;
2575         ret = 1;
2576     }
2577 out:
2578     if (!ret)
2579         pr_warn("Unable to parse numa_balancing=\n");
2580 
2581     return ret;
2582 }
2583 __setup("numa_balancing=", setup_numabalancing);
2584 #else
2585 static inline void __init check_numabalancing_enable(void)
2586 {
2587 }
2588 #endif /* CONFIG_NUMA_BALANCING */
2589 
2590 /* assumes fs == KERNEL_DS */
2591 void __init numa_policy_init(void)
2592 {
2593     nodemask_t interleave_nodes;
2594     unsigned long largest = 0;
2595     int nid, prefer = 0;
2596 
2597     policy_cache = kmem_cache_create("numa_policy",
2598                      sizeof(struct mempolicy),
2599                      0, SLAB_PANIC, NULL);
2600 
2601     sn_cache = kmem_cache_create("shared_policy_node",
2602                      sizeof(struct sp_node),
2603                      0, SLAB_PANIC, NULL);
2604 
2605     for_each_node(nid) {
2606         preferred_node_policy[nid] = (struct mempolicy) {
2607             .refcnt = ATOMIC_INIT(1),
2608             .mode = MPOL_PREFERRED,
2609             .flags = MPOL_F_MOF | MPOL_F_MORON,
2610             .v = { .preferred_node = nid, },
2611         };
2612     }
2613 
2614     /*
2615      * Set interleaving policy for system init. Interleaving is only
2616      * enabled across suitably sized nodes (default is >= 16MB), or
2617      * fall back to the largest node if they're all smaller.
2618      */
2619     nodes_clear(interleave_nodes);
2620     for_each_node_state(nid, N_MEMORY) {
2621         unsigned long total_pages = node_present_pages(nid);
2622 
2623         /* Preserve the largest node */
2624         if (largest < total_pages) {
2625             largest = total_pages;
2626             prefer = nid;
2627         }
2628 
2629         /* Interleave this node? */
2630         if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2631             node_set(nid, interleave_nodes);
2632     }
2633 
2634     /* All too small, use the largest */
2635     if (unlikely(nodes_empty(interleave_nodes)))
2636         node_set(prefer, interleave_nodes);
2637 
2638     if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2639         pr_err("%s: interleaving failed\n", __func__);
2640 
2641     check_numabalancing_enable();
2642 }
2643 
2644 /* Reset policy of current process to default */
2645 void numa_default_policy(void)
2646 {
2647     do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2648 }
2649 
2650 /*
2651  * Parse and format mempolicy from/to strings
2652  */
2653 
2654 /*
2655  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2656  */
2657 static const char * const policy_modes[] =
2658 {
2659     [MPOL_DEFAULT]    = "default",
2660     [MPOL_PREFERRED]  = "prefer",
2661     [MPOL_BIND]       = "bind",
2662     [MPOL_INTERLEAVE] = "interleave",
2663     [MPOL_LOCAL]      = "local",
2664 };
2665 
2666 
2667 #ifdef CONFIG_TMPFS
2668 /**
2669  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2670  * @str:  string containing mempolicy to parse
2671  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2672  *
2673  * Format of input:
2674  *  <mode>[=<flags>][:<nodelist>]
2675  *
2676  * On success, returns 0, else 1
2677  */
2678 int mpol_parse_str(char *str, struct mempolicy **mpol)
2679 {
2680     struct mempolicy *new = NULL;
2681     unsigned short mode;
2682     unsigned short mode_flags;
2683     nodemask_t nodes;
2684     char *nodelist = strchr(str, ':');
2685     char *flags = strchr(str, '=');
2686     int err = 1;
2687 
2688     if (nodelist) {
2689         /* NUL-terminate mode or flags string */
2690         *nodelist++ = '\0';
2691         if (nodelist_parse(nodelist, nodes))
2692             goto out;
2693         if (!nodes_subset(nodes, node_states[N_MEMORY]))
2694             goto out;
2695     } else
2696         nodes_clear(nodes);
2697 
2698     if (flags)
2699         *flags++ = '\0';    /* terminate mode string */
2700 
2701     for (mode = 0; mode < MPOL_MAX; mode++) {
2702         if (!strcmp(str, policy_modes[mode])) {
2703             break;
2704         }
2705     }
2706     if (mode >= MPOL_MAX)
2707         goto out;
2708 
2709     switch (mode) {
2710     case MPOL_PREFERRED:
2711         /*
2712          * Insist on a nodelist of one node only
2713          */
2714         if (nodelist) {
2715             char *rest = nodelist;
2716             while (isdigit(*rest))
2717                 rest++;
2718             if (*rest)
2719                 goto out;
2720         }
2721         break;
2722     case MPOL_INTERLEAVE:
2723         /*
2724          * Default to online nodes with memory if no nodelist
2725          */
2726         if (!nodelist)
2727             nodes = node_states[N_MEMORY];
2728         break;
2729     case MPOL_LOCAL:
2730         /*
2731          * Don't allow a nodelist;  mpol_new() checks flags
2732          */
2733         if (nodelist)
2734             goto out;
2735         mode = MPOL_PREFERRED;
2736         break;
2737     case MPOL_DEFAULT:
2738         /*
2739          * Insist on a empty nodelist
2740          */
2741         if (!nodelist)
2742             err = 0;
2743         goto out;
2744     case MPOL_BIND:
2745         /*
2746          * Insist on a nodelist
2747          */
2748         if (!nodelist)
2749             goto out;
2750     }
2751 
2752     mode_flags = 0;
2753     if (flags) {
2754         /*
2755          * Currently, we only support two mutually exclusive
2756          * mode flags.
2757          */
2758         if (!strcmp(flags, "static"))
2759             mode_flags |= MPOL_F_STATIC_NODES;
2760         else if (!strcmp(flags, "relative"))
2761             mode_flags |= MPOL_F_RELATIVE_NODES;
2762         else
2763             goto out;
2764     }
2765 
2766     new = mpol_new(mode, mode_flags, &nodes);
2767     if (IS_ERR(new))
2768         goto out;
2769 
2770     /*
2771      * Save nodes for mpol_to_str() to show the tmpfs mount options
2772      * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2773      */
2774     if (mode != MPOL_PREFERRED)
2775         new->v.nodes = nodes;
2776     else if (nodelist)
2777         new->v.preferred_node = first_node(nodes);
2778     else
2779         new->flags |= MPOL_F_LOCAL;
2780 
2781     /*
2782      * Save nodes for contextualization: this will be used to "clone"
2783      * the mempolicy in a specific context [cpuset] at a later time.
2784      */
2785     new->w.user_nodemask = nodes;
2786 
2787     err = 0;
2788 
2789 out:
2790     /* Restore string for error message */
2791     if (nodelist)
2792         *--nodelist = ':';
2793     if (flags)
2794         *--flags = '=';
2795     if (!err)
2796         *mpol = new;
2797     return err;
2798 }
2799 #endif /* CONFIG_TMPFS */
2800 
2801 /**
2802  * mpol_to_str - format a mempolicy structure for printing
2803  * @buffer:  to contain formatted mempolicy string
2804  * @maxlen:  length of @buffer
2805  * @pol:  pointer to mempolicy to be formatted
2806  *
2807  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2808  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2809  * longest flag, "relative", and to display at least a few node ids.
2810  */
2811 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2812 {
2813     char *p = buffer;
2814     nodemask_t nodes = NODE_MASK_NONE;
2815     unsigned short mode = MPOL_DEFAULT;
2816     unsigned short flags = 0;
2817 
2818     if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2819         mode = pol->mode;
2820         flags = pol->flags;
2821     }
2822 
2823     switch (mode) {
2824     case MPOL_DEFAULT:
2825         break;
2826     case MPOL_PREFERRED:
2827         if (flags & MPOL_F_LOCAL)
2828             mode = MPOL_LOCAL;
2829         else
2830             node_set(pol->v.preferred_node, nodes);
2831         break;
2832     case MPOL_BIND:
2833     case MPOL_INTERLEAVE:
2834         nodes = pol->v.nodes;
2835         break;
2836     default:
2837         WARN_ON_ONCE(1);
2838         snprintf(p, maxlen, "unknown");
2839         return;
2840     }
2841 
2842     p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2843 
2844     if (flags & MPOL_MODE_FLAGS) {
2845         p += snprintf(p, buffer + maxlen - p, "=");
2846 
2847         /*
2848          * Currently, the only defined flags are mutually exclusive
2849          */
2850         if (flags & MPOL_F_STATIC_NODES)
2851             p += snprintf(p, buffer + maxlen - p, "static");
2852         else if (flags & MPOL_F_RELATIVE_NODES)
2853             p += snprintf(p, buffer + maxlen - p, "relative");
2854     }
2855 
2856     if (!nodes_empty(nodes))
2857         p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2858                    nodemask_pr_args(&nodes));
2859 }