Back to home page

LXR

 
 

    


0001 /*
0002  * mm/percpu.c - percpu memory allocator
0003  *
0004  * Copyright (C) 2009       SUSE Linux Products GmbH
0005  * Copyright (C) 2009       Tejun Heo <tj@kernel.org>
0006  *
0007  * This file is released under the GPLv2.
0008  *
0009  * This is percpu allocator which can handle both static and dynamic
0010  * areas.  Percpu areas are allocated in chunks.  Each chunk is
0011  * consisted of boot-time determined number of units and the first
0012  * chunk is used for static percpu variables in the kernel image
0013  * (special boot time alloc/init handling necessary as these areas
0014  * need to be brought up before allocation services are running).
0015  * Unit grows as necessary and all units grow or shrink in unison.
0016  * When a chunk is filled up, another chunk is allocated.
0017  *
0018  *  c0                           c1                         c2
0019  *  -------------------          -------------------        ------------
0020  * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
0021  *  -------------------  ......  -------------------  ....  ------------
0022  *
0023  * Allocation is done in offset-size areas of single unit space.  Ie,
0024  * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
0025  * c1:u1, c1:u2 and c1:u3.  On UMA, units corresponds directly to
0026  * cpus.  On NUMA, the mapping can be non-linear and even sparse.
0027  * Percpu access can be done by configuring percpu base registers
0028  * according to cpu to unit mapping and pcpu_unit_size.
0029  *
0030  * There are usually many small percpu allocations many of them being
0031  * as small as 4 bytes.  The allocator organizes chunks into lists
0032  * according to free size and tries to allocate from the fullest one.
0033  * Each chunk keeps the maximum contiguous area size hint which is
0034  * guaranteed to be equal to or larger than the maximum contiguous
0035  * area in the chunk.  This helps the allocator not to iterate the
0036  * chunk maps unnecessarily.
0037  *
0038  * Allocation state in each chunk is kept using an array of integers
0039  * on chunk->map.  A positive value in the map represents a free
0040  * region and negative allocated.  Allocation inside a chunk is done
0041  * by scanning this map sequentially and serving the first matching
0042  * entry.  This is mostly copied from the percpu_modalloc() allocator.
0043  * Chunks can be determined from the address using the index field
0044  * in the page struct. The index field contains a pointer to the chunk.
0045  *
0046  * To use this allocator, arch code should do the followings.
0047  *
0048  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
0049  *   regular address to percpu pointer and back if they need to be
0050  *   different from the default
0051  *
0052  * - use pcpu_setup_first_chunk() during percpu area initialization to
0053  *   setup the first chunk containing the kernel static percpu area
0054  */
0055 
0056 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0057 
0058 #include <linux/bitmap.h>
0059 #include <linux/bootmem.h>
0060 #include <linux/err.h>
0061 #include <linux/list.h>
0062 #include <linux/log2.h>
0063 #include <linux/mm.h>
0064 #include <linux/module.h>
0065 #include <linux/mutex.h>
0066 #include <linux/percpu.h>
0067 #include <linux/pfn.h>
0068 #include <linux/slab.h>
0069 #include <linux/spinlock.h>
0070 #include <linux/vmalloc.h>
0071 #include <linux/workqueue.h>
0072 #include <linux/kmemleak.h>
0073 
0074 #include <asm/cacheflush.h>
0075 #include <asm/sections.h>
0076 #include <asm/tlbflush.h>
0077 #include <asm/io.h>
0078 
0079 #define PCPU_SLOT_BASE_SHIFT        5   /* 1-31 shares the same slot */
0080 #define PCPU_DFL_MAP_ALLOC      16  /* start a map with 16 ents */
0081 #define PCPU_ATOMIC_MAP_MARGIN_LOW  32
0082 #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
0083 #define PCPU_EMPTY_POP_PAGES_LOW    2
0084 #define PCPU_EMPTY_POP_PAGES_HIGH   4
0085 
0086 #ifdef CONFIG_SMP
0087 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
0088 #ifndef __addr_to_pcpu_ptr
0089 #define __addr_to_pcpu_ptr(addr)                    \
0090     (void __percpu *)((unsigned long)(addr) -           \
0091               (unsigned long)pcpu_base_addr +       \
0092               (unsigned long)__per_cpu_start)
0093 #endif
0094 #ifndef __pcpu_ptr_to_addr
0095 #define __pcpu_ptr_to_addr(ptr)                     \
0096     (void __force *)((unsigned long)(ptr) +             \
0097              (unsigned long)pcpu_base_addr -        \
0098              (unsigned long)__per_cpu_start)
0099 #endif
0100 #else   /* CONFIG_SMP */
0101 /* on UP, it's always identity mapped */
0102 #define __addr_to_pcpu_ptr(addr)    (void __percpu *)(addr)
0103 #define __pcpu_ptr_to_addr(ptr)     (void __force *)(ptr)
0104 #endif  /* CONFIG_SMP */
0105 
0106 struct pcpu_chunk {
0107     struct list_head    list;       /* linked to pcpu_slot lists */
0108     int         free_size;  /* free bytes in the chunk */
0109     int         contig_hint;    /* max contiguous size hint */
0110     void            *base_addr; /* base address of this chunk */
0111 
0112     int         map_used;   /* # of map entries used before the sentry */
0113     int         map_alloc;  /* # of map entries allocated */
0114     int         *map;       /* allocation map */
0115     struct list_head    map_extend_list;/* on pcpu_map_extend_chunks */
0116 
0117     void            *data;      /* chunk data */
0118     int         first_free; /* no free below this */
0119     bool            immutable;  /* no [de]population allowed */
0120     int         nr_populated;   /* # of populated pages */
0121     unsigned long       populated[];    /* populated bitmap */
0122 };
0123 
0124 static int pcpu_unit_pages __read_mostly;
0125 static int pcpu_unit_size __read_mostly;
0126 static int pcpu_nr_units __read_mostly;
0127 static int pcpu_atom_size __read_mostly;
0128 static int pcpu_nr_slots __read_mostly;
0129 static size_t pcpu_chunk_struct_size __read_mostly;
0130 
0131 /* cpus with the lowest and highest unit addresses */
0132 static unsigned int pcpu_low_unit_cpu __read_mostly;
0133 static unsigned int pcpu_high_unit_cpu __read_mostly;
0134 
0135 /* the address of the first chunk which starts with the kernel static area */
0136 void *pcpu_base_addr __read_mostly;
0137 EXPORT_SYMBOL_GPL(pcpu_base_addr);
0138 
0139 static const int *pcpu_unit_map __read_mostly;      /* cpu -> unit */
0140 const unsigned long *pcpu_unit_offsets __read_mostly;   /* cpu -> unit offset */
0141 
0142 /* group information, used for vm allocation */
0143 static int pcpu_nr_groups __read_mostly;
0144 static const unsigned long *pcpu_group_offsets __read_mostly;
0145 static const size_t *pcpu_group_sizes __read_mostly;
0146 
0147 /*
0148  * The first chunk which always exists.  Note that unlike other
0149  * chunks, this one can be allocated and mapped in several different
0150  * ways and thus often doesn't live in the vmalloc area.
0151  */
0152 static struct pcpu_chunk *pcpu_first_chunk;
0153 
0154 /*
0155  * Optional reserved chunk.  This chunk reserves part of the first
0156  * chunk and serves it for reserved allocations.  The amount of
0157  * reserved offset is in pcpu_reserved_chunk_limit.  When reserved
0158  * area doesn't exist, the following variables contain NULL and 0
0159  * respectively.
0160  */
0161 static struct pcpu_chunk *pcpu_reserved_chunk;
0162 static int pcpu_reserved_chunk_limit;
0163 
0164 static DEFINE_SPINLOCK(pcpu_lock);  /* all internal data structures */
0165 static DEFINE_MUTEX(pcpu_alloc_mutex);  /* chunk create/destroy, [de]pop, map ext */
0166 
0167 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
0168 
0169 /* chunks which need their map areas extended, protected by pcpu_lock */
0170 static LIST_HEAD(pcpu_map_extend_chunks);
0171 
0172 /*
0173  * The number of empty populated pages, protected by pcpu_lock.  The
0174  * reserved chunk doesn't contribute to the count.
0175  */
0176 static int pcpu_nr_empty_pop_pages;
0177 
0178 /*
0179  * Balance work is used to populate or destroy chunks asynchronously.  We
0180  * try to keep the number of populated free pages between
0181  * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
0182  * empty chunk.
0183  */
0184 static void pcpu_balance_workfn(struct work_struct *work);
0185 static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
0186 static bool pcpu_async_enabled __read_mostly;
0187 static bool pcpu_atomic_alloc_failed;
0188 
0189 static void pcpu_schedule_balance_work(void)
0190 {
0191     if (pcpu_async_enabled)
0192         schedule_work(&pcpu_balance_work);
0193 }
0194 
0195 static bool pcpu_addr_in_first_chunk(void *addr)
0196 {
0197     void *first_start = pcpu_first_chunk->base_addr;
0198 
0199     return addr >= first_start && addr < first_start + pcpu_unit_size;
0200 }
0201 
0202 static bool pcpu_addr_in_reserved_chunk(void *addr)
0203 {
0204     void *first_start = pcpu_first_chunk->base_addr;
0205 
0206     return addr >= first_start &&
0207         addr < first_start + pcpu_reserved_chunk_limit;
0208 }
0209 
0210 static int __pcpu_size_to_slot(int size)
0211 {
0212     int highbit = fls(size);    /* size is in bytes */
0213     return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
0214 }
0215 
0216 static int pcpu_size_to_slot(int size)
0217 {
0218     if (size == pcpu_unit_size)
0219         return pcpu_nr_slots - 1;
0220     return __pcpu_size_to_slot(size);
0221 }
0222 
0223 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
0224 {
0225     if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
0226         return 0;
0227 
0228     return pcpu_size_to_slot(chunk->free_size);
0229 }
0230 
0231 /* set the pointer to a chunk in a page struct */
0232 static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
0233 {
0234     page->index = (unsigned long)pcpu;
0235 }
0236 
0237 /* obtain pointer to a chunk from a page struct */
0238 static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
0239 {
0240     return (struct pcpu_chunk *)page->index;
0241 }
0242 
0243 static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
0244 {
0245     return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
0246 }
0247 
0248 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
0249                      unsigned int cpu, int page_idx)
0250 {
0251     return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
0252         (page_idx << PAGE_SHIFT);
0253 }
0254 
0255 static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
0256                        int *rs, int *re, int end)
0257 {
0258     *rs = find_next_zero_bit(chunk->populated, end, *rs);
0259     *re = find_next_bit(chunk->populated, end, *rs + 1);
0260 }
0261 
0262 static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
0263                      int *rs, int *re, int end)
0264 {
0265     *rs = find_next_bit(chunk->populated, end, *rs);
0266     *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
0267 }
0268 
0269 /*
0270  * (Un)populated page region iterators.  Iterate over (un)populated
0271  * page regions between @start and @end in @chunk.  @rs and @re should
0272  * be integer variables and will be set to start and end page index of
0273  * the current region.
0274  */
0275 #define pcpu_for_each_unpop_region(chunk, rs, re, start, end)           \
0276     for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
0277          (rs) < (re);                           \
0278          (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
0279 
0280 #define pcpu_for_each_pop_region(chunk, rs, re, start, end)         \
0281     for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end));   \
0282          (rs) < (re);                           \
0283          (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
0284 
0285 /**
0286  * pcpu_mem_zalloc - allocate memory
0287  * @size: bytes to allocate
0288  *
0289  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
0290  * kzalloc() is used; otherwise, vzalloc() is used.  The returned
0291  * memory is always zeroed.
0292  *
0293  * CONTEXT:
0294  * Does GFP_KERNEL allocation.
0295  *
0296  * RETURNS:
0297  * Pointer to the allocated area on success, NULL on failure.
0298  */
0299 static void *pcpu_mem_zalloc(size_t size)
0300 {
0301     if (WARN_ON_ONCE(!slab_is_available()))
0302         return NULL;
0303 
0304     if (size <= PAGE_SIZE)
0305         return kzalloc(size, GFP_KERNEL);
0306     else
0307         return vzalloc(size);
0308 }
0309 
0310 /**
0311  * pcpu_mem_free - free memory
0312  * @ptr: memory to free
0313  *
0314  * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
0315  */
0316 static void pcpu_mem_free(void *ptr)
0317 {
0318     kvfree(ptr);
0319 }
0320 
0321 /**
0322  * pcpu_count_occupied_pages - count the number of pages an area occupies
0323  * @chunk: chunk of interest
0324  * @i: index of the area in question
0325  *
0326  * Count the number of pages chunk's @i'th area occupies.  When the area's
0327  * start and/or end address isn't aligned to page boundary, the straddled
0328  * page is included in the count iff the rest of the page is free.
0329  */
0330 static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
0331 {
0332     int off = chunk->map[i] & ~1;
0333     int end = chunk->map[i + 1] & ~1;
0334 
0335     if (!PAGE_ALIGNED(off) && i > 0) {
0336         int prev = chunk->map[i - 1];
0337 
0338         if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
0339             off = round_down(off, PAGE_SIZE);
0340     }
0341 
0342     if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
0343         int next = chunk->map[i + 1];
0344         int nend = chunk->map[i + 2] & ~1;
0345 
0346         if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
0347             end = round_up(end, PAGE_SIZE);
0348     }
0349 
0350     return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
0351 }
0352 
0353 /**
0354  * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
0355  * @chunk: chunk of interest
0356  * @oslot: the previous slot it was on
0357  *
0358  * This function is called after an allocation or free changed @chunk.
0359  * New slot according to the changed state is determined and @chunk is
0360  * moved to the slot.  Note that the reserved chunk is never put on
0361  * chunk slots.
0362  *
0363  * CONTEXT:
0364  * pcpu_lock.
0365  */
0366 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
0367 {
0368     int nslot = pcpu_chunk_slot(chunk);
0369 
0370     if (chunk != pcpu_reserved_chunk && oslot != nslot) {
0371         if (oslot < nslot)
0372             list_move(&chunk->list, &pcpu_slot[nslot]);
0373         else
0374             list_move_tail(&chunk->list, &pcpu_slot[nslot]);
0375     }
0376 }
0377 
0378 /**
0379  * pcpu_need_to_extend - determine whether chunk area map needs to be extended
0380  * @chunk: chunk of interest
0381  * @is_atomic: the allocation context
0382  *
0383  * Determine whether area map of @chunk needs to be extended.  If
0384  * @is_atomic, only the amount necessary for a new allocation is
0385  * considered; however, async extension is scheduled if the left amount is
0386  * low.  If !@is_atomic, it aims for more empty space.  Combined, this
0387  * ensures that the map is likely to have enough available space to
0388  * accomodate atomic allocations which can't extend maps directly.
0389  *
0390  * CONTEXT:
0391  * pcpu_lock.
0392  *
0393  * RETURNS:
0394  * New target map allocation length if extension is necessary, 0
0395  * otherwise.
0396  */
0397 static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
0398 {
0399     int margin, new_alloc;
0400 
0401     lockdep_assert_held(&pcpu_lock);
0402 
0403     if (is_atomic) {
0404         margin = 3;
0405 
0406         if (chunk->map_alloc <
0407             chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
0408             if (list_empty(&chunk->map_extend_list)) {
0409                 list_add_tail(&chunk->map_extend_list,
0410                           &pcpu_map_extend_chunks);
0411                 pcpu_schedule_balance_work();
0412             }
0413         }
0414     } else {
0415         margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
0416     }
0417 
0418     if (chunk->map_alloc >= chunk->map_used + margin)
0419         return 0;
0420 
0421     new_alloc = PCPU_DFL_MAP_ALLOC;
0422     while (new_alloc < chunk->map_used + margin)
0423         new_alloc *= 2;
0424 
0425     return new_alloc;
0426 }
0427 
0428 /**
0429  * pcpu_extend_area_map - extend area map of a chunk
0430  * @chunk: chunk of interest
0431  * @new_alloc: new target allocation length of the area map
0432  *
0433  * Extend area map of @chunk to have @new_alloc entries.
0434  *
0435  * CONTEXT:
0436  * Does GFP_KERNEL allocation.  Grabs and releases pcpu_lock.
0437  *
0438  * RETURNS:
0439  * 0 on success, -errno on failure.
0440  */
0441 static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
0442 {
0443     int *old = NULL, *new = NULL;
0444     size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
0445     unsigned long flags;
0446 
0447     lockdep_assert_held(&pcpu_alloc_mutex);
0448 
0449     new = pcpu_mem_zalloc(new_size);
0450     if (!new)
0451         return -ENOMEM;
0452 
0453     /* acquire pcpu_lock and switch to new area map */
0454     spin_lock_irqsave(&pcpu_lock, flags);
0455 
0456     if (new_alloc <= chunk->map_alloc)
0457         goto out_unlock;
0458 
0459     old_size = chunk->map_alloc * sizeof(chunk->map[0]);
0460     old = chunk->map;
0461 
0462     memcpy(new, old, old_size);
0463 
0464     chunk->map_alloc = new_alloc;
0465     chunk->map = new;
0466     new = NULL;
0467 
0468 out_unlock:
0469     spin_unlock_irqrestore(&pcpu_lock, flags);
0470 
0471     /*
0472      * pcpu_mem_free() might end up calling vfree() which uses
0473      * IRQ-unsafe lock and thus can't be called under pcpu_lock.
0474      */
0475     pcpu_mem_free(old);
0476     pcpu_mem_free(new);
0477 
0478     return 0;
0479 }
0480 
0481 /**
0482  * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
0483  * @chunk: chunk the candidate area belongs to
0484  * @off: the offset to the start of the candidate area
0485  * @this_size: the size of the candidate area
0486  * @size: the size of the target allocation
0487  * @align: the alignment of the target allocation
0488  * @pop_only: only allocate from already populated region
0489  *
0490  * We're trying to allocate @size bytes aligned at @align.  @chunk's area
0491  * at @off sized @this_size is a candidate.  This function determines
0492  * whether the target allocation fits in the candidate area and returns the
0493  * number of bytes to pad after @off.  If the target area doesn't fit, -1
0494  * is returned.
0495  *
0496  * If @pop_only is %true, this function only considers the already
0497  * populated part of the candidate area.
0498  */
0499 static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
0500                 int size, int align, bool pop_only)
0501 {
0502     int cand_off = off;
0503 
0504     while (true) {
0505         int head = ALIGN(cand_off, align) - off;
0506         int page_start, page_end, rs, re;
0507 
0508         if (this_size < head + size)
0509             return -1;
0510 
0511         if (!pop_only)
0512             return head;
0513 
0514         /*
0515          * If the first unpopulated page is beyond the end of the
0516          * allocation, the whole allocation is populated;
0517          * otherwise, retry from the end of the unpopulated area.
0518          */
0519         page_start = PFN_DOWN(head + off);
0520         page_end = PFN_UP(head + off + size);
0521 
0522         rs = page_start;
0523         pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
0524         if (rs >= page_end)
0525             return head;
0526         cand_off = re * PAGE_SIZE;
0527     }
0528 }
0529 
0530 /**
0531  * pcpu_alloc_area - allocate area from a pcpu_chunk
0532  * @chunk: chunk of interest
0533  * @size: wanted size in bytes
0534  * @align: wanted align
0535  * @pop_only: allocate only from the populated area
0536  * @occ_pages_p: out param for the number of pages the area occupies
0537  *
0538  * Try to allocate @size bytes area aligned at @align from @chunk.
0539  * Note that this function only allocates the offset.  It doesn't
0540  * populate or map the area.
0541  *
0542  * @chunk->map must have at least two free slots.
0543  *
0544  * CONTEXT:
0545  * pcpu_lock.
0546  *
0547  * RETURNS:
0548  * Allocated offset in @chunk on success, -1 if no matching area is
0549  * found.
0550  */
0551 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
0552                bool pop_only, int *occ_pages_p)
0553 {
0554     int oslot = pcpu_chunk_slot(chunk);
0555     int max_contig = 0;
0556     int i, off;
0557     bool seen_free = false;
0558     int *p;
0559 
0560     for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
0561         int head, tail;
0562         int this_size;
0563 
0564         off = *p;
0565         if (off & 1)
0566             continue;
0567 
0568         this_size = (p[1] & ~1) - off;
0569 
0570         head = pcpu_fit_in_area(chunk, off, this_size, size, align,
0571                     pop_only);
0572         if (head < 0) {
0573             if (!seen_free) {
0574                 chunk->first_free = i;
0575                 seen_free = true;
0576             }
0577             max_contig = max(this_size, max_contig);
0578             continue;
0579         }
0580 
0581         /*
0582          * If head is small or the previous block is free,
0583          * merge'em.  Note that 'small' is defined as smaller
0584          * than sizeof(int), which is very small but isn't too
0585          * uncommon for percpu allocations.
0586          */
0587         if (head && (head < sizeof(int) || !(p[-1] & 1))) {
0588             *p = off += head;
0589             if (p[-1] & 1)
0590                 chunk->free_size -= head;
0591             else
0592                 max_contig = max(*p - p[-1], max_contig);
0593             this_size -= head;
0594             head = 0;
0595         }
0596 
0597         /* if tail is small, just keep it around */
0598         tail = this_size - head - size;
0599         if (tail < sizeof(int)) {
0600             tail = 0;
0601             size = this_size - head;
0602         }
0603 
0604         /* split if warranted */
0605         if (head || tail) {
0606             int nr_extra = !!head + !!tail;
0607 
0608             /* insert new subblocks */
0609             memmove(p + nr_extra + 1, p + 1,
0610                 sizeof(chunk->map[0]) * (chunk->map_used - i));
0611             chunk->map_used += nr_extra;
0612 
0613             if (head) {
0614                 if (!seen_free) {
0615                     chunk->first_free = i;
0616                     seen_free = true;
0617                 }
0618                 *++p = off += head;
0619                 ++i;
0620                 max_contig = max(head, max_contig);
0621             }
0622             if (tail) {
0623                 p[1] = off + size;
0624                 max_contig = max(tail, max_contig);
0625             }
0626         }
0627 
0628         if (!seen_free)
0629             chunk->first_free = i + 1;
0630 
0631         /* update hint and mark allocated */
0632         if (i + 1 == chunk->map_used)
0633             chunk->contig_hint = max_contig; /* fully scanned */
0634         else
0635             chunk->contig_hint = max(chunk->contig_hint,
0636                          max_contig);
0637 
0638         chunk->free_size -= size;
0639         *p |= 1;
0640 
0641         *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
0642         pcpu_chunk_relocate(chunk, oslot);
0643         return off;
0644     }
0645 
0646     chunk->contig_hint = max_contig;    /* fully scanned */
0647     pcpu_chunk_relocate(chunk, oslot);
0648 
0649     /* tell the upper layer that this chunk has no matching area */
0650     return -1;
0651 }
0652 
0653 /**
0654  * pcpu_free_area - free area to a pcpu_chunk
0655  * @chunk: chunk of interest
0656  * @freeme: offset of area to free
0657  * @occ_pages_p: out param for the number of pages the area occupies
0658  *
0659  * Free area starting from @freeme to @chunk.  Note that this function
0660  * only modifies the allocation map.  It doesn't depopulate or unmap
0661  * the area.
0662  *
0663  * CONTEXT:
0664  * pcpu_lock.
0665  */
0666 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
0667                int *occ_pages_p)
0668 {
0669     int oslot = pcpu_chunk_slot(chunk);
0670     int off = 0;
0671     unsigned i, j;
0672     int to_free = 0;
0673     int *p;
0674 
0675     freeme |= 1;    /* we are searching for <given offset, in use> pair */
0676 
0677     i = 0;
0678     j = chunk->map_used;
0679     while (i != j) {
0680         unsigned k = (i + j) / 2;
0681         off = chunk->map[k];
0682         if (off < freeme)
0683             i = k + 1;
0684         else if (off > freeme)
0685             j = k;
0686         else
0687             i = j = k;
0688     }
0689     BUG_ON(off != freeme);
0690 
0691     if (i < chunk->first_free)
0692         chunk->first_free = i;
0693 
0694     p = chunk->map + i;
0695     *p = off &= ~1;
0696     chunk->free_size += (p[1] & ~1) - off;
0697 
0698     *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
0699 
0700     /* merge with next? */
0701     if (!(p[1] & 1))
0702         to_free++;
0703     /* merge with previous? */
0704     if (i > 0 && !(p[-1] & 1)) {
0705         to_free++;
0706         i--;
0707         p--;
0708     }
0709     if (to_free) {
0710         chunk->map_used -= to_free;
0711         memmove(p + 1, p + 1 + to_free,
0712             (chunk->map_used - i) * sizeof(chunk->map[0]));
0713     }
0714 
0715     chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
0716     pcpu_chunk_relocate(chunk, oslot);
0717 }
0718 
0719 static struct pcpu_chunk *pcpu_alloc_chunk(void)
0720 {
0721     struct pcpu_chunk *chunk;
0722 
0723     chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
0724     if (!chunk)
0725         return NULL;
0726 
0727     chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
0728                         sizeof(chunk->map[0]));
0729     if (!chunk->map) {
0730         pcpu_mem_free(chunk);
0731         return NULL;
0732     }
0733 
0734     chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
0735     chunk->map[0] = 0;
0736     chunk->map[1] = pcpu_unit_size | 1;
0737     chunk->map_used = 1;
0738 
0739     INIT_LIST_HEAD(&chunk->list);
0740     INIT_LIST_HEAD(&chunk->map_extend_list);
0741     chunk->free_size = pcpu_unit_size;
0742     chunk->contig_hint = pcpu_unit_size;
0743 
0744     return chunk;
0745 }
0746 
0747 static void pcpu_free_chunk(struct pcpu_chunk *chunk)
0748 {
0749     if (!chunk)
0750         return;
0751     pcpu_mem_free(chunk->map);
0752     pcpu_mem_free(chunk);
0753 }
0754 
0755 /**
0756  * pcpu_chunk_populated - post-population bookkeeping
0757  * @chunk: pcpu_chunk which got populated
0758  * @page_start: the start page
0759  * @page_end: the end page
0760  *
0761  * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
0762  * the bookkeeping information accordingly.  Must be called after each
0763  * successful population.
0764  */
0765 static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
0766                  int page_start, int page_end)
0767 {
0768     int nr = page_end - page_start;
0769 
0770     lockdep_assert_held(&pcpu_lock);
0771 
0772     bitmap_set(chunk->populated, page_start, nr);
0773     chunk->nr_populated += nr;
0774     pcpu_nr_empty_pop_pages += nr;
0775 }
0776 
0777 /**
0778  * pcpu_chunk_depopulated - post-depopulation bookkeeping
0779  * @chunk: pcpu_chunk which got depopulated
0780  * @page_start: the start page
0781  * @page_end: the end page
0782  *
0783  * Pages in [@page_start,@page_end) have been depopulated from @chunk.
0784  * Update the bookkeeping information accordingly.  Must be called after
0785  * each successful depopulation.
0786  */
0787 static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
0788                    int page_start, int page_end)
0789 {
0790     int nr = page_end - page_start;
0791 
0792     lockdep_assert_held(&pcpu_lock);
0793 
0794     bitmap_clear(chunk->populated, page_start, nr);
0795     chunk->nr_populated -= nr;
0796     pcpu_nr_empty_pop_pages -= nr;
0797 }
0798 
0799 /*
0800  * Chunk management implementation.
0801  *
0802  * To allow different implementations, chunk alloc/free and
0803  * [de]population are implemented in a separate file which is pulled
0804  * into this file and compiled together.  The following functions
0805  * should be implemented.
0806  *
0807  * pcpu_populate_chunk      - populate the specified range of a chunk
0808  * pcpu_depopulate_chunk    - depopulate the specified range of a chunk
0809  * pcpu_create_chunk        - create a new chunk
0810  * pcpu_destroy_chunk       - destroy a chunk, always preceded by full depop
0811  * pcpu_addr_to_page        - translate address to physical address
0812  * pcpu_verify_alloc_info   - check alloc_info is acceptable during init
0813  */
0814 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
0815 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
0816 static struct pcpu_chunk *pcpu_create_chunk(void);
0817 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
0818 static struct page *pcpu_addr_to_page(void *addr);
0819 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
0820 
0821 #ifdef CONFIG_NEED_PER_CPU_KM
0822 #include "percpu-km.c"
0823 #else
0824 #include "percpu-vm.c"
0825 #endif
0826 
0827 /**
0828  * pcpu_chunk_addr_search - determine chunk containing specified address
0829  * @addr: address for which the chunk needs to be determined.
0830  *
0831  * RETURNS:
0832  * The address of the found chunk.
0833  */
0834 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
0835 {
0836     /* is it in the first chunk? */
0837     if (pcpu_addr_in_first_chunk(addr)) {
0838         /* is it in the reserved area? */
0839         if (pcpu_addr_in_reserved_chunk(addr))
0840             return pcpu_reserved_chunk;
0841         return pcpu_first_chunk;
0842     }
0843 
0844     /*
0845      * The address is relative to unit0 which might be unused and
0846      * thus unmapped.  Offset the address to the unit space of the
0847      * current processor before looking it up in the vmalloc
0848      * space.  Note that any possible cpu id can be used here, so
0849      * there's no need to worry about preemption or cpu hotplug.
0850      */
0851     addr += pcpu_unit_offsets[raw_smp_processor_id()];
0852     return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
0853 }
0854 
0855 /**
0856  * pcpu_alloc - the percpu allocator
0857  * @size: size of area to allocate in bytes
0858  * @align: alignment of area (max PAGE_SIZE)
0859  * @reserved: allocate from the reserved chunk if available
0860  * @gfp: allocation flags
0861  *
0862  * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
0863  * contain %GFP_KERNEL, the allocation is atomic.
0864  *
0865  * RETURNS:
0866  * Percpu pointer to the allocated area on success, NULL on failure.
0867  */
0868 static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
0869                  gfp_t gfp)
0870 {
0871     static int warn_limit = 10;
0872     struct pcpu_chunk *chunk;
0873     const char *err;
0874     bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
0875     int occ_pages = 0;
0876     int slot, off, new_alloc, cpu, ret;
0877     unsigned long flags;
0878     void __percpu *ptr;
0879 
0880     /*
0881      * We want the lowest bit of offset available for in-use/free
0882      * indicator, so force >= 16bit alignment and make size even.
0883      */
0884     if (unlikely(align < 2))
0885         align = 2;
0886 
0887     size = ALIGN(size, 2);
0888 
0889     if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
0890              !is_power_of_2(align))) {
0891         WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
0892              size, align);
0893         return NULL;
0894     }
0895 
0896     if (!is_atomic)
0897         mutex_lock(&pcpu_alloc_mutex);
0898 
0899     spin_lock_irqsave(&pcpu_lock, flags);
0900 
0901     /* serve reserved allocations from the reserved chunk if available */
0902     if (reserved && pcpu_reserved_chunk) {
0903         chunk = pcpu_reserved_chunk;
0904 
0905         if (size > chunk->contig_hint) {
0906             err = "alloc from reserved chunk failed";
0907             goto fail_unlock;
0908         }
0909 
0910         while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
0911             spin_unlock_irqrestore(&pcpu_lock, flags);
0912             if (is_atomic ||
0913                 pcpu_extend_area_map(chunk, new_alloc) < 0) {
0914                 err = "failed to extend area map of reserved chunk";
0915                 goto fail;
0916             }
0917             spin_lock_irqsave(&pcpu_lock, flags);
0918         }
0919 
0920         off = pcpu_alloc_area(chunk, size, align, is_atomic,
0921                       &occ_pages);
0922         if (off >= 0)
0923             goto area_found;
0924 
0925         err = "alloc from reserved chunk failed";
0926         goto fail_unlock;
0927     }
0928 
0929 restart:
0930     /* search through normal chunks */
0931     for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
0932         list_for_each_entry(chunk, &pcpu_slot[slot], list) {
0933             if (size > chunk->contig_hint)
0934                 continue;
0935 
0936             new_alloc = pcpu_need_to_extend(chunk, is_atomic);
0937             if (new_alloc) {
0938                 if (is_atomic)
0939                     continue;
0940                 spin_unlock_irqrestore(&pcpu_lock, flags);
0941                 if (pcpu_extend_area_map(chunk,
0942                              new_alloc) < 0) {
0943                     err = "failed to extend area map";
0944                     goto fail;
0945                 }
0946                 spin_lock_irqsave(&pcpu_lock, flags);
0947                 /*
0948                  * pcpu_lock has been dropped, need to
0949                  * restart cpu_slot list walking.
0950                  */
0951                 goto restart;
0952             }
0953 
0954             off = pcpu_alloc_area(chunk, size, align, is_atomic,
0955                           &occ_pages);
0956             if (off >= 0)
0957                 goto area_found;
0958         }
0959     }
0960 
0961     spin_unlock_irqrestore(&pcpu_lock, flags);
0962 
0963     /*
0964      * No space left.  Create a new chunk.  We don't want multiple
0965      * tasks to create chunks simultaneously.  Serialize and create iff
0966      * there's still no empty chunk after grabbing the mutex.
0967      */
0968     if (is_atomic)
0969         goto fail;
0970 
0971     if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
0972         chunk = pcpu_create_chunk();
0973         if (!chunk) {
0974             err = "failed to allocate new chunk";
0975             goto fail;
0976         }
0977 
0978         spin_lock_irqsave(&pcpu_lock, flags);
0979         pcpu_chunk_relocate(chunk, -1);
0980     } else {
0981         spin_lock_irqsave(&pcpu_lock, flags);
0982     }
0983 
0984     goto restart;
0985 
0986 area_found:
0987     spin_unlock_irqrestore(&pcpu_lock, flags);
0988 
0989     /* populate if not all pages are already there */
0990     if (!is_atomic) {
0991         int page_start, page_end, rs, re;
0992 
0993         page_start = PFN_DOWN(off);
0994         page_end = PFN_UP(off + size);
0995 
0996         pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
0997             WARN_ON(chunk->immutable);
0998 
0999             ret = pcpu_populate_chunk(chunk, rs, re);
1000 
1001             spin_lock_irqsave(&pcpu_lock, flags);
1002             if (ret) {
1003                 pcpu_free_area(chunk, off, &occ_pages);
1004                 err = "failed to populate";
1005                 goto fail_unlock;
1006             }
1007             pcpu_chunk_populated(chunk, rs, re);
1008             spin_unlock_irqrestore(&pcpu_lock, flags);
1009         }
1010 
1011         mutex_unlock(&pcpu_alloc_mutex);
1012     }
1013 
1014     if (chunk != pcpu_reserved_chunk)
1015         pcpu_nr_empty_pop_pages -= occ_pages;
1016 
1017     if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1018         pcpu_schedule_balance_work();
1019 
1020     /* clear the areas and return address relative to base address */
1021     for_each_possible_cpu(cpu)
1022         memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1023 
1024     ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
1025     kmemleak_alloc_percpu(ptr, size, gfp);
1026     return ptr;
1027 
1028 fail_unlock:
1029     spin_unlock_irqrestore(&pcpu_lock, flags);
1030 fail:
1031     if (!is_atomic && warn_limit) {
1032         pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
1033             size, align, is_atomic, err);
1034         dump_stack();
1035         if (!--warn_limit)
1036             pr_info("limit reached, disable warning\n");
1037     }
1038     if (is_atomic) {
1039         /* see the flag handling in pcpu_blance_workfn() */
1040         pcpu_atomic_alloc_failed = true;
1041         pcpu_schedule_balance_work();
1042     } else {
1043         mutex_unlock(&pcpu_alloc_mutex);
1044     }
1045     return NULL;
1046 }
1047 
1048 /**
1049  * __alloc_percpu_gfp - allocate dynamic percpu area
1050  * @size: size of area to allocate in bytes
1051  * @align: alignment of area (max PAGE_SIZE)
1052  * @gfp: allocation flags
1053  *
1054  * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
1055  * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
1056  * be called from any context but is a lot more likely to fail.
1057  *
1058  * RETURNS:
1059  * Percpu pointer to the allocated area on success, NULL on failure.
1060  */
1061 void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1062 {
1063     return pcpu_alloc(size, align, false, gfp);
1064 }
1065 EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1066 
1067 /**
1068  * __alloc_percpu - allocate dynamic percpu area
1069  * @size: size of area to allocate in bytes
1070  * @align: alignment of area (max PAGE_SIZE)
1071  *
1072  * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
1073  */
1074 void __percpu *__alloc_percpu(size_t size, size_t align)
1075 {
1076     return pcpu_alloc(size, align, false, GFP_KERNEL);
1077 }
1078 EXPORT_SYMBOL_GPL(__alloc_percpu);
1079 
1080 /**
1081  * __alloc_reserved_percpu - allocate reserved percpu area
1082  * @size: size of area to allocate in bytes
1083  * @align: alignment of area (max PAGE_SIZE)
1084  *
1085  * Allocate zero-filled percpu area of @size bytes aligned at @align
1086  * from reserved percpu area if arch has set it up; otherwise,
1087  * allocation is served from the same dynamic area.  Might sleep.
1088  * Might trigger writeouts.
1089  *
1090  * CONTEXT:
1091  * Does GFP_KERNEL allocation.
1092  *
1093  * RETURNS:
1094  * Percpu pointer to the allocated area on success, NULL on failure.
1095  */
1096 void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1097 {
1098     return pcpu_alloc(size, align, true, GFP_KERNEL);
1099 }
1100 
1101 /**
1102  * pcpu_balance_workfn - manage the amount of free chunks and populated pages
1103  * @work: unused
1104  *
1105  * Reclaim all fully free chunks except for the first one.
1106  */
1107 static void pcpu_balance_workfn(struct work_struct *work)
1108 {
1109     LIST_HEAD(to_free);
1110     struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1111     struct pcpu_chunk *chunk, *next;
1112     int slot, nr_to_pop, ret;
1113 
1114     /*
1115      * There's no reason to keep around multiple unused chunks and VM
1116      * areas can be scarce.  Destroy all free chunks except for one.
1117      */
1118     mutex_lock(&pcpu_alloc_mutex);
1119     spin_lock_irq(&pcpu_lock);
1120 
1121     list_for_each_entry_safe(chunk, next, free_head, list) {
1122         WARN_ON(chunk->immutable);
1123 
1124         /* spare the first one */
1125         if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1126             continue;
1127 
1128         list_del_init(&chunk->map_extend_list);
1129         list_move(&chunk->list, &to_free);
1130     }
1131 
1132     spin_unlock_irq(&pcpu_lock);
1133 
1134     list_for_each_entry_safe(chunk, next, &to_free, list) {
1135         int rs, re;
1136 
1137         pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1138             pcpu_depopulate_chunk(chunk, rs, re);
1139             spin_lock_irq(&pcpu_lock);
1140             pcpu_chunk_depopulated(chunk, rs, re);
1141             spin_unlock_irq(&pcpu_lock);
1142         }
1143         pcpu_destroy_chunk(chunk);
1144     }
1145 
1146     /* service chunks which requested async area map extension */
1147     do {
1148         int new_alloc = 0;
1149 
1150         spin_lock_irq(&pcpu_lock);
1151 
1152         chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
1153                     struct pcpu_chunk, map_extend_list);
1154         if (chunk) {
1155             list_del_init(&chunk->map_extend_list);
1156             new_alloc = pcpu_need_to_extend(chunk, false);
1157         }
1158 
1159         spin_unlock_irq(&pcpu_lock);
1160 
1161         if (new_alloc)
1162             pcpu_extend_area_map(chunk, new_alloc);
1163     } while (chunk);
1164 
1165     /*
1166      * Ensure there are certain number of free populated pages for
1167      * atomic allocs.  Fill up from the most packed so that atomic
1168      * allocs don't increase fragmentation.  If atomic allocation
1169      * failed previously, always populate the maximum amount.  This
1170      * should prevent atomic allocs larger than PAGE_SIZE from keeping
1171      * failing indefinitely; however, large atomic allocs are not
1172      * something we support properly and can be highly unreliable and
1173      * inefficient.
1174      */
1175 retry_pop:
1176     if (pcpu_atomic_alloc_failed) {
1177         nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1178         /* best effort anyway, don't worry about synchronization */
1179         pcpu_atomic_alloc_failed = false;
1180     } else {
1181         nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1182                   pcpu_nr_empty_pop_pages,
1183                   0, PCPU_EMPTY_POP_PAGES_HIGH);
1184     }
1185 
1186     for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1187         int nr_unpop = 0, rs, re;
1188 
1189         if (!nr_to_pop)
1190             break;
1191 
1192         spin_lock_irq(&pcpu_lock);
1193         list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1194             nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1195             if (nr_unpop)
1196                 break;
1197         }
1198         spin_unlock_irq(&pcpu_lock);
1199 
1200         if (!nr_unpop)
1201             continue;
1202 
1203         /* @chunk can't go away while pcpu_alloc_mutex is held */
1204         pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1205             int nr = min(re - rs, nr_to_pop);
1206 
1207             ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1208             if (!ret) {
1209                 nr_to_pop -= nr;
1210                 spin_lock_irq(&pcpu_lock);
1211                 pcpu_chunk_populated(chunk, rs, rs + nr);
1212                 spin_unlock_irq(&pcpu_lock);
1213             } else {
1214                 nr_to_pop = 0;
1215             }
1216 
1217             if (!nr_to_pop)
1218                 break;
1219         }
1220     }
1221 
1222     if (nr_to_pop) {
1223         /* ran out of chunks to populate, create a new one and retry */
1224         chunk = pcpu_create_chunk();
1225         if (chunk) {
1226             spin_lock_irq(&pcpu_lock);
1227             pcpu_chunk_relocate(chunk, -1);
1228             spin_unlock_irq(&pcpu_lock);
1229             goto retry_pop;
1230         }
1231     }
1232 
1233     mutex_unlock(&pcpu_alloc_mutex);
1234 }
1235 
1236 /**
1237  * free_percpu - free percpu area
1238  * @ptr: pointer to area to free
1239  *
1240  * Free percpu area @ptr.
1241  *
1242  * CONTEXT:
1243  * Can be called from atomic context.
1244  */
1245 void free_percpu(void __percpu *ptr)
1246 {
1247     void *addr;
1248     struct pcpu_chunk *chunk;
1249     unsigned long flags;
1250     int off, occ_pages;
1251 
1252     if (!ptr)
1253         return;
1254 
1255     kmemleak_free_percpu(ptr);
1256 
1257     addr = __pcpu_ptr_to_addr(ptr);
1258 
1259     spin_lock_irqsave(&pcpu_lock, flags);
1260 
1261     chunk = pcpu_chunk_addr_search(addr);
1262     off = addr - chunk->base_addr;
1263 
1264     pcpu_free_area(chunk, off, &occ_pages);
1265 
1266     if (chunk != pcpu_reserved_chunk)
1267         pcpu_nr_empty_pop_pages += occ_pages;
1268 
1269     /* if there are more than one fully free chunks, wake up grim reaper */
1270     if (chunk->free_size == pcpu_unit_size) {
1271         struct pcpu_chunk *pos;
1272 
1273         list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1274             if (pos != chunk) {
1275                 pcpu_schedule_balance_work();
1276                 break;
1277             }
1278     }
1279 
1280     spin_unlock_irqrestore(&pcpu_lock, flags);
1281 }
1282 EXPORT_SYMBOL_GPL(free_percpu);
1283 
1284 /**
1285  * is_kernel_percpu_address - test whether address is from static percpu area
1286  * @addr: address to test
1287  *
1288  * Test whether @addr belongs to in-kernel static percpu area.  Module
1289  * static percpu areas are not considered.  For those, use
1290  * is_module_percpu_address().
1291  *
1292  * RETURNS:
1293  * %true if @addr is from in-kernel static percpu area, %false otherwise.
1294  */
1295 bool is_kernel_percpu_address(unsigned long addr)
1296 {
1297 #ifdef CONFIG_SMP
1298     const size_t static_size = __per_cpu_end - __per_cpu_start;
1299     void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1300     unsigned int cpu;
1301 
1302     for_each_possible_cpu(cpu) {
1303         void *start = per_cpu_ptr(base, cpu);
1304 
1305         if ((void *)addr >= start && (void *)addr < start + static_size)
1306             return true;
1307         }
1308 #endif
1309     /* on UP, can't distinguish from other static vars, always false */
1310     return false;
1311 }
1312 
1313 /**
1314  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
1315  * @addr: the address to be converted to physical address
1316  *
1317  * Given @addr which is dereferenceable address obtained via one of
1318  * percpu access macros, this function translates it into its physical
1319  * address.  The caller is responsible for ensuring @addr stays valid
1320  * until this function finishes.
1321  *
1322  * percpu allocator has special setup for the first chunk, which currently
1323  * supports either embedding in linear address space or vmalloc mapping,
1324  * and, from the second one, the backing allocator (currently either vm or
1325  * km) provides translation.
1326  *
1327  * The addr can be translated simply without checking if it falls into the
1328  * first chunk. But the current code reflects better how percpu allocator
1329  * actually works, and the verification can discover both bugs in percpu
1330  * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
1331  * code.
1332  *
1333  * RETURNS:
1334  * The physical address for @addr.
1335  */
1336 phys_addr_t per_cpu_ptr_to_phys(void *addr)
1337 {
1338     void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1339     bool in_first_chunk = false;
1340     unsigned long first_low, first_high;
1341     unsigned int cpu;
1342 
1343     /*
1344      * The following test on unit_low/high isn't strictly
1345      * necessary but will speed up lookups of addresses which
1346      * aren't in the first chunk.
1347      */
1348     first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
1349     first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
1350                      pcpu_unit_pages);
1351     if ((unsigned long)addr >= first_low &&
1352         (unsigned long)addr < first_high) {
1353         for_each_possible_cpu(cpu) {
1354             void *start = per_cpu_ptr(base, cpu);
1355 
1356             if (addr >= start && addr < start + pcpu_unit_size) {
1357                 in_first_chunk = true;
1358                 break;
1359             }
1360         }
1361     }
1362 
1363     if (in_first_chunk) {
1364         if (!is_vmalloc_addr(addr))
1365             return __pa(addr);
1366         else
1367             return page_to_phys(vmalloc_to_page(addr)) +
1368                    offset_in_page(addr);
1369     } else
1370         return page_to_phys(pcpu_addr_to_page(addr)) +
1371                offset_in_page(addr);
1372 }
1373 
1374 /**
1375  * pcpu_alloc_alloc_info - allocate percpu allocation info
1376  * @nr_groups: the number of groups
1377  * @nr_units: the number of units
1378  *
1379  * Allocate ai which is large enough for @nr_groups groups containing
1380  * @nr_units units.  The returned ai's groups[0].cpu_map points to the
1381  * cpu_map array which is long enough for @nr_units and filled with
1382  * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
1383  * pointer of other groups.
1384  *
1385  * RETURNS:
1386  * Pointer to the allocated pcpu_alloc_info on success, NULL on
1387  * failure.
1388  */
1389 struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1390                               int nr_units)
1391 {
1392     struct pcpu_alloc_info *ai;
1393     size_t base_size, ai_size;
1394     void *ptr;
1395     int unit;
1396 
1397     base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1398               __alignof__(ai->groups[0].cpu_map[0]));
1399     ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1400 
1401     ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
1402     if (!ptr)
1403         return NULL;
1404     ai = ptr;
1405     ptr += base_size;
1406 
1407     ai->groups[0].cpu_map = ptr;
1408 
1409     for (unit = 0; unit < nr_units; unit++)
1410         ai->groups[0].cpu_map[unit] = NR_CPUS;
1411 
1412     ai->nr_groups = nr_groups;
1413     ai->__ai_size = PFN_ALIGN(ai_size);
1414 
1415     return ai;
1416 }
1417 
1418 /**
1419  * pcpu_free_alloc_info - free percpu allocation info
1420  * @ai: pcpu_alloc_info to free
1421  *
1422  * Free @ai which was allocated by pcpu_alloc_alloc_info().
1423  */
1424 void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1425 {
1426     memblock_free_early(__pa(ai), ai->__ai_size);
1427 }
1428 
1429 /**
1430  * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
1431  * @lvl: loglevel
1432  * @ai: allocation info to dump
1433  *
1434  * Print out information about @ai using loglevel @lvl.
1435  */
1436 static void pcpu_dump_alloc_info(const char *lvl,
1437                  const struct pcpu_alloc_info *ai)
1438 {
1439     int group_width = 1, cpu_width = 1, width;
1440     char empty_str[] = "--------";
1441     int alloc = 0, alloc_end = 0;
1442     int group, v;
1443     int upa, apl;   /* units per alloc, allocs per line */
1444 
1445     v = ai->nr_groups;
1446     while (v /= 10)
1447         group_width++;
1448 
1449     v = num_possible_cpus();
1450     while (v /= 10)
1451         cpu_width++;
1452     empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1453 
1454     upa = ai->alloc_size / ai->unit_size;
1455     width = upa * (cpu_width + 1) + group_width + 3;
1456     apl = rounddown_pow_of_two(max(60 / width, 1));
1457 
1458     printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1459            lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1460            ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1461 
1462     for (group = 0; group < ai->nr_groups; group++) {
1463         const struct pcpu_group_info *gi = &ai->groups[group];
1464         int unit = 0, unit_end = 0;
1465 
1466         BUG_ON(gi->nr_units % upa);
1467         for (alloc_end += gi->nr_units / upa;
1468              alloc < alloc_end; alloc++) {
1469             if (!(alloc % apl)) {
1470                 pr_cont("\n");
1471                 printk("%spcpu-alloc: ", lvl);
1472             }
1473             pr_cont("[%0*d] ", group_width, group);
1474 
1475             for (unit_end += upa; unit < unit_end; unit++)
1476                 if (gi->cpu_map[unit] != NR_CPUS)
1477                     pr_cont("%0*d ",
1478                         cpu_width, gi->cpu_map[unit]);
1479                 else
1480                     pr_cont("%s ", empty_str);
1481         }
1482     }
1483     pr_cont("\n");
1484 }
1485 
1486 /**
1487  * pcpu_setup_first_chunk - initialize the first percpu chunk
1488  * @ai: pcpu_alloc_info describing how to percpu area is shaped
1489  * @base_addr: mapped address
1490  *
1491  * Initialize the first percpu chunk which contains the kernel static
1492  * perpcu area.  This function is to be called from arch percpu area
1493  * setup path.
1494  *
1495  * @ai contains all information necessary to initialize the first
1496  * chunk and prime the dynamic percpu allocator.
1497  *
1498  * @ai->static_size is the size of static percpu area.
1499  *
1500  * @ai->reserved_size, if non-zero, specifies the amount of bytes to
1501  * reserve after the static area in the first chunk.  This reserves
1502  * the first chunk such that it's available only through reserved
1503  * percpu allocation.  This is primarily used to serve module percpu
1504  * static areas on architectures where the addressing model has
1505  * limited offset range for symbol relocations to guarantee module
1506  * percpu symbols fall inside the relocatable range.
1507  *
1508  * @ai->dyn_size determines the number of bytes available for dynamic
1509  * allocation in the first chunk.  The area between @ai->static_size +
1510  * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
1511  *
1512  * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
1513  * and equal to or larger than @ai->static_size + @ai->reserved_size +
1514  * @ai->dyn_size.
1515  *
1516  * @ai->atom_size is the allocation atom size and used as alignment
1517  * for vm areas.
1518  *
1519  * @ai->alloc_size is the allocation size and always multiple of
1520  * @ai->atom_size.  This is larger than @ai->atom_size if
1521  * @ai->unit_size is larger than @ai->atom_size.
1522  *
1523  * @ai->nr_groups and @ai->groups describe virtual memory layout of
1524  * percpu areas.  Units which should be colocated are put into the
1525  * same group.  Dynamic VM areas will be allocated according to these
1526  * groupings.  If @ai->nr_groups is zero, a single group containing
1527  * all units is assumed.
1528  *
1529  * The caller should have mapped the first chunk at @base_addr and
1530  * copied static data to each unit.
1531  *
1532  * If the first chunk ends up with both reserved and dynamic areas, it
1533  * is served by two chunks - one to serve the core static and reserved
1534  * areas and the other for the dynamic area.  They share the same vm
1535  * and page map but uses different area allocation map to stay away
1536  * from each other.  The latter chunk is circulated in the chunk slots
1537  * and available for dynamic allocation like any other chunks.
1538  *
1539  * RETURNS:
1540  * 0 on success, -errno on failure.
1541  */
1542 int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1543                   void *base_addr)
1544 {
1545     static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1546     static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1547     size_t dyn_size = ai->dyn_size;
1548     size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1549     struct pcpu_chunk *schunk, *dchunk = NULL;
1550     unsigned long *group_offsets;
1551     size_t *group_sizes;
1552     unsigned long *unit_off;
1553     unsigned int cpu;
1554     int *unit_map;
1555     int group, unit, i;
1556 
1557 #define PCPU_SETUP_BUG_ON(cond) do {                    \
1558     if (unlikely(cond)) {                       \
1559         pr_emerg("failed to initialize, %s\n", #cond);      \
1560         pr_emerg("cpu_possible_mask=%*pb\n",            \
1561              cpumask_pr_args(cpu_possible_mask));       \
1562         pcpu_dump_alloc_info(KERN_EMERG, ai);           \
1563         BUG();                          \
1564     }                               \
1565 } while (0)
1566 
1567     /* sanity checks */
1568     PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1569 #ifdef CONFIG_SMP
1570     PCPU_SETUP_BUG_ON(!ai->static_size);
1571     PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
1572 #endif
1573     PCPU_SETUP_BUG_ON(!base_addr);
1574     PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
1575     PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1576     PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
1577     PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1578     PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
1579     PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1580 
1581     /* process group information and build config tables accordingly */
1582     group_offsets = memblock_virt_alloc(ai->nr_groups *
1583                          sizeof(group_offsets[0]), 0);
1584     group_sizes = memblock_virt_alloc(ai->nr_groups *
1585                        sizeof(group_sizes[0]), 0);
1586     unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
1587     unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
1588 
1589     for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1590         unit_map[cpu] = UINT_MAX;
1591 
1592     pcpu_low_unit_cpu = NR_CPUS;
1593     pcpu_high_unit_cpu = NR_CPUS;
1594 
1595     for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1596         const struct pcpu_group_info *gi = &ai->groups[group];
1597 
1598         group_offsets[group] = gi->base_offset;
1599         group_sizes[group] = gi->nr_units * ai->unit_size;
1600 
1601         for (i = 0; i < gi->nr_units; i++) {
1602             cpu = gi->cpu_map[i];
1603             if (cpu == NR_CPUS)
1604                 continue;
1605 
1606             PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
1607             PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
1608             PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
1609 
1610             unit_map[cpu] = unit + i;
1611             unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1612 
1613             /* determine low/high unit_cpu */
1614             if (pcpu_low_unit_cpu == NR_CPUS ||
1615                 unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
1616                 pcpu_low_unit_cpu = cpu;
1617             if (pcpu_high_unit_cpu == NR_CPUS ||
1618                 unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
1619                 pcpu_high_unit_cpu = cpu;
1620         }
1621     }
1622     pcpu_nr_units = unit;
1623 
1624     for_each_possible_cpu(cpu)
1625         PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
1626 
1627     /* we're done parsing the input, undefine BUG macro and dump config */
1628 #undef PCPU_SETUP_BUG_ON
1629     pcpu_dump_alloc_info(KERN_DEBUG, ai);
1630 
1631     pcpu_nr_groups = ai->nr_groups;
1632     pcpu_group_offsets = group_offsets;
1633     pcpu_group_sizes = group_sizes;
1634     pcpu_unit_map = unit_map;
1635     pcpu_unit_offsets = unit_off;
1636 
1637     /* determine basic parameters */
1638     pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1639     pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1640     pcpu_atom_size = ai->atom_size;
1641     pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1642         BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1643 
1644     /*
1645      * Allocate chunk slots.  The additional last slot is for
1646      * empty chunks.
1647      */
1648     pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1649     pcpu_slot = memblock_virt_alloc(
1650             pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
1651     for (i = 0; i < pcpu_nr_slots; i++)
1652         INIT_LIST_HEAD(&pcpu_slot[i]);
1653 
1654     /*
1655      * Initialize static chunk.  If reserved_size is zero, the
1656      * static chunk covers static area + dynamic allocation area
1657      * in the first chunk.  If reserved_size is not zero, it
1658      * covers static area + reserved area (mostly used for module
1659      * static percpu allocation).
1660      */
1661     schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1662     INIT_LIST_HEAD(&schunk->list);
1663     INIT_LIST_HEAD(&schunk->map_extend_list);
1664     schunk->base_addr = base_addr;
1665     schunk->map = smap;
1666     schunk->map_alloc = ARRAY_SIZE(smap);
1667     schunk->immutable = true;
1668     bitmap_fill(schunk->populated, pcpu_unit_pages);
1669     schunk->nr_populated = pcpu_unit_pages;
1670 
1671     if (ai->reserved_size) {
1672         schunk->free_size = ai->reserved_size;
1673         pcpu_reserved_chunk = schunk;
1674         pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1675     } else {
1676         schunk->free_size = dyn_size;
1677         dyn_size = 0;           /* dynamic area covered */
1678     }
1679     schunk->contig_hint = schunk->free_size;
1680 
1681     schunk->map[0] = 1;
1682     schunk->map[1] = ai->static_size;
1683     schunk->map_used = 1;
1684     if (schunk->free_size)
1685         schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
1686     schunk->map[schunk->map_used] |= 1;
1687 
1688     /* init dynamic chunk if necessary */
1689     if (dyn_size) {
1690         dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1691         INIT_LIST_HEAD(&dchunk->list);
1692         INIT_LIST_HEAD(&dchunk->map_extend_list);
1693         dchunk->base_addr = base_addr;
1694         dchunk->map = dmap;
1695         dchunk->map_alloc = ARRAY_SIZE(dmap);
1696         dchunk->immutable = true;
1697         bitmap_fill(dchunk->populated, pcpu_unit_pages);
1698         dchunk->nr_populated = pcpu_unit_pages;
1699 
1700         dchunk->contig_hint = dchunk->free_size = dyn_size;
1701         dchunk->map[0] = 1;
1702         dchunk->map[1] = pcpu_reserved_chunk_limit;
1703         dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
1704         dchunk->map_used = 2;
1705     }
1706 
1707     /* link the first chunk in */
1708     pcpu_first_chunk = dchunk ?: schunk;
1709     pcpu_nr_empty_pop_pages +=
1710         pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1711     pcpu_chunk_relocate(pcpu_first_chunk, -1);
1712 
1713     /* we're done */
1714     pcpu_base_addr = base_addr;
1715     return 0;
1716 }
1717 
1718 #ifdef CONFIG_SMP
1719 
1720 const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
1721     [PCPU_FC_AUTO]  = "auto",
1722     [PCPU_FC_EMBED] = "embed",
1723     [PCPU_FC_PAGE]  = "page",
1724 };
1725 
1726 enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1727 
1728 static int __init percpu_alloc_setup(char *str)
1729 {
1730     if (!str)
1731         return -EINVAL;
1732 
1733     if (0)
1734         /* nada */;
1735 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1736     else if (!strcmp(str, "embed"))
1737         pcpu_chosen_fc = PCPU_FC_EMBED;
1738 #endif
1739 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1740     else if (!strcmp(str, "page"))
1741         pcpu_chosen_fc = PCPU_FC_PAGE;
1742 #endif
1743     else
1744         pr_warn("unknown allocator %s specified\n", str);
1745 
1746     return 0;
1747 }
1748 early_param("percpu_alloc", percpu_alloc_setup);
1749 
1750 /*
1751  * pcpu_embed_first_chunk() is used by the generic percpu setup.
1752  * Build it if needed by the arch config or the generic setup is going
1753  * to be used.
1754  */
1755 #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1756     !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1757 #define BUILD_EMBED_FIRST_CHUNK
1758 #endif
1759 
1760 /* build pcpu_page_first_chunk() iff needed by the arch config */
1761 #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
1762 #define BUILD_PAGE_FIRST_CHUNK
1763 #endif
1764 
1765 /* pcpu_build_alloc_info() is used by both embed and page first chunk */
1766 #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
1767 /**
1768  * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
1769  * @reserved_size: the size of reserved percpu area in bytes
1770  * @dyn_size: minimum free size for dynamic allocation in bytes
1771  * @atom_size: allocation atom size
1772  * @cpu_distance_fn: callback to determine distance between cpus, optional
1773  *
1774  * This function determines grouping of units, their mappings to cpus
1775  * and other parameters considering needed percpu size, allocation
1776  * atom size and distances between CPUs.
1777  *
1778  * Groups are always multiples of atom size and CPUs which are of
1779  * LOCAL_DISTANCE both ways are grouped together and share space for
1780  * units in the same group.  The returned configuration is guaranteed
1781  * to have CPUs on different nodes on different groups and >=75% usage
1782  * of allocated virtual address space.
1783  *
1784  * RETURNS:
1785  * On success, pointer to the new allocation_info is returned.  On
1786  * failure, ERR_PTR value is returned.
1787  */
1788 static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1789                 size_t reserved_size, size_t dyn_size,
1790                 size_t atom_size,
1791                 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1792 {
1793     static int group_map[NR_CPUS] __initdata;
1794     static int group_cnt[NR_CPUS] __initdata;
1795     const size_t static_size = __per_cpu_end - __per_cpu_start;
1796     int nr_groups = 1, nr_units = 0;
1797     size_t size_sum, min_unit_size, alloc_size;
1798     int upa, max_upa, uninitialized_var(best_upa);  /* units_per_alloc */
1799     int last_allocs, group, unit;
1800     unsigned int cpu, tcpu;
1801     struct pcpu_alloc_info *ai;
1802     unsigned int *cpu_map;
1803 
1804     /* this function may be called multiple times */
1805     memset(group_map, 0, sizeof(group_map));
1806     memset(group_cnt, 0, sizeof(group_cnt));
1807 
1808     /* calculate size_sum and ensure dyn_size is enough for early alloc */
1809     size_sum = PFN_ALIGN(static_size + reserved_size +
1810                 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1811     dyn_size = size_sum - static_size - reserved_size;
1812 
1813     /*
1814      * Determine min_unit_size, alloc_size and max_upa such that
1815      * alloc_size is multiple of atom_size and is the smallest
1816      * which can accommodate 4k aligned segments which are equal to
1817      * or larger than min_unit_size.
1818      */
1819     min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1820 
1821     alloc_size = roundup(min_unit_size, atom_size);
1822     upa = alloc_size / min_unit_size;
1823     while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
1824         upa--;
1825     max_upa = upa;
1826 
1827     /* group cpus according to their proximity */
1828     for_each_possible_cpu(cpu) {
1829         group = 0;
1830     next_group:
1831         for_each_possible_cpu(tcpu) {
1832             if (cpu == tcpu)
1833                 break;
1834             if (group_map[tcpu] == group && cpu_distance_fn &&
1835                 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1836                  cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1837                 group++;
1838                 nr_groups = max(nr_groups, group + 1);
1839                 goto next_group;
1840             }
1841         }
1842         group_map[cpu] = group;
1843         group_cnt[group]++;
1844     }
1845 
1846     /*
1847      * Expand unit size until address space usage goes over 75%
1848      * and then as much as possible without using more address
1849      * space.
1850      */
1851     last_allocs = INT_MAX;
1852     for (upa = max_upa; upa; upa--) {
1853         int allocs = 0, wasted = 0;
1854 
1855         if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
1856             continue;
1857 
1858         for (group = 0; group < nr_groups; group++) {
1859             int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1860             allocs += this_allocs;
1861             wasted += this_allocs * upa - group_cnt[group];
1862         }
1863 
1864         /*
1865          * Don't accept if wastage is over 1/3.  The
1866          * greater-than comparison ensures upa==1 always
1867          * passes the following check.
1868          */
1869         if (wasted > num_possible_cpus() / 3)
1870             continue;
1871 
1872         /* and then don't consume more memory */
1873         if (allocs > last_allocs)
1874             break;
1875         last_allocs = allocs;
1876         best_upa = upa;
1877     }
1878     upa = best_upa;
1879 
1880     /* allocate and fill alloc_info */
1881     for (group = 0; group < nr_groups; group++)
1882         nr_units += roundup(group_cnt[group], upa);
1883 
1884     ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1885     if (!ai)
1886         return ERR_PTR(-ENOMEM);
1887     cpu_map = ai->groups[0].cpu_map;
1888 
1889     for (group = 0; group < nr_groups; group++) {
1890         ai->groups[group].cpu_map = cpu_map;
1891         cpu_map += roundup(group_cnt[group], upa);
1892     }
1893 
1894     ai->static_size = static_size;
1895     ai->reserved_size = reserved_size;
1896     ai->dyn_size = dyn_size;
1897     ai->unit_size = alloc_size / upa;
1898     ai->atom_size = atom_size;
1899     ai->alloc_size = alloc_size;
1900 
1901     for (group = 0, unit = 0; group_cnt[group]; group++) {
1902         struct pcpu_group_info *gi = &ai->groups[group];
1903 
1904         /*
1905          * Initialize base_offset as if all groups are located
1906          * back-to-back.  The caller should update this to
1907          * reflect actual allocation.
1908          */
1909         gi->base_offset = unit * ai->unit_size;
1910 
1911         for_each_possible_cpu(cpu)
1912             if (group_map[cpu] == group)
1913                 gi->cpu_map[gi->nr_units++] = cpu;
1914         gi->nr_units = roundup(gi->nr_units, upa);
1915         unit += gi->nr_units;
1916     }
1917     BUG_ON(unit != nr_units);
1918 
1919     return ai;
1920 }
1921 #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
1922 
1923 #if defined(BUILD_EMBED_FIRST_CHUNK)
1924 /**
1925  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1926  * @reserved_size: the size of reserved percpu area in bytes
1927  * @dyn_size: minimum free size for dynamic allocation in bytes
1928  * @atom_size: allocation atom size
1929  * @cpu_distance_fn: callback to determine distance between cpus, optional
1930  * @alloc_fn: function to allocate percpu page
1931  * @free_fn: function to free percpu page
1932  *
1933  * This is a helper to ease setting up embedded first percpu chunk and
1934  * can be called where pcpu_setup_first_chunk() is expected.
1935  *
1936  * If this function is used to setup the first chunk, it is allocated
1937  * by calling @alloc_fn and used as-is without being mapped into
1938  * vmalloc area.  Allocations are always whole multiples of @atom_size
1939  * aligned to @atom_size.
1940  *
1941  * This enables the first chunk to piggy back on the linear physical
1942  * mapping which often uses larger page size.  Please note that this
1943  * can result in very sparse cpu->unit mapping on NUMA machines thus
1944  * requiring large vmalloc address space.  Don't use this allocator if
1945  * vmalloc space is not orders of magnitude larger than distances
1946  * between node memory addresses (ie. 32bit NUMA machines).
1947  *
1948  * @dyn_size specifies the minimum dynamic area size.
1949  *
1950  * If the needed size is smaller than the minimum or specified unit
1951  * size, the leftover is returned using @free_fn.
1952  *
1953  * RETURNS:
1954  * 0 on success, -errno on failure.
1955  */
1956 int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1957                   size_t atom_size,
1958                   pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1959                   pcpu_fc_alloc_fn_t alloc_fn,
1960                   pcpu_fc_free_fn_t free_fn)
1961 {
1962     void *base = (void *)ULONG_MAX;
1963     void **areas = NULL;
1964     struct pcpu_alloc_info *ai;
1965     size_t size_sum, areas_size;
1966     unsigned long max_distance;
1967     int group, i, highest_group, rc;
1968 
1969     ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1970                    cpu_distance_fn);
1971     if (IS_ERR(ai))
1972         return PTR_ERR(ai);
1973 
1974     size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1975     areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1976 
1977     areas = memblock_virt_alloc_nopanic(areas_size, 0);
1978     if (!areas) {
1979         rc = -ENOMEM;
1980         goto out_free;
1981     }
1982 
1983     /* allocate, copy and determine base address & max_distance */
1984     highest_group = 0;
1985     for (group = 0; group < ai->nr_groups; group++) {
1986         struct pcpu_group_info *gi = &ai->groups[group];
1987         unsigned int cpu = NR_CPUS;
1988         void *ptr;
1989 
1990         for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1991             cpu = gi->cpu_map[i];
1992         BUG_ON(cpu == NR_CPUS);
1993 
1994         /* allocate space for the whole group */
1995         ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1996         if (!ptr) {
1997             rc = -ENOMEM;
1998             goto out_free_areas;
1999         }
2000         /* kmemleak tracks the percpu allocations separately */
2001         kmemleak_free(ptr);
2002         areas[group] = ptr;
2003 
2004         base = min(ptr, base);
2005         if (ptr > areas[highest_group])
2006             highest_group = group;
2007     }
2008     max_distance = areas[highest_group] - base;
2009     max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
2010 
2011     /* warn if maximum distance is further than 75% of vmalloc space */
2012     if (max_distance > VMALLOC_TOTAL * 3 / 4) {
2013         pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
2014                 max_distance, VMALLOC_TOTAL);
2015 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2016         /* and fail if we have fallback */
2017         rc = -EINVAL;
2018         goto out_free_areas;
2019 #endif
2020     }
2021 
2022     /*
2023      * Copy data and free unused parts.  This should happen after all
2024      * allocations are complete; otherwise, we may end up with
2025      * overlapping groups.
2026      */
2027     for (group = 0; group < ai->nr_groups; group++) {
2028         struct pcpu_group_info *gi = &ai->groups[group];
2029         void *ptr = areas[group];
2030 
2031         for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
2032             if (gi->cpu_map[i] == NR_CPUS) {
2033                 /* unused unit, free whole */
2034                 free_fn(ptr, ai->unit_size);
2035                 continue;
2036             }
2037             /* copy and return the unused part */
2038             memcpy(ptr, __per_cpu_load, ai->static_size);
2039             free_fn(ptr + size_sum, ai->unit_size - size_sum);
2040         }
2041     }
2042 
2043     /* base address is now known, determine group base offsets */
2044     for (group = 0; group < ai->nr_groups; group++) {
2045         ai->groups[group].base_offset = areas[group] - base;
2046     }
2047 
2048     pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
2049         PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
2050         ai->dyn_size, ai->unit_size);
2051 
2052     rc = pcpu_setup_first_chunk(ai, base);
2053     goto out_free;
2054 
2055 out_free_areas:
2056     for (group = 0; group < ai->nr_groups; group++)
2057         if (areas[group])
2058             free_fn(areas[group],
2059                 ai->groups[group].nr_units * ai->unit_size);
2060 out_free:
2061     pcpu_free_alloc_info(ai);
2062     if (areas)
2063         memblock_free_early(__pa(areas), areas_size);
2064     return rc;
2065 }
2066 #endif /* BUILD_EMBED_FIRST_CHUNK */
2067 
2068 #ifdef BUILD_PAGE_FIRST_CHUNK
2069 /**
2070  * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
2071  * @reserved_size: the size of reserved percpu area in bytes
2072  * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
2073  * @free_fn: function to free percpu page, always called with PAGE_SIZE
2074  * @populate_pte_fn: function to populate pte
2075  *
2076  * This is a helper to ease setting up page-remapped first percpu
2077  * chunk and can be called where pcpu_setup_first_chunk() is expected.
2078  *
2079  * This is the basic allocator.  Static percpu area is allocated
2080  * page-by-page into vmalloc area.
2081  *
2082  * RETURNS:
2083  * 0 on success, -errno on failure.
2084  */
2085 int __init pcpu_page_first_chunk(size_t reserved_size,
2086                  pcpu_fc_alloc_fn_t alloc_fn,
2087                  pcpu_fc_free_fn_t free_fn,
2088                  pcpu_fc_populate_pte_fn_t populate_pte_fn)
2089 {
2090     static struct vm_struct vm;
2091     struct pcpu_alloc_info *ai;
2092     char psize_str[16];
2093     int unit_pages;
2094     size_t pages_size;
2095     struct page **pages;
2096     int unit, i, j, rc;
2097     int upa;
2098     int nr_g0_units;
2099 
2100     snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
2101 
2102     ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
2103     if (IS_ERR(ai))
2104         return PTR_ERR(ai);
2105     BUG_ON(ai->nr_groups != 1);
2106     upa = ai->alloc_size/ai->unit_size;
2107     nr_g0_units = roundup(num_possible_cpus(), upa);
2108     if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
2109         pcpu_free_alloc_info(ai);
2110         return -EINVAL;
2111     }
2112 
2113     unit_pages = ai->unit_size >> PAGE_SHIFT;
2114 
2115     /* unaligned allocations can't be freed, round up to page size */
2116     pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
2117                    sizeof(pages[0]));
2118     pages = memblock_virt_alloc(pages_size, 0);
2119 
2120     /* allocate pages */
2121     j = 0;
2122     for (unit = 0; unit < num_possible_cpus(); unit++) {
2123         unsigned int cpu = ai->groups[0].cpu_map[unit];
2124         for (i = 0; i < unit_pages; i++) {
2125             void *ptr;
2126 
2127             ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
2128             if (!ptr) {
2129                 pr_warn("failed to allocate %s page for cpu%u\n",
2130                         psize_str, cpu);
2131                 goto enomem;
2132             }
2133             /* kmemleak tracks the percpu allocations separately */
2134             kmemleak_free(ptr);
2135             pages[j++] = virt_to_page(ptr);
2136         }
2137     }
2138 
2139     /* allocate vm area, map the pages and copy static data */
2140     vm.flags = VM_ALLOC;
2141     vm.size = num_possible_cpus() * ai->unit_size;
2142     vm_area_register_early(&vm, PAGE_SIZE);
2143 
2144     for (unit = 0; unit < num_possible_cpus(); unit++) {
2145         unsigned long unit_addr =
2146             (unsigned long)vm.addr + unit * ai->unit_size;
2147 
2148         for (i = 0; i < unit_pages; i++)
2149             populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
2150 
2151         /* pte already populated, the following shouldn't fail */
2152         rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
2153                       unit_pages);
2154         if (rc < 0)
2155             panic("failed to map percpu area, err=%d\n", rc);
2156 
2157         /*
2158          * FIXME: Archs with virtual cache should flush local
2159          * cache for the linear mapping here - something
2160          * equivalent to flush_cache_vmap() on the local cpu.
2161          * flush_cache_vmap() can't be used as most supporting
2162          * data structures are not set up yet.
2163          */
2164 
2165         /* copy static data */
2166         memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
2167     }
2168 
2169     /* we're ready, commit */
2170     pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
2171         unit_pages, psize_str, vm.addr, ai->static_size,
2172         ai->reserved_size, ai->dyn_size);
2173 
2174     rc = pcpu_setup_first_chunk(ai, vm.addr);
2175     goto out_free_ar;
2176 
2177 enomem:
2178     while (--j >= 0)
2179         free_fn(page_address(pages[j]), PAGE_SIZE);
2180     rc = -ENOMEM;
2181 out_free_ar:
2182     memblock_free_early(__pa(pages), pages_size);
2183     pcpu_free_alloc_info(ai);
2184     return rc;
2185 }
2186 #endif /* BUILD_PAGE_FIRST_CHUNK */
2187 
2188 #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
2189 /*
2190  * Generic SMP percpu area setup.
2191  *
2192  * The embedding helper is used because its behavior closely resembles
2193  * the original non-dynamic generic percpu area setup.  This is
2194  * important because many archs have addressing restrictions and might
2195  * fail if the percpu area is located far away from the previous
2196  * location.  As an added bonus, in non-NUMA cases, embedding is
2197  * generally a good idea TLB-wise because percpu area can piggy back
2198  * on the physical linear memory mapping which uses large page
2199  * mappings on applicable archs.
2200  */
2201 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2202 EXPORT_SYMBOL(__per_cpu_offset);
2203 
2204 static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
2205                        size_t align)
2206 {
2207     return  memblock_virt_alloc_from_nopanic(
2208             size, align, __pa(MAX_DMA_ADDRESS));
2209 }
2210 
2211 static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
2212 {
2213     memblock_free_early(__pa(ptr), size);
2214 }
2215 
2216 void __init setup_per_cpu_areas(void)
2217 {
2218     unsigned long delta;
2219     unsigned int cpu;
2220     int rc;
2221 
2222     /*
2223      * Always reserve area for module percpu variables.  That's
2224      * what the legacy allocator did.
2225      */
2226     rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2227                     PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2228                     pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2229     if (rc < 0)
2230         panic("Failed to initialize percpu areas.");
2231 
2232     delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2233     for_each_possible_cpu(cpu)
2234         __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
2235 }
2236 #endif  /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
2237 
2238 #else   /* CONFIG_SMP */
2239 
2240 /*
2241  * UP percpu area setup.
2242  *
2243  * UP always uses km-based percpu allocator with identity mapping.
2244  * Static percpu variables are indistinguishable from the usual static
2245  * variables and don't require any special preparation.
2246  */
2247 void __init setup_per_cpu_areas(void)
2248 {
2249     const size_t unit_size =
2250         roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
2251                      PERCPU_DYNAMIC_RESERVE));
2252     struct pcpu_alloc_info *ai;
2253     void *fc;
2254 
2255     ai = pcpu_alloc_alloc_info(1, 1);
2256     fc = memblock_virt_alloc_from_nopanic(unit_size,
2257                           PAGE_SIZE,
2258                           __pa(MAX_DMA_ADDRESS));
2259     if (!ai || !fc)
2260         panic("Failed to allocate memory for percpu areas.");
2261     /* kmemleak tracks the percpu allocations separately */
2262     kmemleak_free(fc);
2263 
2264     ai->dyn_size = unit_size;
2265     ai->unit_size = unit_size;
2266     ai->atom_size = unit_size;
2267     ai->alloc_size = unit_size;
2268     ai->groups[0].nr_units = 1;
2269     ai->groups[0].cpu_map[0] = 0;
2270 
2271     if (pcpu_setup_first_chunk(ai, fc) < 0)
2272         panic("Failed to initialize percpu areas.");
2273 }
2274 
2275 #endif  /* CONFIG_SMP */
2276 
2277 /*
2278  * First and reserved chunks are initialized with temporary allocation
2279  * map in initdata so that they can be used before slab is online.
2280  * This function is called after slab is brought up and replaces those
2281  * with properly allocated maps.
2282  */
2283 void __init percpu_init_late(void)
2284 {
2285     struct pcpu_chunk *target_chunks[] =
2286         { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
2287     struct pcpu_chunk *chunk;
2288     unsigned long flags;
2289     int i;
2290 
2291     for (i = 0; (chunk = target_chunks[i]); i++) {
2292         int *map;
2293         const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
2294 
2295         BUILD_BUG_ON(size > PAGE_SIZE);
2296 
2297         map = pcpu_mem_zalloc(size);
2298         BUG_ON(!map);
2299 
2300         spin_lock_irqsave(&pcpu_lock, flags);
2301         memcpy(map, chunk->map, size);
2302         chunk->map = map;
2303         spin_unlock_irqrestore(&pcpu_lock, flags);
2304     }
2305 }
2306 
2307 /*
2308  * Percpu allocator is initialized early during boot when neither slab or
2309  * workqueue is available.  Plug async management until everything is up
2310  * and running.
2311  */
2312 static int __init percpu_enable_async(void)
2313 {
2314     pcpu_async_enabled = true;
2315     return 0;
2316 }
2317 subsys_initcall(percpu_enable_async);