the-tree/mm/percpu-vm.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * mm/percpu-vm.c - vmalloc area based chunk allocation
0004  *
0005  * Copyright (C) 2010       SUSE Linux Products GmbH
0006  * Copyright (C) 2010       Tejun Heo <tj@kernel.org>
0007  *
0008  * Chunks are mapped into vmalloc areas and populated page by page.
0009  * This is the default chunk allocator.
0010  */
0011 #include "internal.h"
0012
0013 static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
0014                     unsigned int cpu, int page_idx)
0015 {
0016     /* must not be used on pre-mapped chunk */
0017     WARN_ON(chunk->immutable);
0018
0019     return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
0020 }
0021
0022 /**
0023  * pcpu_get_pages - get temp pages array
0024  *
0025  * Returns pointer to array of pointers to struct page which can be indexed
0026  * with pcpu_page_idx().  Note that there is only one array and accesses
0027  * should be serialized by pcpu_alloc_mutex.
0028  *
0029  * RETURNS:
0030  * Pointer to temp pages array on success.
0031  */
0032 static struct page **pcpu_get_pages(void)
0033 {
0034     static struct page **pages;
0035     size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
0036
0037     lockdep_assert_held(&pcpu_alloc_mutex);
0038
0039     if (!pages)
0040         pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL);
0041     return pages;
0042 }
0043
0044 /**
0045  * pcpu_free_pages - free pages which were allocated for @chunk
0046  * @chunk: chunk pages were allocated for
0047  * @pages: array of pages to be freed, indexed by pcpu_page_idx()
0048  * @page_start: page index of the first page to be freed
0049  * @page_end: page index of the last page to be freed + 1
0050  *
0051  * Free pages [@page_start and @page_end) in @pages for all units.
0052  * The pages were allocated for @chunk.
0053  */
0054 static void pcpu_free_pages(struct pcpu_chunk *chunk,
0055                 struct page **pages, int page_start, int page_end)
0056 {
0057     unsigned int cpu;
0058     int i;
0059
0060     for_each_possible_cpu(cpu) {
0061         for (i = page_start; i < page_end; i++) {
0062             struct page *page = pages[pcpu_page_idx(cpu, i)];
0063
0064             if (page)
0065                 __free_page(page);
0066         }
0067     }
0068 }
0069
0070 /**
0071  * pcpu_alloc_pages - allocates pages for @chunk
0072  * @chunk: target chunk
0073  * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
0074  * @page_start: page index of the first page to be allocated
0075  * @page_end: page index of the last page to be allocated + 1
0076  * @gfp: allocation flags passed to the underlying allocator
0077  *
0078  * Allocate pages [@page_start,@page_end) into @pages for all units.
0079  * The allocation is for @chunk.  Percpu core doesn't care about the
0080  * content of @pages and will pass it verbatim to pcpu_map_pages().
0081  */
0082 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
0083                 struct page **pages, int page_start, int page_end,
0084                 gfp_t gfp)
0085 {
0086     unsigned int cpu, tcpu;
0087     int i;
0088
0089     gfp |= __GFP_HIGHMEM;
0090
0091     for_each_possible_cpu(cpu) {
0092         for (i = page_start; i < page_end; i++) {
0093             struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
0094
0095             *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
0096             if (!*pagep)
0097                 goto err;
0098         }
0099     }
0100     return 0;
0101
0102 err:
0103     while (--i >= page_start)
0104         __free_page(pages[pcpu_page_idx(cpu, i)]);
0105
0106     for_each_possible_cpu(tcpu) {
0107         if (tcpu == cpu)
0108             break;
0109         for (i = page_start; i < page_end; i++)
0110             __free_page(pages[pcpu_page_idx(tcpu, i)]);
0111     }
0112     return -ENOMEM;
0113 }
0114
0115 /**
0116  * pcpu_pre_unmap_flush - flush cache prior to unmapping
0117  * @chunk: chunk the regions to be flushed belongs to
0118  * @page_start: page index of the first page to be flushed
0119  * @page_end: page index of the last page to be flushed + 1
0120  *
0121  * Pages in [@page_start,@page_end) of @chunk are about to be
0122  * unmapped.  Flush cache.  As each flushing trial can be very
0123  * expensive, issue flush on the whole region at once rather than
0124  * doing it for each cpu.  This could be an overkill but is more
0125  * scalable.
0126  */
0127 static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
0128                  int page_start, int page_end)
0129 {
0130     flush_cache_vunmap(
0131         pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
0132         pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
0133 }
0134
0135 static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
0136 {
0137     vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT));
0138 }
0139
0140 /**
0141  * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
0142  * @chunk: chunk of interest
0143  * @pages: pages array which can be used to pass information to free
0144  * @page_start: page index of the first page to unmap
0145  * @page_end: page index of the last page to unmap + 1
0146  *
0147  * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
0148  * Corresponding elements in @pages were cleared by the caller and can
0149  * be used to carry information to pcpu_free_pages() which will be
0150  * called after all unmaps are finished.  The caller should call
0151  * proper pre/post flush functions.
0152  */
0153 static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
0154                  struct page **pages, int page_start, int page_end)
0155 {
0156     unsigned int cpu;
0157     int i;
0158
0159     for_each_possible_cpu(cpu) {
0160         for (i = page_start; i < page_end; i++) {
0161             struct page *page;
0162
0163             page = pcpu_chunk_page(chunk, cpu, i);
0164             WARN_ON(!page);
0165             pages[pcpu_page_idx(cpu, i)] = page;
0166         }
0167         __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
0168                    page_end - page_start);
0169     }
0170 }
0171
0172 /**
0173  * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
0174  * @chunk: pcpu_chunk the regions to be flushed belong to
0175  * @page_start: page index of the first page to be flushed
0176  * @page_end: page index of the last page to be flushed + 1
0177  *
0178  * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
0179  * TLB for the regions.  This can be skipped if the area is to be
0180  * returned to vmalloc as vmalloc will handle TLB flushing lazily.
0181  *
0182  * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
0183  * for the whole region.
0184  */
0185 static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
0186                       int page_start, int page_end)
0187 {
0188     flush_tlb_kernel_range(
0189         pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
0190         pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
0191 }
0192
0193 static int __pcpu_map_pages(unsigned long addr, struct page **pages,
0194                 int nr_pages)
0195 {
0196     return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
0197                     PAGE_KERNEL, pages, PAGE_SHIFT);
0198 }
0199
0200 /**
0201  * pcpu_map_pages - map pages into a pcpu_chunk
0202  * @chunk: chunk of interest
0203  * @pages: pages array containing pages to be mapped
0204  * @page_start: page index of the first page to map
0205  * @page_end: page index of the last page to map + 1
0206  *
0207  * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
0208  * caller is responsible for calling pcpu_post_map_flush() after all
0209  * mappings are complete.
0210  *
0211  * This function is responsible for setting up whatever is necessary for
0212  * reverse lookup (addr -> chunk).
0213  */
0214 static int pcpu_map_pages(struct pcpu_chunk *chunk,
0215               struct page **pages, int page_start, int page_end)
0216 {
0217     unsigned int cpu, tcpu;
0218     int i, err;
0219
0220     for_each_possible_cpu(cpu) {
0221         err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
0222                        &pages[pcpu_page_idx(cpu, page_start)],
0223                        page_end - page_start);
0224         if (err < 0)
0225             goto err;
0226
0227         for (i = page_start; i < page_end; i++)
0228             pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
0229                         chunk);
0230     }
0231     return 0;
0232 err:
0233     for_each_possible_cpu(tcpu) {
0234         if (tcpu == cpu)
0235             break;
0236         __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
0237                    page_end - page_start);
0238     }
0239     pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
0240     return err;
0241 }
0242
0243 /**
0244  * pcpu_post_map_flush - flush cache after mapping
0245  * @chunk: pcpu_chunk the regions to be flushed belong to
0246  * @page_start: page index of the first page to be flushed
0247  * @page_end: page index of the last page to be flushed + 1
0248  *
0249  * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
0250  * cache.
0251  *
0252  * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
0253  * for the whole region.
0254  */
0255 static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
0256                 int page_start, int page_end)
0257 {
0258     flush_cache_vmap(
0259         pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
0260         pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
0261 }
0262
0263 /**
0264  * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
0265  * @chunk: chunk of interest
0266  * @page_start: the start page
0267  * @page_end: the end page
0268  * @gfp: allocation flags passed to the underlying memory allocator
0269  *
0270  * For each cpu, populate and map pages [@page_start,@page_end) into
0271  * @chunk.
0272  *
0273  * CONTEXT:
0274  * pcpu_alloc_mutex, does GFP_KERNEL allocation.
0275  */
0276 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
0277                    int page_start, int page_end, gfp_t gfp)
0278 {
0279     struct page **pages;
0280
0281     pages = pcpu_get_pages();
0282     if (!pages)
0283         return -ENOMEM;
0284
0285     if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
0286         return -ENOMEM;
0287
0288     if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
0289         pcpu_free_pages(chunk, pages, page_start, page_end);
0290         return -ENOMEM;
0291     }
0292     pcpu_post_map_flush(chunk, page_start, page_end);
0293
0294     return 0;
0295 }
0296
0297 /**
0298  * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
0299  * @chunk: chunk to depopulate
0300  * @page_start: the start page
0301  * @page_end: the end page
0302  *
0303  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
0304  * from @chunk.
0305  *
0306  * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the
0307  * region back to vmalloc() which will lazily flush the tlb.
0308  *
0309  * CONTEXT:
0310  * pcpu_alloc_mutex.
0311  */
0312 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
0313                   int page_start, int page_end)
0314 {
0315     struct page **pages;
0316
0317     /*
0318      * If control reaches here, there must have been at least one
0319      * successful population attempt so the temp pages array must
0320      * be available now.
0321      */
0322     pages = pcpu_get_pages();
0323     BUG_ON(!pages);
0324
0325     /* unmap and free */
0326     pcpu_pre_unmap_flush(chunk, page_start, page_end);
0327
0328     pcpu_unmap_pages(chunk, pages, page_start, page_end);
0329
0330     pcpu_free_pages(chunk, pages, page_start, page_end);
0331 }
0332
0333 static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
0334 {
0335     struct pcpu_chunk *chunk;
0336     struct vm_struct **vms;
0337
0338     chunk = pcpu_alloc_chunk(gfp);
0339     if (!chunk)
0340         return NULL;
0341
0342     vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
0343                 pcpu_nr_groups, pcpu_atom_size);
0344     if (!vms) {
0345         pcpu_free_chunk(chunk);
0346         return NULL;
0347     }
0348
0349     chunk->data = vms;
0350     chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
0351
0352     pcpu_stats_chunk_alloc();
0353     trace_percpu_create_chunk(chunk->base_addr);
0354
0355     return chunk;
0356 }
0357
0358 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
0359 {
0360     if (!chunk)
0361         return;
0362
0363     pcpu_stats_chunk_dealloc();
0364     trace_percpu_destroy_chunk(chunk->base_addr);
0365
0366     if (chunk->data)
0367         pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
0368     pcpu_free_chunk(chunk);
0369 }
0370
0371 static struct page *pcpu_addr_to_page(void *addr)
0372 {
0373     return vmalloc_to_page(addr);
0374 }
0375
0376 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
0377 {
0378     /* no extra restriction */
0379     return 0;
0380 }
0381
0382 /**
0383  * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim
0384  * @chunk: chunk of interest
0385  *
0386  * This is the entry point for percpu reclaim.  If a chunk qualifies, it is then
0387  * isolated and managed in separate lists at the back of pcpu_slot: sidelined
0388  * and to_depopulate respectively.  The to_depopulate list holds chunks slated
0389  * for depopulation.  They no longer contribute to pcpu_nr_empty_pop_pages once
0390  * they are on this list.  Once depopulated, they are moved onto the sidelined
0391  * list which enables them to be pulled back in for allocation if no other chunk
0392  * can suffice the allocation.
0393  */
0394 static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
0395 {
0396     /* do not reclaim either the first chunk or reserved chunk */
0397     if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk)
0398         return false;
0399
0400     /*
0401      * If it is isolated, it may be on the sidelined list so move it back to
0402      * the to_depopulate list.  If we hit at least 1/4 pages empty pages AND
0403      * there is no system-wide shortage of empty pages aside from this
0404      * chunk, move it to the to_depopulate list.
0405      */
0406     return ((chunk->isolated && chunk->nr_empty_pop_pages) ||
0407         (pcpu_nr_empty_pop_pages >
0408          (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) &&
0409          chunk->nr_empty_pop_pages >= chunk->nr_pages / 4));
0410 }