Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /* Copyright(c) 2015 Intel Corporation. All rights reserved. */
0003 #include <linux/device.h>
0004 #include <linux/io.h>
0005 #include <linux/kasan.h>
0006 #include <linux/memory_hotplug.h>
0007 #include <linux/memremap.h>
0008 #include <linux/pfn_t.h>
0009 #include <linux/swap.h>
0010 #include <linux/mmzone.h>
0011 #include <linux/swapops.h>
0012 #include <linux/types.h>
0013 #include <linux/wait_bit.h>
0014 #include <linux/xarray.h>
0015 #include "internal.h"
0016 
0017 static DEFINE_XARRAY(pgmap_array);
0018 
0019 /*
0020  * The memremap() and memremap_pages() interfaces are alternately used
0021  * to map persistent memory namespaces. These interfaces place different
0022  * constraints on the alignment and size of the mapping (namespace).
0023  * memremap() can map individual PAGE_SIZE pages. memremap_pages() can
0024  * only map subsections (2MB), and at least one architecture (PowerPC)
0025  * the minimum mapping granularity of memremap_pages() is 16MB.
0026  *
0027  * The role of memremap_compat_align() is to communicate the minimum
0028  * arch supported alignment of a namespace such that it can freely
0029  * switch modes without violating the arch constraint. Namely, do not
0030  * allow a namespace to be PAGE_SIZE aligned since that namespace may be
0031  * reconfigured into a mode that requires SUBSECTION_SIZE alignment.
0032  */
0033 #ifndef CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN
0034 unsigned long memremap_compat_align(void)
0035 {
0036     return SUBSECTION_SIZE;
0037 }
0038 EXPORT_SYMBOL_GPL(memremap_compat_align);
0039 #endif
0040 
0041 #ifdef CONFIG_FS_DAX
0042 DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
0043 EXPORT_SYMBOL(devmap_managed_key);
0044 
0045 static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
0046 {
0047     if (pgmap->type == MEMORY_DEVICE_FS_DAX)
0048         static_branch_dec(&devmap_managed_key);
0049 }
0050 
0051 static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
0052 {
0053     if (pgmap->type == MEMORY_DEVICE_FS_DAX)
0054         static_branch_inc(&devmap_managed_key);
0055 }
0056 #else
0057 static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
0058 {
0059 }
0060 static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
0061 {
0062 }
0063 #endif /* CONFIG_FS_DAX */
0064 
0065 static void pgmap_array_delete(struct range *range)
0066 {
0067     xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end),
0068             NULL, GFP_KERNEL);
0069     synchronize_rcu();
0070 }
0071 
0072 static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id)
0073 {
0074     struct range *range = &pgmap->ranges[range_id];
0075     unsigned long pfn = PHYS_PFN(range->start);
0076 
0077     if (range_id)
0078         return pfn;
0079     return pfn + vmem_altmap_offset(pgmap_altmap(pgmap));
0080 }
0081 
0082 bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
0083 {
0084     int i;
0085 
0086     for (i = 0; i < pgmap->nr_range; i++) {
0087         struct range *range = &pgmap->ranges[i];
0088 
0089         if (pfn >= PHYS_PFN(range->start) &&
0090             pfn <= PHYS_PFN(range->end))
0091             return pfn >= pfn_first(pgmap, i);
0092     }
0093 
0094     return false;
0095 }
0096 
0097 static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
0098 {
0099     const struct range *range = &pgmap->ranges[range_id];
0100 
0101     return (range->start + range_len(range)) >> PAGE_SHIFT;
0102 }
0103 
0104 static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
0105 {
0106     return (pfn_end(pgmap, range_id) -
0107         pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
0108 }
0109 
0110 static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
0111 {
0112     struct range *range = &pgmap->ranges[range_id];
0113     struct page *first_page;
0114 
0115     /* make sure to access a memmap that was actually initialized */
0116     first_page = pfn_to_page(pfn_first(pgmap, range_id));
0117 
0118     /* pages are dead and unused, undo the arch mapping */
0119     mem_hotplug_begin();
0120     remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start),
0121                    PHYS_PFN(range_len(range)));
0122     if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
0123         __remove_pages(PHYS_PFN(range->start),
0124                    PHYS_PFN(range_len(range)), NULL);
0125     } else {
0126         arch_remove_memory(range->start, range_len(range),
0127                 pgmap_altmap(pgmap));
0128         kasan_remove_zero_shadow(__va(range->start), range_len(range));
0129     }
0130     mem_hotplug_done();
0131 
0132     untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
0133     pgmap_array_delete(range);
0134 }
0135 
0136 void memunmap_pages(struct dev_pagemap *pgmap)
0137 {
0138     int i;
0139 
0140     percpu_ref_kill(&pgmap->ref);
0141     for (i = 0; i < pgmap->nr_range; i++)
0142         percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
0143     wait_for_completion(&pgmap->done);
0144 
0145     for (i = 0; i < pgmap->nr_range; i++)
0146         pageunmap_range(pgmap, i);
0147     percpu_ref_exit(&pgmap->ref);
0148 
0149     WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
0150     devmap_managed_enable_put(pgmap);
0151 }
0152 EXPORT_SYMBOL_GPL(memunmap_pages);
0153 
0154 static void devm_memremap_pages_release(void *data)
0155 {
0156     memunmap_pages(data);
0157 }
0158 
0159 static void dev_pagemap_percpu_release(struct percpu_ref *ref)
0160 {
0161     struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref);
0162 
0163     complete(&pgmap->done);
0164 }
0165 
0166 static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
0167         int range_id, int nid)
0168 {
0169     const bool is_private = pgmap->type == MEMORY_DEVICE_PRIVATE;
0170     struct range *range = &pgmap->ranges[range_id];
0171     struct dev_pagemap *conflict_pgmap;
0172     int error, is_ram;
0173 
0174     if (WARN_ONCE(pgmap_altmap(pgmap) && range_id > 0,
0175                 "altmap not supported for multiple ranges\n"))
0176         return -EINVAL;
0177 
0178     conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL);
0179     if (conflict_pgmap) {
0180         WARN(1, "Conflicting mapping in same section\n");
0181         put_dev_pagemap(conflict_pgmap);
0182         return -ENOMEM;
0183     }
0184 
0185     conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL);
0186     if (conflict_pgmap) {
0187         WARN(1, "Conflicting mapping in same section\n");
0188         put_dev_pagemap(conflict_pgmap);
0189         return -ENOMEM;
0190     }
0191 
0192     is_ram = region_intersects(range->start, range_len(range),
0193         IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
0194 
0195     if (is_ram != REGION_DISJOINT) {
0196         WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n",
0197                 is_ram == REGION_MIXED ? "mixed" : "ram",
0198                 range->start, range->end);
0199         return -ENXIO;
0200     }
0201 
0202     error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start),
0203                 PHYS_PFN(range->end), pgmap, GFP_KERNEL));
0204     if (error)
0205         return error;
0206 
0207     if (nid < 0)
0208         nid = numa_mem_id();
0209 
0210     error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
0211             range_len(range));
0212     if (error)
0213         goto err_pfn_remap;
0214 
0215     if (!mhp_range_allowed(range->start, range_len(range), !is_private)) {
0216         error = -EINVAL;
0217         goto err_kasan;
0218     }
0219 
0220     mem_hotplug_begin();
0221 
0222     /*
0223      * For device private memory we call add_pages() as we only need to
0224      * allocate and initialize struct page for the device memory. More-
0225      * over the device memory is un-accessible thus we do not want to
0226      * create a linear mapping for the memory like arch_add_memory()
0227      * would do.
0228      *
0229      * For all other device memory types, which are accessible by
0230      * the CPU, we do want the linear mapping and thus use
0231      * arch_add_memory().
0232      */
0233     if (is_private) {
0234         error = add_pages(nid, PHYS_PFN(range->start),
0235                 PHYS_PFN(range_len(range)), params);
0236     } else {
0237         error = kasan_add_zero_shadow(__va(range->start), range_len(range));
0238         if (error) {
0239             mem_hotplug_done();
0240             goto err_kasan;
0241         }
0242 
0243         error = arch_add_memory(nid, range->start, range_len(range),
0244                     params);
0245     }
0246 
0247     if (!error) {
0248         struct zone *zone;
0249 
0250         zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
0251         move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
0252                 PHYS_PFN(range_len(range)), params->altmap,
0253                 MIGRATE_MOVABLE);
0254     }
0255 
0256     mem_hotplug_done();
0257     if (error)
0258         goto err_add_memory;
0259 
0260     /*
0261      * Initialization of the pages has been deferred until now in order
0262      * to allow us to do the work while not holding the hotplug lock.
0263      */
0264     memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
0265                 PHYS_PFN(range->start),
0266                 PHYS_PFN(range_len(range)), pgmap);
0267     percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
0268     return 0;
0269 
0270 err_add_memory:
0271     if (!is_private)
0272         kasan_remove_zero_shadow(__va(range->start), range_len(range));
0273 err_kasan:
0274     untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
0275 err_pfn_remap:
0276     pgmap_array_delete(range);
0277     return error;
0278 }
0279 
0280 
0281 /*
0282  * Not device managed version of devm_memremap_pages, undone by
0283  * memunmap_pages().  Please use devm_memremap_pages if you have a struct
0284  * device available.
0285  */
0286 void *memremap_pages(struct dev_pagemap *pgmap, int nid)
0287 {
0288     struct mhp_params params = {
0289         .altmap = pgmap_altmap(pgmap),
0290         .pgmap = pgmap,
0291         .pgprot = PAGE_KERNEL,
0292     };
0293     const int nr_range = pgmap->nr_range;
0294     int error, i;
0295 
0296     if (WARN_ONCE(!nr_range, "nr_range must be specified\n"))
0297         return ERR_PTR(-EINVAL);
0298 
0299     switch (pgmap->type) {
0300     case MEMORY_DEVICE_PRIVATE:
0301         if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
0302             WARN(1, "Device private memory not supported\n");
0303             return ERR_PTR(-EINVAL);
0304         }
0305         if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
0306             WARN(1, "Missing migrate_to_ram method\n");
0307             return ERR_PTR(-EINVAL);
0308         }
0309         if (!pgmap->ops->page_free) {
0310             WARN(1, "Missing page_free method\n");
0311             return ERR_PTR(-EINVAL);
0312         }
0313         if (!pgmap->owner) {
0314             WARN(1, "Missing owner\n");
0315             return ERR_PTR(-EINVAL);
0316         }
0317         break;
0318     case MEMORY_DEVICE_COHERENT:
0319         if (!pgmap->ops->page_free) {
0320             WARN(1, "Missing page_free method\n");
0321             return ERR_PTR(-EINVAL);
0322         }
0323         if (!pgmap->owner) {
0324             WARN(1, "Missing owner\n");
0325             return ERR_PTR(-EINVAL);
0326         }
0327         break;
0328     case MEMORY_DEVICE_FS_DAX:
0329         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
0330             WARN(1, "File system DAX not supported\n");
0331             return ERR_PTR(-EINVAL);
0332         }
0333         break;
0334     case MEMORY_DEVICE_GENERIC:
0335         break;
0336     case MEMORY_DEVICE_PCI_P2PDMA:
0337         params.pgprot = pgprot_noncached(params.pgprot);
0338         break;
0339     default:
0340         WARN(1, "Invalid pgmap type %d\n", pgmap->type);
0341         break;
0342     }
0343 
0344     init_completion(&pgmap->done);
0345     error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0,
0346                 GFP_KERNEL);
0347     if (error)
0348         return ERR_PTR(error);
0349 
0350     devmap_managed_enable_get(pgmap);
0351 
0352     /*
0353      * Clear the pgmap nr_range as it will be incremented for each
0354      * successfully processed range. This communicates how many
0355      * regions to unwind in the abort case.
0356      */
0357     pgmap->nr_range = 0;
0358     error = 0;
0359     for (i = 0; i < nr_range; i++) {
0360         error = pagemap_range(pgmap, &params, i, nid);
0361         if (error)
0362             break;
0363         pgmap->nr_range++;
0364     }
0365 
0366     if (i < nr_range) {
0367         memunmap_pages(pgmap);
0368         pgmap->nr_range = nr_range;
0369         return ERR_PTR(error);
0370     }
0371 
0372     return __va(pgmap->ranges[0].start);
0373 }
0374 EXPORT_SYMBOL_GPL(memremap_pages);
0375 
0376 /**
0377  * devm_memremap_pages - remap and provide memmap backing for the given resource
0378  * @dev: hosting device for @res
0379  * @pgmap: pointer to a struct dev_pagemap
0380  *
0381  * Notes:
0382  * 1/ At a minimum the res and type members of @pgmap must be initialized
0383  *    by the caller before passing it to this function
0384  *
0385  * 2/ The altmap field may optionally be initialized, in which case
0386  *    PGMAP_ALTMAP_VALID must be set in pgmap->flags.
0387  *
0388  * 3/ The ref field may optionally be provided, in which pgmap->ref must be
0389  *    'live' on entry and will be killed and reaped at
0390  *    devm_memremap_pages_release() time, or if this routine fails.
0391  *
0392  * 4/ range is expected to be a host memory range that could feasibly be
0393  *    treated as a "System RAM" range, i.e. not a device mmio range, but
0394  *    this is not enforced.
0395  */
0396 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
0397 {
0398     int error;
0399     void *ret;
0400 
0401     ret = memremap_pages(pgmap, dev_to_node(dev));
0402     if (IS_ERR(ret))
0403         return ret;
0404 
0405     error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
0406             pgmap);
0407     if (error)
0408         return ERR_PTR(error);
0409     return ret;
0410 }
0411 EXPORT_SYMBOL_GPL(devm_memremap_pages);
0412 
0413 void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
0414 {
0415     devm_release_action(dev, devm_memremap_pages_release, pgmap);
0416 }
0417 EXPORT_SYMBOL_GPL(devm_memunmap_pages);
0418 
0419 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
0420 {
0421     /* number of pfns from base where pfn_to_page() is valid */
0422     if (altmap)
0423         return altmap->reserve + altmap->free;
0424     return 0;
0425 }
0426 
0427 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
0428 {
0429     altmap->alloc -= nr_pfns;
0430 }
0431 
0432 /**
0433  * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
0434  * @pfn: page frame number to lookup page_map
0435  * @pgmap: optional known pgmap that already has a reference
0436  *
0437  * If @pgmap is non-NULL and covers @pfn it will be returned as-is.  If @pgmap
0438  * is non-NULL but does not cover @pfn the reference to it will be released.
0439  */
0440 struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
0441         struct dev_pagemap *pgmap)
0442 {
0443     resource_size_t phys = PFN_PHYS(pfn);
0444 
0445     /*
0446      * In the cached case we're already holding a live reference.
0447      */
0448     if (pgmap) {
0449         if (phys >= pgmap->range.start && phys <= pgmap->range.end)
0450             return pgmap;
0451         put_dev_pagemap(pgmap);
0452     }
0453 
0454     /* fall back to slow path lookup */
0455     rcu_read_lock();
0456     pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
0457     if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
0458         pgmap = NULL;
0459     rcu_read_unlock();
0460 
0461     return pgmap;
0462 }
0463 EXPORT_SYMBOL_GPL(get_dev_pagemap);
0464 
0465 void free_zone_device_page(struct page *page)
0466 {
0467     if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
0468         return;
0469 
0470     mem_cgroup_uncharge(page_folio(page));
0471 
0472     /*
0473      * Note: we don't expect anonymous compound pages yet. Once supported
0474      * and we could PTE-map them similar to THP, we'd have to clear
0475      * PG_anon_exclusive on all tail pages.
0476      */
0477     VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page);
0478     if (PageAnon(page))
0479         __ClearPageAnonExclusive(page);
0480 
0481     /*
0482      * When a device managed page is freed, the page->mapping field
0483      * may still contain a (stale) mapping value. For example, the
0484      * lower bits of page->mapping may still identify the page as an
0485      * anonymous page. Ultimately, this entire field is just stale
0486      * and wrong, and it will cause errors if not cleared.  One
0487      * example is:
0488      *
0489      *  migrate_vma_pages()
0490      *    migrate_vma_insert_page()
0491      *      page_add_new_anon_rmap()
0492      *        __page_set_anon_rmap()
0493      *          ...checks page->mapping, via PageAnon(page) call,
0494      *            and incorrectly concludes that the page is an
0495      *            anonymous page. Therefore, it incorrectly,
0496      *            silently fails to set up the new anon rmap.
0497      *
0498      * For other types of ZONE_DEVICE pages, migration is either
0499      * handled differently or not done at all, so there is no need
0500      * to clear page->mapping.
0501      */
0502     page->mapping = NULL;
0503     page->pgmap->ops->page_free(page);
0504 
0505     /*
0506      * Reset the page count to 1 to prepare for handing out the page again.
0507      */
0508     set_page_count(page, 1);
0509 }
0510 
0511 #ifdef CONFIG_FS_DAX
0512 bool __put_devmap_managed_page_refs(struct page *page, int refs)
0513 {
0514     if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
0515         return false;
0516 
0517     /*
0518      * fsdax page refcounts are 1-based, rather than 0-based: if
0519      * refcount is 1, then the page is free and the refcount is
0520      * stable because nobody holds a reference on the page.
0521      */
0522     if (page_ref_sub_return(page, refs) == 1)
0523         wake_up_var(&page->_refcount);
0524     return true;
0525 }
0526 EXPORT_SYMBOL(__put_devmap_managed_page_refs);
0527 #endif /* CONFIG_FS_DAX */