Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0
0002  *
0003  * page_pool.c
0004  *  Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
0005  *  Copyright (C) 2016 Red Hat, Inc.
0006  */
0007 
0008 #include <linux/types.h>
0009 #include <linux/kernel.h>
0010 #include <linux/slab.h>
0011 #include <linux/device.h>
0012 
0013 #include <net/page_pool.h>
0014 #include <net/xdp.h>
0015 
0016 #include <linux/dma-direction.h>
0017 #include <linux/dma-mapping.h>
0018 #include <linux/page-flags.h>
0019 #include <linux/mm.h> /* for put_page() */
0020 #include <linux/poison.h>
0021 #include <linux/ethtool.h>
0022 
0023 #include <trace/events/page_pool.h>
0024 
0025 #define DEFER_TIME (msecs_to_jiffies(1000))
0026 #define DEFER_WARN_INTERVAL (60 * HZ)
0027 
0028 #define BIAS_MAX    LONG_MAX
0029 
0030 #ifdef CONFIG_PAGE_POOL_STATS
0031 /* alloc_stat_inc is intended to be used in softirq context */
0032 #define alloc_stat_inc(pool, __stat)    (pool->alloc_stats.__stat++)
0033 /* recycle_stat_inc is safe to use when preemption is possible. */
0034 #define recycle_stat_inc(pool, __stat)                          \
0035     do {                                        \
0036         struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;   \
0037         this_cpu_inc(s->__stat);                        \
0038     } while (0)
0039 
0040 #define recycle_stat_add(pool, __stat, val)                     \
0041     do {                                        \
0042         struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;   \
0043         this_cpu_add(s->__stat, val);                       \
0044     } while (0)
0045 
0046 static const char pp_stats[][ETH_GSTRING_LEN] = {
0047     "rx_pp_alloc_fast",
0048     "rx_pp_alloc_slow",
0049     "rx_pp_alloc_slow_ho",
0050     "rx_pp_alloc_empty",
0051     "rx_pp_alloc_refill",
0052     "rx_pp_alloc_waive",
0053     "rx_pp_recycle_cached",
0054     "rx_pp_recycle_cache_full",
0055     "rx_pp_recycle_ring",
0056     "rx_pp_recycle_ring_full",
0057     "rx_pp_recycle_released_ref",
0058 };
0059 
0060 bool page_pool_get_stats(struct page_pool *pool,
0061              struct page_pool_stats *stats)
0062 {
0063     int cpu = 0;
0064 
0065     if (!stats)
0066         return false;
0067 
0068     /* The caller is responsible to initialize stats. */
0069     stats->alloc_stats.fast += pool->alloc_stats.fast;
0070     stats->alloc_stats.slow += pool->alloc_stats.slow;
0071     stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
0072     stats->alloc_stats.empty += pool->alloc_stats.empty;
0073     stats->alloc_stats.refill += pool->alloc_stats.refill;
0074     stats->alloc_stats.waive += pool->alloc_stats.waive;
0075 
0076     for_each_possible_cpu(cpu) {
0077         const struct page_pool_recycle_stats *pcpu =
0078             per_cpu_ptr(pool->recycle_stats, cpu);
0079 
0080         stats->recycle_stats.cached += pcpu->cached;
0081         stats->recycle_stats.cache_full += pcpu->cache_full;
0082         stats->recycle_stats.ring += pcpu->ring;
0083         stats->recycle_stats.ring_full += pcpu->ring_full;
0084         stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
0085     }
0086 
0087     return true;
0088 }
0089 EXPORT_SYMBOL(page_pool_get_stats);
0090 
0091 u8 *page_pool_ethtool_stats_get_strings(u8 *data)
0092 {
0093     int i;
0094 
0095     for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
0096         memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
0097         data += ETH_GSTRING_LEN;
0098     }
0099 
0100     return data;
0101 }
0102 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
0103 
0104 int page_pool_ethtool_stats_get_count(void)
0105 {
0106     return ARRAY_SIZE(pp_stats);
0107 }
0108 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
0109 
0110 u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
0111 {
0112     struct page_pool_stats *pool_stats = stats;
0113 
0114     *data++ = pool_stats->alloc_stats.fast;
0115     *data++ = pool_stats->alloc_stats.slow;
0116     *data++ = pool_stats->alloc_stats.slow_high_order;
0117     *data++ = pool_stats->alloc_stats.empty;
0118     *data++ = pool_stats->alloc_stats.refill;
0119     *data++ = pool_stats->alloc_stats.waive;
0120     *data++ = pool_stats->recycle_stats.cached;
0121     *data++ = pool_stats->recycle_stats.cache_full;
0122     *data++ = pool_stats->recycle_stats.ring;
0123     *data++ = pool_stats->recycle_stats.ring_full;
0124     *data++ = pool_stats->recycle_stats.released_refcnt;
0125 
0126     return data;
0127 }
0128 EXPORT_SYMBOL(page_pool_ethtool_stats_get);
0129 
0130 #else
0131 #define alloc_stat_inc(pool, __stat)
0132 #define recycle_stat_inc(pool, __stat)
0133 #define recycle_stat_add(pool, __stat, val)
0134 #endif
0135 
0136 static int page_pool_init(struct page_pool *pool,
0137               const struct page_pool_params *params)
0138 {
0139     unsigned int ring_qsize = 1024; /* Default */
0140 
0141     memcpy(&pool->p, params, sizeof(pool->p));
0142 
0143     /* Validate only known flags were used */
0144     if (pool->p.flags & ~(PP_FLAG_ALL))
0145         return -EINVAL;
0146 
0147     if (pool->p.pool_size)
0148         ring_qsize = pool->p.pool_size;
0149 
0150     /* Sanity limit mem that can be pinned down */
0151     if (ring_qsize > 32768)
0152         return -E2BIG;
0153 
0154     /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
0155      * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
0156      * which is the XDP_TX use-case.
0157      */
0158     if (pool->p.flags & PP_FLAG_DMA_MAP) {
0159         if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
0160             (pool->p.dma_dir != DMA_BIDIRECTIONAL))
0161             return -EINVAL;
0162     }
0163 
0164     if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
0165         /* In order to request DMA-sync-for-device the page
0166          * needs to be mapped
0167          */
0168         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
0169             return -EINVAL;
0170 
0171         if (!pool->p.max_len)
0172             return -EINVAL;
0173 
0174         /* pool->p.offset has to be set according to the address
0175          * offset used by the DMA engine to start copying rx data
0176          */
0177     }
0178 
0179     if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
0180         pool->p.flags & PP_FLAG_PAGE_FRAG)
0181         return -EINVAL;
0182 
0183 #ifdef CONFIG_PAGE_POOL_STATS
0184     pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
0185     if (!pool->recycle_stats)
0186         return -ENOMEM;
0187 #endif
0188 
0189     if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
0190         return -ENOMEM;
0191 
0192     atomic_set(&pool->pages_state_release_cnt, 0);
0193 
0194     /* Driver calling page_pool_create() also call page_pool_destroy() */
0195     refcount_set(&pool->user_cnt, 1);
0196 
0197     if (pool->p.flags & PP_FLAG_DMA_MAP)
0198         get_device(pool->p.dev);
0199 
0200     return 0;
0201 }
0202 
0203 struct page_pool *page_pool_create(const struct page_pool_params *params)
0204 {
0205     struct page_pool *pool;
0206     int err;
0207 
0208     pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
0209     if (!pool)
0210         return ERR_PTR(-ENOMEM);
0211 
0212     err = page_pool_init(pool, params);
0213     if (err < 0) {
0214         pr_warn("%s() gave up with errno %d\n", __func__, err);
0215         kfree(pool);
0216         return ERR_PTR(err);
0217     }
0218 
0219     return pool;
0220 }
0221 EXPORT_SYMBOL(page_pool_create);
0222 
0223 static void page_pool_return_page(struct page_pool *pool, struct page *page);
0224 
0225 noinline
0226 static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
0227 {
0228     struct ptr_ring *r = &pool->ring;
0229     struct page *page;
0230     int pref_nid; /* preferred NUMA node */
0231 
0232     /* Quicker fallback, avoid locks when ring is empty */
0233     if (__ptr_ring_empty(r)) {
0234         alloc_stat_inc(pool, empty);
0235         return NULL;
0236     }
0237 
0238     /* Softirq guarantee CPU and thus NUMA node is stable. This,
0239      * assumes CPU refilling driver RX-ring will also run RX-NAPI.
0240      */
0241 #ifdef CONFIG_NUMA
0242     pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
0243 #else
0244     /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
0245     pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
0246 #endif
0247 
0248     /* Refill alloc array, but only if NUMA match */
0249     do {
0250         page = __ptr_ring_consume(r);
0251         if (unlikely(!page))
0252             break;
0253 
0254         if (likely(page_to_nid(page) == pref_nid)) {
0255             pool->alloc.cache[pool->alloc.count++] = page;
0256         } else {
0257             /* NUMA mismatch;
0258              * (1) release 1 page to page-allocator and
0259              * (2) break out to fallthrough to alloc_pages_node.
0260              * This limit stress on page buddy alloactor.
0261              */
0262             page_pool_return_page(pool, page);
0263             alloc_stat_inc(pool, waive);
0264             page = NULL;
0265             break;
0266         }
0267     } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
0268 
0269     /* Return last page */
0270     if (likely(pool->alloc.count > 0)) {
0271         page = pool->alloc.cache[--pool->alloc.count];
0272         alloc_stat_inc(pool, refill);
0273     }
0274 
0275     return page;
0276 }
0277 
0278 /* fast path */
0279 static struct page *__page_pool_get_cached(struct page_pool *pool)
0280 {
0281     struct page *page;
0282 
0283     /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
0284     if (likely(pool->alloc.count)) {
0285         /* Fast-path */
0286         page = pool->alloc.cache[--pool->alloc.count];
0287         alloc_stat_inc(pool, fast);
0288     } else {
0289         page = page_pool_refill_alloc_cache(pool);
0290     }
0291 
0292     return page;
0293 }
0294 
0295 static void page_pool_dma_sync_for_device(struct page_pool *pool,
0296                       struct page *page,
0297                       unsigned int dma_sync_size)
0298 {
0299     dma_addr_t dma_addr = page_pool_get_dma_addr(page);
0300 
0301     dma_sync_size = min(dma_sync_size, pool->p.max_len);
0302     dma_sync_single_range_for_device(pool->p.dev, dma_addr,
0303                      pool->p.offset, dma_sync_size,
0304                      pool->p.dma_dir);
0305 }
0306 
0307 static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
0308 {
0309     dma_addr_t dma;
0310 
0311     /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
0312      * since dma_addr_t can be either 32 or 64 bits and does not always fit
0313      * into page private data (i.e 32bit cpu with 64bit DMA caps)
0314      * This mapping is kept for lifetime of page, until leaving pool.
0315      */
0316     dma = dma_map_page_attrs(pool->p.dev, page, 0,
0317                  (PAGE_SIZE << pool->p.order),
0318                  pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
0319     if (dma_mapping_error(pool->p.dev, dma))
0320         return false;
0321 
0322     page_pool_set_dma_addr(page, dma);
0323 
0324     if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
0325         page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
0326 
0327     return true;
0328 }
0329 
0330 static void page_pool_set_pp_info(struct page_pool *pool,
0331                   struct page *page)
0332 {
0333     page->pp = pool;
0334     page->pp_magic |= PP_SIGNATURE;
0335     if (pool->p.init_callback)
0336         pool->p.init_callback(page, pool->p.init_arg);
0337 }
0338 
0339 static void page_pool_clear_pp_info(struct page *page)
0340 {
0341     page->pp_magic = 0;
0342     page->pp = NULL;
0343 }
0344 
0345 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
0346                          gfp_t gfp)
0347 {
0348     struct page *page;
0349 
0350     gfp |= __GFP_COMP;
0351     page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
0352     if (unlikely(!page))
0353         return NULL;
0354 
0355     if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
0356         unlikely(!page_pool_dma_map(pool, page))) {
0357         put_page(page);
0358         return NULL;
0359     }
0360 
0361     alloc_stat_inc(pool, slow_high_order);
0362     page_pool_set_pp_info(pool, page);
0363 
0364     /* Track how many pages are held 'in-flight' */
0365     pool->pages_state_hold_cnt++;
0366     trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
0367     return page;
0368 }
0369 
0370 /* slow path */
0371 noinline
0372 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
0373                          gfp_t gfp)
0374 {
0375     const int bulk = PP_ALLOC_CACHE_REFILL;
0376     unsigned int pp_flags = pool->p.flags;
0377     unsigned int pp_order = pool->p.order;
0378     struct page *page;
0379     int i, nr_pages;
0380 
0381     /* Don't support bulk alloc for high-order pages */
0382     if (unlikely(pp_order))
0383         return __page_pool_alloc_page_order(pool, gfp);
0384 
0385     /* Unnecessary as alloc cache is empty, but guarantees zero count */
0386     if (unlikely(pool->alloc.count > 0))
0387         return pool->alloc.cache[--pool->alloc.count];
0388 
0389     /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
0390     memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
0391 
0392     nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
0393                            pool->alloc.cache);
0394     if (unlikely(!nr_pages))
0395         return NULL;
0396 
0397     /* Pages have been filled into alloc.cache array, but count is zero and
0398      * page element have not been (possibly) DMA mapped.
0399      */
0400     for (i = 0; i < nr_pages; i++) {
0401         page = pool->alloc.cache[i];
0402         if ((pp_flags & PP_FLAG_DMA_MAP) &&
0403             unlikely(!page_pool_dma_map(pool, page))) {
0404             put_page(page);
0405             continue;
0406         }
0407 
0408         page_pool_set_pp_info(pool, page);
0409         pool->alloc.cache[pool->alloc.count++] = page;
0410         /* Track how many pages are held 'in-flight' */
0411         pool->pages_state_hold_cnt++;
0412         trace_page_pool_state_hold(pool, page,
0413                        pool->pages_state_hold_cnt);
0414     }
0415 
0416     /* Return last page */
0417     if (likely(pool->alloc.count > 0)) {
0418         page = pool->alloc.cache[--pool->alloc.count];
0419         alloc_stat_inc(pool, slow);
0420     } else {
0421         page = NULL;
0422     }
0423 
0424     /* When page just alloc'ed is should/must have refcnt 1. */
0425     return page;
0426 }
0427 
0428 /* For using page_pool replace: alloc_pages() API calls, but provide
0429  * synchronization guarantee for allocation side.
0430  */
0431 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
0432 {
0433     struct page *page;
0434 
0435     /* Fast-path: Get a page from cache */
0436     page = __page_pool_get_cached(pool);
0437     if (page)
0438         return page;
0439 
0440     /* Slow-path: cache empty, do real allocation */
0441     page = __page_pool_alloc_pages_slow(pool, gfp);
0442     return page;
0443 }
0444 EXPORT_SYMBOL(page_pool_alloc_pages);
0445 
0446 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
0447  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
0448  */
0449 #define _distance(a, b) (s32)((a) - (b))
0450 
0451 static s32 page_pool_inflight(struct page_pool *pool)
0452 {
0453     u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
0454     u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
0455     s32 inflight;
0456 
0457     inflight = _distance(hold_cnt, release_cnt);
0458 
0459     trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
0460     WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
0461 
0462     return inflight;
0463 }
0464 
0465 /* Disconnects a page (from a page_pool).  API users can have a need
0466  * to disconnect a page (from a page_pool), to allow it to be used as
0467  * a regular page (that will eventually be returned to the normal
0468  * page-allocator via put_page).
0469  */
0470 void page_pool_release_page(struct page_pool *pool, struct page *page)
0471 {
0472     dma_addr_t dma;
0473     int count;
0474 
0475     if (!(pool->p.flags & PP_FLAG_DMA_MAP))
0476         /* Always account for inflight pages, even if we didn't
0477          * map them
0478          */
0479         goto skip_dma_unmap;
0480 
0481     dma = page_pool_get_dma_addr(page);
0482 
0483     /* When page is unmapped, it cannot be returned to our pool */
0484     dma_unmap_page_attrs(pool->p.dev, dma,
0485                  PAGE_SIZE << pool->p.order, pool->p.dma_dir,
0486                  DMA_ATTR_SKIP_CPU_SYNC);
0487     page_pool_set_dma_addr(page, 0);
0488 skip_dma_unmap:
0489     page_pool_clear_pp_info(page);
0490 
0491     /* This may be the last page returned, releasing the pool, so
0492      * it is not safe to reference pool afterwards.
0493      */
0494     count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
0495     trace_page_pool_state_release(pool, page, count);
0496 }
0497 EXPORT_SYMBOL(page_pool_release_page);
0498 
0499 /* Return a page to the page allocator, cleaning up our state */
0500 static void page_pool_return_page(struct page_pool *pool, struct page *page)
0501 {
0502     page_pool_release_page(pool, page);
0503 
0504     put_page(page);
0505     /* An optimization would be to call __free_pages(page, pool->p.order)
0506      * knowing page is not part of page-cache (thus avoiding a
0507      * __page_cache_release() call).
0508      */
0509 }
0510 
0511 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
0512 {
0513     int ret;
0514     /* BH protection not needed if current is serving softirq */
0515     if (in_serving_softirq())
0516         ret = ptr_ring_produce(&pool->ring, page);
0517     else
0518         ret = ptr_ring_produce_bh(&pool->ring, page);
0519 
0520     if (!ret) {
0521         recycle_stat_inc(pool, ring);
0522         return true;
0523     }
0524 
0525     return false;
0526 }
0527 
0528 /* Only allow direct recycling in special circumstances, into the
0529  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
0530  *
0531  * Caller must provide appropriate safe context.
0532  */
0533 static bool page_pool_recycle_in_cache(struct page *page,
0534                        struct page_pool *pool)
0535 {
0536     if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
0537         recycle_stat_inc(pool, cache_full);
0538         return false;
0539     }
0540 
0541     /* Caller MUST have verified/know (page_ref_count(page) == 1) */
0542     pool->alloc.cache[pool->alloc.count++] = page;
0543     recycle_stat_inc(pool, cached);
0544     return true;
0545 }
0546 
0547 /* If the page refcnt == 1, this will try to recycle the page.
0548  * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
0549  * the configured size min(dma_sync_size, pool->max_len).
0550  * If the page refcnt != 1, then the page will be returned to memory
0551  * subsystem.
0552  */
0553 static __always_inline struct page *
0554 __page_pool_put_page(struct page_pool *pool, struct page *page,
0555              unsigned int dma_sync_size, bool allow_direct)
0556 {
0557     /* This allocator is optimized for the XDP mode that uses
0558      * one-frame-per-page, but have fallbacks that act like the
0559      * regular page allocator APIs.
0560      *
0561      * refcnt == 1 means page_pool owns page, and can recycle it.
0562      *
0563      * page is NOT reusable when allocated when system is under
0564      * some pressure. (page_is_pfmemalloc)
0565      */
0566     if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
0567         /* Read barrier done in page_ref_count / READ_ONCE */
0568 
0569         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
0570             page_pool_dma_sync_for_device(pool, page,
0571                               dma_sync_size);
0572 
0573         if (allow_direct && in_serving_softirq() &&
0574             page_pool_recycle_in_cache(page, pool))
0575             return NULL;
0576 
0577         /* Page found as candidate for recycling */
0578         return page;
0579     }
0580     /* Fallback/non-XDP mode: API user have elevated refcnt.
0581      *
0582      * Many drivers split up the page into fragments, and some
0583      * want to keep doing this to save memory and do refcnt based
0584      * recycling. Support this use case too, to ease drivers
0585      * switching between XDP/non-XDP.
0586      *
0587      * In-case page_pool maintains the DMA mapping, API user must
0588      * call page_pool_put_page once.  In this elevated refcnt
0589      * case, the DMA is unmapped/released, as driver is likely
0590      * doing refcnt based recycle tricks, meaning another process
0591      * will be invoking put_page.
0592      */
0593     recycle_stat_inc(pool, released_refcnt);
0594     /* Do not replace this with page_pool_return_page() */
0595     page_pool_release_page(pool, page);
0596     put_page(page);
0597 
0598     return NULL;
0599 }
0600 
0601 void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
0602                   unsigned int dma_sync_size, bool allow_direct)
0603 {
0604     page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
0605     if (page && !page_pool_recycle_in_ring(pool, page)) {
0606         /* Cache full, fallback to free pages */
0607         recycle_stat_inc(pool, ring_full);
0608         page_pool_return_page(pool, page);
0609     }
0610 }
0611 EXPORT_SYMBOL(page_pool_put_defragged_page);
0612 
0613 /* Caller must not use data area after call, as this function overwrites it */
0614 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
0615                  int count)
0616 {
0617     int i, bulk_len = 0;
0618 
0619     for (i = 0; i < count; i++) {
0620         struct page *page = virt_to_head_page(data[i]);
0621 
0622         /* It is not the last user for the page frag case */
0623         if (!page_pool_is_last_frag(pool, page))
0624             continue;
0625 
0626         page = __page_pool_put_page(pool, page, -1, false);
0627         /* Approved for bulk recycling in ptr_ring cache */
0628         if (page)
0629             data[bulk_len++] = page;
0630     }
0631 
0632     if (unlikely(!bulk_len))
0633         return;
0634 
0635     /* Bulk producer into ptr_ring page_pool cache */
0636     page_pool_ring_lock(pool);
0637     for (i = 0; i < bulk_len; i++) {
0638         if (__ptr_ring_produce(&pool->ring, data[i])) {
0639             /* ring full */
0640             recycle_stat_inc(pool, ring_full);
0641             break;
0642         }
0643     }
0644     recycle_stat_add(pool, ring, i);
0645     page_pool_ring_unlock(pool);
0646 
0647     /* Hopefully all pages was return into ptr_ring */
0648     if (likely(i == bulk_len))
0649         return;
0650 
0651     /* ptr_ring cache full, free remaining pages outside producer lock
0652      * since put_page() with refcnt == 1 can be an expensive operation
0653      */
0654     for (; i < bulk_len; i++)
0655         page_pool_return_page(pool, data[i]);
0656 }
0657 EXPORT_SYMBOL(page_pool_put_page_bulk);
0658 
0659 static struct page *page_pool_drain_frag(struct page_pool *pool,
0660                      struct page *page)
0661 {
0662     long drain_count = BIAS_MAX - pool->frag_users;
0663 
0664     /* Some user is still using the page frag */
0665     if (likely(page_pool_defrag_page(page, drain_count)))
0666         return NULL;
0667 
0668     if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
0669         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
0670             page_pool_dma_sync_for_device(pool, page, -1);
0671 
0672         return page;
0673     }
0674 
0675     page_pool_return_page(pool, page);
0676     return NULL;
0677 }
0678 
0679 static void page_pool_free_frag(struct page_pool *pool)
0680 {
0681     long drain_count = BIAS_MAX - pool->frag_users;
0682     struct page *page = pool->frag_page;
0683 
0684     pool->frag_page = NULL;
0685 
0686     if (!page || page_pool_defrag_page(page, drain_count))
0687         return;
0688 
0689     page_pool_return_page(pool, page);
0690 }
0691 
0692 struct page *page_pool_alloc_frag(struct page_pool *pool,
0693                   unsigned int *offset,
0694                   unsigned int size, gfp_t gfp)
0695 {
0696     unsigned int max_size = PAGE_SIZE << pool->p.order;
0697     struct page *page = pool->frag_page;
0698 
0699     if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
0700             size > max_size))
0701         return NULL;
0702 
0703     size = ALIGN(size, dma_get_cache_alignment());
0704     *offset = pool->frag_offset;
0705 
0706     if (page && *offset + size > max_size) {
0707         page = page_pool_drain_frag(pool, page);
0708         if (page) {
0709             alloc_stat_inc(pool, fast);
0710             goto frag_reset;
0711         }
0712     }
0713 
0714     if (!page) {
0715         page = page_pool_alloc_pages(pool, gfp);
0716         if (unlikely(!page)) {
0717             pool->frag_page = NULL;
0718             return NULL;
0719         }
0720 
0721         pool->frag_page = page;
0722 
0723 frag_reset:
0724         pool->frag_users = 1;
0725         *offset = 0;
0726         pool->frag_offset = size;
0727         page_pool_fragment_page(page, BIAS_MAX);
0728         return page;
0729     }
0730 
0731     pool->frag_users++;
0732     pool->frag_offset = *offset + size;
0733     alloc_stat_inc(pool, fast);
0734     return page;
0735 }
0736 EXPORT_SYMBOL(page_pool_alloc_frag);
0737 
0738 static void page_pool_empty_ring(struct page_pool *pool)
0739 {
0740     struct page *page;
0741 
0742     /* Empty recycle ring */
0743     while ((page = ptr_ring_consume_bh(&pool->ring))) {
0744         /* Verify the refcnt invariant of cached pages */
0745         if (!(page_ref_count(page) == 1))
0746             pr_crit("%s() page_pool refcnt %d violation\n",
0747                 __func__, page_ref_count(page));
0748 
0749         page_pool_return_page(pool, page);
0750     }
0751 }
0752 
0753 static void page_pool_free(struct page_pool *pool)
0754 {
0755     if (pool->disconnect)
0756         pool->disconnect(pool);
0757 
0758     ptr_ring_cleanup(&pool->ring, NULL);
0759 
0760     if (pool->p.flags & PP_FLAG_DMA_MAP)
0761         put_device(pool->p.dev);
0762 
0763 #ifdef CONFIG_PAGE_POOL_STATS
0764     free_percpu(pool->recycle_stats);
0765 #endif
0766     kfree(pool);
0767 }
0768 
0769 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
0770 {
0771     struct page *page;
0772 
0773     if (pool->destroy_cnt)
0774         return;
0775 
0776     /* Empty alloc cache, assume caller made sure this is
0777      * no-longer in use, and page_pool_alloc_pages() cannot be
0778      * call concurrently.
0779      */
0780     while (pool->alloc.count) {
0781         page = pool->alloc.cache[--pool->alloc.count];
0782         page_pool_return_page(pool, page);
0783     }
0784 }
0785 
0786 static void page_pool_scrub(struct page_pool *pool)
0787 {
0788     page_pool_empty_alloc_cache_once(pool);
0789     pool->destroy_cnt++;
0790 
0791     /* No more consumers should exist, but producers could still
0792      * be in-flight.
0793      */
0794     page_pool_empty_ring(pool);
0795 }
0796 
0797 static int page_pool_release(struct page_pool *pool)
0798 {
0799     int inflight;
0800 
0801     page_pool_scrub(pool);
0802     inflight = page_pool_inflight(pool);
0803     if (!inflight)
0804         page_pool_free(pool);
0805 
0806     return inflight;
0807 }
0808 
0809 static void page_pool_release_retry(struct work_struct *wq)
0810 {
0811     struct delayed_work *dwq = to_delayed_work(wq);
0812     struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
0813     int inflight;
0814 
0815     inflight = page_pool_release(pool);
0816     if (!inflight)
0817         return;
0818 
0819     /* Periodic warning */
0820     if (time_after_eq(jiffies, pool->defer_warn)) {
0821         int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
0822 
0823         pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
0824             __func__, inflight, sec);
0825         pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
0826     }
0827 
0828     /* Still not ready to be disconnected, retry later */
0829     schedule_delayed_work(&pool->release_dw, DEFER_TIME);
0830 }
0831 
0832 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
0833                struct xdp_mem_info *mem)
0834 {
0835     refcount_inc(&pool->user_cnt);
0836     pool->disconnect = disconnect;
0837     pool->xdp_mem_id = mem->id;
0838 }
0839 
0840 void page_pool_destroy(struct page_pool *pool)
0841 {
0842     if (!pool)
0843         return;
0844 
0845     if (!page_pool_put(pool))
0846         return;
0847 
0848     page_pool_free_frag(pool);
0849 
0850     if (!page_pool_release(pool))
0851         return;
0852 
0853     pool->defer_start = jiffies;
0854     pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
0855 
0856     INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
0857     schedule_delayed_work(&pool->release_dw, DEFER_TIME);
0858 }
0859 EXPORT_SYMBOL(page_pool_destroy);
0860 
0861 /* Caller must provide appropriate safe context, e.g. NAPI. */
0862 void page_pool_update_nid(struct page_pool *pool, int new_nid)
0863 {
0864     struct page *page;
0865 
0866     trace_page_pool_update_nid(pool, new_nid);
0867     pool->p.nid = new_nid;
0868 
0869     /* Flush pool alloc cache, as refill will check NUMA node */
0870     while (pool->alloc.count) {
0871         page = pool->alloc.cache[--pool->alloc.count];
0872         page_pool_return_page(pool, page);
0873     }
0874 }
0875 EXPORT_SYMBOL(page_pool_update_nid);
0876 
0877 bool page_pool_return_skb_page(struct page *page)
0878 {
0879     struct page_pool *pp;
0880 
0881     page = compound_head(page);
0882 
0883     /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
0884      * in order to preserve any existing bits, such as bit 0 for the
0885      * head page of compound page and bit 1 for pfmemalloc page, so
0886      * mask those bits for freeing side when doing below checking,
0887      * and page_is_pfmemalloc() is checked in __page_pool_put_page()
0888      * to avoid recycling the pfmemalloc page.
0889      */
0890     if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
0891         return false;
0892 
0893     pp = page->pp;
0894 
0895     /* Driver set this to memory recycling info. Reset it on recycle.
0896      * This will *not* work for NIC using a split-page memory model.
0897      * The page will be returned to the pool here regardless of the
0898      * 'flipped' fragment being in use or not.
0899      */
0900     page_pool_put_full_page(pp, page, false);
0901 
0902     return true;
0903 }
0904 EXPORT_SYMBOL(page_pool_return_skb_page);