Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 #include <linux/mm.h>
0003 #include <linux/mmzone.h>
0004 #include <linux/page_reporting.h>
0005 #include <linux/gfp.h>
0006 #include <linux/export.h>
0007 #include <linux/module.h>
0008 #include <linux/delay.h>
0009 #include <linux/scatterlist.h>
0010 
0011 #include "page_reporting.h"
0012 #include "internal.h"
0013 
0014 unsigned int page_reporting_order = MAX_ORDER;
0015 module_param(page_reporting_order, uint, 0644);
0016 MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
0017 
0018 #define PAGE_REPORTING_DELAY    (2 * HZ)
0019 static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
0020 
0021 enum {
0022     PAGE_REPORTING_IDLE = 0,
0023     PAGE_REPORTING_REQUESTED,
0024     PAGE_REPORTING_ACTIVE
0025 };
0026 
0027 /* request page reporting */
0028 static void
0029 __page_reporting_request(struct page_reporting_dev_info *prdev)
0030 {
0031     unsigned int state;
0032 
0033     /* Check to see if we are in desired state */
0034     state = atomic_read(&prdev->state);
0035     if (state == PAGE_REPORTING_REQUESTED)
0036         return;
0037 
0038     /*
0039      * If reporting is already active there is nothing we need to do.
0040      * Test against 0 as that represents PAGE_REPORTING_IDLE.
0041      */
0042     state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
0043     if (state != PAGE_REPORTING_IDLE)
0044         return;
0045 
0046     /*
0047      * Delay the start of work to allow a sizable queue to build. For
0048      * now we are limiting this to running no more than once every
0049      * couple of seconds.
0050      */
0051     schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
0052 }
0053 
0054 /* notify prdev of free page reporting request */
0055 void __page_reporting_notify(void)
0056 {
0057     struct page_reporting_dev_info *prdev;
0058 
0059     /*
0060      * We use RCU to protect the pr_dev_info pointer. In almost all
0061      * cases this should be present, however in the unlikely case of
0062      * a shutdown this will be NULL and we should exit.
0063      */
0064     rcu_read_lock();
0065     prdev = rcu_dereference(pr_dev_info);
0066     if (likely(prdev))
0067         __page_reporting_request(prdev);
0068 
0069     rcu_read_unlock();
0070 }
0071 
0072 static void
0073 page_reporting_drain(struct page_reporting_dev_info *prdev,
0074              struct scatterlist *sgl, unsigned int nents, bool reported)
0075 {
0076     struct scatterlist *sg = sgl;
0077 
0078     /*
0079      * Drain the now reported pages back into their respective
0080      * free lists/areas. We assume at least one page is populated.
0081      */
0082     do {
0083         struct page *page = sg_page(sg);
0084         int mt = get_pageblock_migratetype(page);
0085         unsigned int order = get_order(sg->length);
0086 
0087         __putback_isolated_page(page, order, mt);
0088 
0089         /* If the pages were not reported due to error skip flagging */
0090         if (!reported)
0091             continue;
0092 
0093         /*
0094          * If page was not comingled with another page we can
0095          * consider the result to be "reported" since the page
0096          * hasn't been modified, otherwise we will need to
0097          * report on the new larger page when we make our way
0098          * up to that higher order.
0099          */
0100         if (PageBuddy(page) && buddy_order(page) == order)
0101             __SetPageReported(page);
0102     } while ((sg = sg_next(sg)));
0103 
0104     /* reinitialize scatterlist now that it is empty */
0105     sg_init_table(sgl, nents);
0106 }
0107 
0108 /*
0109  * The page reporting cycle consists of 4 stages, fill, report, drain, and
0110  * idle. We will cycle through the first 3 stages until we cannot obtain a
0111  * full scatterlist of pages, in that case we will switch to idle.
0112  */
0113 static int
0114 page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
0115              unsigned int order, unsigned int mt,
0116              struct scatterlist *sgl, unsigned int *offset)
0117 {
0118     struct free_area *area = &zone->free_area[order];
0119     struct list_head *list = &area->free_list[mt];
0120     unsigned int page_len = PAGE_SIZE << order;
0121     struct page *page, *next;
0122     long budget;
0123     int err = 0;
0124 
0125     /*
0126      * Perform early check, if free area is empty there is
0127      * nothing to process so we can skip this free_list.
0128      */
0129     if (list_empty(list))
0130         return err;
0131 
0132     spin_lock_irq(&zone->lock);
0133 
0134     /*
0135      * Limit how many calls we will be making to the page reporting
0136      * device for this list. By doing this we avoid processing any
0137      * given list for too long.
0138      *
0139      * The current value used allows us enough calls to process over a
0140      * sixteenth of the current list plus one additional call to handle
0141      * any pages that may have already been present from the previous
0142      * list processed. This should result in us reporting all pages on
0143      * an idle system in about 30 seconds.
0144      *
0145      * The division here should be cheap since PAGE_REPORTING_CAPACITY
0146      * should always be a power of 2.
0147      */
0148     budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
0149 
0150     /* loop through free list adding unreported pages to sg list */
0151     list_for_each_entry_safe(page, next, list, lru) {
0152         /* We are going to skip over the reported pages. */
0153         if (PageReported(page))
0154             continue;
0155 
0156         /*
0157          * If we fully consumed our budget then update our
0158          * state to indicate that we are requesting additional
0159          * processing and exit this list.
0160          */
0161         if (budget < 0) {
0162             atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
0163             next = page;
0164             break;
0165         }
0166 
0167         /* Attempt to pull page from list and place in scatterlist */
0168         if (*offset) {
0169             if (!__isolate_free_page(page, order)) {
0170                 next = page;
0171                 break;
0172             }
0173 
0174             /* Add page to scatter list */
0175             --(*offset);
0176             sg_set_page(&sgl[*offset], page, page_len, 0);
0177 
0178             continue;
0179         }
0180 
0181         /*
0182          * Make the first non-reported page in the free list
0183          * the new head of the free list before we release the
0184          * zone lock.
0185          */
0186         if (!list_is_first(&page->lru, list))
0187             list_rotate_to_front(&page->lru, list);
0188 
0189         /* release lock before waiting on report processing */
0190         spin_unlock_irq(&zone->lock);
0191 
0192         /* begin processing pages in local list */
0193         err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
0194 
0195         /* reset offset since the full list was reported */
0196         *offset = PAGE_REPORTING_CAPACITY;
0197 
0198         /* update budget to reflect call to report function */
0199         budget--;
0200 
0201         /* reacquire zone lock and resume processing */
0202         spin_lock_irq(&zone->lock);
0203 
0204         /* flush reported pages from the sg list */
0205         page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
0206 
0207         /*
0208          * Reset next to first entry, the old next isn't valid
0209          * since we dropped the lock to report the pages
0210          */
0211         next = list_first_entry(list, struct page, lru);
0212 
0213         /* exit on error */
0214         if (err)
0215             break;
0216     }
0217 
0218     /* Rotate any leftover pages to the head of the freelist */
0219     if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
0220         list_rotate_to_front(&next->lru, list);
0221 
0222     spin_unlock_irq(&zone->lock);
0223 
0224     return err;
0225 }
0226 
0227 static int
0228 page_reporting_process_zone(struct page_reporting_dev_info *prdev,
0229                 struct scatterlist *sgl, struct zone *zone)
0230 {
0231     unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
0232     unsigned long watermark;
0233     int err = 0;
0234 
0235     /* Generate minimum watermark to be able to guarantee progress */
0236     watermark = low_wmark_pages(zone) +
0237             (PAGE_REPORTING_CAPACITY << page_reporting_order);
0238 
0239     /*
0240      * Cancel request if insufficient free memory or if we failed
0241      * to allocate page reporting statistics for the zone.
0242      */
0243     if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
0244         return err;
0245 
0246     /* Process each free list starting from lowest order/mt */
0247     for (order = page_reporting_order; order < MAX_ORDER; order++) {
0248         for (mt = 0; mt < MIGRATE_TYPES; mt++) {
0249             /* We do not pull pages from the isolate free list */
0250             if (is_migrate_isolate(mt))
0251                 continue;
0252 
0253             err = page_reporting_cycle(prdev, zone, order, mt,
0254                            sgl, &offset);
0255             if (err)
0256                 return err;
0257         }
0258     }
0259 
0260     /* report the leftover pages before going idle */
0261     leftover = PAGE_REPORTING_CAPACITY - offset;
0262     if (leftover) {
0263         sgl = &sgl[offset];
0264         err = prdev->report(prdev, sgl, leftover);
0265 
0266         /* flush any remaining pages out from the last report */
0267         spin_lock_irq(&zone->lock);
0268         page_reporting_drain(prdev, sgl, leftover, !err);
0269         spin_unlock_irq(&zone->lock);
0270     }
0271 
0272     return err;
0273 }
0274 
0275 static void page_reporting_process(struct work_struct *work)
0276 {
0277     struct delayed_work *d_work = to_delayed_work(work);
0278     struct page_reporting_dev_info *prdev =
0279         container_of(d_work, struct page_reporting_dev_info, work);
0280     int err = 0, state = PAGE_REPORTING_ACTIVE;
0281     struct scatterlist *sgl;
0282     struct zone *zone;
0283 
0284     /*
0285      * Change the state to "Active" so that we can track if there is
0286      * anyone requests page reporting after we complete our pass. If
0287      * the state is not altered by the end of the pass we will switch
0288      * to idle and quit scheduling reporting runs.
0289      */
0290     atomic_set(&prdev->state, state);
0291 
0292     /* allocate scatterlist to store pages being reported on */
0293     sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
0294     if (!sgl)
0295         goto err_out;
0296 
0297     sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
0298 
0299     for_each_zone(zone) {
0300         err = page_reporting_process_zone(prdev, sgl, zone);
0301         if (err)
0302             break;
0303     }
0304 
0305     kfree(sgl);
0306 err_out:
0307     /*
0308      * If the state has reverted back to requested then there may be
0309      * additional pages to be processed. We will defer for 2s to allow
0310      * more pages to accumulate.
0311      */
0312     state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
0313     if (state == PAGE_REPORTING_REQUESTED)
0314         schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
0315 }
0316 
0317 static DEFINE_MUTEX(page_reporting_mutex);
0318 DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
0319 
0320 int page_reporting_register(struct page_reporting_dev_info *prdev)
0321 {
0322     int err = 0;
0323 
0324     mutex_lock(&page_reporting_mutex);
0325 
0326     /* nothing to do if already in use */
0327     if (rcu_access_pointer(pr_dev_info)) {
0328         err = -EBUSY;
0329         goto err_out;
0330     }
0331 
0332     /*
0333      * Update the page reporting order if it's specified by driver.
0334      * Otherwise, it falls back to @pageblock_order.
0335      */
0336     page_reporting_order = prdev->order ? : pageblock_order;
0337 
0338     /* initialize state and work structures */
0339     atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
0340     INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
0341 
0342     /* Begin initial flush of zones */
0343     __page_reporting_request(prdev);
0344 
0345     /* Assign device to allow notifications */
0346     rcu_assign_pointer(pr_dev_info, prdev);
0347 
0348     /* enable page reporting notification */
0349     if (!static_key_enabled(&page_reporting_enabled)) {
0350         static_branch_enable(&page_reporting_enabled);
0351         pr_info("Free page reporting enabled\n");
0352     }
0353 err_out:
0354     mutex_unlock(&page_reporting_mutex);
0355 
0356     return err;
0357 }
0358 EXPORT_SYMBOL_GPL(page_reporting_register);
0359 
0360 void page_reporting_unregister(struct page_reporting_dev_info *prdev)
0361 {
0362     mutex_lock(&page_reporting_mutex);
0363 
0364     if (rcu_access_pointer(pr_dev_info) == prdev) {
0365         /* Disable page reporting notification */
0366         RCU_INIT_POINTER(pr_dev_info, NULL);
0367         synchronize_rcu();
0368 
0369         /* Flush any existing work, and lock it out */
0370         cancel_delayed_work_sync(&prdev->work);
0371     }
0372 
0373     mutex_unlock(&page_reporting_mutex);
0374 }
0375 EXPORT_SYMBOL_GPL(page_reporting_unregister);