Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/mm/vmstat.c
0003  *
0004  *  Manages VM statistics
0005  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
0006  *
0007  *  zoned VM statistics
0008  *  Copyright (C) 2006 Silicon Graphics, Inc.,
0009  *      Christoph Lameter <christoph@lameter.com>
0010  *  Copyright (C) 2008-2014 Christoph Lameter
0011  */
0012 #include <linux/fs.h>
0013 #include <linux/mm.h>
0014 #include <linux/err.h>
0015 #include <linux/module.h>
0016 #include <linux/slab.h>
0017 #include <linux/cpu.h>
0018 #include <linux/cpumask.h>
0019 #include <linux/vmstat.h>
0020 #include <linux/proc_fs.h>
0021 #include <linux/seq_file.h>
0022 #include <linux/debugfs.h>
0023 #include <linux/sched.h>
0024 #include <linux/math64.h>
0025 #include <linux/writeback.h>
0026 #include <linux/compaction.h>
0027 #include <linux/mm_inline.h>
0028 #include <linux/page_ext.h>
0029 #include <linux/page_owner.h>
0030 
0031 #include "internal.h"
0032 
0033 #ifdef CONFIG_VM_EVENT_COUNTERS
0034 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
0035 EXPORT_PER_CPU_SYMBOL(vm_event_states);
0036 
0037 static void sum_vm_events(unsigned long *ret)
0038 {
0039     int cpu;
0040     int i;
0041 
0042     memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
0043 
0044     for_each_online_cpu(cpu) {
0045         struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
0046 
0047         for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
0048             ret[i] += this->event[i];
0049     }
0050 }
0051 
0052 /*
0053  * Accumulate the vm event counters across all CPUs.
0054  * The result is unavoidably approximate - it can change
0055  * during and after execution of this function.
0056 */
0057 void all_vm_events(unsigned long *ret)
0058 {
0059     get_online_cpus();
0060     sum_vm_events(ret);
0061     put_online_cpus();
0062 }
0063 EXPORT_SYMBOL_GPL(all_vm_events);
0064 
0065 /*
0066  * Fold the foreign cpu events into our own.
0067  *
0068  * This is adding to the events on one processor
0069  * but keeps the global counts constant.
0070  */
0071 void vm_events_fold_cpu(int cpu)
0072 {
0073     struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
0074     int i;
0075 
0076     for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
0077         count_vm_events(i, fold_state->event[i]);
0078         fold_state->event[i] = 0;
0079     }
0080 }
0081 
0082 #endif /* CONFIG_VM_EVENT_COUNTERS */
0083 
0084 /*
0085  * Manage combined zone based / global counters
0086  *
0087  * vm_stat contains the global counters
0088  */
0089 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
0090 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
0091 EXPORT_SYMBOL(vm_zone_stat);
0092 EXPORT_SYMBOL(vm_node_stat);
0093 
0094 #ifdef CONFIG_SMP
0095 
0096 int calculate_pressure_threshold(struct zone *zone)
0097 {
0098     int threshold;
0099     int watermark_distance;
0100 
0101     /*
0102      * As vmstats are not up to date, there is drift between the estimated
0103      * and real values. For high thresholds and a high number of CPUs, it
0104      * is possible for the min watermark to be breached while the estimated
0105      * value looks fine. The pressure threshold is a reduced value such
0106      * that even the maximum amount of drift will not accidentally breach
0107      * the min watermark
0108      */
0109     watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
0110     threshold = max(1, (int)(watermark_distance / num_online_cpus()));
0111 
0112     /*
0113      * Maximum threshold is 125
0114      */
0115     threshold = min(125, threshold);
0116 
0117     return threshold;
0118 }
0119 
0120 int calculate_normal_threshold(struct zone *zone)
0121 {
0122     int threshold;
0123     int mem;    /* memory in 128 MB units */
0124 
0125     /*
0126      * The threshold scales with the number of processors and the amount
0127      * of memory per zone. More memory means that we can defer updates for
0128      * longer, more processors could lead to more contention.
0129      * fls() is used to have a cheap way of logarithmic scaling.
0130      *
0131      * Some sample thresholds:
0132      *
0133      * Threshold    Processors  (fls)   Zonesize    fls(mem+1)
0134      * ------------------------------------------------------------------
0135      * 8        1       1   0.9-1 GB    4
0136      * 16       2       2   0.9-1 GB    4
0137      * 20       2       2   1-2 GB      5
0138      * 24       2       2   2-4 GB      6
0139      * 28       2       2   4-8 GB      7
0140      * 32       2       2   8-16 GB     8
0141      * 4        2       2   <128M       1
0142      * 30       4       3   2-4 GB      5
0143      * 48       4       3   8-16 GB     8
0144      * 32       8       4   1-2 GB      4
0145      * 32       8       4   0.9-1GB     4
0146      * 10       16      5   <128M       1
0147      * 40       16      5   900M        4
0148      * 70       64      7   2-4 GB      5
0149      * 84       64      7   4-8 GB      6
0150      * 108      512     9   4-8 GB      6
0151      * 125      1024        10  8-16 GB     8
0152      * 125      1024        10  16-32 GB    9
0153      */
0154 
0155     mem = zone->managed_pages >> (27 - PAGE_SHIFT);
0156 
0157     threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
0158 
0159     /*
0160      * Maximum threshold is 125
0161      */
0162     threshold = min(125, threshold);
0163 
0164     return threshold;
0165 }
0166 
0167 /*
0168  * Refresh the thresholds for each zone.
0169  */
0170 void refresh_zone_stat_thresholds(void)
0171 {
0172     struct pglist_data *pgdat;
0173     struct zone *zone;
0174     int cpu;
0175     int threshold;
0176 
0177     /* Zero current pgdat thresholds */
0178     for_each_online_pgdat(pgdat) {
0179         for_each_online_cpu(cpu) {
0180             per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
0181         }
0182     }
0183 
0184     for_each_populated_zone(zone) {
0185         struct pglist_data *pgdat = zone->zone_pgdat;
0186         unsigned long max_drift, tolerate_drift;
0187 
0188         threshold = calculate_normal_threshold(zone);
0189 
0190         for_each_online_cpu(cpu) {
0191             int pgdat_threshold;
0192 
0193             per_cpu_ptr(zone->pageset, cpu)->stat_threshold
0194                             = threshold;
0195 
0196             /* Base nodestat threshold on the largest populated zone. */
0197             pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
0198             per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
0199                 = max(threshold, pgdat_threshold);
0200         }
0201 
0202         /*
0203          * Only set percpu_drift_mark if there is a danger that
0204          * NR_FREE_PAGES reports the low watermark is ok when in fact
0205          * the min watermark could be breached by an allocation
0206          */
0207         tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
0208         max_drift = num_online_cpus() * threshold;
0209         if (max_drift > tolerate_drift)
0210             zone->percpu_drift_mark = high_wmark_pages(zone) +
0211                     max_drift;
0212     }
0213 }
0214 
0215 void set_pgdat_percpu_threshold(pg_data_t *pgdat,
0216                 int (*calculate_pressure)(struct zone *))
0217 {
0218     struct zone *zone;
0219     int cpu;
0220     int threshold;
0221     int i;
0222 
0223     for (i = 0; i < pgdat->nr_zones; i++) {
0224         zone = &pgdat->node_zones[i];
0225         if (!zone->percpu_drift_mark)
0226             continue;
0227 
0228         threshold = (*calculate_pressure)(zone);
0229         for_each_online_cpu(cpu)
0230             per_cpu_ptr(zone->pageset, cpu)->stat_threshold
0231                             = threshold;
0232     }
0233 }
0234 
0235 /*
0236  * For use when we know that interrupts are disabled,
0237  * or when we know that preemption is disabled and that
0238  * particular counter cannot be updated from interrupt context.
0239  */
0240 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
0241                long delta)
0242 {
0243     struct per_cpu_pageset __percpu *pcp = zone->pageset;
0244     s8 __percpu *p = pcp->vm_stat_diff + item;
0245     long x;
0246     long t;
0247 
0248     x = delta + __this_cpu_read(*p);
0249 
0250     t = __this_cpu_read(pcp->stat_threshold);
0251 
0252     if (unlikely(x > t || x < -t)) {
0253         zone_page_state_add(x, zone, item);
0254         x = 0;
0255     }
0256     __this_cpu_write(*p, x);
0257 }
0258 EXPORT_SYMBOL(__mod_zone_page_state);
0259 
0260 void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
0261                 long delta)
0262 {
0263     struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
0264     s8 __percpu *p = pcp->vm_node_stat_diff + item;
0265     long x;
0266     long t;
0267 
0268     x = delta + __this_cpu_read(*p);
0269 
0270     t = __this_cpu_read(pcp->stat_threshold);
0271 
0272     if (unlikely(x > t || x < -t)) {
0273         node_page_state_add(x, pgdat, item);
0274         x = 0;
0275     }
0276     __this_cpu_write(*p, x);
0277 }
0278 EXPORT_SYMBOL(__mod_node_page_state);
0279 
0280 /*
0281  * Optimized increment and decrement functions.
0282  *
0283  * These are only for a single page and therefore can take a struct page *
0284  * argument instead of struct zone *. This allows the inclusion of the code
0285  * generated for page_zone(page) into the optimized functions.
0286  *
0287  * No overflow check is necessary and therefore the differential can be
0288  * incremented or decremented in place which may allow the compilers to
0289  * generate better code.
0290  * The increment or decrement is known and therefore one boundary check can
0291  * be omitted.
0292  *
0293  * NOTE: These functions are very performance sensitive. Change only
0294  * with care.
0295  *
0296  * Some processors have inc/dec instructions that are atomic vs an interrupt.
0297  * However, the code must first determine the differential location in a zone
0298  * based on the processor number and then inc/dec the counter. There is no
0299  * guarantee without disabling preemption that the processor will not change
0300  * in between and therefore the atomicity vs. interrupt cannot be exploited
0301  * in a useful way here.
0302  */
0303 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
0304 {
0305     struct per_cpu_pageset __percpu *pcp = zone->pageset;
0306     s8 __percpu *p = pcp->vm_stat_diff + item;
0307     s8 v, t;
0308 
0309     v = __this_cpu_inc_return(*p);
0310     t = __this_cpu_read(pcp->stat_threshold);
0311     if (unlikely(v > t)) {
0312         s8 overstep = t >> 1;
0313 
0314         zone_page_state_add(v + overstep, zone, item);
0315         __this_cpu_write(*p, -overstep);
0316     }
0317 }
0318 
0319 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
0320 {
0321     struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
0322     s8 __percpu *p = pcp->vm_node_stat_diff + item;
0323     s8 v, t;
0324 
0325     v = __this_cpu_inc_return(*p);
0326     t = __this_cpu_read(pcp->stat_threshold);
0327     if (unlikely(v > t)) {
0328         s8 overstep = t >> 1;
0329 
0330         node_page_state_add(v + overstep, pgdat, item);
0331         __this_cpu_write(*p, -overstep);
0332     }
0333 }
0334 
0335 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
0336 {
0337     __inc_zone_state(page_zone(page), item);
0338 }
0339 EXPORT_SYMBOL(__inc_zone_page_state);
0340 
0341 void __inc_node_page_state(struct page *page, enum node_stat_item item)
0342 {
0343     __inc_node_state(page_pgdat(page), item);
0344 }
0345 EXPORT_SYMBOL(__inc_node_page_state);
0346 
0347 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
0348 {
0349     struct per_cpu_pageset __percpu *pcp = zone->pageset;
0350     s8 __percpu *p = pcp->vm_stat_diff + item;
0351     s8 v, t;
0352 
0353     v = __this_cpu_dec_return(*p);
0354     t = __this_cpu_read(pcp->stat_threshold);
0355     if (unlikely(v < - t)) {
0356         s8 overstep = t >> 1;
0357 
0358         zone_page_state_add(v - overstep, zone, item);
0359         __this_cpu_write(*p, overstep);
0360     }
0361 }
0362 
0363 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
0364 {
0365     struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
0366     s8 __percpu *p = pcp->vm_node_stat_diff + item;
0367     s8 v, t;
0368 
0369     v = __this_cpu_dec_return(*p);
0370     t = __this_cpu_read(pcp->stat_threshold);
0371     if (unlikely(v < - t)) {
0372         s8 overstep = t >> 1;
0373 
0374         node_page_state_add(v - overstep, pgdat, item);
0375         __this_cpu_write(*p, overstep);
0376     }
0377 }
0378 
0379 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
0380 {
0381     __dec_zone_state(page_zone(page), item);
0382 }
0383 EXPORT_SYMBOL(__dec_zone_page_state);
0384 
0385 void __dec_node_page_state(struct page *page, enum node_stat_item item)
0386 {
0387     __dec_node_state(page_pgdat(page), item);
0388 }
0389 EXPORT_SYMBOL(__dec_node_page_state);
0390 
0391 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
0392 /*
0393  * If we have cmpxchg_local support then we do not need to incur the overhead
0394  * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
0395  *
0396  * mod_state() modifies the zone counter state through atomic per cpu
0397  * operations.
0398  *
0399  * Overstep mode specifies how overstep should handled:
0400  *     0       No overstepping
0401  *     1       Overstepping half of threshold
0402  *     -1      Overstepping minus half of threshold
0403 */
0404 static inline void mod_zone_state(struct zone *zone,
0405        enum zone_stat_item item, long delta, int overstep_mode)
0406 {
0407     struct per_cpu_pageset __percpu *pcp = zone->pageset;
0408     s8 __percpu *p = pcp->vm_stat_diff + item;
0409     long o, n, t, z;
0410 
0411     do {
0412         z = 0;  /* overflow to zone counters */
0413 
0414         /*
0415          * The fetching of the stat_threshold is racy. We may apply
0416          * a counter threshold to the wrong the cpu if we get
0417          * rescheduled while executing here. However, the next
0418          * counter update will apply the threshold again and
0419          * therefore bring the counter under the threshold again.
0420          *
0421          * Most of the time the thresholds are the same anyways
0422          * for all cpus in a zone.
0423          */
0424         t = this_cpu_read(pcp->stat_threshold);
0425 
0426         o = this_cpu_read(*p);
0427         n = delta + o;
0428 
0429         if (n > t || n < -t) {
0430             int os = overstep_mode * (t >> 1) ;
0431 
0432             /* Overflow must be added to zone counters */
0433             z = n + os;
0434             n = -os;
0435         }
0436     } while (this_cpu_cmpxchg(*p, o, n) != o);
0437 
0438     if (z)
0439         zone_page_state_add(z, zone, item);
0440 }
0441 
0442 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
0443              long delta)
0444 {
0445     mod_zone_state(zone, item, delta, 0);
0446 }
0447 EXPORT_SYMBOL(mod_zone_page_state);
0448 
0449 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
0450 {
0451     mod_zone_state(page_zone(page), item, 1, 1);
0452 }
0453 EXPORT_SYMBOL(inc_zone_page_state);
0454 
0455 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
0456 {
0457     mod_zone_state(page_zone(page), item, -1, -1);
0458 }
0459 EXPORT_SYMBOL(dec_zone_page_state);
0460 
0461 static inline void mod_node_state(struct pglist_data *pgdat,
0462        enum node_stat_item item, int delta, int overstep_mode)
0463 {
0464     struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
0465     s8 __percpu *p = pcp->vm_node_stat_diff + item;
0466     long o, n, t, z;
0467 
0468     do {
0469         z = 0;  /* overflow to node counters */
0470 
0471         /*
0472          * The fetching of the stat_threshold is racy. We may apply
0473          * a counter threshold to the wrong the cpu if we get
0474          * rescheduled while executing here. However, the next
0475          * counter update will apply the threshold again and
0476          * therefore bring the counter under the threshold again.
0477          *
0478          * Most of the time the thresholds are the same anyways
0479          * for all cpus in a node.
0480          */
0481         t = this_cpu_read(pcp->stat_threshold);
0482 
0483         o = this_cpu_read(*p);
0484         n = delta + o;
0485 
0486         if (n > t || n < -t) {
0487             int os = overstep_mode * (t >> 1) ;
0488 
0489             /* Overflow must be added to node counters */
0490             z = n + os;
0491             n = -os;
0492         }
0493     } while (this_cpu_cmpxchg(*p, o, n) != o);
0494 
0495     if (z)
0496         node_page_state_add(z, pgdat, item);
0497 }
0498 
0499 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
0500                     long delta)
0501 {
0502     mod_node_state(pgdat, item, delta, 0);
0503 }
0504 EXPORT_SYMBOL(mod_node_page_state);
0505 
0506 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
0507 {
0508     mod_node_state(pgdat, item, 1, 1);
0509 }
0510 
0511 void inc_node_page_state(struct page *page, enum node_stat_item item)
0512 {
0513     mod_node_state(page_pgdat(page), item, 1, 1);
0514 }
0515 EXPORT_SYMBOL(inc_node_page_state);
0516 
0517 void dec_node_page_state(struct page *page, enum node_stat_item item)
0518 {
0519     mod_node_state(page_pgdat(page), item, -1, -1);
0520 }
0521 EXPORT_SYMBOL(dec_node_page_state);
0522 #else
0523 /*
0524  * Use interrupt disable to serialize counter updates
0525  */
0526 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
0527              long delta)
0528 {
0529     unsigned long flags;
0530 
0531     local_irq_save(flags);
0532     __mod_zone_page_state(zone, item, delta);
0533     local_irq_restore(flags);
0534 }
0535 EXPORT_SYMBOL(mod_zone_page_state);
0536 
0537 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
0538 {
0539     unsigned long flags;
0540     struct zone *zone;
0541 
0542     zone = page_zone(page);
0543     local_irq_save(flags);
0544     __inc_zone_state(zone, item);
0545     local_irq_restore(flags);
0546 }
0547 EXPORT_SYMBOL(inc_zone_page_state);
0548 
0549 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
0550 {
0551     unsigned long flags;
0552 
0553     local_irq_save(flags);
0554     __dec_zone_page_state(page, item);
0555     local_irq_restore(flags);
0556 }
0557 EXPORT_SYMBOL(dec_zone_page_state);
0558 
0559 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
0560 {
0561     unsigned long flags;
0562 
0563     local_irq_save(flags);
0564     __inc_node_state(pgdat, item);
0565     local_irq_restore(flags);
0566 }
0567 EXPORT_SYMBOL(inc_node_state);
0568 
0569 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
0570                     long delta)
0571 {
0572     unsigned long flags;
0573 
0574     local_irq_save(flags);
0575     __mod_node_page_state(pgdat, item, delta);
0576     local_irq_restore(flags);
0577 }
0578 EXPORT_SYMBOL(mod_node_page_state);
0579 
0580 void inc_node_page_state(struct page *page, enum node_stat_item item)
0581 {
0582     unsigned long flags;
0583     struct pglist_data *pgdat;
0584 
0585     pgdat = page_pgdat(page);
0586     local_irq_save(flags);
0587     __inc_node_state(pgdat, item);
0588     local_irq_restore(flags);
0589 }
0590 EXPORT_SYMBOL(inc_node_page_state);
0591 
0592 void dec_node_page_state(struct page *page, enum node_stat_item item)
0593 {
0594     unsigned long flags;
0595 
0596     local_irq_save(flags);
0597     __dec_node_page_state(page, item);
0598     local_irq_restore(flags);
0599 }
0600 EXPORT_SYMBOL(dec_node_page_state);
0601 #endif
0602 
0603 /*
0604  * Fold a differential into the global counters.
0605  * Returns the number of counters updated.
0606  */
0607 static int fold_diff(int *zone_diff, int *node_diff)
0608 {
0609     int i;
0610     int changes = 0;
0611 
0612     for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
0613         if (zone_diff[i]) {
0614             atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
0615             changes++;
0616     }
0617 
0618     for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
0619         if (node_diff[i]) {
0620             atomic_long_add(node_diff[i], &vm_node_stat[i]);
0621             changes++;
0622     }
0623     return changes;
0624 }
0625 
0626 /*
0627  * Update the zone counters for the current cpu.
0628  *
0629  * Note that refresh_cpu_vm_stats strives to only access
0630  * node local memory. The per cpu pagesets on remote zones are placed
0631  * in the memory local to the processor using that pageset. So the
0632  * loop over all zones will access a series of cachelines local to
0633  * the processor.
0634  *
0635  * The call to zone_page_state_add updates the cachelines with the
0636  * statistics in the remote zone struct as well as the global cachelines
0637  * with the global counters. These could cause remote node cache line
0638  * bouncing and will have to be only done when necessary.
0639  *
0640  * The function returns the number of global counters updated.
0641  */
0642 static int refresh_cpu_vm_stats(bool do_pagesets)
0643 {
0644     struct pglist_data *pgdat;
0645     struct zone *zone;
0646     int i;
0647     int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
0648     int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
0649     int changes = 0;
0650 
0651     for_each_populated_zone(zone) {
0652         struct per_cpu_pageset __percpu *p = zone->pageset;
0653 
0654         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
0655             int v;
0656 
0657             v = this_cpu_xchg(p->vm_stat_diff[i], 0);
0658             if (v) {
0659 
0660                 atomic_long_add(v, &zone->vm_stat[i]);
0661                 global_zone_diff[i] += v;
0662 #ifdef CONFIG_NUMA
0663                 /* 3 seconds idle till flush */
0664                 __this_cpu_write(p->expire, 3);
0665 #endif
0666             }
0667         }
0668 #ifdef CONFIG_NUMA
0669         if (do_pagesets) {
0670             cond_resched();
0671             /*
0672              * Deal with draining the remote pageset of this
0673              * processor
0674              *
0675              * Check if there are pages remaining in this pageset
0676              * if not then there is nothing to expire.
0677              */
0678             if (!__this_cpu_read(p->expire) ||
0679                    !__this_cpu_read(p->pcp.count))
0680                 continue;
0681 
0682             /*
0683              * We never drain zones local to this processor.
0684              */
0685             if (zone_to_nid(zone) == numa_node_id()) {
0686                 __this_cpu_write(p->expire, 0);
0687                 continue;
0688             }
0689 
0690             if (__this_cpu_dec_return(p->expire))
0691                 continue;
0692 
0693             if (__this_cpu_read(p->pcp.count)) {
0694                 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
0695                 changes++;
0696             }
0697         }
0698 #endif
0699     }
0700 
0701     for_each_online_pgdat(pgdat) {
0702         struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
0703 
0704         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
0705             int v;
0706 
0707             v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
0708             if (v) {
0709                 atomic_long_add(v, &pgdat->vm_stat[i]);
0710                 global_node_diff[i] += v;
0711             }
0712         }
0713     }
0714 
0715     changes += fold_diff(global_zone_diff, global_node_diff);
0716     return changes;
0717 }
0718 
0719 /*
0720  * Fold the data for an offline cpu into the global array.
0721  * There cannot be any access by the offline cpu and therefore
0722  * synchronization is simplified.
0723  */
0724 void cpu_vm_stats_fold(int cpu)
0725 {
0726     struct pglist_data *pgdat;
0727     struct zone *zone;
0728     int i;
0729     int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
0730     int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
0731 
0732     for_each_populated_zone(zone) {
0733         struct per_cpu_pageset *p;
0734 
0735         p = per_cpu_ptr(zone->pageset, cpu);
0736 
0737         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
0738             if (p->vm_stat_diff[i]) {
0739                 int v;
0740 
0741                 v = p->vm_stat_diff[i];
0742                 p->vm_stat_diff[i] = 0;
0743                 atomic_long_add(v, &zone->vm_stat[i]);
0744                 global_zone_diff[i] += v;
0745             }
0746     }
0747 
0748     for_each_online_pgdat(pgdat) {
0749         struct per_cpu_nodestat *p;
0750 
0751         p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
0752 
0753         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
0754             if (p->vm_node_stat_diff[i]) {
0755                 int v;
0756 
0757                 v = p->vm_node_stat_diff[i];
0758                 p->vm_node_stat_diff[i] = 0;
0759                 atomic_long_add(v, &pgdat->vm_stat[i]);
0760                 global_node_diff[i] += v;
0761             }
0762     }
0763 
0764     fold_diff(global_zone_diff, global_node_diff);
0765 }
0766 
0767 /*
0768  * this is only called if !populated_zone(zone), which implies no other users of
0769  * pset->vm_stat_diff[] exsist.
0770  */
0771 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
0772 {
0773     int i;
0774 
0775     for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
0776         if (pset->vm_stat_diff[i]) {
0777             int v = pset->vm_stat_diff[i];
0778             pset->vm_stat_diff[i] = 0;
0779             atomic_long_add(v, &zone->vm_stat[i]);
0780             atomic_long_add(v, &vm_zone_stat[i]);
0781         }
0782 }
0783 #endif
0784 
0785 #ifdef CONFIG_NUMA
0786 /*
0787  * Determine the per node value of a stat item. This function
0788  * is called frequently in a NUMA machine, so try to be as
0789  * frugal as possible.
0790  */
0791 unsigned long sum_zone_node_page_state(int node,
0792                  enum zone_stat_item item)
0793 {
0794     struct zone *zones = NODE_DATA(node)->node_zones;
0795     int i;
0796     unsigned long count = 0;
0797 
0798     for (i = 0; i < MAX_NR_ZONES; i++)
0799         count += zone_page_state(zones + i, item);
0800 
0801     return count;
0802 }
0803 
0804 /*
0805  * Determine the per node value of a stat item.
0806  */
0807 unsigned long node_page_state(struct pglist_data *pgdat,
0808                 enum node_stat_item item)
0809 {
0810     long x = atomic_long_read(&pgdat->vm_stat[item]);
0811 #ifdef CONFIG_SMP
0812     if (x < 0)
0813         x = 0;
0814 #endif
0815     return x;
0816 }
0817 #endif
0818 
0819 #ifdef CONFIG_COMPACTION
0820 
0821 struct contig_page_info {
0822     unsigned long free_pages;
0823     unsigned long free_blocks_total;
0824     unsigned long free_blocks_suitable;
0825 };
0826 
0827 /*
0828  * Calculate the number of free pages in a zone, how many contiguous
0829  * pages are free and how many are large enough to satisfy an allocation of
0830  * the target size. Note that this function makes no attempt to estimate
0831  * how many suitable free blocks there *might* be if MOVABLE pages were
0832  * migrated. Calculating that is possible, but expensive and can be
0833  * figured out from userspace
0834  */
0835 static void fill_contig_page_info(struct zone *zone,
0836                 unsigned int suitable_order,
0837                 struct contig_page_info *info)
0838 {
0839     unsigned int order;
0840 
0841     info->free_pages = 0;
0842     info->free_blocks_total = 0;
0843     info->free_blocks_suitable = 0;
0844 
0845     for (order = 0; order < MAX_ORDER; order++) {
0846         unsigned long blocks;
0847 
0848         /* Count number of free blocks */
0849         blocks = zone->free_area[order].nr_free;
0850         info->free_blocks_total += blocks;
0851 
0852         /* Count free base pages */
0853         info->free_pages += blocks << order;
0854 
0855         /* Count the suitable free blocks */
0856         if (order >= suitable_order)
0857             info->free_blocks_suitable += blocks <<
0858                         (order - suitable_order);
0859     }
0860 }
0861 
0862 /*
0863  * A fragmentation index only makes sense if an allocation of a requested
0864  * size would fail. If that is true, the fragmentation index indicates
0865  * whether external fragmentation or a lack of memory was the problem.
0866  * The value can be used to determine if page reclaim or compaction
0867  * should be used
0868  */
0869 static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
0870 {
0871     unsigned long requested = 1UL << order;
0872 
0873     if (!info->free_blocks_total)
0874         return 0;
0875 
0876     /* Fragmentation index only makes sense when a request would fail */
0877     if (info->free_blocks_suitable)
0878         return -1000;
0879 
0880     /*
0881      * Index is between 0 and 1 so return within 3 decimal places
0882      *
0883      * 0 => allocation would fail due to lack of memory
0884      * 1 => allocation would fail due to fragmentation
0885      */
0886     return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
0887 }
0888 
0889 /* Same as __fragmentation index but allocs contig_page_info on stack */
0890 int fragmentation_index(struct zone *zone, unsigned int order)
0891 {
0892     struct contig_page_info info;
0893 
0894     fill_contig_page_info(zone, order, &info);
0895     return __fragmentation_index(order, &info);
0896 }
0897 #endif
0898 
0899 #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
0900 #ifdef CONFIG_ZONE_DMA
0901 #define TEXT_FOR_DMA(xx) xx "_dma",
0902 #else
0903 #define TEXT_FOR_DMA(xx)
0904 #endif
0905 
0906 #ifdef CONFIG_ZONE_DMA32
0907 #define TEXT_FOR_DMA32(xx) xx "_dma32",
0908 #else
0909 #define TEXT_FOR_DMA32(xx)
0910 #endif
0911 
0912 #ifdef CONFIG_HIGHMEM
0913 #define TEXT_FOR_HIGHMEM(xx) xx "_high",
0914 #else
0915 #define TEXT_FOR_HIGHMEM(xx)
0916 #endif
0917 
0918 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
0919                     TEXT_FOR_HIGHMEM(xx) xx "_movable",
0920 
0921 const char * const vmstat_text[] = {
0922     /* enum zone_stat_item countes */
0923     "nr_free_pages",
0924     "nr_zone_inactive_anon",
0925     "nr_zone_active_anon",
0926     "nr_zone_inactive_file",
0927     "nr_zone_active_file",
0928     "nr_zone_unevictable",
0929     "nr_zone_write_pending",
0930     "nr_mlock",
0931     "nr_slab_reclaimable",
0932     "nr_slab_unreclaimable",
0933     "nr_page_table_pages",
0934     "nr_kernel_stack",
0935     "nr_bounce",
0936 #if IS_ENABLED(CONFIG_ZSMALLOC)
0937     "nr_zspages",
0938 #endif
0939 #ifdef CONFIG_NUMA
0940     "numa_hit",
0941     "numa_miss",
0942     "numa_foreign",
0943     "numa_interleave",
0944     "numa_local",
0945     "numa_other",
0946 #endif
0947     "nr_free_cma",
0948 
0949     /* Node-based counters */
0950     "nr_inactive_anon",
0951     "nr_active_anon",
0952     "nr_inactive_file",
0953     "nr_active_file",
0954     "nr_unevictable",
0955     "nr_isolated_anon",
0956     "nr_isolated_file",
0957     "nr_pages_scanned",
0958     "workingset_refault",
0959     "workingset_activate",
0960     "workingset_nodereclaim",
0961     "nr_anon_pages",
0962     "nr_mapped",
0963     "nr_file_pages",
0964     "nr_dirty",
0965     "nr_writeback",
0966     "nr_writeback_temp",
0967     "nr_shmem",
0968     "nr_shmem_hugepages",
0969     "nr_shmem_pmdmapped",
0970     "nr_anon_transparent_hugepages",
0971     "nr_unstable",
0972     "nr_vmscan_write",
0973     "nr_vmscan_immediate_reclaim",
0974     "nr_dirtied",
0975     "nr_written",
0976 
0977     /* enum writeback_stat_item counters */
0978     "nr_dirty_threshold",
0979     "nr_dirty_background_threshold",
0980 
0981 #ifdef CONFIG_VM_EVENT_COUNTERS
0982     /* enum vm_event_item counters */
0983     "pgpgin",
0984     "pgpgout",
0985     "pswpin",
0986     "pswpout",
0987 
0988     TEXTS_FOR_ZONES("pgalloc")
0989     TEXTS_FOR_ZONES("allocstall")
0990     TEXTS_FOR_ZONES("pgskip")
0991 
0992     "pgfree",
0993     "pgactivate",
0994     "pgdeactivate",
0995 
0996     "pgfault",
0997     "pgmajfault",
0998     "pglazyfreed",
0999 
1000     "pgrefill",
1001     "pgsteal_kswapd",
1002     "pgsteal_direct",
1003     "pgscan_kswapd",
1004     "pgscan_direct",
1005     "pgscan_direct_throttle",
1006 
1007 #ifdef CONFIG_NUMA
1008     "zone_reclaim_failed",
1009 #endif
1010     "pginodesteal",
1011     "slabs_scanned",
1012     "kswapd_inodesteal",
1013     "kswapd_low_wmark_hit_quickly",
1014     "kswapd_high_wmark_hit_quickly",
1015     "pageoutrun",
1016 
1017     "pgrotated",
1018 
1019     "drop_pagecache",
1020     "drop_slab",
1021 
1022 #ifdef CONFIG_NUMA_BALANCING
1023     "numa_pte_updates",
1024     "numa_huge_pte_updates",
1025     "numa_hint_faults",
1026     "numa_hint_faults_local",
1027     "numa_pages_migrated",
1028 #endif
1029 #ifdef CONFIG_MIGRATION
1030     "pgmigrate_success",
1031     "pgmigrate_fail",
1032 #endif
1033 #ifdef CONFIG_COMPACTION
1034     "compact_migrate_scanned",
1035     "compact_free_scanned",
1036     "compact_isolated",
1037     "compact_stall",
1038     "compact_fail",
1039     "compact_success",
1040     "compact_daemon_wake",
1041 #endif
1042 
1043 #ifdef CONFIG_HUGETLB_PAGE
1044     "htlb_buddy_alloc_success",
1045     "htlb_buddy_alloc_fail",
1046 #endif
1047     "unevictable_pgs_culled",
1048     "unevictable_pgs_scanned",
1049     "unevictable_pgs_rescued",
1050     "unevictable_pgs_mlocked",
1051     "unevictable_pgs_munlocked",
1052     "unevictable_pgs_cleared",
1053     "unevictable_pgs_stranded",
1054 
1055 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1056     "thp_fault_alloc",
1057     "thp_fault_fallback",
1058     "thp_collapse_alloc",
1059     "thp_collapse_alloc_failed",
1060     "thp_file_alloc",
1061     "thp_file_mapped",
1062     "thp_split_page",
1063     "thp_split_page_failed",
1064     "thp_deferred_split_page",
1065     "thp_split_pmd",
1066     "thp_zero_page_alloc",
1067     "thp_zero_page_alloc_failed",
1068 #endif
1069 #ifdef CONFIG_MEMORY_BALLOON
1070     "balloon_inflate",
1071     "balloon_deflate",
1072 #ifdef CONFIG_BALLOON_COMPACTION
1073     "balloon_migrate",
1074 #endif
1075 #endif /* CONFIG_MEMORY_BALLOON */
1076 #ifdef CONFIG_DEBUG_TLBFLUSH
1077 #ifdef CONFIG_SMP
1078     "nr_tlb_remote_flush",
1079     "nr_tlb_remote_flush_received",
1080 #endif /* CONFIG_SMP */
1081     "nr_tlb_local_flush_all",
1082     "nr_tlb_local_flush_one",
1083 #endif /* CONFIG_DEBUG_TLBFLUSH */
1084 
1085 #ifdef CONFIG_DEBUG_VM_VMACACHE
1086     "vmacache_find_calls",
1087     "vmacache_find_hits",
1088     "vmacache_full_flushes",
1089 #endif
1090 #endif /* CONFIG_VM_EVENTS_COUNTERS */
1091 };
1092 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
1093 
1094 
1095 #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1096      defined(CONFIG_PROC_FS)
1097 static void *frag_start(struct seq_file *m, loff_t *pos)
1098 {
1099     pg_data_t *pgdat;
1100     loff_t node = *pos;
1101 
1102     for (pgdat = first_online_pgdat();
1103          pgdat && node;
1104          pgdat = next_online_pgdat(pgdat))
1105         --node;
1106 
1107     return pgdat;
1108 }
1109 
1110 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1111 {
1112     pg_data_t *pgdat = (pg_data_t *)arg;
1113 
1114     (*pos)++;
1115     return next_online_pgdat(pgdat);
1116 }
1117 
1118 static void frag_stop(struct seq_file *m, void *arg)
1119 {
1120 }
1121 
1122 /* Walk all the zones in a node and print using a callback */
1123 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
1124         void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1125 {
1126     struct zone *zone;
1127     struct zone *node_zones = pgdat->node_zones;
1128     unsigned long flags;
1129 
1130     for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1131         if (!populated_zone(zone))
1132             continue;
1133 
1134         spin_lock_irqsave(&zone->lock, flags);
1135         print(m, pgdat, zone);
1136         spin_unlock_irqrestore(&zone->lock, flags);
1137     }
1138 }
1139 #endif
1140 
1141 #ifdef CONFIG_PROC_FS
1142 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1143                         struct zone *zone)
1144 {
1145     int order;
1146 
1147     seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1148     for (order = 0; order < MAX_ORDER; ++order)
1149         seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1150     seq_putc(m, '\n');
1151 }
1152 
1153 /*
1154  * This walks the free areas for each zone.
1155  */
1156 static int frag_show(struct seq_file *m, void *arg)
1157 {
1158     pg_data_t *pgdat = (pg_data_t *)arg;
1159     walk_zones_in_node(m, pgdat, frag_show_print);
1160     return 0;
1161 }
1162 
1163 static void pagetypeinfo_showfree_print(struct seq_file *m,
1164                     pg_data_t *pgdat, struct zone *zone)
1165 {
1166     int order, mtype;
1167 
1168     for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1169         seq_printf(m, "Node %4d, zone %8s, type %12s ",
1170                     pgdat->node_id,
1171                     zone->name,
1172                     migratetype_names[mtype]);
1173         for (order = 0; order < MAX_ORDER; ++order) {
1174             unsigned long freecount = 0;
1175             struct free_area *area;
1176             struct list_head *curr;
1177 
1178             area = &(zone->free_area[order]);
1179 
1180             list_for_each(curr, &area->free_list[mtype])
1181                 freecount++;
1182             seq_printf(m, "%6lu ", freecount);
1183         }
1184         seq_putc(m, '\n');
1185     }
1186 }
1187 
1188 /* Print out the free pages at each order for each migatetype */
1189 static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
1190 {
1191     int order;
1192     pg_data_t *pgdat = (pg_data_t *)arg;
1193 
1194     /* Print header */
1195     seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1196     for (order = 0; order < MAX_ORDER; ++order)
1197         seq_printf(m, "%6d ", order);
1198     seq_putc(m, '\n');
1199 
1200     walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
1201 
1202     return 0;
1203 }
1204 
1205 static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1206                     pg_data_t *pgdat, struct zone *zone)
1207 {
1208     int mtype;
1209     unsigned long pfn;
1210     unsigned long start_pfn = zone->zone_start_pfn;
1211     unsigned long end_pfn = zone_end_pfn(zone);
1212     unsigned long count[MIGRATE_TYPES] = { 0, };
1213 
1214     for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1215         struct page *page;
1216 
1217         if (!pfn_valid(pfn))
1218             continue;
1219 
1220         page = pfn_to_page(pfn);
1221 
1222         /* Watch for unexpected holes punched in the memmap */
1223         if (!memmap_valid_within(pfn, page, zone))
1224             continue;
1225 
1226         if (page_zone(page) != zone)
1227             continue;
1228 
1229         mtype = get_pageblock_migratetype(page);
1230 
1231         if (mtype < MIGRATE_TYPES)
1232             count[mtype]++;
1233     }
1234 
1235     /* Print counts */
1236     seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1237     for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1238         seq_printf(m, "%12lu ", count[mtype]);
1239     seq_putc(m, '\n');
1240 }
1241 
1242 /* Print out the free pages at each order for each migratetype */
1243 static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1244 {
1245     int mtype;
1246     pg_data_t *pgdat = (pg_data_t *)arg;
1247 
1248     seq_printf(m, "\n%-23s", "Number of blocks type ");
1249     for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1250         seq_printf(m, "%12s ", migratetype_names[mtype]);
1251     seq_putc(m, '\n');
1252     walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
1253 
1254     return 0;
1255 }
1256 
1257 /*
1258  * Print out the number of pageblocks for each migratetype that contain pages
1259  * of other types. This gives an indication of how well fallbacks are being
1260  * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1261  * to determine what is going on
1262  */
1263 static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1264 {
1265 #ifdef CONFIG_PAGE_OWNER
1266     int mtype;
1267 
1268     if (!static_branch_unlikely(&page_owner_inited))
1269         return;
1270 
1271     drain_all_pages(NULL);
1272 
1273     seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1274     for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1275         seq_printf(m, "%12s ", migratetype_names[mtype]);
1276     seq_putc(m, '\n');
1277 
1278     walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
1279 #endif /* CONFIG_PAGE_OWNER */
1280 }
1281 
1282 /*
1283  * This prints out statistics in relation to grouping pages by mobility.
1284  * It is expensive to collect so do not constantly read the file.
1285  */
1286 static int pagetypeinfo_show(struct seq_file *m, void *arg)
1287 {
1288     pg_data_t *pgdat = (pg_data_t *)arg;
1289 
1290     /* check memoryless node */
1291     if (!node_state(pgdat->node_id, N_MEMORY))
1292         return 0;
1293 
1294     seq_printf(m, "Page block order: %d\n", pageblock_order);
1295     seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
1296     seq_putc(m, '\n');
1297     pagetypeinfo_showfree(m, pgdat);
1298     pagetypeinfo_showblockcount(m, pgdat);
1299     pagetypeinfo_showmixedcount(m, pgdat);
1300 
1301     return 0;
1302 }
1303 
1304 static const struct seq_operations fragmentation_op = {
1305     .start  = frag_start,
1306     .next   = frag_next,
1307     .stop   = frag_stop,
1308     .show   = frag_show,
1309 };
1310 
1311 static int fragmentation_open(struct inode *inode, struct file *file)
1312 {
1313     return seq_open(file, &fragmentation_op);
1314 }
1315 
1316 static const struct file_operations fragmentation_file_operations = {
1317     .open       = fragmentation_open,
1318     .read       = seq_read,
1319     .llseek     = seq_lseek,
1320     .release    = seq_release,
1321 };
1322 
1323 static const struct seq_operations pagetypeinfo_op = {
1324     .start  = frag_start,
1325     .next   = frag_next,
1326     .stop   = frag_stop,
1327     .show   = pagetypeinfo_show,
1328 };
1329 
1330 static int pagetypeinfo_open(struct inode *inode, struct file *file)
1331 {
1332     return seq_open(file, &pagetypeinfo_op);
1333 }
1334 
1335 static const struct file_operations pagetypeinfo_file_ops = {
1336     .open       = pagetypeinfo_open,
1337     .read       = seq_read,
1338     .llseek     = seq_lseek,
1339     .release    = seq_release,
1340 };
1341 
1342 static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1343 {
1344     int zid;
1345 
1346     for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1347         struct zone *compare = &pgdat->node_zones[zid];
1348 
1349         if (populated_zone(compare))
1350             return zone == compare;
1351     }
1352 
1353     /* The zone must be somewhere! */
1354     WARN_ON_ONCE(1);
1355     return false;
1356 }
1357 
1358 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1359                             struct zone *zone)
1360 {
1361     int i;
1362     seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
1363     if (is_zone_first_populated(pgdat, zone)) {
1364         seq_printf(m, "\n  per-node stats");
1365         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1366             seq_printf(m, "\n      %-12s %lu",
1367                 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
1368                 node_page_state(pgdat, i));
1369         }
1370     }
1371     seq_printf(m,
1372            "\n  pages free     %lu"
1373            "\n        min      %lu"
1374            "\n        low      %lu"
1375            "\n        high     %lu"
1376            "\n   node_scanned  %lu"
1377            "\n        spanned  %lu"
1378            "\n        present  %lu"
1379            "\n        managed  %lu",
1380            zone_page_state(zone, NR_FREE_PAGES),
1381            min_wmark_pages(zone),
1382            low_wmark_pages(zone),
1383            high_wmark_pages(zone),
1384            node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
1385            zone->spanned_pages,
1386            zone->present_pages,
1387            zone->managed_pages);
1388 
1389     for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1390         seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
1391                 zone_page_state(zone, i));
1392 
1393     seq_printf(m,
1394            "\n        protection: (%ld",
1395            zone->lowmem_reserve[0]);
1396     for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1397         seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1398     seq_printf(m,
1399            ")"
1400            "\n  pagesets");
1401     for_each_online_cpu(i) {
1402         struct per_cpu_pageset *pageset;
1403 
1404         pageset = per_cpu_ptr(zone->pageset, i);
1405         seq_printf(m,
1406                "\n    cpu: %i"
1407                "\n              count: %i"
1408                "\n              high:  %i"
1409                "\n              batch: %i",
1410                i,
1411                pageset->pcp.count,
1412                pageset->pcp.high,
1413                pageset->pcp.batch);
1414 #ifdef CONFIG_SMP
1415         seq_printf(m, "\n  vm stats threshold: %d",
1416                 pageset->stat_threshold);
1417 #endif
1418     }
1419     seq_printf(m,
1420            "\n  node_unreclaimable:  %u"
1421            "\n  start_pfn:           %lu"
1422            "\n  node_inactive_ratio: %u",
1423            !pgdat_reclaimable(zone->zone_pgdat),
1424            zone->zone_start_pfn,
1425            zone->zone_pgdat->inactive_ratio);
1426     seq_putc(m, '\n');
1427 }
1428 
1429 /*
1430  * Output information about zones in @pgdat.
1431  */
1432 static int zoneinfo_show(struct seq_file *m, void *arg)
1433 {
1434     pg_data_t *pgdat = (pg_data_t *)arg;
1435     walk_zones_in_node(m, pgdat, zoneinfo_show_print);
1436     return 0;
1437 }
1438 
1439 static const struct seq_operations zoneinfo_op = {
1440     .start  = frag_start, /* iterate over all zones. The same as in
1441                    * fragmentation. */
1442     .next   = frag_next,
1443     .stop   = frag_stop,
1444     .show   = zoneinfo_show,
1445 };
1446 
1447 static int zoneinfo_open(struct inode *inode, struct file *file)
1448 {
1449     return seq_open(file, &zoneinfo_op);
1450 }
1451 
1452 static const struct file_operations proc_zoneinfo_file_operations = {
1453     .open       = zoneinfo_open,
1454     .read       = seq_read,
1455     .llseek     = seq_lseek,
1456     .release    = seq_release,
1457 };
1458 
1459 enum writeback_stat_item {
1460     NR_DIRTY_THRESHOLD,
1461     NR_DIRTY_BG_THRESHOLD,
1462     NR_VM_WRITEBACK_STAT_ITEMS,
1463 };
1464 
1465 static void *vmstat_start(struct seq_file *m, loff_t *pos)
1466 {
1467     unsigned long *v;
1468     int i, stat_items_size;
1469 
1470     if (*pos >= ARRAY_SIZE(vmstat_text))
1471         return NULL;
1472     stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
1473               NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
1474               NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
1475 
1476 #ifdef CONFIG_VM_EVENT_COUNTERS
1477     stat_items_size += sizeof(struct vm_event_state);
1478 #endif
1479 
1480     v = kmalloc(stat_items_size, GFP_KERNEL);
1481     m->private = v;
1482     if (!v)
1483         return ERR_PTR(-ENOMEM);
1484     for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1485         v[i] = global_page_state(i);
1486     v += NR_VM_ZONE_STAT_ITEMS;
1487 
1488     for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
1489         v[i] = global_node_page_state(i);
1490     v += NR_VM_NODE_STAT_ITEMS;
1491 
1492     global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1493                 v + NR_DIRTY_THRESHOLD);
1494     v += NR_VM_WRITEBACK_STAT_ITEMS;
1495 
1496 #ifdef CONFIG_VM_EVENT_COUNTERS
1497     all_vm_events(v);
1498     v[PGPGIN] /= 2;     /* sectors -> kbytes */
1499     v[PGPGOUT] /= 2;
1500 #endif
1501     return (unsigned long *)m->private + *pos;
1502 }
1503 
1504 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1505 {
1506     (*pos)++;
1507     if (*pos >= ARRAY_SIZE(vmstat_text))
1508         return NULL;
1509     return (unsigned long *)m->private + *pos;
1510 }
1511 
1512 static int vmstat_show(struct seq_file *m, void *arg)
1513 {
1514     unsigned long *l = arg;
1515     unsigned long off = l - (unsigned long *)m->private;
1516 
1517     seq_puts(m, vmstat_text[off]);
1518     seq_put_decimal_ull(m, " ", *l);
1519     seq_putc(m, '\n');
1520     return 0;
1521 }
1522 
1523 static void vmstat_stop(struct seq_file *m, void *arg)
1524 {
1525     kfree(m->private);
1526     m->private = NULL;
1527 }
1528 
1529 static const struct seq_operations vmstat_op = {
1530     .start  = vmstat_start,
1531     .next   = vmstat_next,
1532     .stop   = vmstat_stop,
1533     .show   = vmstat_show,
1534 };
1535 
1536 static int vmstat_open(struct inode *inode, struct file *file)
1537 {
1538     return seq_open(file, &vmstat_op);
1539 }
1540 
1541 static const struct file_operations proc_vmstat_file_operations = {
1542     .open       = vmstat_open,
1543     .read       = seq_read,
1544     .llseek     = seq_lseek,
1545     .release    = seq_release,
1546 };
1547 #endif /* CONFIG_PROC_FS */
1548 
1549 #ifdef CONFIG_SMP
1550 static struct workqueue_struct *vmstat_wq;
1551 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1552 int sysctl_stat_interval __read_mostly = HZ;
1553 
1554 #ifdef CONFIG_PROC_FS
1555 static void refresh_vm_stats(struct work_struct *work)
1556 {
1557     refresh_cpu_vm_stats(true);
1558 }
1559 
1560 int vmstat_refresh(struct ctl_table *table, int write,
1561            void __user *buffer, size_t *lenp, loff_t *ppos)
1562 {
1563     long val;
1564     int err;
1565     int i;
1566 
1567     /*
1568      * The regular update, every sysctl_stat_interval, may come later
1569      * than expected: leaving a significant amount in per_cpu buckets.
1570      * This is particularly misleading when checking a quantity of HUGE
1571      * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
1572      * which can equally be echo'ed to or cat'ted from (by root),
1573      * can be used to update the stats just before reading them.
1574      *
1575      * Oh, and since global_page_state() etc. are so careful to hide
1576      * transiently negative values, report an error here if any of
1577      * the stats is negative, so we know to go looking for imbalance.
1578      */
1579     err = schedule_on_each_cpu(refresh_vm_stats);
1580     if (err)
1581         return err;
1582     for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1583         val = atomic_long_read(&vm_zone_stat[i]);
1584         if (val < 0) {
1585             switch (i) {
1586             case NR_PAGES_SCANNED:
1587                 /*
1588                  * This is often seen to go negative in
1589                  * recent kernels, but not to go permanently
1590                  * negative.  Whilst it would be nicer not to
1591                  * have exceptions, rooting them out would be
1592                  * another task, of rather low priority.
1593                  */
1594                 break;
1595             default:
1596                 pr_warn("%s: %s %ld\n",
1597                     __func__, vmstat_text[i], val);
1598                 err = -EINVAL;
1599                 break;
1600             }
1601         }
1602     }
1603     if (err)
1604         return err;
1605     if (write)
1606         *ppos += *lenp;
1607     else
1608         *lenp = 0;
1609     return 0;
1610 }
1611 #endif /* CONFIG_PROC_FS */
1612 
1613 static void vmstat_update(struct work_struct *w)
1614 {
1615     if (refresh_cpu_vm_stats(true)) {
1616         /*
1617          * Counters were updated so we expect more updates
1618          * to occur in the future. Keep on running the
1619          * update worker thread.
1620          */
1621         queue_delayed_work_on(smp_processor_id(), vmstat_wq,
1622                 this_cpu_ptr(&vmstat_work),
1623                 round_jiffies_relative(sysctl_stat_interval));
1624     }
1625 }
1626 
1627 /*
1628  * Switch off vmstat processing and then fold all the remaining differentials
1629  * until the diffs stay at zero. The function is used by NOHZ and can only be
1630  * invoked when tick processing is not active.
1631  */
1632 /*
1633  * Check if the diffs for a certain cpu indicate that
1634  * an update is needed.
1635  */
1636 static bool need_update(int cpu)
1637 {
1638     struct zone *zone;
1639 
1640     for_each_populated_zone(zone) {
1641         struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1642 
1643         BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
1644         /*
1645          * The fast way of checking if there are any vmstat diffs.
1646          * This works because the diffs are byte sized items.
1647          */
1648         if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
1649             return true;
1650 
1651     }
1652     return false;
1653 }
1654 
1655 /*
1656  * Switch off vmstat processing and then fold all the remaining differentials
1657  * until the diffs stay at zero. The function is used by NOHZ and can only be
1658  * invoked when tick processing is not active.
1659  */
1660 void quiet_vmstat(void)
1661 {
1662     if (system_state != SYSTEM_RUNNING)
1663         return;
1664 
1665     if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
1666         return;
1667 
1668     if (!need_update(smp_processor_id()))
1669         return;
1670 
1671     /*
1672      * Just refresh counters and do not care about the pending delayed
1673      * vmstat_update. It doesn't fire that often to matter and canceling
1674      * it would be too expensive from this path.
1675      * vmstat_shepherd will take care about that for us.
1676      */
1677     refresh_cpu_vm_stats(false);
1678 }
1679 
1680 /*
1681  * Shepherd worker thread that checks the
1682  * differentials of processors that have their worker
1683  * threads for vm statistics updates disabled because of
1684  * inactivity.
1685  */
1686 static void vmstat_shepherd(struct work_struct *w);
1687 
1688 static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
1689 
1690 static void vmstat_shepherd(struct work_struct *w)
1691 {
1692     int cpu;
1693 
1694     get_online_cpus();
1695     /* Check processors whose vmstat worker threads have been disabled */
1696     for_each_online_cpu(cpu) {
1697         struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
1698 
1699         if (!delayed_work_pending(dw) && need_update(cpu))
1700             queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
1701     }
1702     put_online_cpus();
1703 
1704     schedule_delayed_work(&shepherd,
1705         round_jiffies_relative(sysctl_stat_interval));
1706 }
1707 
1708 static void __init start_shepherd_timer(void)
1709 {
1710     int cpu;
1711 
1712     for_each_possible_cpu(cpu)
1713         INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
1714             vmstat_update);
1715 
1716     vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
1717     schedule_delayed_work(&shepherd,
1718         round_jiffies_relative(sysctl_stat_interval));
1719 }
1720 
1721 static void __init init_cpu_node_state(void)
1722 {
1723     int node;
1724 
1725     for_each_online_node(node) {
1726         if (cpumask_weight(cpumask_of_node(node)) > 0)
1727             node_set_state(node, N_CPU);
1728     }
1729 }
1730 
1731 static int vmstat_cpu_online(unsigned int cpu)
1732 {
1733     refresh_zone_stat_thresholds();
1734     node_set_state(cpu_to_node(cpu), N_CPU);
1735     return 0;
1736 }
1737 
1738 static int vmstat_cpu_down_prep(unsigned int cpu)
1739 {
1740     cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1741     return 0;
1742 }
1743 
1744 static int vmstat_cpu_dead(unsigned int cpu)
1745 {
1746     const struct cpumask *node_cpus;
1747     int node;
1748 
1749     node = cpu_to_node(cpu);
1750 
1751     refresh_zone_stat_thresholds();
1752     node_cpus = cpumask_of_node(node);
1753     if (cpumask_weight(node_cpus) > 0)
1754         return 0;
1755 
1756     node_clear_state(node, N_CPU);
1757     return 0;
1758 }
1759 
1760 #endif
1761 
1762 static int __init setup_vmstat(void)
1763 {
1764 #ifdef CONFIG_SMP
1765     int ret;
1766 
1767     ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
1768                     NULL, vmstat_cpu_dead);
1769     if (ret < 0)
1770         pr_err("vmstat: failed to register 'dead' hotplug state\n");
1771 
1772     ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
1773                     vmstat_cpu_online,
1774                     vmstat_cpu_down_prep);
1775     if (ret < 0)
1776         pr_err("vmstat: failed to register 'online' hotplug state\n");
1777 
1778     get_online_cpus();
1779     init_cpu_node_state();
1780     put_online_cpus();
1781 
1782     start_shepherd_timer();
1783 #endif
1784 #ifdef CONFIG_PROC_FS
1785     proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
1786     proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
1787     proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
1788     proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
1789 #endif
1790     return 0;
1791 }
1792 module_init(setup_vmstat)
1793 
1794 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1795 
1796 /*
1797  * Return an index indicating how much of the available free memory is
1798  * unusable for an allocation of the requested size.
1799  */
1800 static int unusable_free_index(unsigned int order,
1801                 struct contig_page_info *info)
1802 {
1803     /* No free memory is interpreted as all free memory is unusable */
1804     if (info->free_pages == 0)
1805         return 1000;
1806 
1807     /*
1808      * Index should be a value between 0 and 1. Return a value to 3
1809      * decimal places.
1810      *
1811      * 0 => no fragmentation
1812      * 1 => high fragmentation
1813      */
1814     return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
1815 
1816 }
1817 
1818 static void unusable_show_print(struct seq_file *m,
1819                     pg_data_t *pgdat, struct zone *zone)
1820 {
1821     unsigned int order;
1822     int index;
1823     struct contig_page_info info;
1824 
1825     seq_printf(m, "Node %d, zone %8s ",
1826                 pgdat->node_id,
1827                 zone->name);
1828     for (order = 0; order < MAX_ORDER; ++order) {
1829         fill_contig_page_info(zone, order, &info);
1830         index = unusable_free_index(order, &info);
1831         seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1832     }
1833 
1834     seq_putc(m, '\n');
1835 }
1836 
1837 /*
1838  * Display unusable free space index
1839  *
1840  * The unusable free space index measures how much of the available free
1841  * memory cannot be used to satisfy an allocation of a given size and is a
1842  * value between 0 and 1. The higher the value, the more of free memory is
1843  * unusable and by implication, the worse the external fragmentation is. This
1844  * can be expressed as a percentage by multiplying by 100.
1845  */
1846 static int unusable_show(struct seq_file *m, void *arg)
1847 {
1848     pg_data_t *pgdat = (pg_data_t *)arg;
1849 
1850     /* check memoryless node */
1851     if (!node_state(pgdat->node_id, N_MEMORY))
1852         return 0;
1853 
1854     walk_zones_in_node(m, pgdat, unusable_show_print);
1855 
1856     return 0;
1857 }
1858 
1859 static const struct seq_operations unusable_op = {
1860     .start  = frag_start,
1861     .next   = frag_next,
1862     .stop   = frag_stop,
1863     .show   = unusable_show,
1864 };
1865 
1866 static int unusable_open(struct inode *inode, struct file *file)
1867 {
1868     return seq_open(file, &unusable_op);
1869 }
1870 
1871 static const struct file_operations unusable_file_ops = {
1872     .open       = unusable_open,
1873     .read       = seq_read,
1874     .llseek     = seq_lseek,
1875     .release    = seq_release,
1876 };
1877 
1878 static void extfrag_show_print(struct seq_file *m,
1879                     pg_data_t *pgdat, struct zone *zone)
1880 {
1881     unsigned int order;
1882     int index;
1883 
1884     /* Alloc on stack as interrupts are disabled for zone walk */
1885     struct contig_page_info info;
1886 
1887     seq_printf(m, "Node %d, zone %8s ",
1888                 pgdat->node_id,
1889                 zone->name);
1890     for (order = 0; order < MAX_ORDER; ++order) {
1891         fill_contig_page_info(zone, order, &info);
1892         index = __fragmentation_index(order, &info);
1893         seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1894     }
1895 
1896     seq_putc(m, '\n');
1897 }
1898 
1899 /*
1900  * Display fragmentation index for orders that allocations would fail for
1901  */
1902 static int extfrag_show(struct seq_file *m, void *arg)
1903 {
1904     pg_data_t *pgdat = (pg_data_t *)arg;
1905 
1906     walk_zones_in_node(m, pgdat, extfrag_show_print);
1907 
1908     return 0;
1909 }
1910 
1911 static const struct seq_operations extfrag_op = {
1912     .start  = frag_start,
1913     .next   = frag_next,
1914     .stop   = frag_stop,
1915     .show   = extfrag_show,
1916 };
1917 
1918 static int extfrag_open(struct inode *inode, struct file *file)
1919 {
1920     return seq_open(file, &extfrag_op);
1921 }
1922 
1923 static const struct file_operations extfrag_file_ops = {
1924     .open       = extfrag_open,
1925     .read       = seq_read,
1926     .llseek     = seq_lseek,
1927     .release    = seq_release,
1928 };
1929 
1930 static int __init extfrag_debug_init(void)
1931 {
1932     struct dentry *extfrag_debug_root;
1933 
1934     extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
1935     if (!extfrag_debug_root)
1936         return -ENOMEM;
1937 
1938     if (!debugfs_create_file("unusable_index", 0444,
1939             extfrag_debug_root, NULL, &unusable_file_ops))
1940         goto fail;
1941 
1942     if (!debugfs_create_file("extfrag_index", 0444,
1943             extfrag_debug_root, NULL, &extfrag_file_ops))
1944         goto fail;
1945 
1946     return 0;
1947 fail:
1948     debugfs_remove_recursive(extfrag_debug_root);
1949     return -ENOMEM;
1950 }
1951 
1952 module_init(extfrag_debug_init);
1953 #endif