Back to home page

LXR

 
 

    


0001 /*
0002  * Linux VM pressure
0003  *
0004  * Copyright 2012 Linaro Ltd.
0005  *        Anton Vorontsov <anton.vorontsov@linaro.org>
0006  *
0007  * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
0008  * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
0009  *
0010  * This program is free software; you can redistribute it and/or modify it
0011  * under the terms of the GNU General Public License version 2 as published
0012  * by the Free Software Foundation.
0013  */
0014 
0015 #include <linux/cgroup.h>
0016 #include <linux/fs.h>
0017 #include <linux/log2.h>
0018 #include <linux/sched.h>
0019 #include <linux/mm.h>
0020 #include <linux/vmstat.h>
0021 #include <linux/eventfd.h>
0022 #include <linux/slab.h>
0023 #include <linux/swap.h>
0024 #include <linux/printk.h>
0025 #include <linux/vmpressure.h>
0026 
0027 /*
0028  * The window size (vmpressure_win) is the number of scanned pages before
0029  * we try to analyze scanned/reclaimed ratio. So the window is used as a
0030  * rate-limit tunable for the "low" level notification, and also for
0031  * averaging the ratio for medium/critical levels. Using small window
0032  * sizes can cause lot of false positives, but too big window size will
0033  * delay the notifications.
0034  *
0035  * As the vmscan reclaimer logic works with chunks which are multiple of
0036  * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
0037  *
0038  * TODO: Make the window size depend on machine size, as we do for vmstat
0039  * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
0040  */
0041 static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
0042 
0043 /*
0044  * These thresholds are used when we account memory pressure through
0045  * scanned/reclaimed ratio. The current values were chosen empirically. In
0046  * essence, they are percents: the higher the value, the more number
0047  * unsuccessful reclaims there were.
0048  */
0049 static const unsigned int vmpressure_level_med = 60;
0050 static const unsigned int vmpressure_level_critical = 95;
0051 
0052 /*
0053  * When there are too little pages left to scan, vmpressure() may miss the
0054  * critical pressure as number of pages will be less than "window size".
0055  * However, in that case the vmscan priority will raise fast as the
0056  * reclaimer will try to scan LRUs more deeply.
0057  *
0058  * The vmscan logic considers these special priorities:
0059  *
0060  * prio == DEF_PRIORITY (12): reclaimer starts with that value
0061  * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
0062  * prio == 0                : close to OOM, kernel scans every page in an lru
0063  *
0064  * Any value in this range is acceptable for this tunable (i.e. from 12 to
0065  * 0). Current value for the vmpressure_level_critical_prio is chosen
0066  * empirically, but the number, in essence, means that we consider
0067  * critical level when scanning depth is ~10% of the lru size (vmscan
0068  * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
0069  * eights).
0070  */
0071 static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
0072 
0073 static struct vmpressure *work_to_vmpressure(struct work_struct *work)
0074 {
0075     return container_of(work, struct vmpressure, work);
0076 }
0077 
0078 static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
0079 {
0080     struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
0081     struct mem_cgroup *memcg = mem_cgroup_from_css(css);
0082 
0083     memcg = parent_mem_cgroup(memcg);
0084     if (!memcg)
0085         return NULL;
0086     return memcg_to_vmpressure(memcg);
0087 }
0088 
0089 enum vmpressure_levels {
0090     VMPRESSURE_LOW = 0,
0091     VMPRESSURE_MEDIUM,
0092     VMPRESSURE_CRITICAL,
0093     VMPRESSURE_NUM_LEVELS,
0094 };
0095 
0096 static const char * const vmpressure_str_levels[] = {
0097     [VMPRESSURE_LOW] = "low",
0098     [VMPRESSURE_MEDIUM] = "medium",
0099     [VMPRESSURE_CRITICAL] = "critical",
0100 };
0101 
0102 static enum vmpressure_levels vmpressure_level(unsigned long pressure)
0103 {
0104     if (pressure >= vmpressure_level_critical)
0105         return VMPRESSURE_CRITICAL;
0106     else if (pressure >= vmpressure_level_med)
0107         return VMPRESSURE_MEDIUM;
0108     return VMPRESSURE_LOW;
0109 }
0110 
0111 static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
0112                             unsigned long reclaimed)
0113 {
0114     unsigned long scale = scanned + reclaimed;
0115     unsigned long pressure;
0116 
0117     /*
0118      * We calculate the ratio (in percents) of how many pages were
0119      * scanned vs. reclaimed in a given time frame (window). Note that
0120      * time is in VM reclaimer's "ticks", i.e. number of pages
0121      * scanned. This makes it possible to set desired reaction time
0122      * and serves as a ratelimit.
0123      */
0124     pressure = scale - (reclaimed * scale / scanned);
0125     pressure = pressure * 100 / scale;
0126 
0127     pr_debug("%s: %3lu  (s: %lu  r: %lu)\n", __func__, pressure,
0128          scanned, reclaimed);
0129 
0130     return vmpressure_level(pressure);
0131 }
0132 
0133 struct vmpressure_event {
0134     struct eventfd_ctx *efd;
0135     enum vmpressure_levels level;
0136     struct list_head node;
0137 };
0138 
0139 static bool vmpressure_event(struct vmpressure *vmpr,
0140                  enum vmpressure_levels level)
0141 {
0142     struct vmpressure_event *ev;
0143     bool signalled = false;
0144 
0145     mutex_lock(&vmpr->events_lock);
0146 
0147     list_for_each_entry(ev, &vmpr->events, node) {
0148         if (level >= ev->level) {
0149             eventfd_signal(ev->efd, 1);
0150             signalled = true;
0151         }
0152     }
0153 
0154     mutex_unlock(&vmpr->events_lock);
0155 
0156     return signalled;
0157 }
0158 
0159 static void vmpressure_work_fn(struct work_struct *work)
0160 {
0161     struct vmpressure *vmpr = work_to_vmpressure(work);
0162     unsigned long scanned;
0163     unsigned long reclaimed;
0164     enum vmpressure_levels level;
0165 
0166     spin_lock(&vmpr->sr_lock);
0167     /*
0168      * Several contexts might be calling vmpressure(), so it is
0169      * possible that the work was rescheduled again before the old
0170      * work context cleared the counters. In that case we will run
0171      * just after the old work returns, but then scanned might be zero
0172      * here. No need for any locks here since we don't care if
0173      * vmpr->reclaimed is in sync.
0174      */
0175     scanned = vmpr->tree_scanned;
0176     if (!scanned) {
0177         spin_unlock(&vmpr->sr_lock);
0178         return;
0179     }
0180 
0181     reclaimed = vmpr->tree_reclaimed;
0182     vmpr->tree_scanned = 0;
0183     vmpr->tree_reclaimed = 0;
0184     spin_unlock(&vmpr->sr_lock);
0185 
0186     level = vmpressure_calc_level(scanned, reclaimed);
0187 
0188     do {
0189         if (vmpressure_event(vmpr, level))
0190             break;
0191         /*
0192          * If not handled, propagate the event upward into the
0193          * hierarchy.
0194          */
0195     } while ((vmpr = vmpressure_parent(vmpr)));
0196 }
0197 
0198 /**
0199  * vmpressure() - Account memory pressure through scanned/reclaimed ratio
0200  * @gfp:    reclaimer's gfp mask
0201  * @memcg:  cgroup memory controller handle
0202  * @tree:   legacy subtree mode
0203  * @scanned:    number of pages scanned
0204  * @reclaimed:  number of pages reclaimed
0205  *
0206  * This function should be called from the vmscan reclaim path to account
0207  * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
0208  * pressure index is then further refined and averaged over time.
0209  *
0210  * If @tree is set, vmpressure is in traditional userspace reporting
0211  * mode: @memcg is considered the pressure root and userspace is
0212  * notified of the entire subtree's reclaim efficiency.
0213  *
0214  * If @tree is not set, reclaim efficiency is recorded for @memcg, and
0215  * only in-kernel users are notified.
0216  *
0217  * This function does not return any value.
0218  */
0219 void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
0220         unsigned long scanned, unsigned long reclaimed)
0221 {
0222     struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
0223 
0224     /*
0225      * Here we only want to account pressure that userland is able to
0226      * help us with. For example, suppose that DMA zone is under
0227      * pressure; if we notify userland about that kind of pressure,
0228      * then it will be mostly a waste as it will trigger unnecessary
0229      * freeing of memory by userland (since userland is more likely to
0230      * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
0231      * is why we include only movable, highmem and FS/IO pages.
0232      * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
0233      * we account it too.
0234      */
0235     if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
0236         return;
0237 
0238     /*
0239      * If we got here with no pages scanned, then that is an indicator
0240      * that reclaimer was unable to find any shrinkable LRUs at the
0241      * current scanning depth. But it does not mean that we should
0242      * report the critical pressure, yet. If the scanning priority
0243      * (scanning depth) goes too high (deep), we will be notified
0244      * through vmpressure_prio(). But so far, keep calm.
0245      */
0246     if (!scanned)
0247         return;
0248 
0249     if (tree) {
0250         spin_lock(&vmpr->sr_lock);
0251         scanned = vmpr->tree_scanned += scanned;
0252         vmpr->tree_reclaimed += reclaimed;
0253         spin_unlock(&vmpr->sr_lock);
0254 
0255         if (scanned < vmpressure_win)
0256             return;
0257         schedule_work(&vmpr->work);
0258     } else {
0259         enum vmpressure_levels level;
0260 
0261         /* For now, no users for root-level efficiency */
0262         if (!memcg || memcg == root_mem_cgroup)
0263             return;
0264 
0265         spin_lock(&vmpr->sr_lock);
0266         scanned = vmpr->scanned += scanned;
0267         reclaimed = vmpr->reclaimed += reclaimed;
0268         if (scanned < vmpressure_win) {
0269             spin_unlock(&vmpr->sr_lock);
0270             return;
0271         }
0272         vmpr->scanned = vmpr->reclaimed = 0;
0273         spin_unlock(&vmpr->sr_lock);
0274 
0275         level = vmpressure_calc_level(scanned, reclaimed);
0276 
0277         if (level > VMPRESSURE_LOW) {
0278             /*
0279              * Let the socket buffer allocator know that
0280              * we are having trouble reclaiming LRU pages.
0281              *
0282              * For hysteresis keep the pressure state
0283              * asserted for a second in which subsequent
0284              * pressure events can occur.
0285              */
0286             memcg->socket_pressure = jiffies + HZ;
0287         }
0288     }
0289 }
0290 
0291 /**
0292  * vmpressure_prio() - Account memory pressure through reclaimer priority level
0293  * @gfp:    reclaimer's gfp mask
0294  * @memcg:  cgroup memory controller handle
0295  * @prio:   reclaimer's priority
0296  *
0297  * This function should be called from the reclaim path every time when
0298  * the vmscan's reclaiming priority (scanning depth) changes.
0299  *
0300  * This function does not return any value.
0301  */
0302 void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
0303 {
0304     /*
0305      * We only use prio for accounting critical level. For more info
0306      * see comment for vmpressure_level_critical_prio variable above.
0307      */
0308     if (prio > vmpressure_level_critical_prio)
0309         return;
0310 
0311     /*
0312      * OK, the prio is below the threshold, updating vmpressure
0313      * information before shrinker dives into long shrinking of long
0314      * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
0315      * to the vmpressure() basically means that we signal 'critical'
0316      * level.
0317      */
0318     vmpressure(gfp, memcg, true, vmpressure_win, 0);
0319 }
0320 
0321 /**
0322  * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
0323  * @memcg:  memcg that is interested in vmpressure notifications
0324  * @eventfd:    eventfd context to link notifications with
0325  * @args:   event arguments (used to set up a pressure level threshold)
0326  *
0327  * This function associates eventfd context with the vmpressure
0328  * infrastructure, so that the notifications will be delivered to the
0329  * @eventfd. The @args parameter is a string that denotes pressure level
0330  * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
0331  * "critical").
0332  *
0333  * To be used as memcg event method.
0334  */
0335 int vmpressure_register_event(struct mem_cgroup *memcg,
0336                   struct eventfd_ctx *eventfd, const char *args)
0337 {
0338     struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
0339     struct vmpressure_event *ev;
0340     int level;
0341 
0342     for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
0343         if (!strcmp(vmpressure_str_levels[level], args))
0344             break;
0345     }
0346 
0347     if (level >= VMPRESSURE_NUM_LEVELS)
0348         return -EINVAL;
0349 
0350     ev = kzalloc(sizeof(*ev), GFP_KERNEL);
0351     if (!ev)
0352         return -ENOMEM;
0353 
0354     ev->efd = eventfd;
0355     ev->level = level;
0356 
0357     mutex_lock(&vmpr->events_lock);
0358     list_add(&ev->node, &vmpr->events);
0359     mutex_unlock(&vmpr->events_lock);
0360 
0361     return 0;
0362 }
0363 
0364 /**
0365  * vmpressure_unregister_event() - Unbind eventfd from vmpressure
0366  * @memcg:  memcg handle
0367  * @eventfd:    eventfd context that was used to link vmpressure with the @cg
0368  *
0369  * This function does internal manipulations to detach the @eventfd from
0370  * the vmpressure notifications, and then frees internal resources
0371  * associated with the @eventfd (but the @eventfd itself is not freed).
0372  *
0373  * To be used as memcg event method.
0374  */
0375 void vmpressure_unregister_event(struct mem_cgroup *memcg,
0376                  struct eventfd_ctx *eventfd)
0377 {
0378     struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
0379     struct vmpressure_event *ev;
0380 
0381     mutex_lock(&vmpr->events_lock);
0382     list_for_each_entry(ev, &vmpr->events, node) {
0383         if (ev->efd != eventfd)
0384             continue;
0385         list_del(&ev->node);
0386         kfree(ev);
0387         break;
0388     }
0389     mutex_unlock(&vmpr->events_lock);
0390 }
0391 
0392 /**
0393  * vmpressure_init() - Initialize vmpressure control structure
0394  * @vmpr:   Structure to be initialized
0395  *
0396  * This function should be called on every allocated vmpressure structure
0397  * before any usage.
0398  */
0399 void vmpressure_init(struct vmpressure *vmpr)
0400 {
0401     spin_lock_init(&vmpr->sr_lock);
0402     mutex_init(&vmpr->events_lock);
0403     INIT_LIST_HEAD(&vmpr->events);
0404     INIT_WORK(&vmpr->work, vmpressure_work_fn);
0405 }
0406 
0407 /**
0408  * vmpressure_cleanup() - shuts down vmpressure control structure
0409  * @vmpr:   Structure to be cleaned up
0410  *
0411  * This function should be called before the structure in which it is
0412  * embedded is cleaned up.
0413  */
0414 void vmpressure_cleanup(struct vmpressure *vmpr)
0415 {
0416     /*
0417      * Make sure there is no pending work before eventfd infrastructure
0418      * goes away.
0419      */
0420     flush_work(&vmpr->work);
0421 }