0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #include <linux/cgroup.h>
0013 #include <linux/fs.h>
0014 #include <linux/log2.h>
0015 #include <linux/sched.h>
0016 #include <linux/mm.h>
0017 #include <linux/vmstat.h>
0018 #include <linux/eventfd.h>
0019 #include <linux/slab.h>
0020 #include <linux/swap.h>
0021 #include <linux/printk.h>
0022 #include <linux/vmpressure.h>
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038 static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
0039
0040
0041
0042
0043
0044
0045
0046 static const unsigned int vmpressure_level_med = 60;
0047 static const unsigned int vmpressure_level_critical = 95;
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068 static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
0069
0070 static struct vmpressure *work_to_vmpressure(struct work_struct *work)
0071 {
0072 return container_of(work, struct vmpressure, work);
0073 }
0074
0075 static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
0076 {
0077 struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr);
0078
0079 memcg = parent_mem_cgroup(memcg);
0080 if (!memcg)
0081 return NULL;
0082 return memcg_to_vmpressure(memcg);
0083 }
0084
0085 enum vmpressure_levels {
0086 VMPRESSURE_LOW = 0,
0087 VMPRESSURE_MEDIUM,
0088 VMPRESSURE_CRITICAL,
0089 VMPRESSURE_NUM_LEVELS,
0090 };
0091
0092 enum vmpressure_modes {
0093 VMPRESSURE_NO_PASSTHROUGH = 0,
0094 VMPRESSURE_HIERARCHY,
0095 VMPRESSURE_LOCAL,
0096 VMPRESSURE_NUM_MODES,
0097 };
0098
0099 static const char * const vmpressure_str_levels[] = {
0100 [VMPRESSURE_LOW] = "low",
0101 [VMPRESSURE_MEDIUM] = "medium",
0102 [VMPRESSURE_CRITICAL] = "critical",
0103 };
0104
0105 static const char * const vmpressure_str_modes[] = {
0106 [VMPRESSURE_NO_PASSTHROUGH] = "default",
0107 [VMPRESSURE_HIERARCHY] = "hierarchy",
0108 [VMPRESSURE_LOCAL] = "local",
0109 };
0110
0111 static enum vmpressure_levels vmpressure_level(unsigned long pressure)
0112 {
0113 if (pressure >= vmpressure_level_critical)
0114 return VMPRESSURE_CRITICAL;
0115 else if (pressure >= vmpressure_level_med)
0116 return VMPRESSURE_MEDIUM;
0117 return VMPRESSURE_LOW;
0118 }
0119
0120 static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
0121 unsigned long reclaimed)
0122 {
0123 unsigned long scale = scanned + reclaimed;
0124 unsigned long pressure = 0;
0125
0126
0127
0128
0129
0130
0131 if (reclaimed >= scanned)
0132 goto out;
0133
0134
0135
0136
0137
0138
0139
0140 pressure = scale - (reclaimed * scale / scanned);
0141 pressure = pressure * 100 / scale;
0142
0143 out:
0144 pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
0145 scanned, reclaimed);
0146
0147 return vmpressure_level(pressure);
0148 }
0149
0150 struct vmpressure_event {
0151 struct eventfd_ctx *efd;
0152 enum vmpressure_levels level;
0153 enum vmpressure_modes mode;
0154 struct list_head node;
0155 };
0156
0157 static bool vmpressure_event(struct vmpressure *vmpr,
0158 const enum vmpressure_levels level,
0159 bool ancestor, bool signalled)
0160 {
0161 struct vmpressure_event *ev;
0162 bool ret = false;
0163
0164 mutex_lock(&vmpr->events_lock);
0165 list_for_each_entry(ev, &vmpr->events, node) {
0166 if (ancestor && ev->mode == VMPRESSURE_LOCAL)
0167 continue;
0168 if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
0169 continue;
0170 if (level < ev->level)
0171 continue;
0172 eventfd_signal(ev->efd, 1);
0173 ret = true;
0174 }
0175 mutex_unlock(&vmpr->events_lock);
0176
0177 return ret;
0178 }
0179
0180 static void vmpressure_work_fn(struct work_struct *work)
0181 {
0182 struct vmpressure *vmpr = work_to_vmpressure(work);
0183 unsigned long scanned;
0184 unsigned long reclaimed;
0185 enum vmpressure_levels level;
0186 bool ancestor = false;
0187 bool signalled = false;
0188
0189 spin_lock(&vmpr->sr_lock);
0190
0191
0192
0193
0194
0195
0196
0197
0198 scanned = vmpr->tree_scanned;
0199 if (!scanned) {
0200 spin_unlock(&vmpr->sr_lock);
0201 return;
0202 }
0203
0204 reclaimed = vmpr->tree_reclaimed;
0205 vmpr->tree_scanned = 0;
0206 vmpr->tree_reclaimed = 0;
0207 spin_unlock(&vmpr->sr_lock);
0208
0209 level = vmpressure_calc_level(scanned, reclaimed);
0210
0211 do {
0212 if (vmpressure_event(vmpr, level, ancestor, signalled))
0213 signalled = true;
0214 ancestor = true;
0215 } while ((vmpr = vmpressure_parent(vmpr)));
0216 }
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239 void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
0240 unsigned long scanned, unsigned long reclaimed)
0241 {
0242 struct vmpressure *vmpr;
0243
0244 if (mem_cgroup_disabled())
0245 return;
0246
0247 vmpr = memcg_to_vmpressure(memcg);
0248
0249
0250
0251
0252
0253
0254
0255
0256
0257
0258
0259
0260 if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
0261 return;
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271 if (!scanned)
0272 return;
0273
0274 if (tree) {
0275 spin_lock(&vmpr->sr_lock);
0276 scanned = vmpr->tree_scanned += scanned;
0277 vmpr->tree_reclaimed += reclaimed;
0278 spin_unlock(&vmpr->sr_lock);
0279
0280 if (scanned < vmpressure_win)
0281 return;
0282 schedule_work(&vmpr->work);
0283 } else {
0284 enum vmpressure_levels level;
0285
0286
0287 if (!memcg || mem_cgroup_is_root(memcg))
0288 return;
0289
0290 spin_lock(&vmpr->sr_lock);
0291 scanned = vmpr->scanned += scanned;
0292 reclaimed = vmpr->reclaimed += reclaimed;
0293 if (scanned < vmpressure_win) {
0294 spin_unlock(&vmpr->sr_lock);
0295 return;
0296 }
0297 vmpr->scanned = vmpr->reclaimed = 0;
0298 spin_unlock(&vmpr->sr_lock);
0299
0300 level = vmpressure_calc_level(scanned, reclaimed);
0301
0302 if (level > VMPRESSURE_LOW) {
0303
0304
0305
0306
0307
0308
0309
0310
0311 WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
0312 }
0313 }
0314 }
0315
0316
0317
0318
0319
0320
0321
0322
0323
0324
0325
0326
0327 void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
0328 {
0329
0330
0331
0332
0333 if (prio > vmpressure_level_critical_prio)
0334 return;
0335
0336
0337
0338
0339
0340
0341
0342
0343 vmpressure(gfp, memcg, true, vmpressure_win, 0);
0344 }
0345
0346 #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366 int vmpressure_register_event(struct mem_cgroup *memcg,
0367 struct eventfd_ctx *eventfd, const char *args)
0368 {
0369 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
0370 struct vmpressure_event *ev;
0371 enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
0372 enum vmpressure_levels level;
0373 char *spec, *spec_orig;
0374 char *token;
0375 int ret = 0;
0376
0377 spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
0378 if (!spec)
0379 return -ENOMEM;
0380
0381
0382 token = strsep(&spec, ",");
0383 ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
0384 if (ret < 0)
0385 goto out;
0386 level = ret;
0387
0388
0389 token = strsep(&spec, ",");
0390 if (token) {
0391 ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
0392 if (ret < 0)
0393 goto out;
0394 mode = ret;
0395 }
0396
0397 ev = kzalloc(sizeof(*ev), GFP_KERNEL);
0398 if (!ev) {
0399 ret = -ENOMEM;
0400 goto out;
0401 }
0402
0403 ev->efd = eventfd;
0404 ev->level = level;
0405 ev->mode = mode;
0406
0407 mutex_lock(&vmpr->events_lock);
0408 list_add(&ev->node, &vmpr->events);
0409 mutex_unlock(&vmpr->events_lock);
0410 ret = 0;
0411 out:
0412 kfree(spec_orig);
0413 return ret;
0414 }
0415
0416
0417
0418
0419
0420
0421
0422
0423
0424
0425
0426
0427 void vmpressure_unregister_event(struct mem_cgroup *memcg,
0428 struct eventfd_ctx *eventfd)
0429 {
0430 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
0431 struct vmpressure_event *ev;
0432
0433 mutex_lock(&vmpr->events_lock);
0434 list_for_each_entry(ev, &vmpr->events, node) {
0435 if (ev->efd != eventfd)
0436 continue;
0437 list_del(&ev->node);
0438 kfree(ev);
0439 break;
0440 }
0441 mutex_unlock(&vmpr->events_lock);
0442 }
0443
0444
0445
0446
0447
0448
0449
0450
0451 void vmpressure_init(struct vmpressure *vmpr)
0452 {
0453 spin_lock_init(&vmpr->sr_lock);
0454 mutex_init(&vmpr->events_lock);
0455 INIT_LIST_HEAD(&vmpr->events);
0456 INIT_WORK(&vmpr->work, vmpressure_work_fn);
0457 }
0458
0459
0460
0461
0462
0463
0464
0465
0466 void vmpressure_cleanup(struct vmpressure *vmpr)
0467 {
0468
0469
0470
0471
0472 flush_work(&vmpr->work);
0473 }