Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #define _GNU_SOURCE
0003 
0004 #include <linux/limits.h>
0005 #include <linux/oom.h>
0006 #include <fcntl.h>
0007 #include <stdio.h>
0008 #include <stdlib.h>
0009 #include <string.h>
0010 #include <sys/stat.h>
0011 #include <sys/types.h>
0012 #include <unistd.h>
0013 #include <sys/socket.h>
0014 #include <sys/wait.h>
0015 #include <arpa/inet.h>
0016 #include <netinet/in.h>
0017 #include <netdb.h>
0018 #include <errno.h>
0019 #include <sys/mman.h>
0020 
0021 #include "../kselftest.h"
0022 #include "cgroup_util.h"
0023 
0024 static bool has_localevents;
0025 static bool has_recursiveprot;
0026 
0027 /*
0028  * This test creates two nested cgroups with and without enabling
0029  * the memory controller.
0030  */
0031 static int test_memcg_subtree_control(const char *root)
0032 {
0033     char *parent, *child, *parent2 = NULL, *child2 = NULL;
0034     int ret = KSFT_FAIL;
0035     char buf[PAGE_SIZE];
0036 
0037     /* Create two nested cgroups with the memory controller enabled */
0038     parent = cg_name(root, "memcg_test_0");
0039     child = cg_name(root, "memcg_test_0/memcg_test_1");
0040     if (!parent || !child)
0041         goto cleanup_free;
0042 
0043     if (cg_create(parent))
0044         goto cleanup_free;
0045 
0046     if (cg_write(parent, "cgroup.subtree_control", "+memory"))
0047         goto cleanup_parent;
0048 
0049     if (cg_create(child))
0050         goto cleanup_parent;
0051 
0052     if (cg_read_strstr(child, "cgroup.controllers", "memory"))
0053         goto cleanup_child;
0054 
0055     /* Create two nested cgroups without enabling memory controller */
0056     parent2 = cg_name(root, "memcg_test_1");
0057     child2 = cg_name(root, "memcg_test_1/memcg_test_1");
0058     if (!parent2 || !child2)
0059         goto cleanup_free2;
0060 
0061     if (cg_create(parent2))
0062         goto cleanup_free2;
0063 
0064     if (cg_create(child2))
0065         goto cleanup_parent2;
0066 
0067     if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
0068         goto cleanup_all;
0069 
0070     if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
0071         goto cleanup_all;
0072 
0073     ret = KSFT_PASS;
0074 
0075 cleanup_all:
0076     cg_destroy(child2);
0077 cleanup_parent2:
0078     cg_destroy(parent2);
0079 cleanup_free2:
0080     free(parent2);
0081     free(child2);
0082 cleanup_child:
0083     cg_destroy(child);
0084 cleanup_parent:
0085     cg_destroy(parent);
0086 cleanup_free:
0087     free(parent);
0088     free(child);
0089 
0090     return ret;
0091 }
0092 
0093 static int alloc_anon_50M_check(const char *cgroup, void *arg)
0094 {
0095     size_t size = MB(50);
0096     char *buf, *ptr;
0097     long anon, current;
0098     int ret = -1;
0099 
0100     buf = malloc(size);
0101     for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
0102         *ptr = 0;
0103 
0104     current = cg_read_long(cgroup, "memory.current");
0105     if (current < size)
0106         goto cleanup;
0107 
0108     if (!values_close(size, current, 3))
0109         goto cleanup;
0110 
0111     anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
0112     if (anon < 0)
0113         goto cleanup;
0114 
0115     if (!values_close(anon, current, 3))
0116         goto cleanup;
0117 
0118     ret = 0;
0119 cleanup:
0120     free(buf);
0121     return ret;
0122 }
0123 
0124 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
0125 {
0126     size_t size = MB(50);
0127     int ret = -1;
0128     long current, file;
0129     int fd;
0130 
0131     fd = get_temp_fd();
0132     if (fd < 0)
0133         return -1;
0134 
0135     if (alloc_pagecache(fd, size))
0136         goto cleanup;
0137 
0138     current = cg_read_long(cgroup, "memory.current");
0139     if (current < size)
0140         goto cleanup;
0141 
0142     file = cg_read_key_long(cgroup, "memory.stat", "file ");
0143     if (file < 0)
0144         goto cleanup;
0145 
0146     if (!values_close(file, current, 10))
0147         goto cleanup;
0148 
0149     ret = 0;
0150 
0151 cleanup:
0152     close(fd);
0153     return ret;
0154 }
0155 
0156 /*
0157  * This test create a memory cgroup, allocates
0158  * some anonymous memory and some pagecache
0159  * and check memory.current and some memory.stat values.
0160  */
0161 static int test_memcg_current(const char *root)
0162 {
0163     int ret = KSFT_FAIL;
0164     long current;
0165     char *memcg;
0166 
0167     memcg = cg_name(root, "memcg_test");
0168     if (!memcg)
0169         goto cleanup;
0170 
0171     if (cg_create(memcg))
0172         goto cleanup;
0173 
0174     current = cg_read_long(memcg, "memory.current");
0175     if (current != 0)
0176         goto cleanup;
0177 
0178     if (cg_run(memcg, alloc_anon_50M_check, NULL))
0179         goto cleanup;
0180 
0181     if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
0182         goto cleanup;
0183 
0184     ret = KSFT_PASS;
0185 
0186 cleanup:
0187     cg_destroy(memcg);
0188     free(memcg);
0189 
0190     return ret;
0191 }
0192 
0193 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
0194 {
0195     int fd = (long)arg;
0196     int ppid = getppid();
0197 
0198     if (alloc_pagecache(fd, MB(50)))
0199         return -1;
0200 
0201     while (getppid() == ppid)
0202         sleep(1);
0203 
0204     return 0;
0205 }
0206 
0207 static int alloc_anon_noexit(const char *cgroup, void *arg)
0208 {
0209     int ppid = getppid();
0210     size_t size = (unsigned long)arg;
0211     char *buf, *ptr;
0212 
0213     buf = malloc(size);
0214     for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
0215         *ptr = 0;
0216 
0217     while (getppid() == ppid)
0218         sleep(1);
0219 
0220     free(buf);
0221     return 0;
0222 }
0223 
0224 /*
0225  * Wait until processes are killed asynchronously by the OOM killer
0226  * If we exceed a timeout, fail.
0227  */
0228 static int cg_test_proc_killed(const char *cgroup)
0229 {
0230     int limit;
0231 
0232     for (limit = 10; limit > 0; limit--) {
0233         if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
0234             return 0;
0235 
0236         usleep(100000);
0237     }
0238     return -1;
0239 }
0240 
0241 /*
0242  * First, this test creates the following hierarchy:
0243  * A       memory.min = 0,    memory.max = 200M
0244  * A/B     memory.min = 50M
0245  * A/B/C   memory.min = 75M,  memory.current = 50M
0246  * A/B/D   memory.min = 25M,  memory.current = 50M
0247  * A/B/E   memory.min = 0,    memory.current = 50M
0248  * A/B/F   memory.min = 500M, memory.current = 0
0249  *
0250  * (or memory.low if we test soft protection)
0251  *
0252  * Usages are pagecache and the test keeps a running
0253  * process in every leaf cgroup.
0254  * Then it creates A/G and creates a significant
0255  * memory pressure in A.
0256  *
0257  * Then it checks actual memory usages and expects that:
0258  * A/B    memory.current ~= 50M
0259  * A/B/C  memory.current ~= 29M
0260  * A/B/D  memory.current ~= 21M
0261  * A/B/E  memory.current ~= 0
0262  * A/B/F  memory.current  = 0
0263  * (for origin of the numbers, see model in memcg_protection.m.)
0264  *
0265  * After that it tries to allocate more than there is
0266  * unprotected memory in A available, and checks that:
0267  * a) memory.min protects pagecache even in this case,
0268  * b) memory.low allows reclaiming page cache with low events.
0269  */
0270 static int test_memcg_protection(const char *root, bool min)
0271 {
0272     int ret = KSFT_FAIL, rc;
0273     char *parent[3] = {NULL};
0274     char *children[4] = {NULL};
0275     const char *attribute = min ? "memory.min" : "memory.low";
0276     long c[4];
0277     int i, attempts;
0278     int fd;
0279 
0280     fd = get_temp_fd();
0281     if (fd < 0)
0282         goto cleanup;
0283 
0284     parent[0] = cg_name(root, "memcg_test_0");
0285     if (!parent[0])
0286         goto cleanup;
0287 
0288     parent[1] = cg_name(parent[0], "memcg_test_1");
0289     if (!parent[1])
0290         goto cleanup;
0291 
0292     parent[2] = cg_name(parent[0], "memcg_test_2");
0293     if (!parent[2])
0294         goto cleanup;
0295 
0296     if (cg_create(parent[0]))
0297         goto cleanup;
0298 
0299     if (cg_read_long(parent[0], attribute)) {
0300         /* No memory.min on older kernels is fine */
0301         if (min)
0302             ret = KSFT_SKIP;
0303         goto cleanup;
0304     }
0305 
0306     if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
0307         goto cleanup;
0308 
0309     if (cg_write(parent[0], "memory.max", "200M"))
0310         goto cleanup;
0311 
0312     if (cg_write(parent[0], "memory.swap.max", "0"))
0313         goto cleanup;
0314 
0315     if (cg_create(parent[1]))
0316         goto cleanup;
0317 
0318     if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
0319         goto cleanup;
0320 
0321     if (cg_create(parent[2]))
0322         goto cleanup;
0323 
0324     for (i = 0; i < ARRAY_SIZE(children); i++) {
0325         children[i] = cg_name_indexed(parent[1], "child_memcg", i);
0326         if (!children[i])
0327             goto cleanup;
0328 
0329         if (cg_create(children[i]))
0330             goto cleanup;
0331 
0332         if (i > 2)
0333             continue;
0334 
0335         cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
0336                   (void *)(long)fd);
0337     }
0338 
0339     if (cg_write(parent[1],   attribute, "50M"))
0340         goto cleanup;
0341     if (cg_write(children[0], attribute, "75M"))
0342         goto cleanup;
0343     if (cg_write(children[1], attribute, "25M"))
0344         goto cleanup;
0345     if (cg_write(children[2], attribute, "0"))
0346         goto cleanup;
0347     if (cg_write(children[3], attribute, "500M"))
0348         goto cleanup;
0349 
0350     attempts = 0;
0351     while (!values_close(cg_read_long(parent[1], "memory.current"),
0352                  MB(150), 3)) {
0353         if (attempts++ > 5)
0354             break;
0355         sleep(1);
0356     }
0357 
0358     if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
0359         goto cleanup;
0360 
0361     if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
0362         goto cleanup;
0363 
0364     for (i = 0; i < ARRAY_SIZE(children); i++)
0365         c[i] = cg_read_long(children[i], "memory.current");
0366 
0367     if (!values_close(c[0], MB(29), 10))
0368         goto cleanup;
0369 
0370     if (!values_close(c[1], MB(21), 10))
0371         goto cleanup;
0372 
0373     if (c[3] != 0)
0374         goto cleanup;
0375 
0376     rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
0377     if (min && !rc)
0378         goto cleanup;
0379     else if (!min && rc) {
0380         fprintf(stderr,
0381             "memory.low prevents from allocating anon memory\n");
0382         goto cleanup;
0383     }
0384 
0385     if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
0386         goto cleanup;
0387 
0388     if (min) {
0389         ret = KSFT_PASS;
0390         goto cleanup;
0391     }
0392 
0393     for (i = 0; i < ARRAY_SIZE(children); i++) {
0394         int no_low_events_index = 1;
0395         long low, oom;
0396 
0397         oom = cg_read_key_long(children[i], "memory.events", "oom ");
0398         low = cg_read_key_long(children[i], "memory.events", "low ");
0399 
0400         if (oom)
0401             goto cleanup;
0402         if (i <= no_low_events_index && low <= 0)
0403             goto cleanup;
0404         if (i > no_low_events_index && low)
0405             goto cleanup;
0406 
0407     }
0408 
0409     ret = KSFT_PASS;
0410 
0411 cleanup:
0412     for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
0413         if (!children[i])
0414             continue;
0415 
0416         cg_destroy(children[i]);
0417         free(children[i]);
0418     }
0419 
0420     for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
0421         if (!parent[i])
0422             continue;
0423 
0424         cg_destroy(parent[i]);
0425         free(parent[i]);
0426     }
0427     close(fd);
0428     return ret;
0429 }
0430 
0431 static int test_memcg_min(const char *root)
0432 {
0433     return test_memcg_protection(root, true);
0434 }
0435 
0436 static int test_memcg_low(const char *root)
0437 {
0438     return test_memcg_protection(root, false);
0439 }
0440 
0441 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
0442 {
0443     size_t size = MB(50);
0444     int ret = -1;
0445     long current, high, max;
0446     int fd;
0447 
0448     high = cg_read_long(cgroup, "memory.high");
0449     max = cg_read_long(cgroup, "memory.max");
0450     if (high != MB(30) && max != MB(30))
0451         return -1;
0452 
0453     fd = get_temp_fd();
0454     if (fd < 0)
0455         return -1;
0456 
0457     if (alloc_pagecache(fd, size))
0458         goto cleanup;
0459 
0460     current = cg_read_long(cgroup, "memory.current");
0461     if (!values_close(current, MB(30), 5))
0462         goto cleanup;
0463 
0464     ret = 0;
0465 
0466 cleanup:
0467     close(fd);
0468     return ret;
0469 
0470 }
0471 
0472 /*
0473  * This test checks that memory.high limits the amount of
0474  * memory which can be consumed by either anonymous memory
0475  * or pagecache.
0476  */
0477 static int test_memcg_high(const char *root)
0478 {
0479     int ret = KSFT_FAIL;
0480     char *memcg;
0481     long high;
0482 
0483     memcg = cg_name(root, "memcg_test");
0484     if (!memcg)
0485         goto cleanup;
0486 
0487     if (cg_create(memcg))
0488         goto cleanup;
0489 
0490     if (cg_read_strcmp(memcg, "memory.high", "max\n"))
0491         goto cleanup;
0492 
0493     if (cg_write(memcg, "memory.swap.max", "0"))
0494         goto cleanup;
0495 
0496     if (cg_write(memcg, "memory.high", "30M"))
0497         goto cleanup;
0498 
0499     if (cg_run(memcg, alloc_anon, (void *)MB(31)))
0500         goto cleanup;
0501 
0502     if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
0503         goto cleanup;
0504 
0505     if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
0506         goto cleanup;
0507 
0508     high = cg_read_key_long(memcg, "memory.events", "high ");
0509     if (high <= 0)
0510         goto cleanup;
0511 
0512     ret = KSFT_PASS;
0513 
0514 cleanup:
0515     cg_destroy(memcg);
0516     free(memcg);
0517 
0518     return ret;
0519 }
0520 
0521 static int alloc_anon_mlock(const char *cgroup, void *arg)
0522 {
0523     size_t size = (size_t)arg;
0524     void *buf;
0525 
0526     buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
0527            0, 0);
0528     if (buf == MAP_FAILED)
0529         return -1;
0530 
0531     mlock(buf, size);
0532     munmap(buf, size);
0533     return 0;
0534 }
0535 
0536 /*
0537  * This test checks that memory.high is able to throttle big single shot
0538  * allocation i.e. large allocation within one kernel entry.
0539  */
0540 static int test_memcg_high_sync(const char *root)
0541 {
0542     int ret = KSFT_FAIL, pid, fd = -1;
0543     char *memcg;
0544     long pre_high, pre_max;
0545     long post_high, post_max;
0546 
0547     memcg = cg_name(root, "memcg_test");
0548     if (!memcg)
0549         goto cleanup;
0550 
0551     if (cg_create(memcg))
0552         goto cleanup;
0553 
0554     pre_high = cg_read_key_long(memcg, "memory.events", "high ");
0555     pre_max = cg_read_key_long(memcg, "memory.events", "max ");
0556     if (pre_high < 0 || pre_max < 0)
0557         goto cleanup;
0558 
0559     if (cg_write(memcg, "memory.swap.max", "0"))
0560         goto cleanup;
0561 
0562     if (cg_write(memcg, "memory.high", "30M"))
0563         goto cleanup;
0564 
0565     if (cg_write(memcg, "memory.max", "140M"))
0566         goto cleanup;
0567 
0568     fd = memcg_prepare_for_wait(memcg);
0569     if (fd < 0)
0570         goto cleanup;
0571 
0572     pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
0573     if (pid < 0)
0574         goto cleanup;
0575 
0576     cg_wait_for(fd);
0577 
0578     post_high = cg_read_key_long(memcg, "memory.events", "high ");
0579     post_max = cg_read_key_long(memcg, "memory.events", "max ");
0580     if (post_high < 0 || post_max < 0)
0581         goto cleanup;
0582 
0583     if (pre_high == post_high || pre_max != post_max)
0584         goto cleanup;
0585 
0586     ret = KSFT_PASS;
0587 
0588 cleanup:
0589     if (fd >= 0)
0590         close(fd);
0591     cg_destroy(memcg);
0592     free(memcg);
0593 
0594     return ret;
0595 }
0596 
0597 /*
0598  * This test checks that memory.max limits the amount of
0599  * memory which can be consumed by either anonymous memory
0600  * or pagecache.
0601  */
0602 static int test_memcg_max(const char *root)
0603 {
0604     int ret = KSFT_FAIL;
0605     char *memcg;
0606     long current, max;
0607 
0608     memcg = cg_name(root, "memcg_test");
0609     if (!memcg)
0610         goto cleanup;
0611 
0612     if (cg_create(memcg))
0613         goto cleanup;
0614 
0615     if (cg_read_strcmp(memcg, "memory.max", "max\n"))
0616         goto cleanup;
0617 
0618     if (cg_write(memcg, "memory.swap.max", "0"))
0619         goto cleanup;
0620 
0621     if (cg_write(memcg, "memory.max", "30M"))
0622         goto cleanup;
0623 
0624     /* Should be killed by OOM killer */
0625     if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
0626         goto cleanup;
0627 
0628     if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
0629         goto cleanup;
0630 
0631     current = cg_read_long(memcg, "memory.current");
0632     if (current > MB(30) || !current)
0633         goto cleanup;
0634 
0635     max = cg_read_key_long(memcg, "memory.events", "max ");
0636     if (max <= 0)
0637         goto cleanup;
0638 
0639     ret = KSFT_PASS;
0640 
0641 cleanup:
0642     cg_destroy(memcg);
0643     free(memcg);
0644 
0645     return ret;
0646 }
0647 
0648 /*
0649  * This test checks that memory.reclaim reclaims the given
0650  * amount of memory (from both anon and file, if possible).
0651  */
0652 static int test_memcg_reclaim(const char *root)
0653 {
0654     int ret = KSFT_FAIL, fd, retries;
0655     char *memcg;
0656     long current, expected_usage, to_reclaim;
0657     char buf[64];
0658 
0659     memcg = cg_name(root, "memcg_test");
0660     if (!memcg)
0661         goto cleanup;
0662 
0663     if (cg_create(memcg))
0664         goto cleanup;
0665 
0666     current = cg_read_long(memcg, "memory.current");
0667     if (current != 0)
0668         goto cleanup;
0669 
0670     fd = get_temp_fd();
0671     if (fd < 0)
0672         goto cleanup;
0673 
0674     cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
0675 
0676     /*
0677      * If swap is enabled, try to reclaim from both anon and file, else try
0678      * to reclaim from file only.
0679      */
0680     if (is_swap_enabled()) {
0681         cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
0682         expected_usage = MB(100);
0683     } else
0684         expected_usage = MB(50);
0685 
0686     /*
0687      * Wait until current usage reaches the expected usage (or we run out of
0688      * retries).
0689      */
0690     retries = 5;
0691     while (!values_close(cg_read_long(memcg, "memory.current"),
0692                 expected_usage, 10)) {
0693         if (retries--) {
0694             sleep(1);
0695             continue;
0696         } else {
0697             fprintf(stderr,
0698                 "failed to allocate %ld for memcg reclaim test\n",
0699                 expected_usage);
0700             goto cleanup;
0701         }
0702     }
0703 
0704     /*
0705      * Reclaim until current reaches 30M, this makes sure we hit both anon
0706      * and file if swap is enabled.
0707      */
0708     retries = 5;
0709     while (true) {
0710         int err;
0711 
0712         current = cg_read_long(memcg, "memory.current");
0713         to_reclaim = current - MB(30);
0714 
0715         /*
0716          * We only keep looping if we get EAGAIN, which means we could
0717          * not reclaim the full amount.
0718          */
0719         if (to_reclaim <= 0)
0720             goto cleanup;
0721 
0722 
0723         snprintf(buf, sizeof(buf), "%ld", to_reclaim);
0724         err = cg_write(memcg, "memory.reclaim", buf);
0725         if (!err) {
0726             /*
0727              * If writing succeeds, then the written amount should have been
0728              * fully reclaimed (and maybe more).
0729              */
0730             current = cg_read_long(memcg, "memory.current");
0731             if (!values_close(current, MB(30), 3) && current > MB(30))
0732                 goto cleanup;
0733             break;
0734         }
0735 
0736         /* The kernel could not reclaim the full amount, try again. */
0737         if (err == -EAGAIN && retries--)
0738             continue;
0739 
0740         /* We got an unexpected error or ran out of retries. */
0741         goto cleanup;
0742     }
0743 
0744     ret = KSFT_PASS;
0745 cleanup:
0746     cg_destroy(memcg);
0747     free(memcg);
0748     close(fd);
0749 
0750     return ret;
0751 }
0752 
0753 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
0754 {
0755     long mem_max = (long)arg;
0756     size_t size = MB(50);
0757     char *buf, *ptr;
0758     long mem_current, swap_current;
0759     int ret = -1;
0760 
0761     buf = malloc(size);
0762     for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
0763         *ptr = 0;
0764 
0765     mem_current = cg_read_long(cgroup, "memory.current");
0766     if (!mem_current || !values_close(mem_current, mem_max, 3))
0767         goto cleanup;
0768 
0769     swap_current = cg_read_long(cgroup, "memory.swap.current");
0770     if (!swap_current ||
0771         !values_close(mem_current + swap_current, size, 3))
0772         goto cleanup;
0773 
0774     ret = 0;
0775 cleanup:
0776     free(buf);
0777     return ret;
0778 }
0779 
0780 /*
0781  * This test checks that memory.swap.max limits the amount of
0782  * anonymous memory which can be swapped out.
0783  */
0784 static int test_memcg_swap_max(const char *root)
0785 {
0786     int ret = KSFT_FAIL;
0787     char *memcg;
0788     long max;
0789 
0790     if (!is_swap_enabled())
0791         return KSFT_SKIP;
0792 
0793     memcg = cg_name(root, "memcg_test");
0794     if (!memcg)
0795         goto cleanup;
0796 
0797     if (cg_create(memcg))
0798         goto cleanup;
0799 
0800     if (cg_read_long(memcg, "memory.swap.current")) {
0801         ret = KSFT_SKIP;
0802         goto cleanup;
0803     }
0804 
0805     if (cg_read_strcmp(memcg, "memory.max", "max\n"))
0806         goto cleanup;
0807 
0808     if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
0809         goto cleanup;
0810 
0811     if (cg_write(memcg, "memory.swap.max", "30M"))
0812         goto cleanup;
0813 
0814     if (cg_write(memcg, "memory.max", "30M"))
0815         goto cleanup;
0816 
0817     /* Should be killed by OOM killer */
0818     if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
0819         goto cleanup;
0820 
0821     if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
0822         goto cleanup;
0823 
0824     if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
0825         goto cleanup;
0826 
0827     if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
0828         goto cleanup;
0829 
0830     max = cg_read_key_long(memcg, "memory.events", "max ");
0831     if (max <= 0)
0832         goto cleanup;
0833 
0834     ret = KSFT_PASS;
0835 
0836 cleanup:
0837     cg_destroy(memcg);
0838     free(memcg);
0839 
0840     return ret;
0841 }
0842 
0843 /*
0844  * This test disables swapping and tries to allocate anonymous memory
0845  * up to OOM. Then it checks for oom and oom_kill events in
0846  * memory.events.
0847  */
0848 static int test_memcg_oom_events(const char *root)
0849 {
0850     int ret = KSFT_FAIL;
0851     char *memcg;
0852 
0853     memcg = cg_name(root, "memcg_test");
0854     if (!memcg)
0855         goto cleanup;
0856 
0857     if (cg_create(memcg))
0858         goto cleanup;
0859 
0860     if (cg_write(memcg, "memory.max", "30M"))
0861         goto cleanup;
0862 
0863     if (cg_write(memcg, "memory.swap.max", "0"))
0864         goto cleanup;
0865 
0866     if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
0867         goto cleanup;
0868 
0869     if (cg_read_strcmp(memcg, "cgroup.procs", ""))
0870         goto cleanup;
0871 
0872     if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
0873         goto cleanup;
0874 
0875     if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
0876         goto cleanup;
0877 
0878     ret = KSFT_PASS;
0879 
0880 cleanup:
0881     cg_destroy(memcg);
0882     free(memcg);
0883 
0884     return ret;
0885 }
0886 
0887 struct tcp_server_args {
0888     unsigned short port;
0889     int ctl[2];
0890 };
0891 
0892 static int tcp_server(const char *cgroup, void *arg)
0893 {
0894     struct tcp_server_args *srv_args = arg;
0895     struct sockaddr_in6 saddr = { 0 };
0896     socklen_t slen = sizeof(saddr);
0897     int sk, client_sk, ctl_fd, yes = 1, ret = -1;
0898 
0899     close(srv_args->ctl[0]);
0900     ctl_fd = srv_args->ctl[1];
0901 
0902     saddr.sin6_family = AF_INET6;
0903     saddr.sin6_addr = in6addr_any;
0904     saddr.sin6_port = htons(srv_args->port);
0905 
0906     sk = socket(AF_INET6, SOCK_STREAM, 0);
0907     if (sk < 0)
0908         return ret;
0909 
0910     if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
0911         goto cleanup;
0912 
0913     if (bind(sk, (struct sockaddr *)&saddr, slen)) {
0914         write(ctl_fd, &errno, sizeof(errno));
0915         goto cleanup;
0916     }
0917 
0918     if (listen(sk, 1))
0919         goto cleanup;
0920 
0921     ret = 0;
0922     if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
0923         ret = -1;
0924         goto cleanup;
0925     }
0926 
0927     client_sk = accept(sk, NULL, NULL);
0928     if (client_sk < 0)
0929         goto cleanup;
0930 
0931     ret = -1;
0932     for (;;) {
0933         uint8_t buf[0x100000];
0934 
0935         if (write(client_sk, buf, sizeof(buf)) <= 0) {
0936             if (errno == ECONNRESET)
0937                 ret = 0;
0938             break;
0939         }
0940     }
0941 
0942     close(client_sk);
0943 
0944 cleanup:
0945     close(sk);
0946     return ret;
0947 }
0948 
0949 static int tcp_client(const char *cgroup, unsigned short port)
0950 {
0951     const char server[] = "localhost";
0952     struct addrinfo *ai;
0953     char servport[6];
0954     int retries = 0x10; /* nice round number */
0955     int sk, ret;
0956 
0957     snprintf(servport, sizeof(servport), "%hd", port);
0958     ret = getaddrinfo(server, servport, NULL, &ai);
0959     if (ret)
0960         return ret;
0961 
0962     sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
0963     if (sk < 0)
0964         goto free_ainfo;
0965 
0966     ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
0967     if (ret < 0)
0968         goto close_sk;
0969 
0970     ret = KSFT_FAIL;
0971     while (retries--) {
0972         uint8_t buf[0x100000];
0973         long current, sock;
0974 
0975         if (read(sk, buf, sizeof(buf)) <= 0)
0976             goto close_sk;
0977 
0978         current = cg_read_long(cgroup, "memory.current");
0979         sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
0980 
0981         if (current < 0 || sock < 0)
0982             goto close_sk;
0983 
0984         if (values_close(current, sock, 10)) {
0985             ret = KSFT_PASS;
0986             break;
0987         }
0988     }
0989 
0990 close_sk:
0991     close(sk);
0992 free_ainfo:
0993     freeaddrinfo(ai);
0994     return ret;
0995 }
0996 
0997 /*
0998  * This test checks socket memory accounting.
0999  * The test forks a TCP server listens on a random port between 1000
1000  * and 61000. Once it gets a client connection, it starts writing to
1001  * its socket.
1002  * The TCP client interleaves reads from the socket with check whether
1003  * memory.current and memory.stat.sock are similar.
1004  */
1005 static int test_memcg_sock(const char *root)
1006 {
1007     int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1008     unsigned short port;
1009     char *memcg;
1010 
1011     memcg = cg_name(root, "memcg_test");
1012     if (!memcg)
1013         goto cleanup;
1014 
1015     if (cg_create(memcg))
1016         goto cleanup;
1017 
1018     while (bind_retries--) {
1019         struct tcp_server_args args;
1020 
1021         if (pipe(args.ctl))
1022             goto cleanup;
1023 
1024         port = args.port = 1000 + rand() % 60000;
1025 
1026         pid = cg_run_nowait(memcg, tcp_server, &args);
1027         if (pid < 0)
1028             goto cleanup;
1029 
1030         close(args.ctl[1]);
1031         if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1032             goto cleanup;
1033         close(args.ctl[0]);
1034 
1035         if (!err)
1036             break;
1037         if (err != EADDRINUSE)
1038             goto cleanup;
1039 
1040         waitpid(pid, NULL, 0);
1041     }
1042 
1043     if (err == EADDRINUSE) {
1044         ret = KSFT_SKIP;
1045         goto cleanup;
1046     }
1047 
1048     if (tcp_client(memcg, port) != KSFT_PASS)
1049         goto cleanup;
1050 
1051     waitpid(pid, &err, 0);
1052     if (WEXITSTATUS(err))
1053         goto cleanup;
1054 
1055     if (cg_read_long(memcg, "memory.current") < 0)
1056         goto cleanup;
1057 
1058     if (cg_read_key_long(memcg, "memory.stat", "sock "))
1059         goto cleanup;
1060 
1061     ret = KSFT_PASS;
1062 
1063 cleanup:
1064     cg_destroy(memcg);
1065     free(memcg);
1066 
1067     return ret;
1068 }
1069 
1070 /*
1071  * This test disables swapping and tries to allocate anonymous memory
1072  * up to OOM with memory.group.oom set. Then it checks that all
1073  * processes in the leaf were killed. It also checks that oom_events
1074  * were propagated to the parent level.
1075  */
1076 static int test_memcg_oom_group_leaf_events(const char *root)
1077 {
1078     int ret = KSFT_FAIL;
1079     char *parent, *child;
1080     long parent_oom_events;
1081 
1082     parent = cg_name(root, "memcg_test_0");
1083     child = cg_name(root, "memcg_test_0/memcg_test_1");
1084 
1085     if (!parent || !child)
1086         goto cleanup;
1087 
1088     if (cg_create(parent))
1089         goto cleanup;
1090 
1091     if (cg_create(child))
1092         goto cleanup;
1093 
1094     if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1095         goto cleanup;
1096 
1097     if (cg_write(child, "memory.max", "50M"))
1098         goto cleanup;
1099 
1100     if (cg_write(child, "memory.swap.max", "0"))
1101         goto cleanup;
1102 
1103     if (cg_write(child, "memory.oom.group", "1"))
1104         goto cleanup;
1105 
1106     cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1107     cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1108     cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1109     if (!cg_run(child, alloc_anon, (void *)MB(100)))
1110         goto cleanup;
1111 
1112     if (cg_test_proc_killed(child))
1113         goto cleanup;
1114 
1115     if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1116         goto cleanup;
1117 
1118     parent_oom_events = cg_read_key_long(
1119             parent, "memory.events", "oom_kill ");
1120     /*
1121      * If memory_localevents is not enabled (the default), the parent should
1122      * count OOM events in its children groups. Otherwise, it should not
1123      * have observed any events.
1124      */
1125     if (has_localevents && parent_oom_events != 0)
1126         goto cleanup;
1127     else if (!has_localevents && parent_oom_events <= 0)
1128         goto cleanup;
1129 
1130     ret = KSFT_PASS;
1131 
1132 cleanup:
1133     if (child)
1134         cg_destroy(child);
1135     if (parent)
1136         cg_destroy(parent);
1137     free(child);
1138     free(parent);
1139 
1140     return ret;
1141 }
1142 
1143 /*
1144  * This test disables swapping and tries to allocate anonymous memory
1145  * up to OOM with memory.group.oom set. Then it checks that all
1146  * processes in the parent and leaf were killed.
1147  */
1148 static int test_memcg_oom_group_parent_events(const char *root)
1149 {
1150     int ret = KSFT_FAIL;
1151     char *parent, *child;
1152 
1153     parent = cg_name(root, "memcg_test_0");
1154     child = cg_name(root, "memcg_test_0/memcg_test_1");
1155 
1156     if (!parent || !child)
1157         goto cleanup;
1158 
1159     if (cg_create(parent))
1160         goto cleanup;
1161 
1162     if (cg_create(child))
1163         goto cleanup;
1164 
1165     if (cg_write(parent, "memory.max", "80M"))
1166         goto cleanup;
1167 
1168     if (cg_write(parent, "memory.swap.max", "0"))
1169         goto cleanup;
1170 
1171     if (cg_write(parent, "memory.oom.group", "1"))
1172         goto cleanup;
1173 
1174     cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1175     cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1176     cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1177 
1178     if (!cg_run(child, alloc_anon, (void *)MB(100)))
1179         goto cleanup;
1180 
1181     if (cg_test_proc_killed(child))
1182         goto cleanup;
1183     if (cg_test_proc_killed(parent))
1184         goto cleanup;
1185 
1186     ret = KSFT_PASS;
1187 
1188 cleanup:
1189     if (child)
1190         cg_destroy(child);
1191     if (parent)
1192         cg_destroy(parent);
1193     free(child);
1194     free(parent);
1195 
1196     return ret;
1197 }
1198 
1199 /*
1200  * This test disables swapping and tries to allocate anonymous memory
1201  * up to OOM with memory.group.oom set. Then it checks that all
1202  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1203  */
1204 static int test_memcg_oom_group_score_events(const char *root)
1205 {
1206     int ret = KSFT_FAIL;
1207     char *memcg;
1208     int safe_pid;
1209 
1210     memcg = cg_name(root, "memcg_test_0");
1211 
1212     if (!memcg)
1213         goto cleanup;
1214 
1215     if (cg_create(memcg))
1216         goto cleanup;
1217 
1218     if (cg_write(memcg, "memory.max", "50M"))
1219         goto cleanup;
1220 
1221     if (cg_write(memcg, "memory.swap.max", "0"))
1222         goto cleanup;
1223 
1224     if (cg_write(memcg, "memory.oom.group", "1"))
1225         goto cleanup;
1226 
1227     safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1228     if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1229         goto cleanup;
1230 
1231     cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1232     if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1233         goto cleanup;
1234 
1235     if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1236         goto cleanup;
1237 
1238     if (kill(safe_pid, SIGKILL))
1239         goto cleanup;
1240 
1241     ret = KSFT_PASS;
1242 
1243 cleanup:
1244     if (memcg)
1245         cg_destroy(memcg);
1246     free(memcg);
1247 
1248     return ret;
1249 }
1250 
1251 #define T(x) { x, #x }
1252 struct memcg_test {
1253     int (*fn)(const char *root);
1254     const char *name;
1255 } tests[] = {
1256     T(test_memcg_subtree_control),
1257     T(test_memcg_current),
1258     T(test_memcg_min),
1259     T(test_memcg_low),
1260     T(test_memcg_high),
1261     T(test_memcg_high_sync),
1262     T(test_memcg_max),
1263     T(test_memcg_reclaim),
1264     T(test_memcg_oom_events),
1265     T(test_memcg_swap_max),
1266     T(test_memcg_sock),
1267     T(test_memcg_oom_group_leaf_events),
1268     T(test_memcg_oom_group_parent_events),
1269     T(test_memcg_oom_group_score_events),
1270 };
1271 #undef T
1272 
1273 int main(int argc, char **argv)
1274 {
1275     char root[PATH_MAX];
1276     int i, proc_status, ret = EXIT_SUCCESS;
1277 
1278     if (cg_find_unified_root(root, sizeof(root)))
1279         ksft_exit_skip("cgroup v2 isn't mounted\n");
1280 
1281     /*
1282      * Check that memory controller is available:
1283      * memory is listed in cgroup.controllers
1284      */
1285     if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1286         ksft_exit_skip("memory controller isn't available\n");
1287 
1288     if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1289         if (cg_write(root, "cgroup.subtree_control", "+memory"))
1290             ksft_exit_skip("Failed to set memory controller\n");
1291 
1292     proc_status = proc_mount_contains("memory_recursiveprot");
1293     if (proc_status < 0)
1294         ksft_exit_skip("Failed to query cgroup mount option\n");
1295     has_recursiveprot = proc_status;
1296 
1297     proc_status = proc_mount_contains("memory_localevents");
1298     if (proc_status < 0)
1299         ksft_exit_skip("Failed to query cgroup mount option\n");
1300     has_localevents = proc_status;
1301 
1302     for (i = 0; i < ARRAY_SIZE(tests); i++) {
1303         switch (tests[i].fn(root)) {
1304         case KSFT_PASS:
1305             ksft_test_result_pass("%s\n", tests[i].name);
1306             break;
1307         case KSFT_SKIP:
1308             ksft_test_result_skip("%s\n", tests[i].name);
1309             break;
1310         default:
1311             ret = EXIT_FAILURE;
1312             ksft_test_result_fail("%s\n", tests[i].name);
1313             break;
1314         }
1315     }
1316 
1317     return ret;
1318 }