0001
0002
0003 #define _GNU_SOURCE
0004
0005 #include <errno.h>
0006 #include <fcntl.h>
0007 #include <linux/limits.h>
0008 #include <poll.h>
0009 #include <signal.h>
0010 #include <stdio.h>
0011 #include <stdlib.h>
0012 #include <string.h>
0013 #include <sys/inotify.h>
0014 #include <sys/stat.h>
0015 #include <sys/types.h>
0016 #include <sys/wait.h>
0017 #include <unistd.h>
0018
0019 #include "cgroup_util.h"
0020 #include "../clone3/clone3_selftests.h"
0021
0022
0023 static ssize_t read_text(const char *path, char *buf, size_t max_len)
0024 {
0025 ssize_t len;
0026 int fd;
0027
0028 fd = open(path, O_RDONLY);
0029 if (fd < 0)
0030 return -errno;
0031
0032 len = read(fd, buf, max_len - 1);
0033
0034 if (len >= 0)
0035 buf[len] = 0;
0036
0037 close(fd);
0038 return len < 0 ? -errno : len;
0039 }
0040
0041
0042 static ssize_t write_text(const char *path, char *buf, ssize_t len)
0043 {
0044 int fd;
0045
0046 fd = open(path, O_WRONLY | O_APPEND);
0047 if (fd < 0)
0048 return -errno;
0049
0050 len = write(fd, buf, len);
0051 close(fd);
0052 return len < 0 ? -errno : len;
0053 }
0054
0055 char *cg_name(const char *root, const char *name)
0056 {
0057 size_t len = strlen(root) + strlen(name) + 2;
0058 char *ret = malloc(len);
0059
0060 snprintf(ret, len, "%s/%s", root, name);
0061
0062 return ret;
0063 }
0064
0065 char *cg_name_indexed(const char *root, const char *name, int index)
0066 {
0067 size_t len = strlen(root) + strlen(name) + 10;
0068 char *ret = malloc(len);
0069
0070 snprintf(ret, len, "%s/%s_%d", root, name, index);
0071
0072 return ret;
0073 }
0074
0075 char *cg_control(const char *cgroup, const char *control)
0076 {
0077 size_t len = strlen(cgroup) + strlen(control) + 2;
0078 char *ret = malloc(len);
0079
0080 snprintf(ret, len, "%s/%s", cgroup, control);
0081
0082 return ret;
0083 }
0084
0085
0086 int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
0087 {
0088 char path[PATH_MAX];
0089 ssize_t ret;
0090
0091 snprintf(path, sizeof(path), "%s/%s", cgroup, control);
0092
0093 ret = read_text(path, buf, len);
0094 return ret >= 0 ? 0 : ret;
0095 }
0096
0097 int cg_read_strcmp(const char *cgroup, const char *control,
0098 const char *expected)
0099 {
0100 size_t size;
0101 char *buf;
0102 int ret;
0103
0104
0105 if (!expected)
0106 return -1;
0107 else
0108 size = strlen(expected) + 1;
0109
0110 buf = malloc(size);
0111 if (!buf)
0112 return -1;
0113
0114 if (cg_read(cgroup, control, buf, size)) {
0115 free(buf);
0116 return -1;
0117 }
0118
0119 ret = strcmp(expected, buf);
0120 free(buf);
0121 return ret;
0122 }
0123
0124 int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
0125 {
0126 char buf[PAGE_SIZE];
0127
0128 if (cg_read(cgroup, control, buf, sizeof(buf)))
0129 return -1;
0130
0131 return strstr(buf, needle) ? 0 : -1;
0132 }
0133
0134 long cg_read_long(const char *cgroup, const char *control)
0135 {
0136 char buf[128];
0137
0138 if (cg_read(cgroup, control, buf, sizeof(buf)))
0139 return -1;
0140
0141 return atol(buf);
0142 }
0143
0144 long cg_read_key_long(const char *cgroup, const char *control, const char *key)
0145 {
0146 char buf[PAGE_SIZE];
0147 char *ptr;
0148
0149 if (cg_read(cgroup, control, buf, sizeof(buf)))
0150 return -1;
0151
0152 ptr = strstr(buf, key);
0153 if (!ptr)
0154 return -1;
0155
0156 return atol(ptr + strlen(key));
0157 }
0158
0159 long cg_read_lc(const char *cgroup, const char *control)
0160 {
0161 char buf[PAGE_SIZE];
0162 const char delim[] = "\n";
0163 char *line;
0164 long cnt = 0;
0165
0166 if (cg_read(cgroup, control, buf, sizeof(buf)))
0167 return -1;
0168
0169 for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
0170 cnt++;
0171
0172 return cnt;
0173 }
0174
0175
0176 int cg_write(const char *cgroup, const char *control, char *buf)
0177 {
0178 char path[PATH_MAX];
0179 ssize_t len = strlen(buf), ret;
0180
0181 snprintf(path, sizeof(path), "%s/%s", cgroup, control);
0182 ret = write_text(path, buf, len);
0183 return ret == len ? 0 : ret;
0184 }
0185
0186 int cg_write_numeric(const char *cgroup, const char *control, long value)
0187 {
0188 char buf[64];
0189 int ret;
0190
0191 ret = sprintf(buf, "%lu", value);
0192 if (ret < 0)
0193 return ret;
0194
0195 return cg_write(cgroup, control, buf);
0196 }
0197
0198 int cg_find_unified_root(char *root, size_t len)
0199 {
0200 char buf[10 * PAGE_SIZE];
0201 char *fs, *mount, *type;
0202 const char delim[] = "\n\t ";
0203
0204 if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
0205 return -1;
0206
0207
0208
0209
0210
0211 for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
0212 mount = strtok(NULL, delim);
0213 type = strtok(NULL, delim);
0214 strtok(NULL, delim);
0215 strtok(NULL, delim);
0216 strtok(NULL, delim);
0217
0218 if (strcmp(type, "cgroup2") == 0) {
0219 strncpy(root, mount, len);
0220 return 0;
0221 }
0222 }
0223
0224 return -1;
0225 }
0226
0227 int cg_create(const char *cgroup)
0228 {
0229 return mkdir(cgroup, 0755);
0230 }
0231
0232 int cg_wait_for_proc_count(const char *cgroup, int count)
0233 {
0234 char buf[10 * PAGE_SIZE] = {0};
0235 int attempts;
0236 char *ptr;
0237
0238 for (attempts = 10; attempts >= 0; attempts--) {
0239 int nr = 0;
0240
0241 if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
0242 break;
0243
0244 for (ptr = buf; *ptr; ptr++)
0245 if (*ptr == '\n')
0246 nr++;
0247
0248 if (nr >= count)
0249 return 0;
0250
0251 usleep(100000);
0252 }
0253
0254 return -1;
0255 }
0256
0257 int cg_killall(const char *cgroup)
0258 {
0259 char buf[PAGE_SIZE];
0260 char *ptr = buf;
0261
0262
0263 if (!cg_write(cgroup, "cgroup.kill", "1"))
0264 return 0;
0265
0266 if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
0267 return -1;
0268
0269 while (ptr < buf + sizeof(buf)) {
0270 int pid = strtol(ptr, &ptr, 10);
0271
0272 if (pid == 0)
0273 break;
0274 if (*ptr)
0275 ptr++;
0276 else
0277 break;
0278 if (kill(pid, SIGKILL))
0279 return -1;
0280 }
0281
0282 return 0;
0283 }
0284
0285 int cg_destroy(const char *cgroup)
0286 {
0287 int ret;
0288
0289 retry:
0290 ret = rmdir(cgroup);
0291 if (ret && errno == EBUSY) {
0292 cg_killall(cgroup);
0293 usleep(100);
0294 goto retry;
0295 }
0296
0297 if (ret && errno == ENOENT)
0298 ret = 0;
0299
0300 return ret;
0301 }
0302
0303 int cg_enter(const char *cgroup, int pid)
0304 {
0305 char pidbuf[64];
0306
0307 snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
0308 return cg_write(cgroup, "cgroup.procs", pidbuf);
0309 }
0310
0311 int cg_enter_current(const char *cgroup)
0312 {
0313 return cg_write(cgroup, "cgroup.procs", "0");
0314 }
0315
0316 int cg_enter_current_thread(const char *cgroup)
0317 {
0318 return cg_write(cgroup, "cgroup.threads", "0");
0319 }
0320
0321 int cg_run(const char *cgroup,
0322 int (*fn)(const char *cgroup, void *arg),
0323 void *arg)
0324 {
0325 int pid, retcode;
0326
0327 pid = fork();
0328 if (pid < 0) {
0329 return pid;
0330 } else if (pid == 0) {
0331 char buf[64];
0332
0333 snprintf(buf, sizeof(buf), "%d", getpid());
0334 if (cg_write(cgroup, "cgroup.procs", buf))
0335 exit(EXIT_FAILURE);
0336 exit(fn(cgroup, arg));
0337 } else {
0338 waitpid(pid, &retcode, 0);
0339 if (WIFEXITED(retcode))
0340 return WEXITSTATUS(retcode);
0341 else
0342 return -1;
0343 }
0344 }
0345
0346 pid_t clone_into_cgroup(int cgroup_fd)
0347 {
0348 #ifdef CLONE_ARGS_SIZE_VER2
0349 pid_t pid;
0350
0351 struct __clone_args args = {
0352 .flags = CLONE_INTO_CGROUP,
0353 .exit_signal = SIGCHLD,
0354 .cgroup = cgroup_fd,
0355 };
0356
0357 pid = sys_clone3(&args, sizeof(struct __clone_args));
0358
0359
0360
0361
0362
0363 if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
0364 goto pretend_enosys;
0365
0366 return pid;
0367
0368 pretend_enosys:
0369 #endif
0370 errno = ENOSYS;
0371 return -ENOSYS;
0372 }
0373
0374 int clone_reap(pid_t pid, int options)
0375 {
0376 int ret;
0377 siginfo_t info = {
0378 .si_signo = 0,
0379 };
0380
0381 again:
0382 ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
0383 if (ret < 0) {
0384 if (errno == EINTR)
0385 goto again;
0386 return -1;
0387 }
0388
0389 if (options & WEXITED) {
0390 if (WIFEXITED(info.si_status))
0391 return WEXITSTATUS(info.si_status);
0392 }
0393
0394 if (options & WSTOPPED) {
0395 if (WIFSTOPPED(info.si_status))
0396 return WSTOPSIG(info.si_status);
0397 }
0398
0399 if (options & WCONTINUED) {
0400 if (WIFCONTINUED(info.si_status))
0401 return 0;
0402 }
0403
0404 return -1;
0405 }
0406
0407 int dirfd_open_opath(const char *dir)
0408 {
0409 return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
0410 }
0411
0412 #define close_prot_errno(fd) \
0413 if (fd >= 0) { \
0414 int _e_ = errno; \
0415 close(fd); \
0416 errno = _e_; \
0417 }
0418
0419 static int clone_into_cgroup_run_nowait(const char *cgroup,
0420 int (*fn)(const char *cgroup, void *arg),
0421 void *arg)
0422 {
0423 int cgroup_fd;
0424 pid_t pid;
0425
0426 cgroup_fd = dirfd_open_opath(cgroup);
0427 if (cgroup_fd < 0)
0428 return -1;
0429
0430 pid = clone_into_cgroup(cgroup_fd);
0431 close_prot_errno(cgroup_fd);
0432 if (pid == 0)
0433 exit(fn(cgroup, arg));
0434
0435 return pid;
0436 }
0437
0438 int cg_run_nowait(const char *cgroup,
0439 int (*fn)(const char *cgroup, void *arg),
0440 void *arg)
0441 {
0442 int pid;
0443
0444 pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
0445 if (pid > 0)
0446 return pid;
0447
0448
0449 if (pid < 0 && errno != ENOSYS)
0450 return -1;
0451
0452 pid = fork();
0453 if (pid == 0) {
0454 char buf[64];
0455
0456 snprintf(buf, sizeof(buf), "%d", getpid());
0457 if (cg_write(cgroup, "cgroup.procs", buf))
0458 exit(EXIT_FAILURE);
0459 exit(fn(cgroup, arg));
0460 }
0461
0462 return pid;
0463 }
0464
0465 int get_temp_fd(void)
0466 {
0467 return open(".", O_TMPFILE | O_RDWR | O_EXCL);
0468 }
0469
0470 int alloc_pagecache(int fd, size_t size)
0471 {
0472 char buf[PAGE_SIZE];
0473 struct stat st;
0474 int i;
0475
0476 if (fstat(fd, &st))
0477 goto cleanup;
0478
0479 size += st.st_size;
0480
0481 if (ftruncate(fd, size))
0482 goto cleanup;
0483
0484 for (i = 0; i < size; i += sizeof(buf))
0485 read(fd, buf, sizeof(buf));
0486
0487 return 0;
0488
0489 cleanup:
0490 return -1;
0491 }
0492
0493 int alloc_anon(const char *cgroup, void *arg)
0494 {
0495 size_t size = (unsigned long)arg;
0496 char *buf, *ptr;
0497
0498 buf = malloc(size);
0499 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
0500 *ptr = 0;
0501
0502 free(buf);
0503 return 0;
0504 }
0505
0506 int is_swap_enabled(void)
0507 {
0508 char buf[PAGE_SIZE];
0509 const char delim[] = "\n";
0510 int cnt = 0;
0511 char *line;
0512
0513 if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
0514 return -1;
0515
0516 for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
0517 cnt++;
0518
0519 return cnt > 1;
0520 }
0521
0522 int set_oom_adj_score(int pid, int score)
0523 {
0524 char path[PATH_MAX];
0525 int fd, len;
0526
0527 sprintf(path, "/proc/%d/oom_score_adj", pid);
0528
0529 fd = open(path, O_WRONLY | O_APPEND);
0530 if (fd < 0)
0531 return fd;
0532
0533 len = dprintf(fd, "%d", score);
0534 if (len < 0) {
0535 close(fd);
0536 return len;
0537 }
0538
0539 close(fd);
0540 return 0;
0541 }
0542
0543 int proc_mount_contains(const char *option)
0544 {
0545 char buf[4 * PAGE_SIZE];
0546 ssize_t read;
0547
0548 read = read_text("/proc/mounts", buf, sizeof(buf));
0549 if (read < 0)
0550 return read;
0551
0552 return strstr(buf, option) != NULL;
0553 }
0554
0555 ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
0556 {
0557 char path[PATH_MAX];
0558
0559 if (!pid)
0560 snprintf(path, sizeof(path), "/proc/%s/%s",
0561 thread ? "thread-self" : "self", item);
0562 else
0563 snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
0564
0565 size = read_text(path, buf, size);
0566 return size < 0 ? -1 : size;
0567 }
0568
0569 int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
0570 {
0571 char buf[PAGE_SIZE];
0572
0573 if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
0574 return -1;
0575
0576 return strstr(buf, needle) ? 0 : -1;
0577 }
0578
0579 int clone_into_cgroup_run_wait(const char *cgroup)
0580 {
0581 int cgroup_fd;
0582 pid_t pid;
0583
0584 cgroup_fd = dirfd_open_opath(cgroup);
0585 if (cgroup_fd < 0)
0586 return -1;
0587
0588 pid = clone_into_cgroup(cgroup_fd);
0589 close_prot_errno(cgroup_fd);
0590 if (pid < 0)
0591 return -1;
0592
0593 if (pid == 0)
0594 exit(EXIT_SUCCESS);
0595
0596
0597
0598
0599
0600 (void)clone_reap(pid, WEXITED);
0601 return 0;
0602 }
0603
0604 static int __prepare_for_wait(const char *cgroup, const char *filename)
0605 {
0606 int fd, ret = -1;
0607
0608 fd = inotify_init1(0);
0609 if (fd == -1)
0610 return fd;
0611
0612 ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY);
0613 if (ret == -1) {
0614 close(fd);
0615 fd = -1;
0616 }
0617
0618 return fd;
0619 }
0620
0621 int cg_prepare_for_wait(const char *cgroup)
0622 {
0623 return __prepare_for_wait(cgroup, "cgroup.events");
0624 }
0625
0626 int memcg_prepare_for_wait(const char *cgroup)
0627 {
0628 return __prepare_for_wait(cgroup, "memory.events");
0629 }
0630
0631 int cg_wait_for(int fd)
0632 {
0633 int ret = -1;
0634 struct pollfd fds = {
0635 .fd = fd,
0636 .events = POLLIN,
0637 };
0638
0639 while (true) {
0640 ret = poll(&fds, 1, 10000);
0641
0642 if (ret == -1) {
0643 if (errno == EINTR)
0644 continue;
0645
0646 break;
0647 }
0648
0649 if (ret > 0 && fds.revents & POLLIN) {
0650 ret = 0;
0651 break;
0652 }
0653 }
0654
0655 return ret;
0656 }