Line data Source code
1 : /* SPDX-License-Identifier: LGPL-2.1+ */
2 :
3 : #include <dirent.h>
4 : #include <errno.h>
5 : #include <ftw.h>
6 : #include <limits.h>
7 : #include <signal.h>
8 : #include <stddef.h>
9 : #include <stdlib.h>
10 : #include <string.h>
11 : #include <sys/stat.h>
12 : #include <sys/statfs.h>
13 : #include <sys/types.h>
14 : #include <sys/utsname.h>
15 : #include <sys/xattr.h>
16 : #include <unistd.h>
17 :
18 : #include "alloc-util.h"
19 : #include "cgroup-util.h"
20 : #include "def.h"
21 : #include "dirent-util.h"
22 : #include "extract-word.h"
23 : #include "fd-util.h"
24 : #include "fileio.h"
25 : #include "format-util.h"
26 : #include "fs-util.h"
27 : #include "log.h"
28 : #include "login-util.h"
29 : #include "macro.h"
30 : #include "missing.h"
31 : #include "mkdir.h"
32 : #include "parse-util.h"
33 : #include "path-util.h"
34 : #include "proc-cmdline.h"
35 : #include "process-util.h"
36 : #include "set.h"
37 : #include "special.h"
38 : #include "stat-util.h"
39 : #include "stdio-util.h"
40 : #include "string-table.h"
41 : #include "string-util.h"
42 : #include "strv.h"
43 : #include "unit-name.h"
44 : #include "user-util.h"
45 :
46 6 : static int cg_enumerate_items(const char *controller, const char *path, FILE **_f, const char *item) {
47 6 : _cleanup_free_ char *fs = NULL;
48 : FILE *f;
49 : int r;
50 :
51 6 : assert(_f);
52 :
53 6 : r = cg_get_path(controller, path, item, &fs);
54 6 : if (r < 0)
55 0 : return r;
56 :
57 6 : f = fopen(fs, "re");
58 6 : if (!f)
59 6 : return -errno;
60 :
61 0 : *_f = f;
62 0 : return 0;
63 : }
64 :
65 0 : int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
66 0 : return cg_enumerate_items(controller, path, _f, "cgroup.procs");
67 : }
68 :
69 0 : int cg_read_pid(FILE *f, pid_t *_pid) {
70 : unsigned long ul;
71 :
72 : /* Note that the cgroup.procs might contain duplicates! See
73 : * cgroups.txt for details. */
74 :
75 0 : assert(f);
76 0 : assert(_pid);
77 :
78 0 : errno = 0;
79 0 : if (fscanf(f, "%lu", &ul) != 1) {
80 :
81 0 : if (feof(f))
82 0 : return 0;
83 :
84 0 : return errno_or_else(EIO);
85 : }
86 :
87 0 : if (ul <= 0)
88 0 : return -EIO;
89 :
90 0 : *_pid = (pid_t) ul;
91 0 : return 1;
92 : }
93 :
94 0 : int cg_read_event(
95 : const char *controller,
96 : const char *path,
97 : const char *event,
98 : char **ret) {
99 :
100 0 : _cleanup_free_ char *events = NULL, *content = NULL;
101 : int r;
102 :
103 0 : r = cg_get_path(controller, path, "cgroup.events", &events);
104 0 : if (r < 0)
105 0 : return r;
106 :
107 0 : r = read_full_file(events, &content, NULL);
108 0 : if (r < 0)
109 0 : return r;
110 :
111 0 : for (const char *p = content;;) {
112 0 : _cleanup_free_ char *line = NULL, *key = NULL, *val = NULL;
113 : const char *q;
114 :
115 0 : r = extract_first_word(&p, &line, "\n", 0);
116 0 : if (r < 0)
117 0 : return r;
118 0 : if (r == 0)
119 0 : return -ENOENT;
120 :
121 0 : q = line;
122 0 : r = extract_first_word(&q, &key, " ", 0);
123 0 : if (r < 0)
124 0 : return r;
125 0 : if (r == 0)
126 0 : return -EINVAL;
127 :
128 0 : if (!streq(key, event))
129 0 : continue;
130 :
131 0 : val = strdup(q);
132 0 : if (!val)
133 0 : return -ENOMEM;
134 :
135 0 : *ret = TAKE_PTR(val);
136 0 : return 0;
137 : }
138 : }
139 :
140 0 : bool cg_ns_supported(void) {
141 : static thread_local int enabled = -1;
142 :
143 0 : if (enabled >= 0)
144 0 : return enabled;
145 :
146 0 : if (access("/proc/self/ns/cgroup", F_OK) < 0) {
147 0 : if (errno != ENOENT)
148 0 : log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
149 0 : enabled = false;
150 : } else
151 0 : enabled = true;
152 :
153 0 : return enabled;
154 : }
155 :
156 6 : int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
157 6 : _cleanup_free_ char *fs = NULL;
158 : int r;
159 : DIR *d;
160 :
161 6 : assert(_d);
162 :
163 : /* This is not recursive! */
164 :
165 6 : r = cg_get_path(controller, path, NULL, &fs);
166 6 : if (r < 0)
167 0 : return r;
168 :
169 6 : d = opendir(fs);
170 6 : if (!d)
171 6 : return -errno;
172 :
173 0 : *_d = d;
174 0 : return 0;
175 : }
176 :
177 0 : int cg_read_subgroup(DIR *d, char **fn) {
178 : struct dirent *de;
179 :
180 0 : assert(d);
181 0 : assert(fn);
182 :
183 0 : FOREACH_DIRENT_ALL(de, d, return -errno) {
184 : char *b;
185 :
186 0 : if (de->d_type != DT_DIR)
187 0 : continue;
188 :
189 0 : if (dot_or_dot_dot(de->d_name))
190 0 : continue;
191 :
192 0 : b = strdup(de->d_name);
193 0 : if (!b)
194 0 : return -ENOMEM;
195 :
196 0 : *fn = b;
197 0 : return 1;
198 : }
199 :
200 0 : return 0;
201 : }
202 :
203 0 : int cg_rmdir(const char *controller, const char *path) {
204 0 : _cleanup_free_ char *p = NULL;
205 : int r;
206 :
207 0 : r = cg_get_path(controller, path, NULL, &p);
208 0 : if (r < 0)
209 0 : return r;
210 :
211 0 : r = rmdir(p);
212 0 : if (r < 0 && errno != ENOENT)
213 0 : return -errno;
214 :
215 0 : r = cg_hybrid_unified();
216 0 : if (r <= 0)
217 0 : return r;
218 :
219 0 : if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
220 0 : r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
221 0 : if (r < 0)
222 0 : log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
223 : }
224 :
225 0 : return 0;
226 : }
227 :
228 6 : static int cg_kill_items(
229 : const char *controller,
230 : const char *path,
231 : int sig,
232 : CGroupFlags flags,
233 : Set *s,
234 : cg_kill_log_func_t log_kill,
235 : void *userdata,
236 : const char *item) {
237 :
238 6 : _cleanup_set_free_ Set *allocated_set = NULL;
239 6 : bool done = false;
240 6 : int r, ret = 0, ret_log_kill = 0;
241 : pid_t my_pid;
242 :
243 6 : assert(sig >= 0);
244 :
245 : /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
246 : * SIGCONT on SIGKILL. */
247 6 : if (IN_SET(sig, SIGCONT, SIGKILL))
248 0 : flags &= ~CGROUP_SIGCONT;
249 :
250 : /* This goes through the tasks list and kills them all. This
251 : * is repeated until no further processes are added to the
252 : * tasks list, to properly handle forking processes */
253 :
254 6 : if (!s) {
255 0 : s = allocated_set = set_new(NULL);
256 0 : if (!s)
257 0 : return -ENOMEM;
258 : }
259 :
260 6 : my_pid = getpid_cached();
261 :
262 : do {
263 6 : _cleanup_fclose_ FILE *f = NULL;
264 6 : pid_t pid = 0;
265 6 : done = true;
266 :
267 6 : r = cg_enumerate_items(controller, path, &f, item);
268 6 : if (r < 0) {
269 6 : if (ret >= 0 && r != -ENOENT)
270 0 : return r;
271 :
272 6 : return ret;
273 : }
274 :
275 0 : while ((r = cg_read_pid(f, &pid)) > 0) {
276 :
277 0 : if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
278 0 : continue;
279 :
280 0 : if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
281 0 : continue;
282 :
283 0 : if (log_kill)
284 0 : ret_log_kill = log_kill(pid, sig, userdata);
285 :
286 : /* If we haven't killed this process yet, kill
287 : * it */
288 0 : if (kill(pid, sig) < 0) {
289 0 : if (ret >= 0 && errno != ESRCH)
290 0 : ret = -errno;
291 : } else {
292 0 : if (flags & CGROUP_SIGCONT)
293 0 : (void) kill(pid, SIGCONT);
294 :
295 0 : if (ret == 0) {
296 0 : if (log_kill)
297 0 : ret = ret_log_kill;
298 : else
299 0 : ret = 1;
300 : }
301 : }
302 :
303 0 : done = false;
304 :
305 0 : r = set_put(s, PID_TO_PTR(pid));
306 0 : if (r < 0) {
307 0 : if (ret >= 0)
308 0 : return r;
309 :
310 0 : return ret;
311 : }
312 : }
313 :
314 0 : if (r < 0) {
315 0 : if (ret >= 0)
316 0 : return r;
317 :
318 0 : return ret;
319 : }
320 :
321 : /* To avoid racing against processes which fork
322 : * quicker than we can kill them we repeat this until
323 : * no new pids need to be killed. */
324 :
325 0 : } while (!done);
326 :
327 0 : return ret;
328 : }
329 :
330 6 : int cg_kill(
331 : const char *controller,
332 : const char *path,
333 : int sig,
334 : CGroupFlags flags,
335 : Set *s,
336 : cg_kill_log_func_t log_kill,
337 : void *userdata) {
338 : int r;
339 :
340 6 : r = cg_kill_items(controller, path, sig, flags, s, log_kill, userdata, "cgroup.procs");
341 6 : if (r < 0 || sig != SIGKILL)
342 6 : return r;
343 :
344 : /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
345 : a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83), backported to 4.19.66
346 : (4340d175b898) and 4.14.138 (feb6b123b7dd). */
347 0 : r = cg_unified_controller(controller);
348 0 : if (r < 0)
349 0 : return r;
350 0 : if (r == 0) /* doesn't apply to legacy hierarchy */
351 0 : return 0;
352 :
353 0 : return cg_kill_items(controller, path, sig, flags, s, log_kill, userdata, "cgroup.threads");
354 : }
355 :
356 6 : int cg_kill_recursive(
357 : const char *controller,
358 : const char *path,
359 : int sig,
360 : CGroupFlags flags,
361 : Set *s,
362 : cg_kill_log_func_t log_kill,
363 : void *userdata) {
364 :
365 6 : _cleanup_set_free_ Set *allocated_set = NULL;
366 6 : _cleanup_closedir_ DIR *d = NULL;
367 : int r, ret;
368 : char *fn;
369 :
370 6 : assert(path);
371 6 : assert(sig >= 0);
372 :
373 6 : if (!s) {
374 6 : s = allocated_set = set_new(NULL);
375 6 : if (!s)
376 0 : return -ENOMEM;
377 : }
378 :
379 6 : ret = cg_kill(controller, path, sig, flags, s, log_kill, userdata);
380 :
381 6 : r = cg_enumerate_subgroups(controller, path, &d);
382 6 : if (r < 0) {
383 6 : if (ret >= 0 && r != -ENOENT)
384 0 : return r;
385 :
386 6 : return ret;
387 : }
388 :
389 0 : while ((r = cg_read_subgroup(d, &fn)) > 0) {
390 0 : _cleanup_free_ char *p = NULL;
391 :
392 0 : p = path_join(empty_to_root(path), fn);
393 0 : free(fn);
394 0 : if (!p)
395 0 : return -ENOMEM;
396 :
397 0 : r = cg_kill_recursive(controller, p, sig, flags, s, log_kill, userdata);
398 0 : if (r != 0 && ret >= 0)
399 0 : ret = r;
400 : }
401 0 : if (ret >= 0 && r < 0)
402 0 : ret = r;
403 :
404 0 : if (flags & CGROUP_REMOVE) {
405 0 : r = cg_rmdir(controller, path);
406 0 : if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
407 0 : return r;
408 : }
409 :
410 0 : return ret;
411 : }
412 :
413 0 : int cg_migrate(
414 : const char *cfrom,
415 : const char *pfrom,
416 : const char *cto,
417 : const char *pto,
418 : CGroupFlags flags) {
419 :
420 0 : bool done = false;
421 0 : _cleanup_set_free_ Set *s = NULL;
422 0 : int r, ret = 0;
423 : pid_t my_pid;
424 :
425 0 : assert(cfrom);
426 0 : assert(pfrom);
427 0 : assert(cto);
428 0 : assert(pto);
429 :
430 0 : s = set_new(NULL);
431 0 : if (!s)
432 0 : return -ENOMEM;
433 :
434 0 : my_pid = getpid_cached();
435 :
436 : do {
437 0 : _cleanup_fclose_ FILE *f = NULL;
438 0 : pid_t pid = 0;
439 0 : done = true;
440 :
441 0 : r = cg_enumerate_processes(cfrom, pfrom, &f);
442 0 : if (r < 0) {
443 0 : if (ret >= 0 && r != -ENOENT)
444 0 : return r;
445 :
446 0 : return ret;
447 : }
448 :
449 0 : while ((r = cg_read_pid(f, &pid)) > 0) {
450 :
451 : /* This might do weird stuff if we aren't a
452 : * single-threaded program. However, we
453 : * luckily know we are not */
454 0 : if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
455 0 : continue;
456 :
457 0 : if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
458 0 : continue;
459 :
460 : /* Ignore kernel threads. Since they can only
461 : * exist in the root cgroup, we only check for
462 : * them there. */
463 0 : if (cfrom &&
464 0 : empty_or_root(pfrom) &&
465 0 : is_kernel_thread(pid) > 0)
466 0 : continue;
467 :
468 0 : r = cg_attach(cto, pto, pid);
469 0 : if (r < 0) {
470 0 : if (ret >= 0 && r != -ESRCH)
471 0 : ret = r;
472 0 : } else if (ret == 0)
473 0 : ret = 1;
474 :
475 0 : done = false;
476 :
477 0 : r = set_put(s, PID_TO_PTR(pid));
478 0 : if (r < 0) {
479 0 : if (ret >= 0)
480 0 : return r;
481 :
482 0 : return ret;
483 : }
484 : }
485 :
486 0 : if (r < 0) {
487 0 : if (ret >= 0)
488 0 : return r;
489 :
490 0 : return ret;
491 : }
492 0 : } while (!done);
493 :
494 0 : return ret;
495 : }
496 :
497 0 : int cg_migrate_recursive(
498 : const char *cfrom,
499 : const char *pfrom,
500 : const char *cto,
501 : const char *pto,
502 : CGroupFlags flags) {
503 :
504 0 : _cleanup_closedir_ DIR *d = NULL;
505 0 : int r, ret = 0;
506 : char *fn;
507 :
508 0 : assert(cfrom);
509 0 : assert(pfrom);
510 0 : assert(cto);
511 0 : assert(pto);
512 :
513 0 : ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
514 :
515 0 : r = cg_enumerate_subgroups(cfrom, pfrom, &d);
516 0 : if (r < 0) {
517 0 : if (ret >= 0 && r != -ENOENT)
518 0 : return r;
519 :
520 0 : return ret;
521 : }
522 :
523 0 : while ((r = cg_read_subgroup(d, &fn)) > 0) {
524 0 : _cleanup_free_ char *p = NULL;
525 :
526 0 : p = path_join(empty_to_root(pfrom), fn);
527 0 : free(fn);
528 0 : if (!p)
529 0 : return -ENOMEM;
530 :
531 0 : r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
532 0 : if (r != 0 && ret >= 0)
533 0 : ret = r;
534 : }
535 :
536 0 : if (r < 0 && ret >= 0)
537 0 : ret = r;
538 :
539 0 : if (flags & CGROUP_REMOVE) {
540 0 : r = cg_rmdir(cfrom, pfrom);
541 0 : if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
542 0 : return r;
543 : }
544 :
545 0 : return ret;
546 : }
547 :
548 0 : int cg_migrate_recursive_fallback(
549 : const char *cfrom,
550 : const char *pfrom,
551 : const char *cto,
552 : const char *pto,
553 : CGroupFlags flags) {
554 :
555 : int r;
556 :
557 0 : assert(cfrom);
558 0 : assert(pfrom);
559 0 : assert(cto);
560 0 : assert(pto);
561 :
562 0 : r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
563 0 : if (r < 0) {
564 0 : char prefix[strlen(pto) + 1];
565 :
566 : /* This didn't work? Then let's try all prefixes of the destination */
567 :
568 0 : PATH_FOREACH_PREFIX(prefix, pto) {
569 : int q;
570 :
571 0 : q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
572 0 : if (q >= 0)
573 0 : return q;
574 : }
575 : }
576 :
577 0 : return r;
578 : }
579 :
580 492 : static const char *controller_to_dirname(const char *controller) {
581 : const char *e;
582 :
583 492 : assert(controller);
584 :
585 : /* Converts a controller name to the directory name below
586 : * /sys/fs/cgroup/ we want to mount it to. Effectively, this
587 : * just cuts off the name= prefixed used for named
588 : * hierarchies, if it is specified. */
589 :
590 492 : if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
591 107 : if (cg_hybrid_unified() > 0)
592 107 : controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
593 : else
594 0 : controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
595 : }
596 :
597 492 : e = startswith(controller, "name=");
598 492 : if (e)
599 107 : return e;
600 :
601 385 : return controller;
602 : }
603 :
604 150 : static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **fs) {
605 : const char *dn;
606 150 : char *t = NULL;
607 :
608 150 : assert(fs);
609 150 : assert(controller);
610 :
611 150 : dn = controller_to_dirname(controller);
612 :
613 150 : if (isempty(path) && isempty(suffix))
614 0 : t = path_join("/sys/fs/cgroup", dn);
615 150 : else if (isempty(path))
616 54 : t = path_join("/sys/fs/cgroup", dn, suffix);
617 96 : else if (isempty(suffix))
618 83 : t = path_join("/sys/fs/cgroup", dn, path);
619 : else
620 13 : t = path_join("/sys/fs/cgroup", dn, path, suffix);
621 150 : if (!t)
622 0 : return -ENOMEM;
623 :
624 150 : *fs = t;
625 150 : return 0;
626 : }
627 :
628 0 : static int join_path_unified(const char *path, const char *suffix, char **fs) {
629 : char *t;
630 :
631 0 : assert(fs);
632 :
633 0 : if (isempty(path) && isempty(suffix))
634 0 : t = strdup("/sys/fs/cgroup");
635 0 : else if (isempty(path))
636 0 : t = path_join("/sys/fs/cgroup", suffix);
637 0 : else if (isempty(suffix))
638 0 : t = path_join("/sys/fs/cgroup", path);
639 : else
640 0 : t = path_join("/sys/fs/cgroup", path, suffix);
641 0 : if (!t)
642 0 : return -ENOMEM;
643 :
644 0 : *fs = t;
645 0 : return 0;
646 : }
647 :
648 150 : int cg_get_path(const char *controller, const char *path, const char *suffix, char **fs) {
649 : int r;
650 :
651 150 : assert(fs);
652 :
653 150 : if (!controller) {
654 : char *t;
655 :
656 : /* If no controller is specified, we return the path
657 : * *below* the controllers, without any prefix. */
658 :
659 0 : if (!path && !suffix)
660 0 : return -EINVAL;
661 :
662 0 : if (!suffix)
663 0 : t = strdup(path);
664 0 : else if (!path)
665 0 : t = strdup(suffix);
666 : else
667 0 : t = path_join(path, suffix);
668 0 : if (!t)
669 0 : return -ENOMEM;
670 :
671 0 : *fs = path_simplify(t, false);
672 0 : return 0;
673 : }
674 :
675 150 : if (!cg_controller_is_valid(controller))
676 0 : return -EINVAL;
677 :
678 150 : r = cg_all_unified();
679 150 : if (r < 0)
680 0 : return r;
681 150 : if (r > 0)
682 0 : r = join_path_unified(path, suffix, fs);
683 : else
684 150 : r = join_path_legacy(controller, path, suffix, fs);
685 150 : if (r < 0)
686 0 : return r;
687 :
688 150 : path_simplify(*fs, false);
689 150 : return 0;
690 : }
691 :
692 342 : static int controller_is_accessible(const char *controller) {
693 : int r;
694 :
695 342 : assert(controller);
696 :
697 : /* Checks whether a specific controller is accessible,
698 : * i.e. its hierarchy mounted. In the unified hierarchy all
699 : * controllers are considered accessible, except for the named
700 : * hierarchies */
701 :
702 342 : if (!cg_controller_is_valid(controller))
703 0 : return -EINVAL;
704 :
705 342 : r = cg_all_unified();
706 342 : if (r < 0)
707 0 : return r;
708 342 : if (r > 0) {
709 : /* We don't support named hierarchies if we are using
710 : * the unified hierarchy. */
711 :
712 0 : if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
713 0 : return 0;
714 :
715 0 : if (startswith(controller, "name="))
716 0 : return -EOPNOTSUPP;
717 :
718 : } else {
719 : const char *cc, *dn;
720 :
721 342 : dn = controller_to_dirname(controller);
722 1710 : cc = strjoina("/sys/fs/cgroup/", dn);
723 :
724 342 : if (laccess(cc, F_OK) < 0)
725 0 : return -errno;
726 : }
727 :
728 342 : return 0;
729 : }
730 :
731 54 : int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
732 : int r;
733 :
734 54 : assert(controller);
735 54 : assert(fs);
736 :
737 : /* Check if the specified controller is actually accessible */
738 54 : r = controller_is_accessible(controller);
739 54 : if (r < 0)
740 0 : return r;
741 :
742 54 : return cg_get_path(controller, path, suffix, fs);
743 : }
744 :
745 0 : static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct FTW *ftwbuf) {
746 0 : assert(path);
747 0 : assert(sb);
748 0 : assert(ftwbuf);
749 :
750 0 : if (typeflag != FTW_DP)
751 0 : return 0;
752 :
753 0 : if (ftwbuf->level < 1)
754 0 : return 0;
755 :
756 0 : (void) rmdir(path);
757 0 : return 0;
758 : }
759 :
760 18 : int cg_trim(const char *controller, const char *path, bool delete_root) {
761 18 : _cleanup_free_ char *fs = NULL;
762 18 : int r = 0, q;
763 :
764 18 : assert(path);
765 :
766 18 : r = cg_get_path(controller, path, NULL, &fs);
767 18 : if (r < 0)
768 0 : return r;
769 :
770 18 : errno = 0;
771 18 : if (nftw(fs, trim_cb, 64, FTW_DEPTH|FTW_MOUNT|FTW_PHYS) != 0) {
772 18 : if (errno == ENOENT)
773 18 : r = 0;
774 : else
775 0 : r = errno_or_else(EIO);
776 : }
777 :
778 18 : if (delete_root) {
779 18 : if (rmdir(fs) < 0 && errno != ENOENT)
780 0 : return -errno;
781 : }
782 :
783 18 : q = cg_hybrid_unified();
784 18 : if (q < 0)
785 0 : return q;
786 18 : if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
787 0 : q = cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
788 0 : if (q < 0)
789 0 : log_warning_errno(q, "Failed to trim compat systemd cgroup %s: %m", path);
790 : }
791 :
792 18 : return r;
793 : }
794 :
795 : /* Create a cgroup in the hierarchy of controller.
796 : * Returns 0 if the group already existed, 1 on success, negative otherwise.
797 : */
798 48 : int cg_create(const char *controller, const char *path) {
799 48 : _cleanup_free_ char *fs = NULL;
800 : int r;
801 :
802 48 : r = cg_get_path_and_check(controller, path, NULL, &fs);
803 48 : if (r < 0)
804 0 : return r;
805 :
806 48 : r = mkdir_parents(fs, 0755);
807 48 : if (r < 0)
808 0 : return r;
809 :
810 48 : r = mkdir_errno_wrapper(fs, 0755);
811 48 : if (r == -EEXIST)
812 18 : return 0;
813 30 : if (r < 0)
814 30 : return r;
815 :
816 0 : r = cg_hybrid_unified();
817 0 : if (r < 0)
818 0 : return r;
819 :
820 0 : if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
821 0 : r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
822 0 : if (r < 0)
823 0 : log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
824 : }
825 :
826 0 : return 1;
827 : }
828 :
829 11 : int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
830 : int r, q;
831 :
832 11 : assert(pid >= 0);
833 :
834 11 : r = cg_create(controller, path);
835 11 : if (r < 0)
836 11 : return r;
837 :
838 0 : q = cg_attach(controller, path, pid);
839 0 : if (q < 0)
840 0 : return q;
841 :
842 : /* This does not remove the cgroup on failure */
843 0 : return r;
844 : }
845 :
846 6 : int cg_attach(const char *controller, const char *path, pid_t pid) {
847 6 : _cleanup_free_ char *fs = NULL;
848 : char c[DECIMAL_STR_MAX(pid_t) + 2];
849 : int r;
850 :
851 6 : assert(path);
852 6 : assert(pid >= 0);
853 :
854 6 : r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
855 6 : if (r < 0)
856 0 : return r;
857 :
858 6 : if (pid == 0)
859 0 : pid = getpid_cached();
860 :
861 6 : xsprintf(c, PID_FMT "\n", pid);
862 :
863 6 : r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER);
864 6 : if (r < 0)
865 6 : return r;
866 :
867 0 : r = cg_hybrid_unified();
868 0 : if (r < 0)
869 0 : return r;
870 :
871 0 : if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
872 0 : r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
873 0 : if (r < 0)
874 0 : log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
875 : }
876 :
877 0 : return 0;
878 : }
879 :
880 0 : int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
881 : int r;
882 :
883 0 : assert(controller);
884 0 : assert(path);
885 0 : assert(pid >= 0);
886 :
887 0 : r = cg_attach(controller, path, pid);
888 0 : if (r < 0) {
889 0 : char prefix[strlen(path) + 1];
890 :
891 : /* This didn't work? Then let's try all prefixes of
892 : * the destination */
893 :
894 0 : PATH_FOREACH_PREFIX(prefix, path) {
895 : int q;
896 :
897 0 : q = cg_attach(controller, prefix, pid);
898 0 : if (q >= 0)
899 0 : return q;
900 : }
901 : }
902 :
903 0 : return r;
904 : }
905 :
906 0 : int cg_set_access(
907 : const char *controller,
908 : const char *path,
909 : uid_t uid,
910 : gid_t gid) {
911 :
912 : struct Attribute {
913 : const char *name;
914 : bool fatal;
915 : };
916 :
917 : /* cgroup v1, aka legacy/non-unified */
918 : static const struct Attribute legacy_attributes[] = {
919 : { "cgroup.procs", true },
920 : { "tasks", false },
921 : { "cgroup.clone_children", false },
922 : {},
923 : };
924 :
925 : /* cgroup v2, aka unified */
926 : static const struct Attribute unified_attributes[] = {
927 : { "cgroup.procs", true },
928 : { "cgroup.subtree_control", true },
929 : { "cgroup.threads", false },
930 : {},
931 : };
932 :
933 : static const struct Attribute* const attributes[] = {
934 : [false] = legacy_attributes,
935 : [true] = unified_attributes,
936 : };
937 :
938 0 : _cleanup_free_ char *fs = NULL;
939 : const struct Attribute *i;
940 : int r, unified;
941 :
942 0 : assert(path);
943 :
944 0 : if (uid == UID_INVALID && gid == GID_INVALID)
945 0 : return 0;
946 :
947 0 : unified = cg_unified_controller(controller);
948 0 : if (unified < 0)
949 0 : return unified;
950 :
951 : /* Configure access to the cgroup itself */
952 0 : r = cg_get_path(controller, path, NULL, &fs);
953 0 : if (r < 0)
954 0 : return r;
955 :
956 0 : r = chmod_and_chown(fs, 0755, uid, gid);
957 0 : if (r < 0)
958 0 : return r;
959 :
960 : /* Configure access to the cgroup's attributes */
961 0 : for (i = attributes[unified]; i->name; i++) {
962 0 : fs = mfree(fs);
963 :
964 0 : r = cg_get_path(controller, path, i->name, &fs);
965 0 : if (r < 0)
966 0 : return r;
967 :
968 0 : r = chmod_and_chown(fs, 0644, uid, gid);
969 0 : if (r < 0) {
970 0 : if (i->fatal)
971 0 : return r;
972 :
973 0 : log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
974 : }
975 : }
976 :
977 0 : if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
978 0 : r = cg_hybrid_unified();
979 0 : if (r < 0)
980 0 : return r;
981 0 : if (r > 0) {
982 : /* Always propagate access mode from unified to legacy controller */
983 0 : r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
984 0 : if (r < 0)
985 0 : log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
986 : }
987 : }
988 :
989 0 : return 0;
990 : }
991 :
992 0 : int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) {
993 0 : _cleanup_free_ char *fs = NULL;
994 : int r;
995 :
996 0 : assert(path);
997 0 : assert(name);
998 0 : assert(value || size <= 0);
999 :
1000 0 : r = cg_get_path(controller, path, NULL, &fs);
1001 0 : if (r < 0)
1002 0 : return r;
1003 :
1004 0 : if (setxattr(fs, name, value, size, flags) < 0)
1005 0 : return -errno;
1006 :
1007 0 : return 0;
1008 : }
1009 :
1010 0 : int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) {
1011 0 : _cleanup_free_ char *fs = NULL;
1012 : ssize_t n;
1013 : int r;
1014 :
1015 0 : assert(path);
1016 0 : assert(name);
1017 :
1018 0 : r = cg_get_path(controller, path, NULL, &fs);
1019 0 : if (r < 0)
1020 0 : return r;
1021 :
1022 0 : n = getxattr(fs, name, value, size);
1023 0 : if (n < 0)
1024 0 : return -errno;
1025 :
1026 0 : return (int) n;
1027 : }
1028 :
1029 2680 : int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
1030 2680 : _cleanup_fclose_ FILE *f = NULL;
1031 : const char *fs, *controller_str;
1032 : int unified, r;
1033 2680 : size_t cs = 0;
1034 :
1035 2680 : assert(path);
1036 2680 : assert(pid >= 0);
1037 :
1038 2680 : if (controller) {
1039 2665 : if (!cg_controller_is_valid(controller))
1040 0 : return -EINVAL;
1041 : } else
1042 15 : controller = SYSTEMD_CGROUP_CONTROLLER;
1043 :
1044 2680 : unified = cg_unified_controller(controller);
1045 2680 : if (unified < 0)
1046 0 : return unified;
1047 2680 : if (unified == 0) {
1048 0 : if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
1049 0 : controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
1050 : else
1051 0 : controller_str = controller;
1052 :
1053 0 : cs = strlen(controller_str);
1054 : }
1055 :
1056 2680 : fs = procfs_file_alloca(pid, "cgroup");
1057 2680 : r = fopen_unlocked(fs, "re", &f);
1058 2680 : if (r == -ENOENT)
1059 0 : return -ESRCH;
1060 2680 : if (r < 0)
1061 0 : return r;
1062 :
1063 29480 : for (;;) {
1064 32160 : _cleanup_free_ char *line = NULL;
1065 : char *e, *p;
1066 :
1067 32160 : r = read_line(f, LONG_LINE_MAX, &line);
1068 32160 : if (r < 0)
1069 0 : return r;
1070 32160 : if (r == 0)
1071 0 : break;
1072 :
1073 32160 : if (unified) {
1074 32160 : e = startswith(line, "0:");
1075 32160 : if (!e)
1076 29480 : continue;
1077 :
1078 2680 : e = strchr(e, ':');
1079 2680 : if (!e)
1080 0 : continue;
1081 : } else {
1082 : char *l;
1083 : size_t k;
1084 : const char *word, *state;
1085 0 : bool found = false;
1086 :
1087 0 : l = strchr(line, ':');
1088 0 : if (!l)
1089 0 : continue;
1090 :
1091 0 : l++;
1092 0 : e = strchr(l, ':');
1093 0 : if (!e)
1094 0 : continue;
1095 :
1096 0 : *e = 0;
1097 0 : FOREACH_WORD_SEPARATOR(word, k, l, ",", state)
1098 0 : if (k == cs && memcmp(word, controller_str, cs) == 0) {
1099 0 : found = true;
1100 0 : break;
1101 : }
1102 0 : if (!found)
1103 0 : continue;
1104 : }
1105 :
1106 2680 : p = strdup(e + 1);
1107 2680 : if (!p)
1108 0 : return -ENOMEM;
1109 :
1110 : /* Truncate suffix indicating the process is a zombie */
1111 2680 : e = endswith(p, " (deleted)");
1112 2680 : if (e)
1113 0 : *e = 0;
1114 :
1115 2680 : *path = p;
1116 2680 : return 0;
1117 : }
1118 :
1119 0 : return -ENODATA;
1120 : }
1121 :
1122 0 : int cg_install_release_agent(const char *controller, const char *agent) {
1123 0 : _cleanup_free_ char *fs = NULL, *contents = NULL;
1124 : const char *sc;
1125 : int r;
1126 :
1127 0 : assert(agent);
1128 :
1129 0 : r = cg_unified_controller(controller);
1130 0 : if (r < 0)
1131 0 : return r;
1132 0 : if (r > 0) /* doesn't apply to unified hierarchy */
1133 0 : return -EOPNOTSUPP;
1134 :
1135 0 : r = cg_get_path(controller, NULL, "release_agent", &fs);
1136 0 : if (r < 0)
1137 0 : return r;
1138 :
1139 0 : r = read_one_line_file(fs, &contents);
1140 0 : if (r < 0)
1141 0 : return r;
1142 :
1143 0 : sc = strstrip(contents);
1144 0 : if (isempty(sc)) {
1145 0 : r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER);
1146 0 : if (r < 0)
1147 0 : return r;
1148 0 : } else if (!path_equal(sc, agent))
1149 0 : return -EEXIST;
1150 :
1151 0 : fs = mfree(fs);
1152 0 : r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1153 0 : if (r < 0)
1154 0 : return r;
1155 :
1156 0 : contents = mfree(contents);
1157 0 : r = read_one_line_file(fs, &contents);
1158 0 : if (r < 0)
1159 0 : return r;
1160 :
1161 0 : sc = strstrip(contents);
1162 0 : if (streq(sc, "0")) {
1163 0 : r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
1164 0 : if (r < 0)
1165 0 : return r;
1166 :
1167 0 : return 1;
1168 : }
1169 :
1170 0 : if (!streq(sc, "1"))
1171 0 : return -EIO;
1172 :
1173 0 : return 0;
1174 : }
1175 :
1176 0 : int cg_uninstall_release_agent(const char *controller) {
1177 0 : _cleanup_free_ char *fs = NULL;
1178 : int r;
1179 :
1180 0 : r = cg_unified_controller(controller);
1181 0 : if (r < 0)
1182 0 : return r;
1183 0 : if (r > 0) /* Doesn't apply to unified hierarchy */
1184 0 : return -EOPNOTSUPP;
1185 :
1186 0 : r = cg_get_path(controller, NULL, "notify_on_release", &fs);
1187 0 : if (r < 0)
1188 0 : return r;
1189 :
1190 0 : r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER);
1191 0 : if (r < 0)
1192 0 : return r;
1193 :
1194 0 : fs = mfree(fs);
1195 :
1196 0 : r = cg_get_path(controller, NULL, "release_agent", &fs);
1197 0 : if (r < 0)
1198 0 : return r;
1199 :
1200 0 : r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER);
1201 0 : if (r < 0)
1202 0 : return r;
1203 :
1204 0 : return 0;
1205 : }
1206 :
1207 0 : int cg_is_empty(const char *controller, const char *path) {
1208 0 : _cleanup_fclose_ FILE *f = NULL;
1209 : pid_t pid;
1210 : int r;
1211 :
1212 0 : assert(path);
1213 :
1214 0 : r = cg_enumerate_processes(controller, path, &f);
1215 0 : if (r == -ENOENT)
1216 0 : return true;
1217 0 : if (r < 0)
1218 0 : return r;
1219 :
1220 0 : r = cg_read_pid(f, &pid);
1221 0 : if (r < 0)
1222 0 : return r;
1223 :
1224 0 : return r == 0;
1225 : }
1226 :
1227 0 : int cg_is_empty_recursive(const char *controller, const char *path) {
1228 : int r;
1229 :
1230 0 : assert(path);
1231 :
1232 : /* The root cgroup is always populated */
1233 0 : if (controller && empty_or_root(path))
1234 0 : return false;
1235 :
1236 0 : r = cg_unified_controller(controller);
1237 0 : if (r < 0)
1238 0 : return r;
1239 0 : if (r > 0) {
1240 0 : _cleanup_free_ char *t = NULL;
1241 :
1242 : /* On the unified hierarchy we can check empty state
1243 : * via the "populated" attribute of "cgroup.events". */
1244 :
1245 0 : r = cg_read_event(controller, path, "populated", &t);
1246 0 : if (r == -ENOENT)
1247 0 : return true;
1248 0 : if (r < 0)
1249 0 : return r;
1250 :
1251 0 : return streq(t, "0");
1252 : } else {
1253 0 : _cleanup_closedir_ DIR *d = NULL;
1254 : char *fn;
1255 :
1256 0 : r = cg_is_empty(controller, path);
1257 0 : if (r <= 0)
1258 0 : return r;
1259 :
1260 0 : r = cg_enumerate_subgroups(controller, path, &d);
1261 0 : if (r == -ENOENT)
1262 0 : return true;
1263 0 : if (r < 0)
1264 0 : return r;
1265 :
1266 0 : while ((r = cg_read_subgroup(d, &fn)) > 0) {
1267 0 : _cleanup_free_ char *p = NULL;
1268 :
1269 0 : p = path_join(path, fn);
1270 0 : free(fn);
1271 0 : if (!p)
1272 0 : return -ENOMEM;
1273 :
1274 0 : r = cg_is_empty_recursive(controller, p);
1275 0 : if (r <= 0)
1276 0 : return r;
1277 : }
1278 0 : if (r < 0)
1279 0 : return r;
1280 :
1281 0 : return true;
1282 : }
1283 : }
1284 :
1285 0 : int cg_split_spec(const char *spec, char **controller, char **path) {
1286 0 : char *t = NULL, *u = NULL;
1287 : const char *e;
1288 :
1289 0 : assert(spec);
1290 :
1291 0 : if (*spec == '/') {
1292 0 : if (!path_is_normalized(spec))
1293 0 : return -EINVAL;
1294 :
1295 0 : if (path) {
1296 0 : t = strdup(spec);
1297 0 : if (!t)
1298 0 : return -ENOMEM;
1299 :
1300 0 : *path = path_simplify(t, false);
1301 : }
1302 :
1303 0 : if (controller)
1304 0 : *controller = NULL;
1305 :
1306 0 : return 0;
1307 : }
1308 :
1309 0 : e = strchr(spec, ':');
1310 0 : if (!e) {
1311 0 : if (!cg_controller_is_valid(spec))
1312 0 : return -EINVAL;
1313 :
1314 0 : if (controller) {
1315 0 : t = strdup(spec);
1316 0 : if (!t)
1317 0 : return -ENOMEM;
1318 :
1319 0 : *controller = t;
1320 : }
1321 :
1322 0 : if (path)
1323 0 : *path = NULL;
1324 :
1325 0 : return 0;
1326 : }
1327 :
1328 0 : t = strndup(spec, e-spec);
1329 0 : if (!t)
1330 0 : return -ENOMEM;
1331 0 : if (!cg_controller_is_valid(t)) {
1332 0 : free(t);
1333 0 : return -EINVAL;
1334 : }
1335 :
1336 0 : if (isempty(e+1))
1337 0 : u = NULL;
1338 : else {
1339 0 : u = strdup(e+1);
1340 0 : if (!u) {
1341 0 : free(t);
1342 0 : return -ENOMEM;
1343 : }
1344 :
1345 0 : if (!path_is_normalized(u) ||
1346 0 : !path_is_absolute(u)) {
1347 0 : free(t);
1348 0 : free(u);
1349 0 : return -EINVAL;
1350 : }
1351 :
1352 0 : path_simplify(u, false);
1353 : }
1354 :
1355 0 : if (controller)
1356 0 : *controller = t;
1357 : else
1358 0 : free(t);
1359 :
1360 0 : if (path)
1361 0 : *path = u;
1362 : else
1363 0 : free(u);
1364 :
1365 0 : return 0;
1366 : }
1367 :
1368 0 : int cg_mangle_path(const char *path, char **result) {
1369 0 : _cleanup_free_ char *c = NULL, *p = NULL;
1370 : char *t;
1371 : int r;
1372 :
1373 0 : assert(path);
1374 0 : assert(result);
1375 :
1376 : /* First, check if it already is a filesystem path */
1377 0 : if (path_startswith(path, "/sys/fs/cgroup")) {
1378 :
1379 0 : t = strdup(path);
1380 0 : if (!t)
1381 0 : return -ENOMEM;
1382 :
1383 0 : *result = path_simplify(t, false);
1384 0 : return 0;
1385 : }
1386 :
1387 : /* Otherwise, treat it as cg spec */
1388 0 : r = cg_split_spec(path, &c, &p);
1389 0 : if (r < 0)
1390 0 : return r;
1391 :
1392 0 : return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
1393 : }
1394 :
1395 1269 : int cg_get_root_path(char **path) {
1396 : char *p, *e;
1397 : int r;
1398 :
1399 1269 : assert(path);
1400 :
1401 1269 : r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1402 1269 : if (r < 0)
1403 0 : return r;
1404 :
1405 1269 : e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1406 1269 : if (!e)
1407 0 : e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1408 1269 : if (!e)
1409 0 : e = endswith(p, "/system"); /* even more legacy */
1410 1269 : if (e)
1411 1269 : *e = 0;
1412 :
1413 1269 : *path = p;
1414 1269 : return 0;
1415 : }
1416 :
1417 1228 : int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
1418 1228 : _cleanup_free_ char *rt = NULL;
1419 : char *p;
1420 : int r;
1421 :
1422 1228 : assert(cgroup);
1423 1228 : assert(shifted);
1424 :
1425 1228 : if (!root) {
1426 : /* If the root was specified let's use that, otherwise
1427 : * let's determine it from PID 1 */
1428 :
1429 1212 : r = cg_get_root_path(&rt);
1430 1212 : if (r < 0)
1431 0 : return r;
1432 :
1433 1212 : root = rt;
1434 : }
1435 :
1436 1228 : p = path_startswith(cgroup, root);
1437 1228 : if (p && p > cgroup)
1438 2 : *shifted = p - 1;
1439 : else
1440 1226 : *shifted = cgroup;
1441 :
1442 1228 : return 0;
1443 : }
1444 :
1445 1212 : int cg_pid_get_path_shifted(pid_t pid, const char *root, char **cgroup) {
1446 1212 : _cleanup_free_ char *raw = NULL;
1447 : const char *c;
1448 : int r;
1449 :
1450 1212 : assert(pid >= 0);
1451 1212 : assert(cgroup);
1452 :
1453 1212 : r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1454 1212 : if (r < 0)
1455 0 : return r;
1456 :
1457 1212 : r = cg_shift_path(raw, root, &c);
1458 1212 : if (r < 0)
1459 0 : return r;
1460 :
1461 1212 : if (c == raw)
1462 1212 : *cgroup = TAKE_PTR(raw);
1463 : else {
1464 : char *n;
1465 :
1466 0 : n = strdup(c);
1467 0 : if (!n)
1468 0 : return -ENOMEM;
1469 :
1470 0 : *cgroup = n;
1471 : }
1472 :
1473 1212 : return 0;
1474 : }
1475 :
1476 1031 : int cg_path_decode_unit(const char *cgroup, char **unit) {
1477 : char *c, *s;
1478 : size_t n;
1479 :
1480 1031 : assert(cgroup);
1481 1031 : assert(unit);
1482 :
1483 1031 : n = strcspn(cgroup, "/");
1484 1031 : if (n < 3)
1485 80 : return -ENXIO;
1486 :
1487 951 : c = strndupa(cgroup, n);
1488 951 : c = cg_unescape(c);
1489 :
1490 951 : if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1491 8 : return -ENXIO;
1492 :
1493 943 : s = strdup(c);
1494 943 : if (!s)
1495 0 : return -ENOMEM;
1496 :
1497 943 : *unit = s;
1498 943 : return 0;
1499 : }
1500 :
1501 3063 : static bool valid_slice_name(const char *p, size_t n) {
1502 :
1503 3063 : if (!p)
1504 0 : return false;
1505 :
1506 3063 : if (n < STRLEN("x.slice"))
1507 99 : return false;
1508 :
1509 2964 : if (memcmp(p + n - 6, ".slice", 6) == 0) {
1510 1825 : char buf[n+1], *c;
1511 :
1512 1825 : memcpy(buf, p, n);
1513 1825 : buf[n] = 0;
1514 :
1515 1825 : c = cg_unescape(buf);
1516 :
1517 1825 : return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1518 : }
1519 :
1520 1139 : return false;
1521 : }
1522 :
1523 871 : static const char *skip_slices(const char *p) {
1524 871 : assert(p);
1525 :
1526 : /* Skips over all slice assignments */
1527 :
1528 1228 : for (;;) {
1529 : size_t n;
1530 :
1531 2099 : p += strspn(p, "/");
1532 :
1533 2099 : n = strcspn(p, "/");
1534 2099 : if (!valid_slice_name(p, n))
1535 871 : return p;
1536 :
1537 1228 : p += n;
1538 : }
1539 : }
1540 :
1541 668 : int cg_path_get_unit(const char *path, char **ret) {
1542 : const char *e;
1543 : char *unit;
1544 : int r;
1545 :
1546 668 : assert(path);
1547 668 : assert(ret);
1548 :
1549 668 : e = skip_slices(path);
1550 :
1551 668 : r = cg_path_decode_unit(e, &unit);
1552 668 : if (r < 0)
1553 84 : return r;
1554 :
1555 : /* We skipped over the slices, don't accept any now */
1556 584 : if (endswith(unit, ".slice")) {
1557 0 : free(unit);
1558 0 : return -ENXIO;
1559 : }
1560 :
1561 584 : *ret = unit;
1562 584 : return 0;
1563 : }
1564 :
1565 173 : int cg_pid_get_unit(pid_t pid, char **unit) {
1566 173 : _cleanup_free_ char *cgroup = NULL;
1567 : int r;
1568 :
1569 173 : assert(unit);
1570 :
1571 173 : r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1572 173 : if (r < 0)
1573 0 : return r;
1574 :
1575 173 : return cg_path_get_unit(cgroup, unit);
1576 : }
1577 :
1578 : /**
1579 : * Skip session-*.scope, but require it to be there.
1580 : */
1581 157 : static const char *skip_session(const char *p) {
1582 : size_t n;
1583 :
1584 157 : if (isempty(p))
1585 4 : return NULL;
1586 :
1587 153 : p += strspn(p, "/");
1588 :
1589 153 : n = strcspn(p, "/");
1590 153 : if (n < STRLEN("session-x.scope"))
1591 35 : return NULL;
1592 :
1593 118 : if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1594 89 : char buf[n - 8 - 6 + 1];
1595 :
1596 89 : memcpy(buf, p + 8, n - 8 - 6);
1597 89 : buf[n - 8 - 6] = 0;
1598 :
1599 : /* Note that session scopes never need unescaping,
1600 : * since they cannot conflict with the kernel's own
1601 : * names, hence we don't need to call cg_unescape()
1602 : * here. */
1603 :
1604 89 : if (!session_id_valid(buf))
1605 0 : return false;
1606 :
1607 89 : p += n;
1608 89 : p += strspn(p, "/");
1609 89 : return p;
1610 : }
1611 :
1612 29 : return NULL;
1613 : }
1614 :
1615 : /**
1616 : * Skip user@*.service, but require it to be there.
1617 : */
1618 203 : static const char *skip_user_manager(const char *p) {
1619 : size_t n;
1620 :
1621 203 : if (isempty(p))
1622 4 : return NULL;
1623 :
1624 199 : p += strspn(p, "/");
1625 :
1626 199 : n = strcspn(p, "/");
1627 199 : if (n < STRLEN("user@x.service"))
1628 29 : return NULL;
1629 :
1630 170 : if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1631 46 : char buf[n - 5 - 8 + 1];
1632 :
1633 46 : memcpy(buf, p + 5, n - 5 - 8);
1634 46 : buf[n - 5 - 8] = 0;
1635 :
1636 : /* Note that user manager services never need unescaping,
1637 : * since they cannot conflict with the kernel's own
1638 : * names, hence we don't need to call cg_unescape()
1639 : * here. */
1640 :
1641 46 : if (parse_uid(buf, NULL) < 0)
1642 0 : return NULL;
1643 :
1644 46 : p += n;
1645 46 : p += strspn(p, "/");
1646 :
1647 46 : return p;
1648 : }
1649 :
1650 124 : return NULL;
1651 : }
1652 :
1653 203 : static const char *skip_user_prefix(const char *path) {
1654 : const char *e, *t;
1655 :
1656 203 : assert(path);
1657 :
1658 : /* Skip slices, if there are any */
1659 203 : e = skip_slices(path);
1660 :
1661 : /* Skip the user manager, if it's in the path now... */
1662 203 : t = skip_user_manager(e);
1663 203 : if (t)
1664 46 : return t;
1665 :
1666 : /* Alternatively skip the user session if it is in the path... */
1667 157 : return skip_session(e);
1668 : }
1669 :
1670 188 : int cg_path_get_user_unit(const char *path, char **ret) {
1671 : const char *t;
1672 :
1673 188 : assert(path);
1674 188 : assert(ret);
1675 :
1676 188 : t = skip_user_prefix(path);
1677 188 : if (!t)
1678 60 : return -ENXIO;
1679 :
1680 : /* And from here on it looks pretty much the same as for a system unit, hence let's use the same
1681 : * parser. */
1682 128 : return cg_path_get_unit(t, ret);
1683 : }
1684 :
1685 173 : int cg_pid_get_user_unit(pid_t pid, char **unit) {
1686 173 : _cleanup_free_ char *cgroup = NULL;
1687 : int r;
1688 :
1689 173 : assert(unit);
1690 :
1691 173 : r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1692 173 : if (r < 0)
1693 0 : return r;
1694 :
1695 173 : return cg_path_get_user_unit(cgroup, unit);
1696 : }
1697 :
1698 172 : int cg_path_get_machine_name(const char *path, char **machine) {
1699 172 : _cleanup_free_ char *u = NULL;
1700 : const char *sl;
1701 : int r;
1702 :
1703 172 : r = cg_path_get_unit(path, &u);
1704 172 : if (r < 0)
1705 0 : return r;
1706 :
1707 860 : sl = strjoina("/run/systemd/machines/unit:", u);
1708 172 : return readlink_malloc(sl, machine);
1709 : }
1710 :
1711 172 : int cg_pid_get_machine_name(pid_t pid, char **machine) {
1712 172 : _cleanup_free_ char *cgroup = NULL;
1713 : int r;
1714 :
1715 172 : assert(machine);
1716 :
1717 172 : r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1718 172 : if (r < 0)
1719 0 : return r;
1720 :
1721 172 : return cg_path_get_machine_name(cgroup, machine);
1722 : }
1723 :
1724 182 : int cg_path_get_session(const char *path, char **session) {
1725 182 : _cleanup_free_ char *unit = NULL;
1726 : char *start, *end;
1727 : int r;
1728 :
1729 182 : assert(path);
1730 :
1731 182 : r = cg_path_get_unit(path, &unit);
1732 182 : if (r < 0)
1733 1 : return r;
1734 :
1735 181 : start = startswith(unit, "session-");
1736 181 : if (!start)
1737 96 : return -ENXIO;
1738 85 : end = endswith(start, ".scope");
1739 85 : if (!end)
1740 0 : return -ENXIO;
1741 :
1742 85 : *end = 0;
1743 85 : if (!session_id_valid(start))
1744 1 : return -ENXIO;
1745 :
1746 84 : if (session) {
1747 : char *rr;
1748 :
1749 84 : rr = strdup(start);
1750 84 : if (!rr)
1751 0 : return -ENOMEM;
1752 :
1753 84 : *session = rr;
1754 : }
1755 :
1756 84 : return 0;
1757 : }
1758 :
1759 176 : int cg_pid_get_session(pid_t pid, char **session) {
1760 176 : _cleanup_free_ char *cgroup = NULL;
1761 : int r;
1762 :
1763 176 : r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1764 176 : if (r < 0)
1765 0 : return r;
1766 :
1767 176 : return cg_path_get_session(cgroup, session);
1768 : }
1769 :
1770 178 : int cg_path_get_owner_uid(const char *path, uid_t *uid) {
1771 178 : _cleanup_free_ char *slice = NULL;
1772 : char *start, *end;
1773 : int r;
1774 :
1775 178 : assert(path);
1776 :
1777 178 : r = cg_path_get_slice(path, &slice);
1778 178 : if (r < 0)
1779 0 : return r;
1780 :
1781 178 : start = startswith(slice, "user-");
1782 178 : if (!start)
1783 59 : return -ENXIO;
1784 119 : end = endswith(start, ".slice");
1785 119 : if (!end)
1786 0 : return -ENXIO;
1787 :
1788 119 : *end = 0;
1789 119 : if (parse_uid(start, uid) < 0)
1790 0 : return -ENXIO;
1791 :
1792 119 : return 0;
1793 : }
1794 :
1795 173 : int cg_pid_get_owner_uid(pid_t pid, uid_t *uid) {
1796 173 : _cleanup_free_ char *cgroup = NULL;
1797 : int r;
1798 :
1799 173 : r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1800 173 : if (r < 0)
1801 0 : return r;
1802 :
1803 173 : return cg_path_get_owner_uid(cgroup, uid);
1804 : }
1805 :
1806 367 : int cg_path_get_slice(const char *p, char **slice) {
1807 367 : const char *e = NULL;
1808 :
1809 367 : assert(p);
1810 367 : assert(slice);
1811 :
1812 : /* Finds the right-most slice unit from the beginning, but
1813 : * stops before we come to the first non-slice unit. */
1814 :
1815 597 : for (;;) {
1816 : size_t n;
1817 :
1818 964 : p += strspn(p, "/");
1819 :
1820 964 : n = strcspn(p, "/");
1821 964 : if (!valid_slice_name(p, n)) {
1822 :
1823 367 : if (!e) {
1824 : char *s;
1825 :
1826 13 : s = strdup(SPECIAL_ROOT_SLICE);
1827 13 : if (!s)
1828 0 : return -ENOMEM;
1829 :
1830 13 : *slice = s;
1831 13 : return 0;
1832 : }
1833 :
1834 354 : return cg_path_decode_unit(e, slice);
1835 : }
1836 :
1837 597 : e = p;
1838 597 : p += n;
1839 : }
1840 : }
1841 :
1842 173 : int cg_pid_get_slice(pid_t pid, char **slice) {
1843 173 : _cleanup_free_ char *cgroup = NULL;
1844 : int r;
1845 :
1846 173 : assert(slice);
1847 :
1848 173 : r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1849 173 : if (r < 0)
1850 0 : return r;
1851 :
1852 173 : return cg_path_get_slice(cgroup, slice);
1853 : }
1854 :
1855 15 : int cg_path_get_user_slice(const char *p, char **slice) {
1856 : const char *t;
1857 15 : assert(p);
1858 15 : assert(slice);
1859 :
1860 15 : t = skip_user_prefix(p);
1861 15 : if (!t)
1862 8 : return -ENXIO;
1863 :
1864 : /* And now it looks pretty much the same as for a system
1865 : * slice, so let's just use the same parser from here on. */
1866 7 : return cg_path_get_slice(t, slice);
1867 : }
1868 :
1869 0 : int cg_pid_get_user_slice(pid_t pid, char **slice) {
1870 0 : _cleanup_free_ char *cgroup = NULL;
1871 : int r;
1872 :
1873 0 : assert(slice);
1874 :
1875 0 : r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1876 0 : if (r < 0)
1877 0 : return r;
1878 :
1879 0 : return cg_path_get_user_slice(cgroup, slice);
1880 : }
1881 :
1882 26 : char *cg_escape(const char *p) {
1883 26 : bool need_prefix = false;
1884 :
1885 : /* This implements very minimal escaping for names to be used
1886 : * as file names in the cgroup tree: any name which might
1887 : * conflict with a kernel name or is prefixed with '_' is
1888 : * prefixed with a '_'. That way, when reading cgroup names it
1889 : * is sufficient to remove a single prefixing underscore if
1890 : * there is one. */
1891 :
1892 : /* The return value of this function (unlike cg_unescape())
1893 : * needs free()! */
1894 :
1895 26 : if (IN_SET(p[0], 0, '_', '.') ||
1896 41 : STR_IN_SET(p, "notify_on_release", "release_agent", "tasks") ||
1897 20 : startswith(p, "cgroup."))
1898 7 : need_prefix = true;
1899 : else {
1900 : const char *dot;
1901 :
1902 19 : dot = strrchr(p, '.');
1903 19 : if (dot) {
1904 : CGroupController c;
1905 18 : size_t l = dot - p;
1906 :
1907 171 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1908 : const char *n;
1909 :
1910 154 : n = cgroup_controller_to_string(c);
1911 :
1912 154 : if (l != strlen(n))
1913 139 : continue;
1914 :
1915 15 : if (memcmp(p, n, l) != 0)
1916 14 : continue;
1917 :
1918 1 : need_prefix = true;
1919 1 : break;
1920 : }
1921 : }
1922 : }
1923 :
1924 26 : if (need_prefix)
1925 8 : return strjoin("_", p);
1926 :
1927 18 : return strdup(p);
1928 : }
1929 :
1930 2786 : char *cg_unescape(const char *p) {
1931 2786 : assert(p);
1932 :
1933 : /* The return value of this function (unlike cg_escape())
1934 : * doesn't need free()! */
1935 :
1936 2786 : if (p[0] == '_')
1937 11 : return (char*) p+1;
1938 :
1939 2775 : return (char*) p;
1940 : }
1941 :
1942 : #define CONTROLLER_VALID \
1943 : DIGITS LETTERS \
1944 : "_"
1945 :
1946 3167 : bool cg_controller_is_valid(const char *p) {
1947 : const char *t, *s;
1948 :
1949 3167 : if (!p)
1950 0 : return false;
1951 :
1952 3167 : if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1953 2772 : return true;
1954 :
1955 395 : s = startswith(p, "name=");
1956 395 : if (s)
1957 2 : p = s;
1958 :
1959 395 : if (IN_SET(*p, 0, '_'))
1960 4 : return false;
1961 :
1962 2454 : for (t = p; *t; t++)
1963 2066 : if (!strchr(CONTROLLER_VALID, *t))
1964 3 : return false;
1965 :
1966 388 : if (t - p > FILENAME_MAX)
1967 0 : return false;
1968 :
1969 388 : return true;
1970 : }
1971 :
1972 21 : int cg_slice_to_path(const char *unit, char **ret) {
1973 21 : _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1974 : const char *dash;
1975 : int r;
1976 :
1977 21 : assert(unit);
1978 21 : assert(ret);
1979 :
1980 21 : if (streq(unit, SPECIAL_ROOT_SLICE)) {
1981 : char *x;
1982 :
1983 1 : x = strdup("");
1984 1 : if (!x)
1985 0 : return -ENOMEM;
1986 1 : *ret = x;
1987 1 : return 0;
1988 : }
1989 :
1990 20 : if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1991 10 : return -EINVAL;
1992 :
1993 10 : if (!endswith(unit, ".slice"))
1994 1 : return -EINVAL;
1995 :
1996 9 : r = unit_name_to_prefix(unit, &p);
1997 9 : if (r < 0)
1998 0 : return r;
1999 :
2000 9 : dash = strchr(p, '-');
2001 :
2002 : /* Don't allow initial dashes */
2003 9 : if (dash == p)
2004 3 : return -EINVAL;
2005 :
2006 12 : while (dash) {
2007 8 : _cleanup_free_ char *escaped = NULL;
2008 8 : char n[dash - p + sizeof(".slice")];
2009 :
2010 : #if HAS_FEATURE_MEMORY_SANITIZER
2011 : /* msan doesn't instrument stpncpy, so it thinks
2012 : * n is later used uninitialized:
2013 : * https://github.com/google/sanitizers/issues/926
2014 : */
2015 : zero(n);
2016 : #endif
2017 :
2018 : /* Don't allow trailing or double dashes */
2019 8 : if (IN_SET(dash[1], 0, '-'))
2020 2 : return -EINVAL;
2021 :
2022 6 : strcpy(stpncpy(n, p, dash - p), ".slice");
2023 6 : if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
2024 0 : return -EINVAL;
2025 :
2026 6 : escaped = cg_escape(n);
2027 6 : if (!escaped)
2028 0 : return -ENOMEM;
2029 :
2030 6 : if (!strextend(&s, escaped, "/", NULL))
2031 0 : return -ENOMEM;
2032 :
2033 6 : dash = strchr(dash+1, '-');
2034 : }
2035 :
2036 4 : e = cg_escape(unit);
2037 4 : if (!e)
2038 0 : return -ENOMEM;
2039 :
2040 4 : if (!strextend(&s, e, NULL))
2041 0 : return -ENOMEM;
2042 :
2043 4 : *ret = TAKE_PTR(s);
2044 :
2045 4 : return 0;
2046 : }
2047 :
2048 0 : int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
2049 0 : _cleanup_free_ char *p = NULL;
2050 : int r;
2051 :
2052 0 : r = cg_get_path(controller, path, attribute, &p);
2053 0 : if (r < 0)
2054 0 : return r;
2055 :
2056 0 : return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
2057 : }
2058 :
2059 54 : int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
2060 54 : _cleanup_free_ char *p = NULL;
2061 : int r;
2062 :
2063 54 : r = cg_get_path(controller, path, attribute, &p);
2064 54 : if (r < 0)
2065 0 : return r;
2066 :
2067 54 : return read_one_line_file(p, ret);
2068 : }
2069 :
2070 1 : int cg_get_keyed_attribute(
2071 : const char *controller,
2072 : const char *path,
2073 : const char *attribute,
2074 : char **keys,
2075 : char **ret_values) {
2076 :
2077 1 : _cleanup_free_ char *filename = NULL, *contents = NULL;
2078 : const char *p;
2079 1 : size_t n, i, n_done = 0;
2080 : char **v;
2081 : int r;
2082 :
2083 : /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
2084 : * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
2085 : * entries as 'keys'. On success each entry will be set to the value of the matching key.
2086 : *
2087 : * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. */
2088 :
2089 1 : r = cg_get_path(controller, path, attribute, &filename);
2090 1 : if (r < 0)
2091 0 : return r;
2092 :
2093 1 : r = read_full_file(filename, &contents, NULL);
2094 1 : if (r < 0)
2095 1 : return r;
2096 :
2097 0 : n = strv_length(keys);
2098 0 : if (n == 0) /* No keys to retrieve? That's easy, we are done then */
2099 0 : return 0;
2100 :
2101 : /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
2102 0 : v = newa0(char*, n);
2103 :
2104 0 : for (p = contents; *p;) {
2105 0 : const char *w = NULL;
2106 :
2107 0 : for (i = 0; i < n; i++)
2108 0 : if (!v[i]) {
2109 0 : w = first_word(p, keys[i]);
2110 0 : if (w)
2111 0 : break;
2112 : }
2113 :
2114 0 : if (w) {
2115 : size_t l;
2116 :
2117 0 : l = strcspn(w, NEWLINE);
2118 0 : v[i] = strndup(w, l);
2119 0 : if (!v[i]) {
2120 0 : r = -ENOMEM;
2121 0 : goto fail;
2122 : }
2123 :
2124 0 : n_done++;
2125 0 : if (n_done >= n)
2126 0 : goto done;
2127 :
2128 0 : p = w + l;
2129 : } else
2130 0 : p += strcspn(p, NEWLINE);
2131 :
2132 0 : p += strspn(p, NEWLINE);
2133 : }
2134 :
2135 0 : r = -ENXIO;
2136 :
2137 0 : fail:
2138 0 : for (i = 0; i < n; i++)
2139 0 : free(v[i]);
2140 :
2141 0 : return r;
2142 :
2143 0 : done:
2144 0 : memcpy(ret_values, v, sizeof(char*) * n);
2145 0 : return 0;
2146 :
2147 : }
2148 :
2149 25 : int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
2150 : CGroupController c;
2151 : CGroupMask done;
2152 : bool created;
2153 : int r;
2154 :
2155 : /* This one will create a cgroup in our private tree, but also
2156 : * duplicate it in the trees specified in mask, and remove it
2157 : * in all others.
2158 : *
2159 : * Returns 0 if the group already existed in the systemd hierarchy,
2160 : * 1 on success, negative otherwise.
2161 : */
2162 :
2163 : /* First create the cgroup in our own hierarchy. */
2164 25 : r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
2165 25 : if (r < 0)
2166 19 : return r;
2167 6 : created = r;
2168 :
2169 : /* If we are in the unified hierarchy, we are done now */
2170 6 : r = cg_all_unified();
2171 6 : if (r < 0)
2172 0 : return r;
2173 6 : if (r > 0)
2174 0 : return created;
2175 :
2176 6 : supported &= CGROUP_MASK_V1;
2177 6 : mask = CGROUP_MASK_EXTEND_JOINED(mask);
2178 6 : done = 0;
2179 :
2180 : /* Otherwise, do the same in the other hierarchies */
2181 60 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2182 54 : CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2183 : const char *n;
2184 :
2185 54 : if (!FLAGS_SET(supported, bit))
2186 18 : continue;
2187 :
2188 36 : if (FLAGS_SET(done, bit))
2189 6 : continue;
2190 :
2191 30 : n = cgroup_controller_to_string(c);
2192 30 : if (FLAGS_SET(mask, bit))
2193 12 : (void) cg_create(n, path);
2194 : else
2195 18 : (void) cg_trim(n, path, true);
2196 :
2197 30 : done |= CGROUP_MASK_EXTEND_JOINED(bit);
2198 : }
2199 :
2200 6 : return created;
2201 : }
2202 :
2203 0 : int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
2204 : CGroupController c;
2205 : CGroupMask done;
2206 : int r;
2207 :
2208 0 : r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
2209 0 : if (r < 0)
2210 0 : return r;
2211 :
2212 0 : r = cg_all_unified();
2213 0 : if (r < 0)
2214 0 : return r;
2215 0 : if (r > 0)
2216 0 : return 0;
2217 :
2218 0 : supported &= CGROUP_MASK_V1;
2219 0 : done = 0;
2220 :
2221 0 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2222 0 : CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2223 0 : const char *p = NULL;
2224 :
2225 0 : if (!FLAGS_SET(supported, bit))
2226 0 : continue;
2227 :
2228 0 : if (FLAGS_SET(done, bit))
2229 0 : continue;
2230 :
2231 0 : if (path_callback)
2232 0 : p = path_callback(bit, userdata);
2233 0 : if (!p)
2234 0 : p = path;
2235 :
2236 0 : (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
2237 0 : done |= CGROUP_MASK_EXTEND_JOINED(bit);
2238 : }
2239 :
2240 0 : return 0;
2241 : }
2242 :
2243 0 : int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t path_callback, void *userdata) {
2244 : Iterator i;
2245 : void *pidp;
2246 0 : int r = 0;
2247 :
2248 0 : SET_FOREACH(pidp, pids, i) {
2249 0 : pid_t pid = PTR_TO_PID(pidp);
2250 : int q;
2251 :
2252 0 : q = cg_attach_everywhere(supported, path, pid, path_callback, userdata);
2253 0 : if (q < 0 && r >= 0)
2254 0 : r = q;
2255 : }
2256 :
2257 0 : return r;
2258 : }
2259 :
2260 0 : int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
2261 : CGroupController c;
2262 : CGroupMask done;
2263 0 : int r = 0, q;
2264 :
2265 0 : if (!path_equal(from, to)) {
2266 0 : r = cg_migrate_recursive(SYSTEMD_CGROUP_CONTROLLER, from, SYSTEMD_CGROUP_CONTROLLER, to, CGROUP_REMOVE);
2267 0 : if (r < 0)
2268 0 : return r;
2269 : }
2270 :
2271 0 : q = cg_all_unified();
2272 0 : if (q < 0)
2273 0 : return q;
2274 0 : if (q > 0)
2275 0 : return r;
2276 :
2277 0 : supported &= CGROUP_MASK_V1;
2278 0 : done = 0;
2279 :
2280 0 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2281 0 : CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2282 0 : const char *p = NULL;
2283 :
2284 0 : if (!FLAGS_SET(supported, bit))
2285 0 : continue;
2286 :
2287 0 : if (FLAGS_SET(done, bit))
2288 0 : continue;
2289 :
2290 0 : if (to_callback)
2291 0 : p = to_callback(bit, userdata);
2292 0 : if (!p)
2293 0 : p = to;
2294 :
2295 0 : (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0);
2296 0 : done |= CGROUP_MASK_EXTEND_JOINED(bit);
2297 : }
2298 :
2299 0 : return r;
2300 : }
2301 :
2302 0 : int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
2303 : CGroupController c;
2304 : CGroupMask done;
2305 : int r, q;
2306 :
2307 0 : r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
2308 0 : if (r < 0)
2309 0 : return r;
2310 :
2311 0 : q = cg_all_unified();
2312 0 : if (q < 0)
2313 0 : return q;
2314 0 : if (q > 0)
2315 0 : return r;
2316 :
2317 0 : supported &= CGROUP_MASK_V1;
2318 0 : done = 0;
2319 :
2320 0 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2321 0 : CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2322 :
2323 0 : if (!FLAGS_SET(supported, bit))
2324 0 : continue;
2325 :
2326 0 : if (FLAGS_SET(done, bit))
2327 0 : continue;
2328 :
2329 0 : (void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
2330 0 : done |= CGROUP_MASK_EXTEND_JOINED(bit);
2331 : }
2332 :
2333 0 : return r;
2334 : }
2335 :
2336 449 : int cg_mask_to_string(CGroupMask mask, char **ret) {
2337 449 : _cleanup_free_ char *s = NULL;
2338 449 : size_t n = 0, allocated = 0;
2339 449 : bool space = false;
2340 : CGroupController c;
2341 :
2342 449 : assert(ret);
2343 :
2344 449 : if (mask == 0) {
2345 203 : *ret = NULL;
2346 203 : return 0;
2347 : }
2348 :
2349 2460 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2350 : const char *k;
2351 : size_t l;
2352 :
2353 2214 : if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c)))
2354 1674 : continue;
2355 :
2356 540 : k = cgroup_controller_to_string(c);
2357 540 : l = strlen(k);
2358 :
2359 540 : if (!GREEDY_REALLOC(s, allocated, n + space + l + 1))
2360 0 : return -ENOMEM;
2361 :
2362 540 : if (space)
2363 294 : s[n] = ' ';
2364 540 : memcpy(s + n + space, k, l);
2365 540 : n += space + l;
2366 :
2367 540 : space = true;
2368 : }
2369 :
2370 246 : assert(s);
2371 :
2372 246 : s[n] = 0;
2373 246 : *ret = TAKE_PTR(s);
2374 :
2375 246 : return 0;
2376 : }
2377 :
2378 23 : int cg_mask_from_string(const char *value, CGroupMask *ret) {
2379 23 : CGroupMask m = 0;
2380 :
2381 23 : assert(ret);
2382 23 : assert(value);
2383 :
2384 33 : for (;;) {
2385 56 : _cleanup_free_ char *n = NULL;
2386 : CGroupController v;
2387 : int r;
2388 :
2389 56 : r = extract_first_word(&value, &n, NULL, 0);
2390 56 : if (r < 0)
2391 0 : return r;
2392 56 : if (r == 0)
2393 23 : break;
2394 :
2395 33 : v = cgroup_controller_from_string(n);
2396 33 : if (v < 0)
2397 2 : continue;
2398 :
2399 31 : m |= CGROUP_CONTROLLER_TO_MASK(v);
2400 : }
2401 :
2402 23 : *ret = m;
2403 23 : return 0;
2404 : }
2405 :
2406 48 : int cg_mask_supported(CGroupMask *ret) {
2407 : CGroupMask mask;
2408 : int r;
2409 :
2410 : /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
2411 : * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
2412 : * pseudo-controllers. */
2413 :
2414 48 : r = cg_all_unified();
2415 48 : if (r < 0)
2416 0 : return r;
2417 48 : if (r > 0) {
2418 0 : _cleanup_free_ char *root = NULL, *controllers = NULL, *path = NULL;
2419 :
2420 : /* In the unified hierarchy we can read the supported
2421 : * and accessible controllers from a the top-level
2422 : * cgroup attribute */
2423 :
2424 0 : r = cg_get_root_path(&root);
2425 0 : if (r < 0)
2426 0 : return r;
2427 :
2428 0 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
2429 0 : if (r < 0)
2430 0 : return r;
2431 :
2432 0 : r = read_one_line_file(path, &controllers);
2433 0 : if (r < 0)
2434 0 : return r;
2435 :
2436 0 : r = cg_mask_from_string(controllers, &mask);
2437 0 : if (r < 0)
2438 0 : return r;
2439 :
2440 : /* Currently, we support the cpu, memory, io and pids controller in the unified hierarchy, mask
2441 : * everything else off. */
2442 0 : mask &= CGROUP_MASK_V2;
2443 :
2444 : } else {
2445 : CGroupController c;
2446 :
2447 : /* In the legacy hierarchy, we check which hierarchies are mounted. */
2448 :
2449 48 : mask = 0;
2450 480 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2451 432 : CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2452 : const char *n;
2453 :
2454 432 : if (!FLAGS_SET(CGROUP_MASK_V1, bit))
2455 144 : continue;
2456 :
2457 288 : n = cgroup_controller_to_string(c);
2458 288 : if (controller_is_accessible(n) >= 0)
2459 288 : mask |= bit;
2460 : }
2461 : }
2462 :
2463 48 : *ret = mask;
2464 48 : return 0;
2465 : }
2466 :
2467 0 : int cg_kernel_controllers(Set **ret) {
2468 0 : _cleanup_set_free_free_ Set *controllers = NULL;
2469 0 : _cleanup_fclose_ FILE *f = NULL;
2470 : int r;
2471 :
2472 0 : assert(ret);
2473 :
2474 : /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2475 : * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2476 : * pseudo-controllers. */
2477 :
2478 0 : controllers = set_new(&string_hash_ops);
2479 0 : if (!controllers)
2480 0 : return -ENOMEM;
2481 :
2482 0 : r = fopen_unlocked("/proc/cgroups", "re", &f);
2483 0 : if (r == -ENOENT) {
2484 0 : *ret = NULL;
2485 0 : return 0;
2486 : }
2487 0 : if (r < 0)
2488 0 : return r;
2489 :
2490 : /* Ignore the header line */
2491 0 : (void) read_line(f, (size_t) -1, NULL);
2492 :
2493 0 : for (;;) {
2494 : char *controller;
2495 0 : int enabled = 0;
2496 :
2497 0 : errno = 0;
2498 0 : if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2499 :
2500 0 : if (feof(f))
2501 0 : break;
2502 :
2503 0 : if (ferror(f))
2504 0 : return errno_or_else(EIO);
2505 :
2506 0 : return -EBADMSG;
2507 : }
2508 :
2509 0 : if (!enabled) {
2510 0 : free(controller);
2511 0 : continue;
2512 : }
2513 :
2514 0 : if (!cg_controller_is_valid(controller)) {
2515 0 : free(controller);
2516 0 : return -EBADMSG;
2517 : }
2518 :
2519 0 : r = set_consume(controllers, controller);
2520 0 : if (r < 0)
2521 0 : return r;
2522 : }
2523 :
2524 0 : *ret = TAKE_PTR(controllers);
2525 :
2526 0 : return 0;
2527 : }
2528 :
2529 : static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2530 :
2531 : /* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on /sys/fs/cgroup/systemd. This
2532 : * unfortunately broke other tools (such as docker) which expected the v1 "name=systemd" hierarchy on
2533 : * /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mountnbs v2 on /sys/fs/cgroup/unified and maintains
2534 : * "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility with other tools.
2535 : *
2536 : * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep cgroup v2
2537 : * process management but disable the compat dual layout, we return %true on
2538 : * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and %false on cg_hybrid_unified().
2539 : */
2540 : static thread_local bool unified_systemd_v232;
2541 :
2542 3467 : static int cg_unified_update(void) {
2543 :
2544 : struct statfs fs;
2545 :
2546 : /* Checks if we support the unified hierarchy. Returns an
2547 : * error when the cgroup hierarchies aren't mounted yet or we
2548 : * have any other trouble determining if the unified hierarchy
2549 : * is supported. */
2550 :
2551 3467 : if (unified_cache >= CGROUP_UNIFIED_NONE)
2552 3440 : return 0;
2553 :
2554 27 : if (statfs("/sys/fs/cgroup/", &fs) < 0)
2555 0 : return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2556 :
2557 27 : if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2558 0 : log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2559 0 : unified_cache = CGROUP_UNIFIED_ALL;
2560 27 : } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2561 27 : if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2562 27 : F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2563 27 : log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2564 27 : unified_cache = CGROUP_UNIFIED_SYSTEMD;
2565 27 : unified_systemd_v232 = false;
2566 : } else {
2567 0 : if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
2568 0 : return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2569 :
2570 0 : if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2571 0 : log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2572 0 : unified_cache = CGROUP_UNIFIED_SYSTEMD;
2573 0 : unified_systemd_v232 = true;
2574 0 : } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2575 0 : log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2576 0 : unified_cache = CGROUP_UNIFIED_NONE;
2577 : } else {
2578 0 : log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2579 : (unsigned long long) fs.f_type);
2580 0 : unified_cache = CGROUP_UNIFIED_NONE;
2581 : }
2582 : }
2583 : } else
2584 0 : return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2585 : "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2586 : (unsigned long long)fs.f_type);
2587 :
2588 27 : return 0;
2589 : }
2590 :
2591 2714 : int cg_unified_controller(const char *controller) {
2592 : int r;
2593 :
2594 2714 : r = cg_unified_update();
2595 2714 : if (r < 0)
2596 0 : return r;
2597 :
2598 2714 : if (unified_cache == CGROUP_UNIFIED_NONE)
2599 0 : return false;
2600 :
2601 2714 : if (unified_cache >= CGROUP_UNIFIED_ALL)
2602 0 : return true;
2603 :
2604 2714 : return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2605 : }
2606 :
2607 610 : int cg_all_unified(void) {
2608 : int r;
2609 :
2610 610 : r = cg_unified_update();
2611 610 : if (r < 0)
2612 0 : return r;
2613 :
2614 610 : return unified_cache >= CGROUP_UNIFIED_ALL;
2615 : }
2616 :
2617 126 : int cg_hybrid_unified(void) {
2618 : int r;
2619 :
2620 126 : r = cg_unified_update();
2621 126 : if (r < 0)
2622 0 : return r;
2623 :
2624 126 : return unified_cache == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2625 : }
2626 :
2627 17 : int cg_unified_flush(void) {
2628 17 : unified_cache = CGROUP_UNIFIED_UNKNOWN;
2629 :
2630 17 : return cg_unified_update();
2631 : }
2632 :
2633 6 : int cg_enable_everywhere(
2634 : CGroupMask supported,
2635 : CGroupMask mask,
2636 : const char *p,
2637 : CGroupMask *ret_result_mask) {
2638 :
2639 6 : _cleanup_fclose_ FILE *f = NULL;
2640 6 : _cleanup_free_ char *fs = NULL;
2641 : CGroupController c;
2642 6 : CGroupMask ret = 0;
2643 : int r;
2644 :
2645 6 : assert(p);
2646 :
2647 6 : if (supported == 0) {
2648 0 : if (ret_result_mask)
2649 0 : *ret_result_mask = 0;
2650 0 : return 0;
2651 : }
2652 :
2653 6 : r = cg_all_unified();
2654 6 : if (r < 0)
2655 0 : return r;
2656 6 : if (r == 0) {
2657 : /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
2658 : * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
2659 : * caller tends to use the returned mask later on to compare if all controllers where properly joined,
2660 : * and if not requeues realization. This use is the primary purpose of the return value, hence let's
2661 : * minimize surprises here and reduce triggers for re-realization by always saying we fully
2662 : * succeeded.) */
2663 6 : if (ret_result_mask)
2664 6 : *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with
2665 : * CGROUP_MASK_V2: The 'supported' mask
2666 : * might contain pure-V1 or BPF
2667 : * controllers, and we never want to
2668 : * claim that we could enable those with
2669 : * cgroup.subtree_control */
2670 6 : return 0;
2671 : }
2672 :
2673 0 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
2674 0 : if (r < 0)
2675 0 : return r;
2676 :
2677 0 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
2678 0 : CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
2679 : const char *n;
2680 :
2681 0 : if (!FLAGS_SET(CGROUP_MASK_V2, bit))
2682 0 : continue;
2683 :
2684 0 : if (!FLAGS_SET(supported, bit))
2685 0 : continue;
2686 :
2687 0 : n = cgroup_controller_to_string(c);
2688 0 : {
2689 0 : char s[1 + strlen(n) + 1];
2690 :
2691 0 : s[0] = FLAGS_SET(mask, bit) ? '+' : '-';
2692 0 : strcpy(s + 1, n);
2693 :
2694 0 : if (!f) {
2695 0 : f = fopen(fs, "we");
2696 0 : if (!f)
2697 0 : return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
2698 : }
2699 :
2700 0 : r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER);
2701 0 : if (r < 0) {
2702 0 : log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m",
2703 : FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs);
2704 0 : clearerr(f);
2705 :
2706 : /* If we can't turn off a controller, leave it on in the reported resulting mask. This
2707 : * happens for example when we attempt to turn off a controller up in the tree that is
2708 : * used down in the tree. */
2709 0 : if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY
2710 : * only here, and not follow the same logic
2711 : * for other errors such as EINVAL or
2712 : * EOPNOTSUPP or anything else. That's
2713 : * because EBUSY indicates that the
2714 : * controllers is currently enabled and
2715 : * cannot be disabled because something down
2716 : * the hierarchy is still using it. Any other
2717 : * error most likely means something like "I
2718 : * never heard of this controller" or
2719 : * similar. In the former case it's hence
2720 : * safe to assume the controller is still on
2721 : * after the failed operation, while in the
2722 : * latter case it's safer to assume the
2723 : * controller is unknown and hence certainly
2724 : * not enabled. */
2725 0 : ret |= bit;
2726 : } else {
2727 : /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
2728 0 : if (FLAGS_SET(mask, bit))
2729 0 : ret |= bit;
2730 : }
2731 : }
2732 : }
2733 :
2734 : /* Let's return the precise set of controllers now enabled for the cgroup. */
2735 0 : if (ret_result_mask)
2736 0 : *ret_result_mask = ret;
2737 :
2738 0 : return 0;
2739 : }
2740 :
2741 8 : bool cg_is_unified_wanted(void) {
2742 : static thread_local int wanted = -1;
2743 : int r;
2744 : bool b;
2745 8 : const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
2746 8 : _cleanup_free_ char *c = NULL;
2747 :
2748 : /* If we have a cached value, return that. */
2749 8 : if (wanted >= 0)
2750 7 : return wanted;
2751 :
2752 : /* If the hierarchy is already mounted, then follow whatever
2753 : * was chosen for it. */
2754 1 : if (cg_unified_flush() >= 0)
2755 1 : return (wanted = unified_cache >= CGROUP_UNIFIED_ALL);
2756 :
2757 : /* If we were explicitly passed systemd.unified_cgroup_hierarchy,
2758 : * respect that. */
2759 0 : r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
2760 0 : if (r > 0)
2761 0 : return (wanted = b);
2762 :
2763 : /* If we passed cgroup_no_v1=all with no other instructions, it seems
2764 : * highly unlikely that we want to use hybrid or legacy hierarchy. */
2765 0 : r = proc_cmdline_get_key("cgroup_no_v1", 0, &c);
2766 0 : if (r > 0 && streq_ptr(c, "all"))
2767 0 : return (wanted = true);
2768 :
2769 0 : return (wanted = is_default);
2770 : }
2771 :
2772 8 : bool cg_is_legacy_wanted(void) {
2773 : static thread_local int wanted = -1;
2774 :
2775 : /* If we have a cached value, return that. */
2776 8 : if (wanted >= 0)
2777 7 : return wanted;
2778 :
2779 : /* Check if we have cgroup v2 already mounted. */
2780 1 : if (cg_unified_flush() >= 0 &&
2781 1 : unified_cache == CGROUP_UNIFIED_ALL)
2782 0 : return (wanted = false);
2783 :
2784 : /* Otherwise, assume that at least partial legacy is wanted,
2785 : * since cgroup v2 should already be mounted at this point. */
2786 1 : return (wanted = true);
2787 : }
2788 :
2789 8 : bool cg_is_hybrid_wanted(void) {
2790 : static thread_local int wanted = -1;
2791 : int r;
2792 : bool b;
2793 8 : const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
2794 : /* We default to true if the default is "hybrid", obviously,
2795 : * but also when the default is "unified", because if we get
2796 : * called, it means that unified hierarchy was not mounted. */
2797 :
2798 : /* If we have a cached value, return that. */
2799 8 : if (wanted >= 0)
2800 7 : return wanted;
2801 :
2802 : /* If the hierarchy is already mounted, then follow whatever
2803 : * was chosen for it. */
2804 1 : if (cg_unified_flush() >= 0 &&
2805 1 : unified_cache == CGROUP_UNIFIED_ALL)
2806 0 : return (wanted = false);
2807 :
2808 : /* Otherwise, let's see what the kernel command line has to say.
2809 : * Since checking is expensive, cache a non-error result. */
2810 1 : r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", &b);
2811 :
2812 : /* The meaning of the kernel option is reversed wrt. to the return value
2813 : * of this function, hence the negation. */
2814 1 : return (wanted = r > 0 ? !b : is_default);
2815 : }
2816 :
2817 2 : int cg_weight_parse(const char *s, uint64_t *ret) {
2818 : uint64_t u;
2819 : int r;
2820 :
2821 2 : if (isempty(s)) {
2822 0 : *ret = CGROUP_WEIGHT_INVALID;
2823 0 : return 0;
2824 : }
2825 :
2826 2 : r = safe_atou64(s, &u);
2827 2 : if (r < 0)
2828 0 : return r;
2829 :
2830 2 : if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
2831 0 : return -ERANGE;
2832 :
2833 2 : *ret = u;
2834 2 : return 0;
2835 : }
2836 :
2837 : const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2838 : [CGROUP_IO_RBPS_MAX] = CGROUP_LIMIT_MAX,
2839 : [CGROUP_IO_WBPS_MAX] = CGROUP_LIMIT_MAX,
2840 : [CGROUP_IO_RIOPS_MAX] = CGROUP_LIMIT_MAX,
2841 : [CGROUP_IO_WIOPS_MAX] = CGROUP_LIMIT_MAX,
2842 : };
2843 :
2844 : static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2845 : [CGROUP_IO_RBPS_MAX] = "IOReadBandwidthMax",
2846 : [CGROUP_IO_WBPS_MAX] = "IOWriteBandwidthMax",
2847 : [CGROUP_IO_RIOPS_MAX] = "IOReadIOPSMax",
2848 : [CGROUP_IO_WIOPS_MAX] = "IOWriteIOPSMax",
2849 : };
2850 :
2851 12 : DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2852 :
2853 1 : int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
2854 : uint64_t u;
2855 : int r;
2856 :
2857 1 : if (isempty(s)) {
2858 0 : *ret = CGROUP_CPU_SHARES_INVALID;
2859 0 : return 0;
2860 : }
2861 :
2862 1 : r = safe_atou64(s, &u);
2863 1 : if (r < 0)
2864 0 : return r;
2865 :
2866 1 : if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
2867 0 : return -ERANGE;
2868 :
2869 1 : *ret = u;
2870 1 : return 0;
2871 : }
2872 :
2873 0 : int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
2874 : uint64_t u;
2875 : int r;
2876 :
2877 0 : if (isempty(s)) {
2878 0 : *ret = CGROUP_BLKIO_WEIGHT_INVALID;
2879 0 : return 0;
2880 : }
2881 :
2882 0 : r = safe_atou64(s, &u);
2883 0 : if (r < 0)
2884 0 : return r;
2885 :
2886 0 : if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
2887 0 : return -ERANGE;
2888 :
2889 0 : *ret = u;
2890 0 : return 0;
2891 : }
2892 :
2893 2 : bool is_cgroup_fs(const struct statfs *s) {
2894 2 : return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2895 0 : is_fs_type(s, CGROUP2_SUPER_MAGIC);
2896 : }
2897 :
2898 1 : bool fd_is_cgroup_fs(int fd) {
2899 : struct statfs s;
2900 :
2901 1 : if (fstatfs(fd, &s) < 0)
2902 0 : return -errno;
2903 :
2904 1 : return is_cgroup_fs(&s);
2905 : }
2906 :
2907 : static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2908 : [CGROUP_CONTROLLER_CPU] = "cpu",
2909 : [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2910 : [CGROUP_CONTROLLER_IO] = "io",
2911 : [CGROUP_CONTROLLER_BLKIO] = "blkio",
2912 : [CGROUP_CONTROLLER_MEMORY] = "memory",
2913 : [CGROUP_CONTROLLER_DEVICES] = "devices",
2914 : [CGROUP_CONTROLLER_PIDS] = "pids",
2915 : [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
2916 : [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
2917 : };
2918 :
2919 1112 : DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
2920 :
2921 12 : CGroupMask get_cpu_accounting_mask(void) {
2922 : static CGroupMask needed_mask = (CGroupMask) -1;
2923 :
2924 : /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2925 : * provided externally from the CPU controller, which means we don't
2926 : * need to enable the CPU controller just to get metrics. This is good,
2927 : * because enabling the CPU controller comes at a minor performance
2928 : * hit, especially when it's propagated deep into large hierarchies.
2929 : * There's also no separate CPU accounting controller available within
2930 : * a unified hierarchy.
2931 : *
2932 : * This combination of factors results in the desired cgroup mask to
2933 : * enable for CPU accounting varying as follows:
2934 : *
2935 : * ╔═════════════════════╤═════════════════════╗
2936 : * ║ Linux ≥4.15 │ Linux <4.15 ║
2937 : * ╔═══════════════╬═════════════════════╪═════════════════════╣
2938 : * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║
2939 : * ╟───────────────╫─────────────────────┼─────────────────────╢
2940 : * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2941 : * ╚═══════════════╩═════════════════════╧═════════════════════╝
2942 : *
2943 : * We check kernel version here instead of manually checking whether
2944 : * cpu.stat is present for every cgroup, as that check in itself would
2945 : * already be fairly expensive.
2946 : *
2947 : * Kernels where this patch has been backported will therefore have the
2948 : * CPU controller enabled unnecessarily. This is more expensive than
2949 : * necessary, but harmless. ☺️
2950 : */
2951 :
2952 12 : if (needed_mask == (CGroupMask) -1) {
2953 2 : if (cg_all_unified()) {
2954 : struct utsname u;
2955 0 : assert_se(uname(&u) >= 0);
2956 :
2957 0 : if (str_verscmp(u.release, "4.15") < 0)
2958 0 : needed_mask = CGROUP_MASK_CPU;
2959 : else
2960 0 : needed_mask = 0;
2961 : } else
2962 2 : needed_mask = CGROUP_MASK_CPUACCT;
2963 : }
2964 :
2965 12 : return needed_mask;
2966 : }
2967 :
2968 0 : bool cpu_accounting_is_cheap(void) {
2969 0 : return get_cpu_accounting_mask() == 0;
2970 : }
|