Line data Source code
1 : /* SPDX-License-Identifier: LGPL-2.1+ */
2 :
3 : #include <fcntl.h>
4 : #include <fnmatch.h>
5 :
6 : #include "sd-messages.h"
7 :
8 : #include "alloc-util.h"
9 : #include "blockdev-util.h"
10 : #include "bpf-devices.h"
11 : #include "bpf-firewall.h"
12 : #include "btrfs-util.h"
13 : #include "bus-error.h"
14 : #include "cgroup-util.h"
15 : #include "cgroup.h"
16 : #include "fd-util.h"
17 : #include "fileio.h"
18 : #include "fs-util.h"
19 : #include "nulstr-util.h"
20 : #include "parse-util.h"
21 : #include "path-util.h"
22 : #include "process-util.h"
23 : #include "procfs-util.h"
24 : #include "special.h"
25 : #include "stat-util.h"
26 : #include "stdio-util.h"
27 : #include "string-table.h"
28 : #include "string-util.h"
29 : #include "virt.h"
30 :
31 : #define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
32 :
33 : /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
34 : * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
35 : * out specific attributes from us. */
36 : #define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
37 :
38 29 : bool manager_owns_host_root_cgroup(Manager *m) {
39 29 : assert(m);
40 :
41 : /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
42 : * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
43 : * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
44 : * we run in any kind of container virtualization. */
45 :
46 29 : if (MANAGER_IS_USER(m))
47 29 : return false;
48 :
49 0 : if (detect_container() > 0)
50 0 : return false;
51 :
52 0 : return empty_or_root(m->cgroup_root);
53 : }
54 :
55 18 : bool unit_has_host_root_cgroup(Unit *u) {
56 18 : assert(u);
57 :
58 : /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
59 : * the manager manages the root cgroup. */
60 :
61 18 : if (!manager_owns_host_root_cgroup(u->manager))
62 18 : return false;
63 :
64 0 : return unit_has_name(u, SPECIAL_ROOT_SLICE);
65 : }
66 :
67 0 : static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
68 : int r;
69 :
70 0 : r = cg_set_attribute(controller, u->cgroup_path, attribute, value);
71 0 : if (r < 0)
72 0 : log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
73 : strna(attribute), isempty(u->cgroup_path) ? "/" : u->cgroup_path, (int) strcspn(value, NEWLINE), value);
74 :
75 0 : return r;
76 : }
77 :
78 0 : static void cgroup_compat_warn(void) {
79 : static bool cgroup_compat_warned = false;
80 :
81 0 : if (cgroup_compat_warned)
82 0 : return;
83 :
84 0 : log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
85 : "See cgroup-compat debug messages for details.");
86 :
87 0 : cgroup_compat_warned = true;
88 : }
89 :
90 : #define log_cgroup_compat(unit, fmt, ...) do { \
91 : cgroup_compat_warn(); \
92 : log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
93 : } while (false)
94 :
95 589 : void cgroup_context_init(CGroupContext *c) {
96 589 : assert(c);
97 :
98 : /* Initialize everything to the kernel defaults. */
99 :
100 589 : *c = (CGroupContext) {
101 : .cpu_weight = CGROUP_WEIGHT_INVALID,
102 : .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
103 : .cpu_quota_per_sec_usec = USEC_INFINITY,
104 : .cpu_quota_period_usec = USEC_INFINITY,
105 :
106 : .cpu_shares = CGROUP_CPU_SHARES_INVALID,
107 : .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
108 :
109 : .memory_high = CGROUP_LIMIT_MAX,
110 : .memory_max = CGROUP_LIMIT_MAX,
111 : .memory_swap_max = CGROUP_LIMIT_MAX,
112 :
113 : .memory_limit = CGROUP_LIMIT_MAX,
114 :
115 : .io_weight = CGROUP_WEIGHT_INVALID,
116 : .startup_io_weight = CGROUP_WEIGHT_INVALID,
117 :
118 : .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
119 : .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
120 :
121 : .tasks_max = CGROUP_LIMIT_MAX,
122 : };
123 589 : }
124 :
125 0 : void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
126 0 : assert(c);
127 0 : assert(a);
128 :
129 0 : LIST_REMOVE(device_allow, c->device_allow, a);
130 0 : free(a->path);
131 0 : free(a);
132 0 : }
133 :
134 0 : void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
135 0 : assert(c);
136 0 : assert(w);
137 :
138 0 : LIST_REMOVE(device_weights, c->io_device_weights, w);
139 0 : free(w->path);
140 0 : free(w);
141 0 : }
142 :
143 0 : void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
144 0 : assert(c);
145 0 : assert(l);
146 :
147 0 : LIST_REMOVE(device_latencies, c->io_device_latencies, l);
148 0 : free(l->path);
149 0 : free(l);
150 0 : }
151 :
152 0 : void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
153 0 : assert(c);
154 0 : assert(l);
155 :
156 0 : LIST_REMOVE(device_limits, c->io_device_limits, l);
157 0 : free(l->path);
158 0 : free(l);
159 0 : }
160 :
161 0 : void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
162 0 : assert(c);
163 0 : assert(w);
164 :
165 0 : LIST_REMOVE(device_weights, c->blockio_device_weights, w);
166 0 : free(w->path);
167 0 : free(w);
168 0 : }
169 :
170 0 : void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
171 0 : assert(c);
172 0 : assert(b);
173 :
174 0 : LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
175 0 : free(b->path);
176 0 : free(b);
177 0 : }
178 :
179 589 : void cgroup_context_done(CGroupContext *c) {
180 589 : assert(c);
181 :
182 589 : while (c->io_device_weights)
183 0 : cgroup_context_free_io_device_weight(c, c->io_device_weights);
184 :
185 589 : while (c->io_device_latencies)
186 0 : cgroup_context_free_io_device_latency(c, c->io_device_latencies);
187 :
188 589 : while (c->io_device_limits)
189 0 : cgroup_context_free_io_device_limit(c, c->io_device_limits);
190 :
191 589 : while (c->blockio_device_weights)
192 0 : cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
193 :
194 589 : while (c->blockio_device_bandwidths)
195 0 : cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
196 :
197 589 : while (c->device_allow)
198 0 : cgroup_context_free_device_allow(c, c->device_allow);
199 :
200 589 : c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
201 589 : c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
202 :
203 589 : c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
204 589 : c->ip_filters_egress = strv_free(c->ip_filters_egress);
205 589 : }
206 :
207 182 : void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
208 182 : _cleanup_free_ char *disable_controllers_str = NULL;
209 : CGroupIODeviceLimit *il;
210 : CGroupIODeviceWeight *iw;
211 : CGroupIODeviceLatency *l;
212 : CGroupBlockIODeviceBandwidth *b;
213 : CGroupBlockIODeviceWeight *w;
214 : CGroupDeviceAllow *a;
215 : IPAddressAccessItem *iaai;
216 : char **path;
217 : char u[FORMAT_TIMESPAN_MAX];
218 : char v[FORMAT_TIMESPAN_MAX];
219 :
220 182 : assert(c);
221 182 : assert(f);
222 :
223 182 : prefix = strempty(prefix);
224 :
225 182 : (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
226 :
227 364 : fprintf(f,
228 : "%sCPUAccounting=%s\n"
229 : "%sIOAccounting=%s\n"
230 : "%sBlockIOAccounting=%s\n"
231 : "%sMemoryAccounting=%s\n"
232 : "%sTasksAccounting=%s\n"
233 : "%sIPAccounting=%s\n"
234 : "%sCPUWeight=%" PRIu64 "\n"
235 : "%sStartupCPUWeight=%" PRIu64 "\n"
236 : "%sCPUShares=%" PRIu64 "\n"
237 : "%sStartupCPUShares=%" PRIu64 "\n"
238 : "%sCPUQuotaPerSecSec=%s\n"
239 : "%sCPUQuotaPeriodSec=%s\n"
240 : "%sIOWeight=%" PRIu64 "\n"
241 : "%sStartupIOWeight=%" PRIu64 "\n"
242 : "%sBlockIOWeight=%" PRIu64 "\n"
243 : "%sStartupBlockIOWeight=%" PRIu64 "\n"
244 : "%sDefaultMemoryMin=%" PRIu64 "\n"
245 : "%sDefaultMemoryLow=%" PRIu64 "\n"
246 : "%sMemoryMin=%" PRIu64 "\n"
247 : "%sMemoryLow=%" PRIu64 "\n"
248 : "%sMemoryHigh=%" PRIu64 "\n"
249 : "%sMemoryMax=%" PRIu64 "\n"
250 : "%sMemorySwapMax=%" PRIu64 "\n"
251 : "%sMemoryLimit=%" PRIu64 "\n"
252 : "%sTasksMax=%" PRIu64 "\n"
253 : "%sDevicePolicy=%s\n"
254 : "%sDisableControllers=%s\n"
255 : "%sDelegate=%s\n",
256 182 : prefix, yes_no(c->cpu_accounting),
257 182 : prefix, yes_no(c->io_accounting),
258 182 : prefix, yes_no(c->blockio_accounting),
259 182 : prefix, yes_no(c->memory_accounting),
260 182 : prefix, yes_no(c->tasks_accounting),
261 182 : prefix, yes_no(c->ip_accounting),
262 : prefix, c->cpu_weight,
263 : prefix, c->startup_cpu_weight,
264 : prefix, c->cpu_shares,
265 : prefix, c->startup_cpu_shares,
266 : prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
267 : prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1),
268 : prefix, c->io_weight,
269 : prefix, c->startup_io_weight,
270 : prefix, c->blockio_weight,
271 : prefix, c->startup_blockio_weight,
272 : prefix, c->default_memory_min,
273 : prefix, c->default_memory_low,
274 : prefix, c->memory_min,
275 : prefix, c->memory_low,
276 : prefix, c->memory_high,
277 : prefix, c->memory_max,
278 : prefix, c->memory_swap_max,
279 : prefix, c->memory_limit,
280 : prefix, c->tasks_max,
281 : prefix, cgroup_device_policy_to_string(c->device_policy),
282 : prefix, strempty(disable_controllers_str),
283 182 : prefix, yes_no(c->delegate));
284 :
285 182 : if (c->delegate) {
286 0 : _cleanup_free_ char *t = NULL;
287 :
288 0 : (void) cg_mask_to_string(c->delegate_controllers, &t);
289 :
290 0 : fprintf(f, "%sDelegateControllers=%s\n",
291 : prefix,
292 : strempty(t));
293 : }
294 :
295 182 : LIST_FOREACH(device_allow, a, c->device_allow)
296 0 : fprintf(f,
297 : "%sDeviceAllow=%s %s%s%s\n",
298 : prefix,
299 : a->path,
300 0 : a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
301 :
302 182 : LIST_FOREACH(device_weights, iw, c->io_device_weights)
303 0 : fprintf(f,
304 : "%sIODeviceWeight=%s %" PRIu64 "\n",
305 : prefix,
306 : iw->path,
307 : iw->weight);
308 :
309 182 : LIST_FOREACH(device_latencies, l, c->io_device_latencies)
310 0 : fprintf(f,
311 : "%sIODeviceLatencyTargetSec=%s %s\n",
312 : prefix,
313 : l->path,
314 : format_timespan(u, sizeof(u), l->target_usec, 1));
315 :
316 182 : LIST_FOREACH(device_limits, il, c->io_device_limits) {
317 : char buf[FORMAT_BYTES_MAX];
318 : CGroupIOLimitType type;
319 :
320 0 : for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
321 0 : if (il->limits[type] != cgroup_io_limit_defaults[type])
322 0 : fprintf(f,
323 : "%s%s=%s %s\n",
324 : prefix,
325 : cgroup_io_limit_type_to_string(type),
326 : il->path,
327 : format_bytes(buf, sizeof(buf), il->limits[type]));
328 : }
329 :
330 182 : LIST_FOREACH(device_weights, w, c->blockio_device_weights)
331 0 : fprintf(f,
332 : "%sBlockIODeviceWeight=%s %" PRIu64,
333 : prefix,
334 : w->path,
335 : w->weight);
336 :
337 182 : LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
338 : char buf[FORMAT_BYTES_MAX];
339 :
340 0 : if (b->rbps != CGROUP_LIMIT_MAX)
341 0 : fprintf(f,
342 : "%sBlockIOReadBandwidth=%s %s\n",
343 : prefix,
344 : b->path,
345 : format_bytes(buf, sizeof(buf), b->rbps));
346 0 : if (b->wbps != CGROUP_LIMIT_MAX)
347 0 : fprintf(f,
348 : "%sBlockIOWriteBandwidth=%s %s\n",
349 : prefix,
350 : b->path,
351 : format_bytes(buf, sizeof(buf), b->wbps));
352 : }
353 :
354 182 : LIST_FOREACH(items, iaai, c->ip_address_allow) {
355 0 : _cleanup_free_ char *k = NULL;
356 :
357 0 : (void) in_addr_to_string(iaai->family, &iaai->address, &k);
358 0 : fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
359 : }
360 :
361 182 : LIST_FOREACH(items, iaai, c->ip_address_deny) {
362 0 : _cleanup_free_ char *k = NULL;
363 :
364 0 : (void) in_addr_to_string(iaai->family, &iaai->address, &k);
365 0 : fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
366 : }
367 :
368 182 : STRV_FOREACH(path, c->ip_filters_ingress)
369 0 : fprintf(f, "%sIPIngressFilterPath=%s\n", prefix, *path);
370 :
371 182 : STRV_FOREACH(path, c->ip_filters_egress)
372 0 : fprintf(f, "%sIPEgressFilterPath=%s\n", prefix, *path);
373 182 : }
374 :
375 0 : int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) {
376 0 : _cleanup_free_ CGroupDeviceAllow *a = NULL;
377 0 : _cleanup_free_ char *d = NULL;
378 :
379 0 : assert(c);
380 0 : assert(dev);
381 0 : assert(isempty(mode) || in_charset(mode, "rwm"));
382 :
383 0 : a = new(CGroupDeviceAllow, 1);
384 0 : if (!a)
385 0 : return -ENOMEM;
386 :
387 0 : d = strdup(dev);
388 0 : if (!d)
389 0 : return -ENOMEM;
390 :
391 0 : *a = (CGroupDeviceAllow) {
392 0 : .path = TAKE_PTR(d),
393 0 : .r = isempty(mode) || strchr(mode, 'r'),
394 0 : .w = isempty(mode) || strchr(mode, 'w'),
395 0 : .m = isempty(mode) || strchr(mode, 'm'),
396 : };
397 :
398 0 : LIST_PREPEND(device_allow, c->device_allow, a);
399 0 : TAKE_PTR(a);
400 :
401 0 : return 0;
402 : }
403 :
404 : #define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \
405 : uint64_t unit_get_ancestor_##entry(Unit *u) { \
406 : CGroupContext *c; \
407 : \
408 : /* 1. Is entry set in this unit? If so, use that. \
409 : * 2. Is the default for this entry set in any \
410 : * ancestor? If so, use that. \
411 : * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \
412 : \
413 : assert(u); \
414 : \
415 : c = unit_get_cgroup_context(u); \
416 : if (c && c->entry##_set) \
417 : return c->entry; \
418 : \
419 : while ((u = UNIT_DEREF(u->slice))) { \
420 : c = unit_get_cgroup_context(u); \
421 : if (c && c->default_##entry##_set) \
422 : return c->default_##entry; \
423 : } \
424 : \
425 : /* We've reached the root, but nobody had default for \
426 : * this entry set, so set it to the kernel default. */ \
427 : return CGROUP_LIMIT_MIN; \
428 : }
429 :
430 136 : UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
431 0 : UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
432 :
433 6 : static void cgroup_xattr_apply(Unit *u) {
434 : char ids[SD_ID128_STRING_MAX];
435 : int r;
436 :
437 6 : assert(u);
438 :
439 6 : if (!MANAGER_IS_SYSTEM(u->manager))
440 6 : return;
441 :
442 0 : if (sd_id128_is_null(u->invocation_id))
443 0 : return;
444 :
445 0 : r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
446 : "trusted.invocation_id",
447 0 : sd_id128_to_string(u->invocation_id, ids), 32,
448 : 0);
449 0 : if (r < 0)
450 0 : log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
451 : }
452 :
453 0 : static int lookup_block_device(const char *p, dev_t *ret) {
454 0 : dev_t rdev, dev = 0;
455 : mode_t mode;
456 : int r;
457 :
458 0 : assert(p);
459 0 : assert(ret);
460 :
461 0 : r = device_path_parse_major_minor(p, &mode, &rdev);
462 0 : if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
463 : struct stat st;
464 0 : if (stat(p, &st) < 0)
465 0 : return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
466 0 : rdev = (dev_t)st.st_rdev;
467 0 : dev = (dev_t)st.st_dev;
468 0 : mode = st.st_mode;
469 0 : } else if (r < 0)
470 0 : return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
471 :
472 0 : if (S_ISCHR(mode)) {
473 0 : log_warning("Device node '%s' is a character device, but block device needed.", p);
474 0 : return -ENOTBLK;
475 0 : } else if (S_ISBLK(mode))
476 0 : *ret = rdev;
477 0 : else if (major(dev) != 0)
478 0 : *ret = dev; /* If this is not a device node then use the block device this file is stored on */
479 : else {
480 : /* If this is btrfs, getting the backing block device is a bit harder */
481 0 : r = btrfs_get_block_device(p, ret);
482 0 : if (r < 0 && r != -ENOTTY)
483 0 : return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
484 0 : if (r == -ENOTTY) {
485 0 : log_warning("'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
486 0 : return -ENODEV;
487 : }
488 : }
489 :
490 : /* If this is a LUKS device, try to get the originating block device */
491 0 : (void) block_get_originating(*ret, ret);
492 :
493 : /* If this is a partition, try to get the originating block device */
494 0 : (void) block_get_whole_disk(*ret, ret);
495 0 : return 0;
496 : }
497 :
498 0 : static int whitelist_device(BPFProgram *prog, const char *path, const char *node, const char *acc) {
499 : dev_t rdev;
500 : mode_t mode;
501 : int r;
502 :
503 0 : assert(path);
504 0 : assert(acc);
505 :
506 : /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and
507 : * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This
508 : * means clients can use these path without the device node actually around */
509 0 : r = device_path_parse_major_minor(node, &mode, &rdev);
510 0 : if (r < 0) {
511 0 : if (r != -ENODEV)
512 0 : return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node);
513 :
514 : struct stat st;
515 0 : if (stat(node, &st) < 0)
516 0 : return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
517 :
518 0 : if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
519 0 : log_warning("%s is not a device.", node);
520 0 : return -ENODEV;
521 : }
522 0 : rdev = (dev_t) st.st_rdev;
523 0 : mode = st.st_mode;
524 : }
525 :
526 0 : if (cg_all_unified() > 0) {
527 0 : if (!prog)
528 0 : return 0;
529 :
530 0 : return cgroup_bpf_whitelist_device(prog, S_ISCHR(mode) ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
531 0 : major(rdev), minor(rdev), acc);
532 :
533 : } else {
534 : char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
535 :
536 0 : sprintf(buf,
537 : "%c %u:%u %s",
538 0 : S_ISCHR(mode) ? 'c' : 'b',
539 : major(rdev), minor(rdev),
540 : acc);
541 :
542 : /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL here. */
543 :
544 0 : r = cg_set_attribute("devices", path, "devices.allow", buf);
545 0 : if (r < 0)
546 0 : return log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING,
547 : r, "Failed to set devices.allow on %s: %m", path);
548 :
549 0 : return 0;
550 : }
551 : }
552 :
553 0 : static int whitelist_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc) {
554 0 : _cleanup_fclose_ FILE *f = NULL;
555 : char buf[2+DECIMAL_STR_MAX(unsigned)+3+4];
556 0 : bool good = false;
557 : unsigned maj;
558 : int r;
559 :
560 0 : assert(path);
561 0 : assert(acc);
562 0 : assert(IN_SET(type, 'b', 'c'));
563 :
564 0 : if (streq(name, "*")) {
565 : /* If the name is a wildcard, then apply this list to all devices of this type */
566 :
567 0 : if (cg_all_unified() > 0) {
568 0 : if (!prog)
569 0 : return 0;
570 :
571 0 : (void) cgroup_bpf_whitelist_class(prog, type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, acc);
572 : } else {
573 0 : xsprintf(buf, "%c *:* %s", type, acc);
574 :
575 0 : r = cg_set_attribute("devices", path, "devices.allow", buf);
576 0 : if (r < 0)
577 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
578 : "Failed to set devices.allow on %s: %m", path);
579 0 : return 0;
580 : }
581 : }
582 :
583 0 : if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj)) {
584 : /* The name is numeric and suitable as major. In that case, let's take is major, and create the entry
585 : * directly */
586 :
587 0 : if (cg_all_unified() > 0) {
588 0 : if (!prog)
589 0 : return 0;
590 :
591 0 : (void) cgroup_bpf_whitelist_major(prog,
592 : type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
593 : maj, acc);
594 : } else {
595 0 : xsprintf(buf, "%c %u:* %s", type, maj, acc);
596 :
597 0 : r = cg_set_attribute("devices", path, "devices.allow", buf);
598 0 : if (r < 0)
599 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
600 : "Failed to set devices.allow on %s: %m", path);
601 : }
602 :
603 0 : return 0;
604 : }
605 :
606 0 : f = fopen("/proc/devices", "re");
607 0 : if (!f)
608 0 : return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
609 :
610 0 : for (;;) {
611 0 : _cleanup_free_ char *line = NULL;
612 : char *w, *p;
613 :
614 0 : r = read_line(f, LONG_LINE_MAX, &line);
615 0 : if (r < 0)
616 0 : return log_warning_errno(r, "Failed to read /proc/devices: %m");
617 0 : if (r == 0)
618 0 : break;
619 :
620 0 : if (type == 'c' && streq(line, "Character devices:")) {
621 0 : good = true;
622 0 : continue;
623 : }
624 :
625 0 : if (type == 'b' && streq(line, "Block devices:")) {
626 0 : good = true;
627 0 : continue;
628 : }
629 :
630 0 : if (isempty(line)) {
631 0 : good = false;
632 0 : continue;
633 : }
634 :
635 0 : if (!good)
636 0 : continue;
637 :
638 0 : p = strstrip(line);
639 :
640 0 : w = strpbrk(p, WHITESPACE);
641 0 : if (!w)
642 0 : continue;
643 0 : *w = 0;
644 :
645 0 : r = safe_atou(p, &maj);
646 0 : if (r < 0)
647 0 : continue;
648 0 : if (maj <= 0)
649 0 : continue;
650 :
651 0 : w++;
652 0 : w += strspn(w, WHITESPACE);
653 :
654 0 : if (fnmatch(name, w, 0) != 0)
655 0 : continue;
656 :
657 0 : if (cg_all_unified() > 0) {
658 0 : if (!prog)
659 0 : continue;
660 :
661 0 : (void) cgroup_bpf_whitelist_major(prog,
662 : type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
663 : maj, acc);
664 : } else {
665 0 : sprintf(buf,
666 : "%c %u:* %s",
667 : type,
668 : maj,
669 : acc);
670 :
671 : /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL
672 : * here. */
673 :
674 0 : r = cg_set_attribute("devices", path, "devices.allow", buf);
675 0 : if (r < 0)
676 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING,
677 : r, "Failed to set devices.allow on %s: %m", path);
678 : }
679 : }
680 :
681 0 : return 0;
682 : }
683 :
684 579 : static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
685 1158 : return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
686 579 : c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
687 : }
688 :
689 579 : static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
690 1153 : return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
691 574 : c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
692 : }
693 :
694 20 : static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
695 20 : if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
696 4 : c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
697 0 : return c->startup_cpu_weight;
698 20 : else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
699 0 : return c->cpu_weight;
700 : else
701 20 : return CGROUP_WEIGHT_DEFAULT;
702 : }
703 :
704 0 : static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
705 0 : if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
706 0 : c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
707 0 : return c->startup_cpu_shares;
708 0 : else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
709 0 : return c->cpu_shares;
710 : else
711 0 : return CGROUP_CPU_SHARES_DEFAULT;
712 : }
713 :
714 12 : usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
715 : /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
716 : * need to be higher than that boundary. quota is specified in USecPerSec.
717 : * Additionally, period must be at most max_period. */
718 12 : assert(quota > 0);
719 :
720 12 : return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
721 : }
722 :
723 0 : static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
724 : usec_t new_period;
725 :
726 0 : if (quota == USEC_INFINITY)
727 : /* Always use default period for infinity quota. */
728 0 : return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
729 :
730 0 : if (period == USEC_INFINITY)
731 : /* Default period was requested. */
732 0 : period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
733 :
734 : /* Clamp to interval [1ms, 1s] */
735 0 : new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
736 :
737 0 : if (new_period != period) {
738 : char v[FORMAT_TIMESPAN_MAX];
739 0 : log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING, 0,
740 : "Clamping CPU interval for cpu.max: period is now %s",
741 : format_timespan(v, sizeof(v), new_period, 1));
742 0 : u->warned_clamping_cpu_quota_period = true;
743 : }
744 :
745 0 : return new_period;
746 : }
747 :
748 0 : static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
749 : char buf[DECIMAL_STR_MAX(uint64_t) + 2];
750 :
751 0 : xsprintf(buf, "%" PRIu64 "\n", weight);
752 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
753 0 : }
754 :
755 0 : static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
756 : char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
757 :
758 0 : period = cgroup_cpu_adjust_period_and_log(u, period, quota);
759 0 : if (quota != USEC_INFINITY)
760 0 : xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
761 : MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
762 : else
763 0 : xsprintf(buf, "max " USEC_FMT "\n", period);
764 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
765 0 : }
766 :
767 0 : static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
768 : char buf[DECIMAL_STR_MAX(uint64_t) + 2];
769 :
770 0 : xsprintf(buf, "%" PRIu64 "\n", shares);
771 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
772 0 : }
773 :
774 0 : static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
775 : char buf[DECIMAL_STR_MAX(usec_t) + 2];
776 :
777 0 : period = cgroup_cpu_adjust_period_and_log(u, period, quota);
778 :
779 0 : xsprintf(buf, USEC_FMT "\n", period);
780 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
781 :
782 0 : if (quota != USEC_INFINITY) {
783 0 : xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
784 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
785 : } else
786 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
787 0 : }
788 :
789 0 : static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
790 0 : return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
791 : CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
792 : }
793 :
794 0 : static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
795 0 : return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
796 : CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
797 : }
798 :
799 579 : static bool cgroup_context_has_io_config(CGroupContext *c) {
800 1158 : return c->io_accounting ||
801 579 : c->io_weight != CGROUP_WEIGHT_INVALID ||
802 569 : c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
803 569 : c->io_device_weights ||
804 1727 : c->io_device_latencies ||
805 569 : c->io_device_limits;
806 : }
807 :
808 569 : static bool cgroup_context_has_blockio_config(CGroupContext *c) {
809 1138 : return c->blockio_accounting ||
810 569 : c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
811 569 : c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
812 1707 : c->blockio_device_weights ||
813 569 : c->blockio_device_bandwidths;
814 : }
815 :
816 0 : static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
817 0 : if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
818 0 : c->startup_io_weight != CGROUP_WEIGHT_INVALID)
819 0 : return c->startup_io_weight;
820 0 : else if (c->io_weight != CGROUP_WEIGHT_INVALID)
821 0 : return c->io_weight;
822 : else
823 0 : return CGROUP_WEIGHT_DEFAULT;
824 : }
825 :
826 0 : static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
827 0 : if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
828 0 : c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
829 0 : return c->startup_blockio_weight;
830 0 : else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
831 0 : return c->blockio_weight;
832 : else
833 0 : return CGROUP_BLKIO_WEIGHT_DEFAULT;
834 : }
835 :
836 0 : static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
837 0 : return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
838 : CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
839 : }
840 :
841 0 : static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
842 0 : return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
843 : CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
844 : }
845 :
846 0 : static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
847 : char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
848 : dev_t dev;
849 : int r;
850 :
851 0 : r = lookup_block_device(dev_path, &dev);
852 0 : if (r < 0)
853 0 : return;
854 :
855 0 : xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
856 0 : (void) set_attribute_and_warn(u, "io", "io.weight", buf);
857 : }
858 :
859 0 : static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
860 : char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
861 : dev_t dev;
862 : int r;
863 :
864 0 : r = lookup_block_device(dev_path, &dev);
865 0 : if (r < 0)
866 0 : return;
867 :
868 0 : xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
869 0 : (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf);
870 : }
871 :
872 0 : static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
873 : char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
874 : dev_t dev;
875 : int r;
876 :
877 0 : r = lookup_block_device(dev_path, &dev);
878 0 : if (r < 0)
879 0 : return;
880 :
881 0 : if (target != USEC_INFINITY)
882 0 : xsprintf(buf, "%u:%u target=%" PRIu64 "\n", major(dev), minor(dev), target);
883 : else
884 0 : xsprintf(buf, "%u:%u target=max\n", major(dev), minor(dev));
885 :
886 0 : (void) set_attribute_and_warn(u, "io", "io.latency", buf);
887 : }
888 :
889 0 : static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
890 : char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
891 : char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
892 : CGroupIOLimitType type;
893 : dev_t dev;
894 : int r;
895 :
896 0 : r = lookup_block_device(dev_path, &dev);
897 0 : if (r < 0)
898 0 : return;
899 :
900 0 : for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
901 0 : if (limits[type] != cgroup_io_limit_defaults[type])
902 0 : xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
903 : else
904 0 : xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
905 :
906 0 : xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
907 : limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
908 : limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
909 0 : (void) set_attribute_and_warn(u, "io", "io.max", buf);
910 : }
911 :
912 0 : static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
913 : char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
914 : dev_t dev;
915 : int r;
916 :
917 0 : r = lookup_block_device(dev_path, &dev);
918 0 : if (r < 0)
919 0 : return;
920 :
921 0 : sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
922 0 : (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf);
923 :
924 0 : sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
925 0 : (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf);
926 : }
927 :
928 55 : static bool unit_has_unified_memory_config(Unit *u) {
929 : CGroupContext *c;
930 :
931 55 : assert(u);
932 :
933 55 : c = unit_get_cgroup_context(u);
934 55 : assert(c);
935 :
936 55 : return c->memory_min > 0 || unit_get_ancestor_memory_low(u) > 0 ||
937 165 : c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX ||
938 55 : c->memory_swap_max != CGROUP_LIMIT_MAX;
939 : }
940 :
941 0 : static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
942 0 : char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
943 :
944 0 : if (v != CGROUP_LIMIT_MAX)
945 0 : xsprintf(buf, "%" PRIu64 "\n", v);
946 :
947 0 : (void) set_attribute_and_warn(u, "memory", file, buf);
948 0 : }
949 :
950 0 : static void cgroup_apply_firewall(Unit *u) {
951 0 : assert(u);
952 :
953 : /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
954 :
955 0 : if (bpf_firewall_compile(u) < 0)
956 0 : return;
957 :
958 0 : (void) bpf_firewall_load_custom(u);
959 0 : (void) bpf_firewall_install(u);
960 : }
961 :
962 6 : static void cgroup_context_apply(
963 : Unit *u,
964 : CGroupMask apply_mask,
965 : ManagerState state) {
966 :
967 : const char *path;
968 : CGroupContext *c;
969 : bool is_host_root, is_local_root;
970 : int r;
971 :
972 6 : assert(u);
973 :
974 : /* Nothing to do? Exit early! */
975 6 : if (apply_mask == 0)
976 0 : return;
977 :
978 : /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
979 : * attributes should only be managed for cgroups further down the tree. */
980 6 : is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
981 6 : is_host_root = unit_has_host_root_cgroup(u);
982 :
983 6 : assert_se(c = unit_get_cgroup_context(u));
984 6 : assert_se(path = u->cgroup_path);
985 :
986 6 : if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
987 6 : path = "/";
988 :
989 : /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
990 : * then), and missing cgroups, i.e. EROFS and ENOENT. */
991 :
992 : /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
993 : * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
994 : * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
995 : * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
996 : * we couldn't even write to them if we wanted to). */
997 6 : if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
998 :
999 0 : if (cg_all_unified() > 0) {
1000 : uint64_t weight;
1001 :
1002 0 : if (cgroup_context_has_cpu_weight(c))
1003 0 : weight = cgroup_context_cpu_weight(c, state);
1004 0 : else if (cgroup_context_has_cpu_shares(c)) {
1005 : uint64_t shares;
1006 :
1007 0 : shares = cgroup_context_cpu_shares(c, state);
1008 0 : weight = cgroup_cpu_shares_to_weight(shares);
1009 :
1010 0 : log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
1011 : shares, weight, path);
1012 : } else
1013 0 : weight = CGROUP_WEIGHT_DEFAULT;
1014 :
1015 0 : cgroup_apply_unified_cpu_weight(u, weight);
1016 0 : cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
1017 :
1018 : } else {
1019 : uint64_t shares;
1020 :
1021 0 : if (cgroup_context_has_cpu_weight(c)) {
1022 : uint64_t weight;
1023 :
1024 0 : weight = cgroup_context_cpu_weight(c, state);
1025 0 : shares = cgroup_cpu_weight_to_shares(weight);
1026 :
1027 0 : log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
1028 : weight, shares, path);
1029 0 : } else if (cgroup_context_has_cpu_shares(c))
1030 0 : shares = cgroup_context_cpu_shares(c, state);
1031 : else
1032 0 : shares = CGROUP_CPU_SHARES_DEFAULT;
1033 :
1034 0 : cgroup_apply_legacy_cpu_shares(u, shares);
1035 0 : cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
1036 : }
1037 : }
1038 :
1039 : /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
1040 : * controller), and in case of containers we want to leave control of these attributes to the container manager
1041 : * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
1042 6 : if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
1043 : char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
1044 : bool has_io, has_blockio;
1045 : uint64_t weight;
1046 :
1047 0 : has_io = cgroup_context_has_io_config(c);
1048 0 : has_blockio = cgroup_context_has_blockio_config(c);
1049 :
1050 0 : if (has_io)
1051 0 : weight = cgroup_context_io_weight(c, state);
1052 0 : else if (has_blockio) {
1053 : uint64_t blkio_weight;
1054 :
1055 0 : blkio_weight = cgroup_context_blkio_weight(c, state);
1056 0 : weight = cgroup_weight_blkio_to_io(blkio_weight);
1057 :
1058 0 : log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
1059 : blkio_weight, weight);
1060 : } else
1061 0 : weight = CGROUP_WEIGHT_DEFAULT;
1062 :
1063 0 : xsprintf(buf, "default %" PRIu64 "\n", weight);
1064 0 : (void) set_attribute_and_warn(u, "io", "io.weight", buf);
1065 :
1066 : /* FIXME: drop this when distro kernels properly support BFQ through "io.weight"
1067 : * See also: https://github.com/systemd/systemd/pull/13335 */
1068 0 : xsprintf(buf, "%" PRIu64 "\n", weight);
1069 0 : (void) set_attribute_and_warn(u, "io", "io.bfq.weight", buf);
1070 :
1071 0 : if (has_io) {
1072 : CGroupIODeviceLatency *latency;
1073 : CGroupIODeviceLimit *limit;
1074 : CGroupIODeviceWeight *w;
1075 :
1076 0 : LIST_FOREACH(device_weights, w, c->io_device_weights)
1077 0 : cgroup_apply_io_device_weight(u, w->path, w->weight);
1078 :
1079 0 : LIST_FOREACH(device_limits, limit, c->io_device_limits)
1080 0 : cgroup_apply_io_device_limit(u, limit->path, limit->limits);
1081 :
1082 0 : LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
1083 0 : cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
1084 :
1085 0 : } else if (has_blockio) {
1086 : CGroupBlockIODeviceWeight *w;
1087 : CGroupBlockIODeviceBandwidth *b;
1088 :
1089 0 : LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
1090 0 : weight = cgroup_weight_blkio_to_io(w->weight);
1091 :
1092 0 : log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
1093 : w->weight, weight, w->path);
1094 :
1095 0 : cgroup_apply_io_device_weight(u, w->path, weight);
1096 : }
1097 :
1098 0 : LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
1099 : uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
1100 : CGroupIOLimitType type;
1101 :
1102 0 : for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
1103 0 : limits[type] = cgroup_io_limit_defaults[type];
1104 :
1105 0 : limits[CGROUP_IO_RBPS_MAX] = b->rbps;
1106 0 : limits[CGROUP_IO_WBPS_MAX] = b->wbps;
1107 :
1108 0 : log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
1109 : b->rbps, b->wbps, b->path);
1110 :
1111 0 : cgroup_apply_io_device_limit(u, b->path, limits);
1112 : }
1113 : }
1114 : }
1115 :
1116 6 : if (apply_mask & CGROUP_MASK_BLKIO) {
1117 : bool has_io, has_blockio;
1118 :
1119 0 : has_io = cgroup_context_has_io_config(c);
1120 0 : has_blockio = cgroup_context_has_blockio_config(c);
1121 :
1122 : /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be
1123 : * left to our container manager, too. */
1124 0 : if (!is_local_root) {
1125 : char buf[DECIMAL_STR_MAX(uint64_t)+1];
1126 : uint64_t weight;
1127 :
1128 0 : if (has_io) {
1129 : uint64_t io_weight;
1130 :
1131 0 : io_weight = cgroup_context_io_weight(c, state);
1132 0 : weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
1133 :
1134 0 : log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
1135 : io_weight, weight);
1136 0 : } else if (has_blockio)
1137 0 : weight = cgroup_context_blkio_weight(c, state);
1138 : else
1139 0 : weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
1140 :
1141 0 : xsprintf(buf, "%" PRIu64 "\n", weight);
1142 0 : (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf);
1143 :
1144 0 : if (has_io) {
1145 : CGroupIODeviceWeight *w;
1146 :
1147 0 : LIST_FOREACH(device_weights, w, c->io_device_weights) {
1148 0 : weight = cgroup_weight_io_to_blkio(w->weight);
1149 :
1150 0 : log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
1151 : w->weight, weight, w->path);
1152 :
1153 0 : cgroup_apply_blkio_device_weight(u, w->path, weight);
1154 : }
1155 0 : } else if (has_blockio) {
1156 : CGroupBlockIODeviceWeight *w;
1157 :
1158 0 : LIST_FOREACH(device_weights, w, c->blockio_device_weights)
1159 0 : cgroup_apply_blkio_device_weight(u, w->path, w->weight);
1160 : }
1161 : }
1162 :
1163 : /* The bandwidth limits are something that make sense to be applied to the host's root but not container
1164 : * roots, as there we want the container manager to handle it */
1165 0 : if (is_host_root || !is_local_root) {
1166 0 : if (has_io) {
1167 : CGroupIODeviceLimit *l;
1168 :
1169 0 : LIST_FOREACH(device_limits, l, c->io_device_limits) {
1170 0 : log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
1171 : l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
1172 :
1173 0 : cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
1174 : }
1175 0 : } else if (has_blockio) {
1176 : CGroupBlockIODeviceBandwidth *b;
1177 :
1178 0 : LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
1179 0 : cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps);
1180 : }
1181 : }
1182 : }
1183 :
1184 : /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
1185 : * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
1186 : * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
1187 : * write to this if we wanted to.) */
1188 6 : if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
1189 :
1190 0 : if (cg_all_unified() > 0) {
1191 0 : uint64_t max, swap_max = CGROUP_LIMIT_MAX;
1192 :
1193 0 : if (unit_has_unified_memory_config(u)) {
1194 0 : max = c->memory_max;
1195 0 : swap_max = c->memory_swap_max;
1196 : } else {
1197 0 : max = c->memory_limit;
1198 :
1199 0 : if (max != CGROUP_LIMIT_MAX)
1200 0 : log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
1201 : }
1202 :
1203 0 : cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min);
1204 0 : cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
1205 0 : cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
1206 0 : cgroup_apply_unified_memory_limit(u, "memory.max", max);
1207 0 : cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
1208 :
1209 0 : (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
1210 :
1211 : } else {
1212 : char buf[DECIMAL_STR_MAX(uint64_t) + 1];
1213 : uint64_t val;
1214 :
1215 0 : if (unit_has_unified_memory_config(u)) {
1216 0 : val = c->memory_max;
1217 0 : log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val);
1218 : } else
1219 0 : val = c->memory_limit;
1220 :
1221 0 : if (val == CGROUP_LIMIT_MAX)
1222 0 : strncpy(buf, "-1\n", sizeof(buf));
1223 : else
1224 0 : xsprintf(buf, "%" PRIu64 "\n", val);
1225 :
1226 0 : (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
1227 : }
1228 : }
1229 :
1230 : /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
1231 : * containers, where we leave this to the manager */
1232 6 : if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
1233 0 : (is_host_root || cg_all_unified() > 0 || !is_local_root)) {
1234 0 : _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
1235 : CGroupDeviceAllow *a;
1236 :
1237 0 : if (cg_all_unified() > 0) {
1238 0 : r = cgroup_init_device_bpf(&prog, c->device_policy, c->device_allow);
1239 0 : if (r < 0)
1240 0 : log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
1241 : } else {
1242 : /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL
1243 : * here. */
1244 :
1245 0 : if (c->device_allow || c->device_policy != CGROUP_AUTO)
1246 0 : r = cg_set_attribute("devices", path, "devices.deny", "a");
1247 : else
1248 0 : r = cg_set_attribute("devices", path, "devices.allow", "a");
1249 0 : if (r < 0)
1250 0 : log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1251 : "Failed to reset devices.allow/devices.deny: %m");
1252 : }
1253 :
1254 0 : if (c->device_policy == CGROUP_CLOSED ||
1255 0 : (c->device_policy == CGROUP_AUTO && c->device_allow)) {
1256 : static const char auto_devices[] =
1257 : "/dev/null\0" "rwm\0"
1258 : "/dev/zero\0" "rwm\0"
1259 : "/dev/full\0" "rwm\0"
1260 : "/dev/random\0" "rwm\0"
1261 : "/dev/urandom\0" "rwm\0"
1262 : "/dev/tty\0" "rwm\0"
1263 : "/dev/ptmx\0" "rwm\0"
1264 : /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
1265 : "/run/systemd/inaccessible/chr\0" "rwm\0"
1266 : "/run/systemd/inaccessible/blk\0" "rwm\0";
1267 :
1268 : const char *x, *y;
1269 :
1270 0 : NULSTR_FOREACH_PAIR(x, y, auto_devices)
1271 0 : (void) whitelist_device(prog, path, x, y);
1272 :
1273 : /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1274 0 : (void) whitelist_major(prog, path, "pts", 'c', "rw");
1275 : }
1276 :
1277 0 : LIST_FOREACH(device_allow, a, c->device_allow) {
1278 : char acc[4], *val;
1279 0 : unsigned k = 0;
1280 :
1281 0 : if (a->r)
1282 0 : acc[k++] = 'r';
1283 0 : if (a->w)
1284 0 : acc[k++] = 'w';
1285 0 : if (a->m)
1286 0 : acc[k++] = 'm';
1287 :
1288 0 : if (k == 0)
1289 0 : continue;
1290 :
1291 0 : acc[k++] = 0;
1292 :
1293 0 : if (path_startswith(a->path, "/dev/"))
1294 0 : (void) whitelist_device(prog, path, a->path, acc);
1295 0 : else if ((val = startswith(a->path, "block-")))
1296 0 : (void) whitelist_major(prog, path, val, 'b', acc);
1297 0 : else if ((val = startswith(a->path, "char-")))
1298 0 : (void) whitelist_major(prog, path, val, 'c', acc);
1299 : else
1300 0 : log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
1301 : }
1302 :
1303 0 : r = cgroup_apply_device_bpf(u, prog, c->device_policy, c->device_allow);
1304 0 : if (r < 0) {
1305 : static bool warned = false;
1306 :
1307 0 : log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
1308 : "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
1309 : "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
1310 : "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
1311 :
1312 0 : warned = true;
1313 : }
1314 : }
1315 :
1316 6 : if (apply_mask & CGROUP_MASK_PIDS) {
1317 :
1318 6 : if (is_host_root) {
1319 : /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1320 : * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1321 : * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1322 : * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1323 : * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1324 : * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1325 : * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1326 : * it also counts. But if the user never set a limit through us (i.e. we are the default of
1327 : * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1328 : * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1329 : * which is desirable so that there's an official way to release control of the sysctl from
1330 : * systemd: set the limit to unbounded and reload. */
1331 :
1332 0 : if (c->tasks_max != CGROUP_LIMIT_MAX) {
1333 0 : u->manager->sysctl_pid_max_changed = true;
1334 0 : r = procfs_tasks_set_limit(c->tasks_max);
1335 0 : } else if (u->manager->sysctl_pid_max_changed)
1336 0 : r = procfs_tasks_set_limit(TASKS_MAX);
1337 : else
1338 0 : r = 0;
1339 0 : if (r < 0)
1340 0 : log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r,
1341 : "Failed to write to tasks limit sysctls: %m");
1342 : }
1343 :
1344 : /* The attribute itself is not available on the host root cgroup, and in the container case we want to
1345 : * leave it for the container manager. */
1346 6 : if (!is_local_root) {
1347 0 : if (c->tasks_max != CGROUP_LIMIT_MAX) {
1348 : char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1349 :
1350 0 : sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1351 0 : (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
1352 : } else
1353 0 : (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
1354 : }
1355 : }
1356 :
1357 6 : if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
1358 0 : cgroup_apply_firewall(u);
1359 : }
1360 :
1361 579 : static bool unit_get_needs_bpf_firewall(Unit *u) {
1362 : CGroupContext *c;
1363 : Unit *p;
1364 579 : assert(u);
1365 :
1366 579 : c = unit_get_cgroup_context(u);
1367 579 : if (!c)
1368 0 : return false;
1369 :
1370 579 : if (c->ip_accounting ||
1371 579 : c->ip_address_allow ||
1372 579 : c->ip_address_deny ||
1373 579 : c->ip_filters_ingress ||
1374 579 : c->ip_filters_egress)
1375 0 : return true;
1376 :
1377 : /* If any parent slice has an IP access list defined, it applies too */
1378 1162 : for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1379 583 : c = unit_get_cgroup_context(p);
1380 583 : if (!c)
1381 0 : return false;
1382 :
1383 583 : if (c->ip_address_allow ||
1384 583 : c->ip_address_deny)
1385 0 : return true;
1386 : }
1387 :
1388 579 : return false;
1389 : }
1390 :
1391 579 : static CGroupMask unit_get_cgroup_mask(Unit *u) {
1392 579 : CGroupMask mask = 0;
1393 : CGroupContext *c;
1394 :
1395 579 : assert(u);
1396 :
1397 579 : c = unit_get_cgroup_context(u);
1398 :
1399 579 : assert(c);
1400 :
1401 : /* Figure out which controllers we need, based on the cgroup context object */
1402 :
1403 579 : if (c->cpu_accounting)
1404 5 : mask |= get_cpu_accounting_mask();
1405 :
1406 1158 : if (cgroup_context_has_cpu_weight(c) ||
1407 579 : cgroup_context_has_cpu_shares(c) ||
1408 574 : c->cpu_quota_per_sec_usec != USEC_INFINITY)
1409 5 : mask |= CGROUP_MASK_CPU;
1410 :
1411 579 : if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1412 10 : mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1413 :
1414 579 : if (c->memory_accounting ||
1415 115 : c->memory_limit != CGROUP_LIMIT_MAX ||
1416 55 : unit_has_unified_memory_config(u))
1417 524 : mask |= CGROUP_MASK_MEMORY;
1418 :
1419 579 : if (c->device_allow ||
1420 579 : c->device_policy != CGROUP_AUTO)
1421 0 : mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
1422 :
1423 579 : if (c->tasks_accounting ||
1424 65 : c->tasks_max != CGROUP_LIMIT_MAX)
1425 514 : mask |= CGROUP_MASK_PIDS;
1426 :
1427 579 : return CGROUP_MASK_EXTEND_JOINED(mask);
1428 : }
1429 :
1430 579 : static CGroupMask unit_get_bpf_mask(Unit *u) {
1431 579 : CGroupMask mask = 0;
1432 :
1433 : /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
1434 : * too. */
1435 :
1436 579 : if (unit_get_needs_bpf_firewall(u))
1437 0 : mask |= CGROUP_MASK_BPF_FIREWALL;
1438 :
1439 579 : return mask;
1440 : }
1441 :
1442 1587 : CGroupMask unit_get_own_mask(Unit *u) {
1443 : CGroupContext *c;
1444 :
1445 : /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
1446 : * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
1447 :
1448 1587 : if (u->load_state != UNIT_LOADED)
1449 186 : return 0;
1450 :
1451 1401 : c = unit_get_cgroup_context(u);
1452 1401 : if (!c)
1453 822 : return 0;
1454 :
1455 579 : return (unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u);
1456 : }
1457 :
1458 1769 : CGroupMask unit_get_delegate_mask(Unit *u) {
1459 : CGroupContext *c;
1460 :
1461 : /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1462 : * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1463 : *
1464 : * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1465 :
1466 1769 : if (!unit_cgroup_delegate(u))
1467 1769 : return 0;
1468 :
1469 0 : if (cg_all_unified() <= 0) {
1470 : ExecContext *e;
1471 :
1472 0 : e = unit_get_exec_context(u);
1473 0 : if (e && !exec_context_maintains_privileges(e))
1474 0 : return 0;
1475 : }
1476 :
1477 0 : assert_se(c = unit_get_cgroup_context(u));
1478 0 : return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers);
1479 : }
1480 :
1481 1633 : CGroupMask unit_get_members_mask(Unit *u) {
1482 1633 : assert(u);
1483 :
1484 : /* Returns the mask of controllers all of the unit's children require, merged */
1485 :
1486 1633 : if (u->cgroup_members_mask_valid)
1487 1253 : return u->cgroup_members_mask; /* Use cached value if possible */
1488 :
1489 380 : u->cgroup_members_mask = 0;
1490 :
1491 380 : if (u->type == UNIT_SLICE) {
1492 : void *v;
1493 : Unit *member;
1494 : Iterator i;
1495 :
1496 362 : HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1497 346 : if (UNIT_DEREF(member->slice) == u)
1498 343 : u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1499 : }
1500 : }
1501 :
1502 380 : u->cgroup_members_mask_valid = true;
1503 380 : return u->cgroup_members_mask;
1504 : }
1505 :
1506 44 : CGroupMask unit_get_siblings_mask(Unit *u) {
1507 44 : assert(u);
1508 :
1509 : /* Returns the mask of controllers all of the unit's siblings
1510 : * require, i.e. the members mask of the unit's parent slice
1511 : * if there is one. */
1512 :
1513 44 : if (UNIT_ISSET(u->slice))
1514 34 : return unit_get_members_mask(UNIT_DEREF(u->slice));
1515 :
1516 10 : return unit_get_subtree_mask(u); /* we are the top-level slice */
1517 : }
1518 :
1519 1240 : CGroupMask unit_get_disable_mask(Unit *u) {
1520 : CGroupContext *c;
1521 :
1522 1240 : c = unit_get_cgroup_context(u);
1523 1240 : if (!c)
1524 0 : return 0;
1525 :
1526 1240 : return c->disable_controllers;
1527 : }
1528 :
1529 1240 : CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
1530 : CGroupMask mask;
1531 :
1532 1240 : assert(u);
1533 1240 : mask = unit_get_disable_mask(u);
1534 :
1535 : /* Returns the mask of controllers which are marked as forcibly
1536 : * disabled in any ancestor unit or the unit in question. */
1537 :
1538 1240 : if (UNIT_ISSET(u->slice))
1539 621 : mask |= unit_get_ancestor_disable_mask(UNIT_DEREF(u->slice));
1540 :
1541 1240 : return mask;
1542 : }
1543 :
1544 353 : CGroupMask unit_get_subtree_mask(Unit *u) {
1545 :
1546 : /* Returns the mask of this subtree, meaning of the group
1547 : * itself and its children. */
1548 :
1549 353 : return unit_get_own_mask(u) | unit_get_members_mask(u);
1550 : }
1551 :
1552 28 : CGroupMask unit_get_target_mask(Unit *u) {
1553 : CGroupMask mask;
1554 :
1555 : /* This returns the cgroup mask of all controllers to enable
1556 : * for a specific cgroup, i.e. everything it needs itself,
1557 : * plus all that its children need, plus all that its siblings
1558 : * need. This is primarily useful on the legacy cgroup
1559 : * hierarchy, where we need to duplicate each cgroup in each
1560 : * hierarchy that shall be enabled for it. */
1561 :
1562 28 : mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1563 :
1564 28 : if (mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
1565 0 : emit_bpf_firewall_warning(u);
1566 :
1567 28 : mask &= u->manager->cgroup_supported;
1568 28 : mask &= ~unit_get_ancestor_disable_mask(u);
1569 :
1570 28 : return mask;
1571 : }
1572 :
1573 12 : CGroupMask unit_get_enable_mask(Unit *u) {
1574 : CGroupMask mask;
1575 :
1576 : /* This returns the cgroup mask of all controllers to enable
1577 : * for the children of a specific cgroup. This is primarily
1578 : * useful for the unified cgroup hierarchy, where each cgroup
1579 : * controls which controllers are enabled for its children. */
1580 :
1581 12 : mask = unit_get_members_mask(u);
1582 12 : mask &= u->manager->cgroup_supported;
1583 12 : mask &= ~unit_get_ancestor_disable_mask(u);
1584 :
1585 12 : return mask;
1586 : }
1587 :
1588 2289 : void unit_invalidate_cgroup_members_masks(Unit *u) {
1589 2289 : assert(u);
1590 :
1591 : /* Recurse invalidate the member masks cache all the way up the tree */
1592 2289 : u->cgroup_members_mask_valid = false;
1593 :
1594 2289 : if (UNIT_ISSET(u->slice))
1595 313 : unit_invalidate_cgroup_members_masks(UNIT_DEREF(u->slice));
1596 2289 : }
1597 :
1598 0 : const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
1599 :
1600 : /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
1601 :
1602 0 : while (u) {
1603 :
1604 0 : if (u->cgroup_path &&
1605 0 : u->cgroup_realized &&
1606 0 : FLAGS_SET(u->cgroup_realized_mask, mask))
1607 0 : return u->cgroup_path;
1608 :
1609 0 : u = UNIT_DEREF(u->slice);
1610 : }
1611 :
1612 0 : return NULL;
1613 : }
1614 :
1615 0 : static const char *migrate_callback(CGroupMask mask, void *userdata) {
1616 0 : return unit_get_realized_cgroup_path(userdata, mask);
1617 : }
1618 :
1619 12 : char *unit_default_cgroup_path(const Unit *u) {
1620 12 : _cleanup_free_ char *escaped = NULL, *slice = NULL;
1621 : int r;
1622 :
1623 12 : assert(u);
1624 :
1625 12 : if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1626 6 : return strdup(u->manager->cgroup_root);
1627 :
1628 6 : if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1629 0 : r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1630 0 : if (r < 0)
1631 0 : return NULL;
1632 : }
1633 :
1634 6 : escaped = cg_escape(u->id);
1635 6 : if (!escaped)
1636 0 : return NULL;
1637 :
1638 6 : return path_join(empty_to_root(u->manager->cgroup_root), slice, escaped);
1639 : }
1640 :
1641 12 : int unit_set_cgroup_path(Unit *u, const char *path) {
1642 12 : _cleanup_free_ char *p = NULL;
1643 : int r;
1644 :
1645 12 : assert(u);
1646 :
1647 12 : if (streq_ptr(u->cgroup_path, path))
1648 0 : return 0;
1649 :
1650 12 : if (path) {
1651 12 : p = strdup(path);
1652 12 : if (!p)
1653 0 : return -ENOMEM;
1654 : }
1655 :
1656 12 : if (p) {
1657 12 : r = hashmap_put(u->manager->cgroup_unit, p, u);
1658 12 : if (r < 0)
1659 0 : return r;
1660 : }
1661 :
1662 12 : unit_release_cgroup(u);
1663 12 : u->cgroup_path = TAKE_PTR(p);
1664 :
1665 12 : return 1;
1666 : }
1667 :
1668 6 : int unit_watch_cgroup(Unit *u) {
1669 6 : _cleanup_free_ char *events = NULL;
1670 : int r;
1671 :
1672 6 : assert(u);
1673 :
1674 : /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
1675 : * cgroupv2 is available. */
1676 :
1677 6 : if (!u->cgroup_path)
1678 0 : return 0;
1679 :
1680 6 : if (u->cgroup_control_inotify_wd >= 0)
1681 0 : return 0;
1682 :
1683 : /* Only applies to the unified hierarchy */
1684 6 : r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1685 6 : if (r < 0)
1686 0 : return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1687 6 : if (r == 0)
1688 0 : return 0;
1689 :
1690 : /* No point in watch the top-level slice, it's never going to run empty. */
1691 6 : if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1692 6 : return 0;
1693 :
1694 0 : r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
1695 0 : if (r < 0)
1696 0 : return log_oom();
1697 :
1698 0 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1699 0 : if (r < 0)
1700 0 : return log_oom();
1701 :
1702 0 : u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1703 0 : if (u->cgroup_control_inotify_wd < 0) {
1704 :
1705 0 : if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
1706 : * is not an error */
1707 0 : return 0;
1708 :
1709 0 : return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", u->cgroup_path);
1710 : }
1711 :
1712 0 : r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
1713 0 : if (r < 0)
1714 0 : return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor to hash map: %m");
1715 :
1716 0 : return 0;
1717 : }
1718 :
1719 6 : int unit_watch_cgroup_memory(Unit *u) {
1720 6 : _cleanup_free_ char *events = NULL;
1721 : CGroupContext *c;
1722 : int r;
1723 :
1724 6 : assert(u);
1725 :
1726 : /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
1727 : * cgroupv2 is available. */
1728 :
1729 6 : if (!u->cgroup_path)
1730 0 : return 0;
1731 :
1732 6 : c = unit_get_cgroup_context(u);
1733 6 : if (!c)
1734 0 : return 0;
1735 :
1736 : /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
1737 : * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
1738 : * all. */
1739 6 : if (!c->memory_accounting)
1740 0 : return 0;
1741 :
1742 : /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
1743 : * we also don't want to generate a log message for each parent cgroup of a process. */
1744 6 : if (u->type == UNIT_SLICE)
1745 6 : return 0;
1746 :
1747 0 : if (u->cgroup_memory_inotify_wd >= 0)
1748 0 : return 0;
1749 :
1750 : /* Only applies to the unified hierarchy */
1751 0 : r = cg_all_unified();
1752 0 : if (r < 0)
1753 0 : return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
1754 0 : if (r == 0)
1755 0 : return 0;
1756 :
1757 0 : r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
1758 0 : if (r < 0)
1759 0 : return log_oom();
1760 :
1761 0 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
1762 0 : if (r < 0)
1763 0 : return log_oom();
1764 :
1765 0 : u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1766 0 : if (u->cgroup_memory_inotify_wd < 0) {
1767 :
1768 0 : if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
1769 : * is not an error */
1770 0 : return 0;
1771 :
1772 0 : return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", u->cgroup_path);
1773 : }
1774 :
1775 0 : r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
1776 0 : if (r < 0)
1777 0 : return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor to hash map: %m");
1778 :
1779 0 : return 0;
1780 : }
1781 :
1782 18 : int unit_pick_cgroup_path(Unit *u) {
1783 18 : _cleanup_free_ char *path = NULL;
1784 : int r;
1785 :
1786 18 : assert(u);
1787 :
1788 18 : if (u->cgroup_path)
1789 6 : return 0;
1790 :
1791 12 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
1792 0 : return -EINVAL;
1793 :
1794 12 : path = unit_default_cgroup_path(u);
1795 12 : if (!path)
1796 0 : return log_oom();
1797 :
1798 12 : r = unit_set_cgroup_path(u, path);
1799 12 : if (r == -EEXIST)
1800 0 : return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1801 12 : if (r < 0)
1802 0 : return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1803 :
1804 12 : return 0;
1805 : }
1806 :
1807 12 : static int unit_create_cgroup(
1808 : Unit *u,
1809 : CGroupMask target_mask,
1810 : CGroupMask enable_mask,
1811 : ManagerState state) {
1812 :
1813 : bool created;
1814 : int r;
1815 :
1816 12 : assert(u);
1817 :
1818 12 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
1819 0 : return 0;
1820 :
1821 : /* Figure out our cgroup path */
1822 12 : r = unit_pick_cgroup_path(u);
1823 12 : if (r < 0)
1824 0 : return r;
1825 :
1826 : /* First, create our own group */
1827 12 : r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1828 12 : if (r < 0)
1829 6 : return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1830 6 : created = r;
1831 :
1832 : /* Start watching it */
1833 6 : (void) unit_watch_cgroup(u);
1834 6 : (void) unit_watch_cgroup_memory(u);
1835 :
1836 : /* Preserve enabled controllers in delegated units, adjust others. */
1837 6 : if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
1838 6 : CGroupMask result_mask = 0;
1839 :
1840 : /* Enable all controllers we need */
1841 6 : r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask);
1842 6 : if (r < 0)
1843 0 : log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1844 :
1845 : /* If we just turned off a controller, this might release the controller for our parent too, let's
1846 : * enqueue the parent for re-realization in that case again. */
1847 6 : if (UNIT_ISSET(u->slice)) {
1848 : CGroupMask turned_off;
1849 :
1850 0 : turned_off = (u->cgroup_realized ? u->cgroup_enabled_mask & ~result_mask : 0);
1851 0 : if (turned_off != 0) {
1852 : Unit *parent;
1853 :
1854 : /* Force the parent to propagate the enable mask to the kernel again, by invalidating
1855 : * the controller we just turned off. */
1856 :
1857 0 : for (parent = UNIT_DEREF(u->slice); parent; parent = UNIT_DEREF(parent->slice))
1858 0 : unit_invalidate_cgroup(parent, turned_off);
1859 : }
1860 : }
1861 :
1862 : /* Remember what's actually enabled now */
1863 6 : u->cgroup_enabled_mask = result_mask;
1864 : }
1865 :
1866 : /* Keep track that this is now realized */
1867 6 : u->cgroup_realized = true;
1868 6 : u->cgroup_realized_mask = target_mask;
1869 :
1870 6 : if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
1871 :
1872 : /* Then, possibly move things over, but not if
1873 : * subgroups may contain processes, which is the case
1874 : * for slice and delegation units. */
1875 0 : r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1876 0 : if (r < 0)
1877 0 : log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1878 : }
1879 :
1880 : /* Set attributes */
1881 6 : cgroup_context_apply(u, target_mask, state);
1882 6 : cgroup_xattr_apply(u);
1883 :
1884 6 : return 0;
1885 : }
1886 :
1887 0 : static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1888 0 : _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1889 : char *pp;
1890 : int r;
1891 :
1892 0 : assert(u);
1893 :
1894 0 : if (MANAGER_IS_SYSTEM(u->manager))
1895 0 : return -EINVAL;
1896 :
1897 0 : if (!u->manager->system_bus)
1898 0 : return -EIO;
1899 :
1900 0 : if (!u->cgroup_path)
1901 0 : return -EINVAL;
1902 :
1903 : /* Determine this unit's cgroup path relative to our cgroup root */
1904 0 : pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1905 0 : if (!pp)
1906 0 : return -EINVAL;
1907 :
1908 0 : pp = strjoina("/", pp, suffix_path);
1909 0 : path_simplify(pp, false);
1910 :
1911 0 : r = sd_bus_call_method(u->manager->system_bus,
1912 : "org.freedesktop.systemd1",
1913 : "/org/freedesktop/systemd1",
1914 : "org.freedesktop.systemd1.Manager",
1915 : "AttachProcessesToUnit",
1916 : &error, NULL,
1917 : "ssau",
1918 : NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
1919 0 : if (r < 0)
1920 0 : return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1921 :
1922 0 : return 0;
1923 : }
1924 :
1925 0 : int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1926 : CGroupMask delegated_mask;
1927 : const char *p;
1928 : Iterator i;
1929 : void *pidp;
1930 : int r, q;
1931 :
1932 0 : assert(u);
1933 :
1934 0 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
1935 0 : return -EINVAL;
1936 :
1937 0 : if (set_isempty(pids))
1938 0 : return 0;
1939 :
1940 : /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
1941 : * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
1942 0 : r = bpf_firewall_load_custom(u);
1943 0 : if (r < 0)
1944 0 : return r;
1945 :
1946 0 : r = unit_realize_cgroup(u);
1947 0 : if (r < 0)
1948 0 : return r;
1949 :
1950 0 : if (isempty(suffix_path))
1951 0 : p = u->cgroup_path;
1952 : else
1953 0 : p = prefix_roota(u->cgroup_path, suffix_path);
1954 :
1955 0 : delegated_mask = unit_get_delegate_mask(u);
1956 :
1957 0 : r = 0;
1958 0 : SET_FOREACH(pidp, pids, i) {
1959 0 : pid_t pid = PTR_TO_PID(pidp);
1960 : CGroupController c;
1961 :
1962 : /* First, attach the PID to the main cgroup hierarchy */
1963 0 : q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1964 0 : if (q < 0) {
1965 0 : log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1966 :
1967 0 : if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
1968 : int z;
1969 :
1970 : /* If we are in a user instance, and we can't move the process ourselves due to
1971 : * permission problems, let's ask the system instance about it instead. Since it's more
1972 : * privileged it might be able to move the process across the leaves of a subtree who's
1973 : * top node is not owned by us. */
1974 :
1975 0 : z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1976 0 : if (z < 0)
1977 0 : log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1978 : else
1979 0 : continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1980 : }
1981 :
1982 0 : if (r >= 0)
1983 0 : r = q; /* Remember first error */
1984 :
1985 0 : continue;
1986 : }
1987 :
1988 0 : q = cg_all_unified();
1989 0 : if (q < 0)
1990 0 : return q;
1991 0 : if (q > 0)
1992 0 : continue;
1993 :
1994 : /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1995 : * innermost realized one */
1996 :
1997 0 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1998 0 : CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1999 : const char *realized;
2000 :
2001 0 : if (!(u->manager->cgroup_supported & bit))
2002 0 : continue;
2003 :
2004 : /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
2005 0 : if (delegated_mask & u->cgroup_realized_mask & bit) {
2006 0 : q = cg_attach(cgroup_controller_to_string(c), p, pid);
2007 0 : if (q >= 0)
2008 0 : continue; /* Success! */
2009 :
2010 0 : log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
2011 : pid, p, cgroup_controller_to_string(c));
2012 : }
2013 :
2014 : /* So this controller is either not delegate or realized, or something else weird happened. In
2015 : * that case let's attach the PID at least to the closest cgroup up the tree that is
2016 : * realized. */
2017 0 : realized = unit_get_realized_cgroup_path(u, bit);
2018 0 : if (!realized)
2019 0 : continue; /* Not even realized in the root slice? Then let's not bother */
2020 :
2021 0 : q = cg_attach(cgroup_controller_to_string(c), realized, pid);
2022 0 : if (q < 0)
2023 0 : log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
2024 : pid, realized, cgroup_controller_to_string(c));
2025 : }
2026 : }
2027 :
2028 0 : return r;
2029 : }
2030 :
2031 6 : static bool unit_has_mask_realized(
2032 : Unit *u,
2033 : CGroupMask target_mask,
2034 : CGroupMask enable_mask) {
2035 :
2036 6 : assert(u);
2037 :
2038 : /* Returns true if this unit is fully realized. We check four things:
2039 : *
2040 : * 1. Whether the cgroup was created at all
2041 : * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
2042 : * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
2043 : * 4. Whether the invalidation mask is currently zero
2044 : *
2045 : * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
2046 : * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
2047 : * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
2048 : * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
2049 : * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
2050 : * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
2051 : * simply don't matter. */
2052 :
2053 6 : return u->cgroup_realized &&
2054 0 : ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
2055 6 : ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
2056 0 : u->cgroup_invalidated_mask == 0;
2057 : }
2058 :
2059 0 : static bool unit_has_mask_disables_realized(
2060 : Unit *u,
2061 : CGroupMask target_mask,
2062 : CGroupMask enable_mask) {
2063 :
2064 0 : assert(u);
2065 :
2066 : /* Returns true if all controllers which should be disabled are indeed disabled.
2067 : *
2068 : * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
2069 : * already removed. */
2070 :
2071 0 : return !u->cgroup_realized ||
2072 0 : (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
2073 0 : FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
2074 : }
2075 :
2076 6 : static bool unit_has_mask_enables_realized(
2077 : Unit *u,
2078 : CGroupMask target_mask,
2079 : CGroupMask enable_mask) {
2080 :
2081 6 : assert(u);
2082 :
2083 : /* Returns true if all controllers which should be enabled are indeed enabled.
2084 : *
2085 : * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
2086 : * we want to add is already added. */
2087 :
2088 6 : return u->cgroup_realized &&
2089 6 : ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
2090 0 : ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
2091 : }
2092 :
2093 183 : void unit_add_to_cgroup_realize_queue(Unit *u) {
2094 183 : assert(u);
2095 :
2096 183 : if (u->in_cgroup_realize_queue)
2097 167 : return;
2098 :
2099 16 : LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2100 16 : u->in_cgroup_realize_queue = true;
2101 : }
2102 :
2103 6 : static void unit_remove_from_cgroup_realize_queue(Unit *u) {
2104 6 : assert(u);
2105 :
2106 6 : if (!u->in_cgroup_realize_queue)
2107 6 : return;
2108 :
2109 0 : LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2110 0 : u->in_cgroup_realize_queue = false;
2111 : }
2112 :
2113 : /* Controllers can only be enabled breadth-first, from the root of the
2114 : * hierarchy downwards to the unit in question. */
2115 6 : static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
2116 : CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
2117 : int r;
2118 :
2119 6 : assert(u);
2120 :
2121 : /* First go deal with this unit's parent, or we won't be able to enable
2122 : * any new controllers at this layer. */
2123 6 : if (UNIT_ISSET(u->slice)) {
2124 0 : r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
2125 0 : if (r < 0)
2126 0 : return r;
2127 : }
2128 :
2129 6 : target_mask = unit_get_target_mask(u);
2130 6 : enable_mask = unit_get_enable_mask(u);
2131 :
2132 : /* We can only enable in this direction, don't try to disable anything.
2133 : */
2134 6 : if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
2135 0 : return 0;
2136 :
2137 6 : new_target_mask = u->cgroup_realized_mask | target_mask;
2138 6 : new_enable_mask = u->cgroup_enabled_mask | enable_mask;
2139 :
2140 6 : return unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
2141 : }
2142 :
2143 : /* Controllers can only be disabled depth-first, from the leaves of the
2144 : * hierarchy upwards to the unit in question. */
2145 6 : static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
2146 : Iterator i;
2147 : Unit *m;
2148 : void *v;
2149 :
2150 6 : assert(u);
2151 :
2152 6 : if (u->type != UNIT_SLICE)
2153 6 : return 0;
2154 :
2155 0 : HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
2156 : CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
2157 : int r;
2158 :
2159 0 : if (UNIT_DEREF(m->slice) != u)
2160 0 : continue;
2161 :
2162 : /* The cgroup for this unit might not actually be fully
2163 : * realised yet, in which case it isn't holding any controllers
2164 : * open anyway. */
2165 0 : if (!m->cgroup_path)
2166 0 : continue;
2167 :
2168 : /* We must disable those below us first in order to release the
2169 : * controller. */
2170 0 : if (m->type == UNIT_SLICE)
2171 0 : (void) unit_realize_cgroup_now_disable(m, state);
2172 :
2173 0 : target_mask = unit_get_target_mask(m);
2174 0 : enable_mask = unit_get_enable_mask(m);
2175 :
2176 : /* We can only disable in this direction, don't try to enable
2177 : * anything. */
2178 0 : if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
2179 0 : continue;
2180 :
2181 0 : new_target_mask = m->cgroup_realized_mask & target_mask;
2182 0 : new_enable_mask = m->cgroup_enabled_mask & enable_mask;
2183 :
2184 0 : r = unit_create_cgroup(m, new_target_mask, new_enable_mask, state);
2185 0 : if (r < 0)
2186 0 : return r;
2187 : }
2188 :
2189 0 : return 0;
2190 : }
2191 :
2192 : /* Check if necessary controllers and attributes for a unit are in place.
2193 : *
2194 : * - If so, do nothing.
2195 : * - If not, create paths, move processes over, and set attributes.
2196 : *
2197 : * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
2198 : * a depth-first way. As such the process looks like this:
2199 : *
2200 : * Suppose we have a cgroup hierarchy which looks like this:
2201 : *
2202 : * root
2203 : * / \
2204 : * / \
2205 : * / \
2206 : * a b
2207 : * / \ / \
2208 : * / \ / \
2209 : * c d e f
2210 : * / \ / \ / \ / \
2211 : * h i j k l m n o
2212 : *
2213 : * 1. We want to realise cgroup "d" now.
2214 : * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
2215 : * 3. cgroup "k" just started requesting the memory controller.
2216 : *
2217 : * To make this work we must do the following in order:
2218 : *
2219 : * 1. Disable CPU controller in k, j
2220 : * 2. Disable CPU controller in d
2221 : * 3. Enable memory controller in root
2222 : * 4. Enable memory controller in a
2223 : * 5. Enable memory controller in d
2224 : * 6. Enable memory controller in k
2225 : *
2226 : * Notice that we need to touch j in one direction, but not the other. We also
2227 : * don't go beyond d when disabling -- it's up to "a" to get realized if it
2228 : * wants to disable further. The basic rules are therefore:
2229 : *
2230 : * - If you're disabling something, you need to realise all of the cgroups from
2231 : * your recursive descendants to the root. This starts from the leaves.
2232 : * - If you're enabling something, you need to realise from the root cgroup
2233 : * downwards, but you don't need to iterate your recursive descendants.
2234 : *
2235 : * Returns 0 on success and < 0 on failure. */
2236 6 : static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
2237 : CGroupMask target_mask, enable_mask;
2238 : int r;
2239 :
2240 6 : assert(u);
2241 :
2242 6 : unit_remove_from_cgroup_realize_queue(u);
2243 :
2244 6 : target_mask = unit_get_target_mask(u);
2245 6 : enable_mask = unit_get_enable_mask(u);
2246 :
2247 6 : if (unit_has_mask_realized(u, target_mask, enable_mask))
2248 0 : return 0;
2249 :
2250 : /* Disable controllers below us, if there are any */
2251 6 : r = unit_realize_cgroup_now_disable(u, state);
2252 6 : if (r < 0)
2253 0 : return r;
2254 :
2255 : /* Enable controllers above us, if there are any */
2256 6 : if (UNIT_ISSET(u->slice)) {
2257 6 : r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
2258 6 : if (r < 0)
2259 0 : return r;
2260 : }
2261 :
2262 : /* Now actually deal with the cgroup we were trying to realise and set attributes */
2263 6 : r = unit_create_cgroup(u, target_mask, enable_mask, state);
2264 6 : if (r < 0)
2265 6 : return r;
2266 :
2267 : /* Now, reset the invalidation mask */
2268 0 : u->cgroup_invalidated_mask = 0;
2269 0 : return 0;
2270 : }
2271 :
2272 0 : unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
2273 : ManagerState state;
2274 0 : unsigned n = 0;
2275 : Unit *i;
2276 : int r;
2277 :
2278 0 : assert(m);
2279 :
2280 0 : state = manager_state(m);
2281 :
2282 0 : while ((i = m->cgroup_realize_queue)) {
2283 0 : assert(i->in_cgroup_realize_queue);
2284 :
2285 0 : if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
2286 : /* Maybe things changed, and the unit is not actually active anymore? */
2287 0 : unit_remove_from_cgroup_realize_queue(i);
2288 0 : continue;
2289 : }
2290 :
2291 0 : r = unit_realize_cgroup_now(i, state);
2292 0 : if (r < 0)
2293 0 : log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
2294 :
2295 0 : n++;
2296 : }
2297 :
2298 0 : return n;
2299 : }
2300 :
2301 6 : static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
2302 : Unit *slice;
2303 :
2304 : /* This adds the siblings of the specified unit and the
2305 : * siblings of all parent units to the cgroup queue. (But
2306 : * neither the specified unit itself nor the parents.) */
2307 :
2308 12 : while ((slice = UNIT_DEREF(u->slice))) {
2309 : Iterator i;
2310 : Unit *m;
2311 : void *v;
2312 :
2313 12 : HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
2314 : /* Skip units that have a dependency on the slice
2315 : * but aren't actually in it. */
2316 6 : if (UNIT_DEREF(m->slice) != slice)
2317 6 : continue;
2318 :
2319 : /* No point in doing cgroup application for units
2320 : * without active processes. */
2321 0 : if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
2322 0 : continue;
2323 :
2324 : /* If the unit doesn't need any new controllers
2325 : * and has current ones realized, it doesn't need
2326 : * any changes. */
2327 0 : if (unit_has_mask_realized(m,
2328 : unit_get_target_mask(m),
2329 : unit_get_enable_mask(m)))
2330 0 : continue;
2331 :
2332 0 : unit_add_to_cgroup_realize_queue(m);
2333 : }
2334 :
2335 6 : u = slice;
2336 : }
2337 6 : }
2338 :
2339 6 : int unit_realize_cgroup(Unit *u) {
2340 6 : assert(u);
2341 :
2342 6 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
2343 0 : return 0;
2344 :
2345 : /* So, here's the deal: when realizing the cgroups for this
2346 : * unit, we need to first create all parents, but there's more
2347 : * actually: for the weight-based controllers we also need to
2348 : * make sure that all our siblings (i.e. units that are in the
2349 : * same slice as we are) have cgroups, too. Otherwise, things
2350 : * would become very uneven as each of their processes would
2351 : * get as much resources as all our group together. This call
2352 : * will synchronously create the parent cgroups, but will
2353 : * defer work on the siblings to the next event loop
2354 : * iteration. */
2355 :
2356 : /* Add all sibling slices to the cgroup queue. */
2357 6 : unit_add_siblings_to_cgroup_realize_queue(u);
2358 :
2359 : /* And realize this one now (and apply the values) */
2360 6 : return unit_realize_cgroup_now(u, manager_state(u->manager));
2361 : }
2362 :
2363 2172 : void unit_release_cgroup(Unit *u) {
2364 2172 : assert(u);
2365 :
2366 : /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
2367 : * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
2368 :
2369 2172 : if (u->cgroup_path) {
2370 12 : (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
2371 12 : u->cgroup_path = mfree(u->cgroup_path);
2372 : }
2373 :
2374 2172 : if (u->cgroup_control_inotify_wd >= 0) {
2375 0 : if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
2376 0 : log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);
2377 :
2378 0 : (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
2379 0 : u->cgroup_control_inotify_wd = -1;
2380 : }
2381 :
2382 2172 : if (u->cgroup_memory_inotify_wd >= 0) {
2383 0 : if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
2384 0 : log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
2385 :
2386 0 : (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
2387 0 : u->cgroup_memory_inotify_wd = -1;
2388 : }
2389 2172 : }
2390 :
2391 7 : void unit_prune_cgroup(Unit *u) {
2392 : int r;
2393 : bool is_root_slice;
2394 :
2395 7 : assert(u);
2396 :
2397 : /* Removes the cgroup, if empty and possible, and stops watching it. */
2398 :
2399 7 : if (!u->cgroup_path)
2400 7 : return;
2401 :
2402 0 : (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
2403 :
2404 0 : is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
2405 :
2406 0 : r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
2407 0 : if (r < 0)
2408 : /* One reason we could have failed here is, that the cgroup still contains a process.
2409 : * However, if the cgroup becomes removable at a later time, it might be removed when
2410 : * the containing slice is stopped. So even if we failed now, this unit shouldn't assume
2411 : * that the cgroup is still realized the next time it is started. Do not return early
2412 : * on error, continue cleanup. */
2413 0 : log_unit_full(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
2414 :
2415 0 : if (is_root_slice)
2416 0 : return;
2417 :
2418 0 : unit_release_cgroup(u);
2419 :
2420 0 : u->cgroup_realized = false;
2421 0 : u->cgroup_realized_mask = 0;
2422 0 : u->cgroup_enabled_mask = 0;
2423 :
2424 0 : u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
2425 : }
2426 :
2427 0 : int unit_search_main_pid(Unit *u, pid_t *ret) {
2428 0 : _cleanup_fclose_ FILE *f = NULL;
2429 0 : pid_t pid = 0, npid;
2430 : int r;
2431 :
2432 0 : assert(u);
2433 0 : assert(ret);
2434 :
2435 0 : if (!u->cgroup_path)
2436 0 : return -ENXIO;
2437 :
2438 0 : r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
2439 0 : if (r < 0)
2440 0 : return r;
2441 :
2442 0 : while (cg_read_pid(f, &npid) > 0) {
2443 :
2444 0 : if (npid == pid)
2445 0 : continue;
2446 :
2447 0 : if (pid_is_my_child(npid) == 0)
2448 0 : continue;
2449 :
2450 0 : if (pid != 0)
2451 : /* Dang, there's more than one daemonized PID
2452 : in this group, so we don't know what process
2453 : is the main process. */
2454 :
2455 0 : return -ENODATA;
2456 :
2457 0 : pid = npid;
2458 : }
2459 :
2460 0 : *ret = pid;
2461 0 : return 0;
2462 : }
2463 :
2464 0 : static int unit_watch_pids_in_path(Unit *u, const char *path) {
2465 0 : _cleanup_closedir_ DIR *d = NULL;
2466 0 : _cleanup_fclose_ FILE *f = NULL;
2467 0 : int ret = 0, r;
2468 :
2469 0 : assert(u);
2470 0 : assert(path);
2471 :
2472 0 : r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
2473 0 : if (r < 0)
2474 0 : ret = r;
2475 : else {
2476 : pid_t pid;
2477 :
2478 0 : while ((r = cg_read_pid(f, &pid)) > 0) {
2479 0 : r = unit_watch_pid(u, pid, false);
2480 0 : if (r < 0 && ret >= 0)
2481 0 : ret = r;
2482 : }
2483 :
2484 0 : if (r < 0 && ret >= 0)
2485 0 : ret = r;
2486 : }
2487 :
2488 0 : r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
2489 0 : if (r < 0) {
2490 0 : if (ret >= 0)
2491 0 : ret = r;
2492 : } else {
2493 : char *fn;
2494 :
2495 0 : while ((r = cg_read_subgroup(d, &fn)) > 0) {
2496 0 : _cleanup_free_ char *p = NULL;
2497 :
2498 0 : p = path_join(empty_to_root(path), fn);
2499 0 : free(fn);
2500 :
2501 0 : if (!p)
2502 0 : return -ENOMEM;
2503 :
2504 0 : r = unit_watch_pids_in_path(u, p);
2505 0 : if (r < 0 && ret >= 0)
2506 0 : ret = r;
2507 : }
2508 :
2509 0 : if (r < 0 && ret >= 0)
2510 0 : ret = r;
2511 : }
2512 :
2513 0 : return ret;
2514 : }
2515 :
2516 0 : int unit_synthesize_cgroup_empty_event(Unit *u) {
2517 : int r;
2518 :
2519 0 : assert(u);
2520 :
2521 : /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
2522 : * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
2523 : * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2524 :
2525 0 : if (!u->cgroup_path)
2526 0 : return -ENOENT;
2527 :
2528 0 : r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2529 0 : if (r < 0)
2530 0 : return r;
2531 0 : if (r > 0) /* On unified we have reliable notifications, and don't need this */
2532 0 : return 0;
2533 :
2534 0 : if (!set_isempty(u->pids))
2535 0 : return 0;
2536 :
2537 0 : unit_add_to_cgroup_empty_queue(u);
2538 0 : return 0;
2539 : }
2540 :
2541 0 : int unit_watch_all_pids(Unit *u) {
2542 : int r;
2543 :
2544 0 : assert(u);
2545 :
2546 : /* Adds all PIDs from our cgroup to the set of PIDs we
2547 : * watch. This is a fallback logic for cases where we do not
2548 : * get reliable cgroup empty notifications: we try to use
2549 : * SIGCHLD as replacement. */
2550 :
2551 0 : if (!u->cgroup_path)
2552 0 : return -ENOENT;
2553 :
2554 0 : r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2555 0 : if (r < 0)
2556 0 : return r;
2557 0 : if (r > 0) /* On unified we can use proper notifications */
2558 0 : return 0;
2559 :
2560 0 : return unit_watch_pids_in_path(u, u->cgroup_path);
2561 : }
2562 :
2563 0 : static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2564 0 : Manager *m = userdata;
2565 : Unit *u;
2566 : int r;
2567 :
2568 0 : assert(s);
2569 0 : assert(m);
2570 :
2571 0 : u = m->cgroup_empty_queue;
2572 0 : if (!u)
2573 0 : return 0;
2574 :
2575 0 : assert(u->in_cgroup_empty_queue);
2576 0 : u->in_cgroup_empty_queue = false;
2577 0 : LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
2578 :
2579 0 : if (m->cgroup_empty_queue) {
2580 : /* More stuff queued, let's make sure we remain enabled */
2581 0 : r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2582 0 : if (r < 0)
2583 0 : log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
2584 : }
2585 :
2586 0 : unit_add_to_gc_queue(u);
2587 :
2588 0 : if (UNIT_VTABLE(u)->notify_cgroup_empty)
2589 0 : UNIT_VTABLE(u)->notify_cgroup_empty(u);
2590 :
2591 0 : return 0;
2592 : }
2593 :
2594 0 : void unit_add_to_cgroup_empty_queue(Unit *u) {
2595 : int r;
2596 :
2597 0 : assert(u);
2598 :
2599 : /* Note that there are four different ways how cgroup empty events reach us:
2600 : *
2601 : * 1. On the unified hierarchy we get an inotify event on the cgroup
2602 : *
2603 : * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2604 : *
2605 : * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2606 : *
2607 : * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2608 : * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2609 : *
2610 : * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2611 : * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2612 : * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2613 : * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2614 : * case for scope units). */
2615 :
2616 0 : if (u->in_cgroup_empty_queue)
2617 0 : return;
2618 :
2619 : /* Let's verify that the cgroup is really empty */
2620 0 : if (!u->cgroup_path)
2621 0 : return;
2622 0 : r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2623 0 : if (r < 0) {
2624 0 : log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2625 0 : return;
2626 : }
2627 0 : if (r == 0)
2628 0 : return;
2629 :
2630 0 : LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2631 0 : u->in_cgroup_empty_queue = true;
2632 :
2633 : /* Trigger the defer event */
2634 0 : r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2635 0 : if (r < 0)
2636 0 : log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2637 : }
2638 :
2639 0 : int unit_check_oom(Unit *u) {
2640 0 : _cleanup_free_ char *oom_kill = NULL;
2641 : bool increased;
2642 : uint64_t c;
2643 : int r;
2644 :
2645 0 : if (!u->cgroup_path)
2646 0 : return 0;
2647 :
2648 0 : r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
2649 0 : if (r < 0)
2650 0 : return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
2651 :
2652 0 : r = safe_atou64(oom_kill, &c);
2653 0 : if (r < 0)
2654 0 : return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
2655 :
2656 0 : increased = c > u->oom_kill_last;
2657 0 : u->oom_kill_last = c;
2658 :
2659 0 : if (!increased)
2660 0 : return 0;
2661 :
2662 0 : log_struct(LOG_NOTICE,
2663 : "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
2664 : LOG_UNIT_ID(u),
2665 : LOG_UNIT_INVOCATION_ID(u),
2666 : LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
2667 :
2668 0 : if (UNIT_VTABLE(u)->notify_cgroup_oom)
2669 0 : UNIT_VTABLE(u)->notify_cgroup_oom(u);
2670 :
2671 0 : return 1;
2672 : }
2673 :
2674 0 : static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
2675 0 : Manager *m = userdata;
2676 : Unit *u;
2677 : int r;
2678 :
2679 0 : assert(s);
2680 0 : assert(m);
2681 :
2682 0 : u = m->cgroup_oom_queue;
2683 0 : if (!u)
2684 0 : return 0;
2685 :
2686 0 : assert(u->in_cgroup_oom_queue);
2687 0 : u->in_cgroup_oom_queue = false;
2688 0 : LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
2689 :
2690 0 : if (m->cgroup_oom_queue) {
2691 : /* More stuff queued, let's make sure we remain enabled */
2692 0 : r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2693 0 : if (r < 0)
2694 0 : log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
2695 : }
2696 :
2697 0 : (void) unit_check_oom(u);
2698 0 : return 0;
2699 : }
2700 :
2701 0 : static void unit_add_to_cgroup_oom_queue(Unit *u) {
2702 : int r;
2703 :
2704 0 : assert(u);
2705 :
2706 0 : if (u->in_cgroup_oom_queue)
2707 0 : return;
2708 0 : if (!u->cgroup_path)
2709 0 : return;
2710 :
2711 0 : LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
2712 0 : u->in_cgroup_oom_queue = true;
2713 :
2714 : /* Trigger the defer event */
2715 0 : if (!u->manager->cgroup_oom_event_source) {
2716 0 : _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
2717 :
2718 0 : r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
2719 0 : if (r < 0) {
2720 0 : log_error_errno(r, "Failed to create cgroup oom event source: %m");
2721 0 : return;
2722 : }
2723 :
2724 0 : r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
2725 0 : if (r < 0) {
2726 0 : log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
2727 0 : return;
2728 : }
2729 :
2730 0 : (void) sd_event_source_set_description(s, "cgroup-oom");
2731 0 : u->manager->cgroup_oom_event_source = TAKE_PTR(s);
2732 : }
2733 :
2734 0 : r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
2735 0 : if (r < 0)
2736 0 : log_error_errno(r, "Failed to enable cgroup oom event source: %m");
2737 : }
2738 :
2739 0 : static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2740 0 : Manager *m = userdata;
2741 :
2742 0 : assert(s);
2743 0 : assert(fd >= 0);
2744 0 : assert(m);
2745 :
2746 0 : for (;;) {
2747 : union inotify_event_buffer buffer;
2748 : struct inotify_event *e;
2749 : ssize_t l;
2750 :
2751 0 : l = read(fd, &buffer, sizeof(buffer));
2752 0 : if (l < 0) {
2753 0 : if (IN_SET(errno, EINTR, EAGAIN))
2754 0 : return 0;
2755 :
2756 0 : return log_error_errno(errno, "Failed to read control group inotify events: %m");
2757 : }
2758 :
2759 0 : FOREACH_INOTIFY_EVENT(e, buffer, l) {
2760 : Unit *u;
2761 :
2762 0 : if (e->wd < 0)
2763 : /* Queue overflow has no watch descriptor */
2764 0 : continue;
2765 :
2766 0 : if (e->mask & IN_IGNORED)
2767 : /* The watch was just removed */
2768 0 : continue;
2769 :
2770 : /* Note that inotify might deliver events for a watch even after it was removed,
2771 : * because it was queued before the removal. Let's ignore this here safely. */
2772 :
2773 0 : u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
2774 0 : if (u)
2775 0 : unit_add_to_cgroup_empty_queue(u);
2776 :
2777 0 : u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
2778 0 : if (u)
2779 0 : unit_add_to_cgroup_oom_queue(u);
2780 : }
2781 : }
2782 : }
2783 :
2784 11 : static int cg_bpf_mask_supported(CGroupMask *ret) {
2785 11 : CGroupMask mask = 0;
2786 : int r;
2787 :
2788 : /* BPF-based firewall */
2789 11 : r = bpf_firewall_supported();
2790 11 : if (r > 0)
2791 0 : mask |= CGROUP_MASK_BPF_FIREWALL;
2792 :
2793 : /* BPF-based device access control */
2794 11 : r = bpf_devices_supported();
2795 11 : if (r > 0)
2796 0 : mask |= CGROUP_MASK_BPF_DEVICES;
2797 :
2798 11 : *ret = mask;
2799 11 : return 0;
2800 : }
2801 :
2802 11 : int manager_setup_cgroup(Manager *m) {
2803 11 : _cleanup_free_ char *path = NULL;
2804 : const char *scope_path;
2805 : CGroupController c;
2806 : int r, all_unified;
2807 : CGroupMask mask;
2808 : char *e;
2809 :
2810 11 : assert(m);
2811 :
2812 : /* 1. Determine hierarchy */
2813 11 : m->cgroup_root = mfree(m->cgroup_root);
2814 11 : r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2815 11 : if (r < 0)
2816 0 : return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2817 :
2818 : /* Chop off the init scope, if we are already located in it */
2819 11 : e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2820 :
2821 : /* LEGACY: Also chop off the system slice if we are in
2822 : * it. This is to support live upgrades from older systemd
2823 : * versions where PID 1 was moved there. Also see
2824 : * cg_get_root_path(). */
2825 11 : if (!e && MANAGER_IS_SYSTEM(m)) {
2826 0 : e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2827 0 : if (!e)
2828 0 : e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2829 : }
2830 11 : if (e)
2831 0 : *e = 0;
2832 :
2833 : /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2834 : * easily prepend it everywhere. */
2835 11 : delete_trailing_chars(m->cgroup_root, "/");
2836 :
2837 : /* 2. Show data */
2838 11 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2839 11 : if (r < 0)
2840 0 : return log_error_errno(r, "Cannot find cgroup mount point: %m");
2841 :
2842 11 : r = cg_unified_flush();
2843 11 : if (r < 0)
2844 0 : return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2845 :
2846 11 : all_unified = cg_all_unified();
2847 11 : if (all_unified < 0)
2848 0 : return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2849 11 : if (all_unified > 0)
2850 0 : log_debug("Unified cgroup hierarchy is located at %s.", path);
2851 : else {
2852 11 : r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2853 11 : if (r < 0)
2854 0 : return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2855 11 : if (r > 0)
2856 11 : log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2857 : else
2858 0 : log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2859 : }
2860 :
2861 : /* 3. Allocate cgroup empty defer event source */
2862 11 : m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2863 11 : r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2864 11 : if (r < 0)
2865 0 : return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2866 :
2867 : /* Schedule cgroup empty checks early, but after having processed service notification messages or
2868 : * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
2869 : * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
2870 11 : r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2871 11 : if (r < 0)
2872 0 : return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2873 :
2874 11 : r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2875 11 : if (r < 0)
2876 0 : return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2877 :
2878 11 : (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2879 :
2880 : /* 4. Install notifier inotify object, or agent */
2881 11 : if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2882 :
2883 : /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2884 :
2885 11 : m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2886 11 : safe_close(m->cgroup_inotify_fd);
2887 :
2888 11 : m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2889 11 : if (m->cgroup_inotify_fd < 0)
2890 0 : return log_error_errno(errno, "Failed to create control group inotify object: %m");
2891 :
2892 11 : r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2893 11 : if (r < 0)
2894 0 : return log_error_errno(r, "Failed to watch control group inotify object: %m");
2895 :
2896 : /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
2897 : * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
2898 : * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2899 11 : r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
2900 11 : if (r < 0)
2901 0 : return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2902 :
2903 11 : (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2904 :
2905 0 : } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) {
2906 :
2907 : /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2908 : * since it does not generate events when control groups with children run empty. */
2909 :
2910 0 : r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2911 0 : if (r < 0)
2912 0 : log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2913 0 : else if (r > 0)
2914 0 : log_debug("Installed release agent.");
2915 0 : else if (r == 0)
2916 0 : log_debug("Release agent already installed.");
2917 : }
2918 :
2919 : /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2920 55 : scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2921 11 : r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2922 11 : if (r >= 0) {
2923 : /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2924 0 : r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2925 0 : if (r < 0)
2926 0 : log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2927 :
2928 : /* 6. And pin it, so that it cannot be unmounted */
2929 0 : safe_close(m->pin_cgroupfs_fd);
2930 0 : m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2931 0 : if (m->pin_cgroupfs_fd < 0)
2932 0 : return log_error_errno(errno, "Failed to open pin file: %m");
2933 :
2934 11 : } else if (!MANAGER_IS_TEST_RUN(m))
2935 0 : return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2936 :
2937 : /* 7. Always enable hierarchical support if it exists... */
2938 11 : if (!all_unified && !MANAGER_IS_TEST_RUN(m))
2939 0 : (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2940 :
2941 : /* 8. Figure out which controllers are supported */
2942 11 : r = cg_mask_supported(&m->cgroup_supported);
2943 11 : if (r < 0)
2944 0 : return log_error_errno(r, "Failed to determine supported controllers: %m");
2945 :
2946 : /* 9. Figure out which bpf-based pseudo-controllers are supported */
2947 11 : r = cg_bpf_mask_supported(&mask);
2948 11 : if (r < 0)
2949 0 : return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
2950 11 : m->cgroup_supported |= mask;
2951 :
2952 : /* 10. Log which controllers are supported */
2953 110 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2954 99 : log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2955 :
2956 11 : return 0;
2957 : }
2958 :
2959 14 : void manager_shutdown_cgroup(Manager *m, bool delete) {
2960 14 : assert(m);
2961 :
2962 : /* We can't really delete the group, since we are in it. But
2963 : * let's trim it. */
2964 14 : if (delete && m->cgroup_root && m->test_run_flags != MANAGER_TEST_RUN_MINIMAL)
2965 0 : (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2966 :
2967 14 : m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2968 :
2969 14 : m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
2970 14 : m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
2971 :
2972 14 : m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2973 14 : m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2974 :
2975 14 : m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2976 :
2977 14 : m->cgroup_root = mfree(m->cgroup_root);
2978 14 : }
2979 :
2980 0 : Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2981 : char *p;
2982 : Unit *u;
2983 :
2984 0 : assert(m);
2985 0 : assert(cgroup);
2986 :
2987 0 : u = hashmap_get(m->cgroup_unit, cgroup);
2988 0 : if (u)
2989 0 : return u;
2990 :
2991 0 : p = strdupa(cgroup);
2992 0 : for (;;) {
2993 : char *e;
2994 :
2995 0 : e = strrchr(p, '/');
2996 0 : if (!e || e == p)
2997 0 : return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2998 :
2999 0 : *e = 0;
3000 :
3001 0 : u = hashmap_get(m->cgroup_unit, p);
3002 0 : if (u)
3003 0 : return u;
3004 : }
3005 : }
3006 :
3007 0 : Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
3008 0 : _cleanup_free_ char *cgroup = NULL;
3009 :
3010 0 : assert(m);
3011 :
3012 0 : if (!pid_is_valid(pid))
3013 0 : return NULL;
3014 :
3015 0 : if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
3016 0 : return NULL;
3017 :
3018 0 : return manager_get_unit_by_cgroup(m, cgroup);
3019 : }
3020 :
3021 0 : Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
3022 : Unit *u, **array;
3023 :
3024 0 : assert(m);
3025 :
3026 : /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
3027 : * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
3028 : * relevant one as children of the process will be assigned to that one, too, before all else. */
3029 :
3030 0 : if (!pid_is_valid(pid))
3031 0 : return NULL;
3032 :
3033 0 : if (pid == getpid_cached())
3034 0 : return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
3035 :
3036 0 : u = manager_get_unit_by_pid_cgroup(m, pid);
3037 0 : if (u)
3038 0 : return u;
3039 :
3040 0 : u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
3041 0 : if (u)
3042 0 : return u;
3043 :
3044 0 : array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
3045 0 : if (array)
3046 0 : return array[0];
3047 :
3048 0 : return NULL;
3049 : }
3050 :
3051 0 : int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
3052 : Unit *u;
3053 :
3054 0 : assert(m);
3055 0 : assert(cgroup);
3056 :
3057 : /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
3058 : * or from the --system instance */
3059 :
3060 0 : log_debug("Got cgroup empty notification for: %s", cgroup);
3061 :
3062 0 : u = manager_get_unit_by_cgroup(m, cgroup);
3063 0 : if (!u)
3064 0 : return 0;
3065 :
3066 0 : unit_add_to_cgroup_empty_queue(u);
3067 0 : return 1;
3068 : }
3069 :
3070 0 : int unit_get_memory_current(Unit *u, uint64_t *ret) {
3071 0 : _cleanup_free_ char *v = NULL;
3072 : int r;
3073 :
3074 0 : assert(u);
3075 0 : assert(ret);
3076 :
3077 0 : if (!UNIT_CGROUP_BOOL(u, memory_accounting))
3078 0 : return -ENODATA;
3079 :
3080 0 : if (!u->cgroup_path)
3081 0 : return -ENODATA;
3082 :
3083 : /* The root cgroup doesn't expose this information, let's get it from /proc instead */
3084 0 : if (unit_has_host_root_cgroup(u))
3085 0 : return procfs_memory_get_used(ret);
3086 :
3087 0 : if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
3088 0 : return -ENODATA;
3089 :
3090 0 : r = cg_all_unified();
3091 0 : if (r < 0)
3092 0 : return r;
3093 0 : if (r > 0)
3094 0 : r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
3095 : else
3096 0 : r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
3097 0 : if (r == -ENOENT)
3098 0 : return -ENODATA;
3099 0 : if (r < 0)
3100 0 : return r;
3101 :
3102 0 : return safe_atou64(v, ret);
3103 : }
3104 :
3105 0 : int unit_get_tasks_current(Unit *u, uint64_t *ret) {
3106 0 : _cleanup_free_ char *v = NULL;
3107 : int r;
3108 :
3109 0 : assert(u);
3110 0 : assert(ret);
3111 :
3112 0 : if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
3113 0 : return -ENODATA;
3114 :
3115 0 : if (!u->cgroup_path)
3116 0 : return -ENODATA;
3117 :
3118 : /* The root cgroup doesn't expose this information, let's get it from /proc instead */
3119 0 : if (unit_has_host_root_cgroup(u))
3120 0 : return procfs_tasks_get_current(ret);
3121 :
3122 0 : if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
3123 0 : return -ENODATA;
3124 :
3125 0 : r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
3126 0 : if (r == -ENOENT)
3127 0 : return -ENODATA;
3128 0 : if (r < 0)
3129 0 : return r;
3130 :
3131 0 : return safe_atou64(v, ret);
3132 : }
3133 :
3134 6 : static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
3135 6 : _cleanup_free_ char *v = NULL;
3136 : uint64_t ns;
3137 : int r;
3138 :
3139 6 : assert(u);
3140 6 : assert(ret);
3141 :
3142 6 : if (!u->cgroup_path)
3143 0 : return -ENODATA;
3144 :
3145 : /* The root cgroup doesn't expose this information, let's get it from /proc instead */
3146 6 : if (unit_has_host_root_cgroup(u))
3147 0 : return procfs_cpu_get_usage(ret);
3148 :
3149 : /* Requisite controllers for CPU accounting are not enabled */
3150 6 : if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
3151 6 : return -ENODATA;
3152 :
3153 0 : r = cg_all_unified();
3154 0 : if (r < 0)
3155 0 : return r;
3156 0 : if (r > 0) {
3157 0 : _cleanup_free_ char *val = NULL;
3158 : uint64_t us;
3159 :
3160 0 : r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
3161 0 : if (IN_SET(r, -ENOENT, -ENXIO))
3162 0 : return -ENODATA;
3163 0 : if (r < 0)
3164 0 : return r;
3165 :
3166 0 : r = safe_atou64(val, &us);
3167 0 : if (r < 0)
3168 0 : return r;
3169 :
3170 0 : ns = us * NSEC_PER_USEC;
3171 : } else {
3172 0 : r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
3173 0 : if (r == -ENOENT)
3174 0 : return -ENODATA;
3175 0 : if (r < 0)
3176 0 : return r;
3177 :
3178 0 : r = safe_atou64(v, &ns);
3179 0 : if (r < 0)
3180 0 : return r;
3181 : }
3182 :
3183 0 : *ret = ns;
3184 0 : return 0;
3185 : }
3186 :
3187 7 : int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
3188 : nsec_t ns;
3189 : int r;
3190 :
3191 7 : assert(u);
3192 :
3193 : /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
3194 : * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
3195 : * call this function with a NULL return value. */
3196 :
3197 7 : if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
3198 7 : return -ENODATA;
3199 :
3200 0 : r = unit_get_cpu_usage_raw(u, &ns);
3201 0 : if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
3202 : /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
3203 : * cached value. */
3204 :
3205 0 : if (ret)
3206 0 : *ret = u->cpu_usage_last;
3207 0 : return 0;
3208 : }
3209 0 : if (r < 0)
3210 0 : return r;
3211 :
3212 0 : if (ns > u->cpu_usage_base)
3213 0 : ns -= u->cpu_usage_base;
3214 : else
3215 0 : ns = 0;
3216 :
3217 0 : u->cpu_usage_last = ns;
3218 0 : if (ret)
3219 0 : *ret = ns;
3220 :
3221 0 : return 0;
3222 : }
3223 :
3224 28 : int unit_get_ip_accounting(
3225 : Unit *u,
3226 : CGroupIPAccountingMetric metric,
3227 : uint64_t *ret) {
3228 :
3229 : uint64_t value;
3230 : int fd, r;
3231 :
3232 28 : assert(u);
3233 28 : assert(metric >= 0);
3234 28 : assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
3235 28 : assert(ret);
3236 :
3237 28 : if (!UNIT_CGROUP_BOOL(u, ip_accounting))
3238 28 : return -ENODATA;
3239 :
3240 0 : fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
3241 0 : u->ip_accounting_ingress_map_fd :
3242 : u->ip_accounting_egress_map_fd;
3243 0 : if (fd < 0)
3244 0 : return -ENODATA;
3245 :
3246 0 : if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
3247 0 : r = bpf_firewall_read_accounting(fd, &value, NULL);
3248 : else
3249 0 : r = bpf_firewall_read_accounting(fd, NULL, &value);
3250 0 : if (r < 0)
3251 0 : return r;
3252 :
3253 : /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
3254 : * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
3255 : * ip_accounting_extra[] field, and add them in here transparently. */
3256 :
3257 0 : *ret = value + u->ip_accounting_extra[metric];
3258 :
3259 0 : return r;
3260 : }
3261 :
3262 6 : static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
3263 : static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
3264 : [CGROUP_IO_READ_BYTES] = "rbytes=",
3265 : [CGROUP_IO_WRITE_BYTES] = "wbytes=",
3266 : [CGROUP_IO_READ_OPERATIONS] = "rios=",
3267 : [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
3268 : };
3269 6 : uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
3270 6 : _cleanup_free_ char *path = NULL;
3271 6 : _cleanup_fclose_ FILE *f = NULL;
3272 : int r;
3273 :
3274 6 : assert(u);
3275 :
3276 6 : if (!u->cgroup_path)
3277 0 : return -ENODATA;
3278 :
3279 6 : if (unit_has_host_root_cgroup(u))
3280 0 : return -ENODATA; /* TODO: return useful data for the top-level cgroup */
3281 :
3282 6 : r = cg_all_unified();
3283 6 : if (r < 0)
3284 0 : return r;
3285 6 : if (r == 0) /* TODO: support cgroupv1 */
3286 6 : return -ENODATA;
3287 :
3288 0 : if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
3289 0 : return -ENODATA;
3290 :
3291 0 : r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
3292 0 : if (r < 0)
3293 0 : return r;
3294 :
3295 0 : f = fopen(path, "re");
3296 0 : if (!f)
3297 0 : return -errno;
3298 :
3299 0 : for (;;) {
3300 0 : _cleanup_free_ char *line = NULL;
3301 : const char *p;
3302 :
3303 0 : r = read_line(f, LONG_LINE_MAX, &line);
3304 0 : if (r < 0)
3305 0 : return r;
3306 0 : if (r == 0)
3307 0 : break;
3308 :
3309 0 : p = line;
3310 0 : p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
3311 0 : p += strspn(p, WHITESPACE); /* Skip over following whitespace */
3312 :
3313 0 : for (;;) {
3314 0 : _cleanup_free_ char *word = NULL;
3315 :
3316 0 : r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
3317 0 : if (r < 0)
3318 0 : return r;
3319 0 : if (r == 0)
3320 0 : break;
3321 :
3322 0 : for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
3323 : const char *x;
3324 :
3325 0 : x = startswith(word, field_names[i]);
3326 0 : if (x) {
3327 : uint64_t w;
3328 :
3329 0 : r = safe_atou64(x, &w);
3330 0 : if (r < 0)
3331 0 : return r;
3332 :
3333 : /* Sum up the stats of all devices */
3334 0 : acc[i] += w;
3335 0 : break;
3336 : }
3337 : }
3338 : }
3339 : }
3340 :
3341 0 : memcpy(ret, acc, sizeof(acc));
3342 0 : return 0;
3343 : }
3344 :
3345 28 : int unit_get_io_accounting(
3346 : Unit *u,
3347 : CGroupIOAccountingMetric metric,
3348 : bool allow_cache,
3349 : uint64_t *ret) {
3350 :
3351 : uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
3352 : int r;
3353 :
3354 : /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
3355 :
3356 28 : if (!UNIT_CGROUP_BOOL(u, io_accounting))
3357 28 : return -ENODATA;
3358 :
3359 0 : if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
3360 0 : goto done;
3361 :
3362 0 : r = unit_get_io_accounting_raw(u, raw);
3363 0 : if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
3364 0 : goto done;
3365 0 : if (r < 0)
3366 0 : return r;
3367 :
3368 0 : for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
3369 : /* Saturated subtraction */
3370 0 : if (raw[i] > u->io_accounting_base[i])
3371 0 : u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
3372 : else
3373 0 : u->io_accounting_last[i] = 0;
3374 : }
3375 :
3376 0 : done:
3377 0 : if (ret)
3378 0 : *ret = u->io_accounting_last[metric];
3379 :
3380 0 : return 0;
3381 : }
3382 :
3383 6 : int unit_reset_cpu_accounting(Unit *u) {
3384 : int r;
3385 :
3386 6 : assert(u);
3387 :
3388 6 : u->cpu_usage_last = NSEC_INFINITY;
3389 :
3390 6 : r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
3391 6 : if (r < 0) {
3392 6 : u->cpu_usage_base = 0;
3393 6 : return r;
3394 : }
3395 :
3396 0 : return 0;
3397 : }
3398 :
3399 6 : int unit_reset_ip_accounting(Unit *u) {
3400 6 : int r = 0, q = 0;
3401 :
3402 6 : assert(u);
3403 :
3404 6 : if (u->ip_accounting_ingress_map_fd >= 0)
3405 0 : r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
3406 :
3407 6 : if (u->ip_accounting_egress_map_fd >= 0)
3408 0 : q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
3409 :
3410 6 : zero(u->ip_accounting_extra);
3411 :
3412 6 : return r < 0 ? r : q;
3413 : }
3414 :
3415 6 : int unit_reset_io_accounting(Unit *u) {
3416 : int r;
3417 :
3418 6 : assert(u);
3419 :
3420 30 : for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++)
3421 24 : u->io_accounting_last[i] = UINT64_MAX;
3422 :
3423 6 : r = unit_get_io_accounting_raw(u, u->io_accounting_base);
3424 6 : if (r < 0) {
3425 6 : zero(u->io_accounting_base);
3426 6 : return r;
3427 : }
3428 :
3429 0 : return 0;
3430 : }
3431 :
3432 6 : int unit_reset_accounting(Unit *u) {
3433 : int r, q, v;
3434 :
3435 6 : assert(u);
3436 :
3437 6 : r = unit_reset_cpu_accounting(u);
3438 6 : q = unit_reset_io_accounting(u);
3439 6 : v = unit_reset_ip_accounting(u);
3440 :
3441 6 : return r < 0 ? r : q < 0 ? q : v;
3442 : }
3443 :
3444 0 : void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
3445 0 : assert(u);
3446 :
3447 0 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
3448 0 : return;
3449 :
3450 0 : if (m == 0)
3451 0 : return;
3452 :
3453 : /* always invalidate compat pairs together */
3454 0 : if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
3455 0 : m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
3456 :
3457 0 : if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
3458 0 : m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
3459 :
3460 0 : if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */
3461 0 : return;
3462 :
3463 0 : u->cgroup_invalidated_mask |= m;
3464 0 : unit_add_to_cgroup_realize_queue(u);
3465 : }
3466 :
3467 0 : void unit_invalidate_cgroup_bpf(Unit *u) {
3468 0 : assert(u);
3469 :
3470 0 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
3471 0 : return;
3472 :
3473 0 : if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
3474 0 : return;
3475 :
3476 0 : u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
3477 0 : unit_add_to_cgroup_realize_queue(u);
3478 :
3479 : /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
3480 : * list of our children includes our own. */
3481 0 : if (u->type == UNIT_SLICE) {
3482 : Unit *member;
3483 : Iterator i;
3484 : void *v;
3485 :
3486 0 : HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
3487 0 : if (UNIT_DEREF(member->slice) == u)
3488 0 : unit_invalidate_cgroup_bpf(member);
3489 : }
3490 : }
3491 : }
3492 :
3493 1775 : bool unit_cgroup_delegate(Unit *u) {
3494 : CGroupContext *c;
3495 :
3496 1775 : assert(u);
3497 :
3498 1775 : if (!UNIT_VTABLE(u)->can_delegate)
3499 1574 : return false;
3500 :
3501 201 : c = unit_get_cgroup_context(u);
3502 201 : if (!c)
3503 0 : return false;
3504 :
3505 201 : return c->delegate;
3506 : }
3507 :
3508 1 : void manager_invalidate_startup_units(Manager *m) {
3509 : Iterator i;
3510 : Unit *u;
3511 :
3512 1 : assert(m);
3513 :
3514 1 : SET_FOREACH(u, m->startup_units, i)
3515 0 : unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
3516 1 : }
3517 :
3518 102 : static int unit_get_nice(Unit *u) {
3519 : ExecContext *ec;
3520 :
3521 102 : ec = unit_get_exec_context(u);
3522 102 : return ec ? ec->nice : 0;
3523 : }
3524 :
3525 102 : static uint64_t unit_get_cpu_weight(Unit *u) {
3526 102 : ManagerState state = manager_state(u->manager);
3527 : CGroupContext *cc;
3528 :
3529 102 : cc = unit_get_cgroup_context(u);
3530 102 : return cc ? cgroup_context_cpu_weight(cc, state) : CGROUP_WEIGHT_DEFAULT;
3531 : }
3532 :
3533 122 : int compare_job_priority(const void *a, const void *b) {
3534 122 : const Job *x = a, *y = b;
3535 : int nice_x, nice_y;
3536 : uint64_t weight_x, weight_y;
3537 : int ret;
3538 :
3539 122 : if ((ret = CMP(x->unit->type, y->unit->type)) != 0)
3540 71 : return -ret;
3541 :
3542 51 : weight_x = unit_get_cpu_weight(x->unit);
3543 51 : weight_y = unit_get_cpu_weight(y->unit);
3544 :
3545 51 : if ((ret = CMP(weight_x, weight_y)) != 0)
3546 0 : return -ret;
3547 :
3548 51 : nice_x = unit_get_nice(x->unit);
3549 51 : nice_y = unit_get_nice(y->unit);
3550 :
3551 51 : if ((ret = CMP(nice_x, nice_y)) != 0)
3552 0 : return ret;
3553 :
3554 51 : return strcmp(x->unit->id, y->unit->id);
3555 : }
3556 :
3557 : static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
3558 : [CGROUP_AUTO] = "auto",
3559 : [CGROUP_CLOSED] = "closed",
3560 : [CGROUP_STRICT] = "strict",
3561 : };
3562 :
3563 192 : DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
|