Branch data Line data Source code
1 : : /* SPDX-License-Identifier: LGPL-2.1+ */
2 : :
3 : : #include <fcntl.h>
4 : : #include <fnmatch.h>
5 : :
6 : : #include "sd-messages.h"
7 : :
8 : : #include "alloc-util.h"
9 : : #include "blockdev-util.h"
10 : : #include "bpf-devices.h"
11 : : #include "bpf-firewall.h"
12 : : #include "btrfs-util.h"
13 : : #include "bus-error.h"
14 : : #include "cgroup-util.h"
15 : : #include "cgroup.h"
16 : : #include "fd-util.h"
17 : : #include "fileio.h"
18 : : #include "fs-util.h"
19 : : #include "nulstr-util.h"
20 : : #include "parse-util.h"
21 : : #include "path-util.h"
22 : : #include "process-util.h"
23 : : #include "procfs-util.h"
24 : : #include "special.h"
25 : : #include "stat-util.h"
26 : : #include "stdio-util.h"
27 : : #include "string-table.h"
28 : : #include "string-util.h"
29 : : #include "virt.h"
30 : :
31 : : #define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
32 : :
33 : : /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
34 : : * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
35 : : * out specific attributes from us. */
36 : : #define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
37 : :
38 : 116 : bool manager_owns_host_root_cgroup(Manager *m) {
39 [ - + ]: 116 : assert(m);
40 : :
41 : : /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
42 : : * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
43 : : * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
44 : : * we run in any kind of container virtualization. */
45 : :
46 [ + - ]: 116 : if (MANAGER_IS_USER(m))
47 : 116 : return false;
48 : :
49 [ # # ]: 0 : if (detect_container() > 0)
50 : 0 : return false;
51 : :
52 : 0 : return empty_or_root(m->cgroup_root);
53 : : }
54 : :
55 : 72 : bool unit_has_host_root_cgroup(Unit *u) {
56 [ - + ]: 72 : assert(u);
57 : :
58 : : /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
59 : : * the manager manages the root cgroup. */
60 : :
61 [ + - ]: 72 : if (!manager_owns_host_root_cgroup(u->manager))
62 : 72 : return false;
63 : :
64 : 0 : return unit_has_name(u, SPECIAL_ROOT_SLICE);
65 : : }
66 : :
67 : 0 : static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
68 : : int r;
69 : :
70 : 0 : r = cg_set_attribute(controller, u->cgroup_path, attribute, value);
71 [ # # ]: 0 : if (r < 0)
72 [ # # # # : 0 : log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
# # # # #
# # # #
# ]
73 : : strna(attribute), isempty(u->cgroup_path) ? "/" : u->cgroup_path, (int) strcspn(value, NEWLINE), value);
74 : :
75 : 0 : return r;
76 : : }
77 : :
78 : 0 : static void cgroup_compat_warn(void) {
79 : : static bool cgroup_compat_warned = false;
80 : :
81 [ # # ]: 0 : if (cgroup_compat_warned)
82 : 0 : return;
83 : :
84 [ # # ]: 0 : log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
85 : : "See cgroup-compat debug messages for details.");
86 : :
87 : 0 : cgroup_compat_warned = true;
88 : : }
89 : :
90 : : #define log_cgroup_compat(unit, fmt, ...) do { \
91 : : cgroup_compat_warn(); \
92 : : log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
93 : : } while (false)
94 : :
95 : 2356 : void cgroup_context_init(CGroupContext *c) {
96 [ - + ]: 2356 : assert(c);
97 : :
98 : : /* Initialize everything to the kernel defaults. */
99 : :
100 : 2356 : *c = (CGroupContext) {
101 : : .cpu_weight = CGROUP_WEIGHT_INVALID,
102 : : .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
103 : : .cpu_quota_per_sec_usec = USEC_INFINITY,
104 : : .cpu_quota_period_usec = USEC_INFINITY,
105 : :
106 : : .cpu_shares = CGROUP_CPU_SHARES_INVALID,
107 : : .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
108 : :
109 : : .memory_high = CGROUP_LIMIT_MAX,
110 : : .memory_max = CGROUP_LIMIT_MAX,
111 : : .memory_swap_max = CGROUP_LIMIT_MAX,
112 : :
113 : : .memory_limit = CGROUP_LIMIT_MAX,
114 : :
115 : : .io_weight = CGROUP_WEIGHT_INVALID,
116 : : .startup_io_weight = CGROUP_WEIGHT_INVALID,
117 : :
118 : : .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
119 : : .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
120 : :
121 : : .tasks_max = CGROUP_LIMIT_MAX,
122 : : };
123 : 2356 : }
124 : :
125 : 0 : void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
126 [ # # ]: 0 : assert(c);
127 [ # # ]: 0 : assert(a);
128 : :
129 [ # # # # : 0 : LIST_REMOVE(device_allow, c->device_allow, a);
# # # # ]
130 : 0 : free(a->path);
131 : 0 : free(a);
132 : 0 : }
133 : :
134 : 0 : void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
135 [ # # ]: 0 : assert(c);
136 [ # # ]: 0 : assert(w);
137 : :
138 [ # # # # : 0 : LIST_REMOVE(device_weights, c->io_device_weights, w);
# # # # ]
139 : 0 : free(w->path);
140 : 0 : free(w);
141 : 0 : }
142 : :
143 : 0 : void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
144 [ # # ]: 0 : assert(c);
145 [ # # ]: 0 : assert(l);
146 : :
147 [ # # # # : 0 : LIST_REMOVE(device_latencies, c->io_device_latencies, l);
# # # # ]
148 : 0 : free(l->path);
149 : 0 : free(l);
150 : 0 : }
151 : :
152 : 0 : void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
153 [ # # ]: 0 : assert(c);
154 [ # # ]: 0 : assert(l);
155 : :
156 [ # # # # : 0 : LIST_REMOVE(device_limits, c->io_device_limits, l);
# # # # ]
157 : 0 : free(l->path);
158 : 0 : free(l);
159 : 0 : }
160 : :
161 : 0 : void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
162 [ # # ]: 0 : assert(c);
163 [ # # ]: 0 : assert(w);
164 : :
165 [ # # # # : 0 : LIST_REMOVE(device_weights, c->blockio_device_weights, w);
# # # # ]
166 : 0 : free(w->path);
167 : 0 : free(w);
168 : 0 : }
169 : :
170 : 0 : void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
171 [ # # ]: 0 : assert(c);
172 [ # # ]: 0 : assert(b);
173 : :
174 [ # # # # : 0 : LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
# # # # ]
175 : 0 : free(b->path);
176 : 0 : free(b);
177 : 0 : }
178 : :
179 : 2356 : void cgroup_context_done(CGroupContext *c) {
180 [ - + ]: 2356 : assert(c);
181 : :
182 [ - + ]: 2356 : while (c->io_device_weights)
183 : 0 : cgroup_context_free_io_device_weight(c, c->io_device_weights);
184 : :
185 [ - + ]: 2356 : while (c->io_device_latencies)
186 : 0 : cgroup_context_free_io_device_latency(c, c->io_device_latencies);
187 : :
188 [ - + ]: 2356 : while (c->io_device_limits)
189 : 0 : cgroup_context_free_io_device_limit(c, c->io_device_limits);
190 : :
191 [ - + ]: 2356 : while (c->blockio_device_weights)
192 : 0 : cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
193 : :
194 [ - + ]: 2356 : while (c->blockio_device_bandwidths)
195 : 0 : cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
196 : :
197 [ - + ]: 2356 : while (c->device_allow)
198 : 0 : cgroup_context_free_device_allow(c, c->device_allow);
199 : :
200 : 2356 : c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
201 : 2356 : c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
202 : :
203 : 2356 : c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
204 : 2356 : c->ip_filters_egress = strv_free(c->ip_filters_egress);
205 : 2356 : }
206 : :
207 : 728 : void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
208 : 728 : _cleanup_free_ char *disable_controllers_str = NULL;
209 : : CGroupIODeviceLimit *il;
210 : : CGroupIODeviceWeight *iw;
211 : : CGroupIODeviceLatency *l;
212 : : CGroupBlockIODeviceBandwidth *b;
213 : : CGroupBlockIODeviceWeight *w;
214 : : CGroupDeviceAllow *a;
215 : : IPAddressAccessItem *iaai;
216 : : char **path;
217 : : char u[FORMAT_TIMESPAN_MAX];
218 : : char v[FORMAT_TIMESPAN_MAX];
219 : :
220 [ - + ]: 728 : assert(c);
221 [ - + ]: 728 : assert(f);
222 : :
223 : 728 : prefix = strempty(prefix);
224 : :
225 : 728 : (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
226 : :
227 : 1456 : fprintf(f,
228 : : "%sCPUAccounting=%s\n"
229 : : "%sIOAccounting=%s\n"
230 : : "%sBlockIOAccounting=%s\n"
231 : : "%sMemoryAccounting=%s\n"
232 : : "%sTasksAccounting=%s\n"
233 : : "%sIPAccounting=%s\n"
234 : : "%sCPUWeight=%" PRIu64 "\n"
235 : : "%sStartupCPUWeight=%" PRIu64 "\n"
236 : : "%sCPUShares=%" PRIu64 "\n"
237 : : "%sStartupCPUShares=%" PRIu64 "\n"
238 : : "%sCPUQuotaPerSecSec=%s\n"
239 : : "%sCPUQuotaPeriodSec=%s\n"
240 : : "%sIOWeight=%" PRIu64 "\n"
241 : : "%sStartupIOWeight=%" PRIu64 "\n"
242 : : "%sBlockIOWeight=%" PRIu64 "\n"
243 : : "%sStartupBlockIOWeight=%" PRIu64 "\n"
244 : : "%sDefaultMemoryMin=%" PRIu64 "\n"
245 : : "%sDefaultMemoryLow=%" PRIu64 "\n"
246 : : "%sMemoryMin=%" PRIu64 "\n"
247 : : "%sMemoryLow=%" PRIu64 "\n"
248 : : "%sMemoryHigh=%" PRIu64 "\n"
249 : : "%sMemoryMax=%" PRIu64 "\n"
250 : : "%sMemorySwapMax=%" PRIu64 "\n"
251 : : "%sMemoryLimit=%" PRIu64 "\n"
252 : : "%sTasksMax=%" PRIu64 "\n"
253 : : "%sDevicePolicy=%s\n"
254 : : "%sDisableControllers=%s\n"
255 : : "%sDelegate=%s\n",
256 : 728 : prefix, yes_no(c->cpu_accounting),
257 : 728 : prefix, yes_no(c->io_accounting),
258 : 728 : prefix, yes_no(c->blockio_accounting),
259 : 728 : prefix, yes_no(c->memory_accounting),
260 : 728 : prefix, yes_no(c->tasks_accounting),
261 : 728 : prefix, yes_no(c->ip_accounting),
262 : : prefix, c->cpu_weight,
263 : : prefix, c->startup_cpu_weight,
264 : : prefix, c->cpu_shares,
265 : : prefix, c->startup_cpu_shares,
266 : : prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
267 : : prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1),
268 : : prefix, c->io_weight,
269 : : prefix, c->startup_io_weight,
270 : : prefix, c->blockio_weight,
271 : : prefix, c->startup_blockio_weight,
272 : : prefix, c->default_memory_min,
273 : : prefix, c->default_memory_low,
274 : : prefix, c->memory_min,
275 : : prefix, c->memory_low,
276 : : prefix, c->memory_high,
277 : : prefix, c->memory_max,
278 : : prefix, c->memory_swap_max,
279 : : prefix, c->memory_limit,
280 : : prefix, c->tasks_max,
281 : : prefix, cgroup_device_policy_to_string(c->device_policy),
282 : : prefix, strempty(disable_controllers_str),
283 : 728 : prefix, yes_no(c->delegate));
284 : :
285 [ - + ]: 728 : if (c->delegate) {
286 : 0 : _cleanup_free_ char *t = NULL;
287 : :
288 : 0 : (void) cg_mask_to_string(c->delegate_controllers, &t);
289 : :
290 : 0 : fprintf(f, "%sDelegateControllers=%s\n",
291 : : prefix,
292 : : strempty(t));
293 : : }
294 : :
295 [ - + ]: 728 : LIST_FOREACH(device_allow, a, c->device_allow)
296 : 0 : fprintf(f,
297 : : "%sDeviceAllow=%s %s%s%s\n",
298 : : prefix,
299 : : a->path,
300 [ # # # # : 0 : a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
# # ]
301 : :
302 [ - + ]: 728 : LIST_FOREACH(device_weights, iw, c->io_device_weights)
303 : 0 : fprintf(f,
304 : : "%sIODeviceWeight=%s %" PRIu64 "\n",
305 : : prefix,
306 : : iw->path,
307 : : iw->weight);
308 : :
309 [ - + ]: 728 : LIST_FOREACH(device_latencies, l, c->io_device_latencies)
310 : 0 : fprintf(f,
311 : : "%sIODeviceLatencyTargetSec=%s %s\n",
312 : : prefix,
313 : : l->path,
314 : : format_timespan(u, sizeof(u), l->target_usec, 1));
315 : :
316 [ - + ]: 728 : LIST_FOREACH(device_limits, il, c->io_device_limits) {
317 : : char buf[FORMAT_BYTES_MAX];
318 : : CGroupIOLimitType type;
319 : :
320 [ # # ]: 0 : for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
321 [ # # ]: 0 : if (il->limits[type] != cgroup_io_limit_defaults[type])
322 : 0 : fprintf(f,
323 : : "%s%s=%s %s\n",
324 : : prefix,
325 : : cgroup_io_limit_type_to_string(type),
326 : : il->path,
327 : : format_bytes(buf, sizeof(buf), il->limits[type]));
328 : : }
329 : :
330 [ - + ]: 728 : LIST_FOREACH(device_weights, w, c->blockio_device_weights)
331 : 0 : fprintf(f,
332 : : "%sBlockIODeviceWeight=%s %" PRIu64,
333 : : prefix,
334 : : w->path,
335 : : w->weight);
336 : :
337 [ - + ]: 728 : LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
338 : : char buf[FORMAT_BYTES_MAX];
339 : :
340 [ # # ]: 0 : if (b->rbps != CGROUP_LIMIT_MAX)
341 : 0 : fprintf(f,
342 : : "%sBlockIOReadBandwidth=%s %s\n",
343 : : prefix,
344 : : b->path,
345 : : format_bytes(buf, sizeof(buf), b->rbps));
346 [ # # ]: 0 : if (b->wbps != CGROUP_LIMIT_MAX)
347 : 0 : fprintf(f,
348 : : "%sBlockIOWriteBandwidth=%s %s\n",
349 : : prefix,
350 : : b->path,
351 : : format_bytes(buf, sizeof(buf), b->wbps));
352 : : }
353 : :
354 [ - + ]: 728 : LIST_FOREACH(items, iaai, c->ip_address_allow) {
355 : 0 : _cleanup_free_ char *k = NULL;
356 : :
357 : 0 : (void) in_addr_to_string(iaai->family, &iaai->address, &k);
358 : 0 : fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
359 : : }
360 : :
361 [ - + ]: 728 : LIST_FOREACH(items, iaai, c->ip_address_deny) {
362 : 0 : _cleanup_free_ char *k = NULL;
363 : :
364 : 0 : (void) in_addr_to_string(iaai->family, &iaai->address, &k);
365 : 0 : fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
366 : : }
367 : :
368 [ - + # # ]: 728 : STRV_FOREACH(path, c->ip_filters_ingress)
369 : 0 : fprintf(f, "%sIPIngressFilterPath=%s\n", prefix, *path);
370 : :
371 [ - + # # ]: 728 : STRV_FOREACH(path, c->ip_filters_egress)
372 : 0 : fprintf(f, "%sIPEgressFilterPath=%s\n", prefix, *path);
373 : 728 : }
374 : :
375 : 0 : int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) {
376 : 0 : _cleanup_free_ CGroupDeviceAllow *a = NULL;
377 : 0 : _cleanup_free_ char *d = NULL;
378 : :
379 [ # # ]: 0 : assert(c);
380 [ # # ]: 0 : assert(dev);
381 [ # # # # ]: 0 : assert(isempty(mode) || in_charset(mode, "rwm"));
382 : :
383 : 0 : a = new(CGroupDeviceAllow, 1);
384 [ # # ]: 0 : if (!a)
385 : 0 : return -ENOMEM;
386 : :
387 : 0 : d = strdup(dev);
388 [ # # ]: 0 : if (!d)
389 : 0 : return -ENOMEM;
390 : :
391 : 0 : *a = (CGroupDeviceAllow) {
392 : 0 : .path = TAKE_PTR(d),
393 [ # # # # ]: 0 : .r = isempty(mode) || strchr(mode, 'r'),
394 [ # # # # ]: 0 : .w = isempty(mode) || strchr(mode, 'w'),
395 [ # # # # ]: 0 : .m = isempty(mode) || strchr(mode, 'm'),
396 : : };
397 : :
398 [ # # # # ]: 0 : LIST_PREPEND(device_allow, c->device_allow, a);
399 : 0 : TAKE_PTR(a);
400 : :
401 : 0 : return 0;
402 : : }
403 : :
404 : : #define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \
405 : : uint64_t unit_get_ancestor_##entry(Unit *u) { \
406 : : CGroupContext *c; \
407 : : \
408 : : /* 1. Is entry set in this unit? If so, use that. \
409 : : * 2. Is the default for this entry set in any \
410 : : * ancestor? If so, use that. \
411 : : * 3. Otherwise, return CGROUP_LIMIT_MIN. */ \
412 : : \
413 : : assert(u); \
414 : : \
415 : : c = unit_get_cgroup_context(u); \
416 : : if (c && c->entry##_set) \
417 : : return c->entry; \
418 : : \
419 : : while ((u = UNIT_DEREF(u->slice))) { \
420 : : c = unit_get_cgroup_context(u); \
421 : : if (c && c->default_##entry##_set) \
422 : : return c->default_##entry; \
423 : : } \
424 : : \
425 : : /* We've reached the root, but nobody had default for \
426 : : * this entry set, so set it to the kernel default. */ \
427 : : return CGROUP_LIMIT_MIN; \
428 : : }
429 : :
430 [ - + + - : 544 : UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
+ + + - +
+ + + ]
431 [ # # # # : 0 : UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
# # # # #
# # # ]
432 : :
433 : 24 : static void cgroup_xattr_apply(Unit *u) {
434 : : char ids[SD_ID128_STRING_MAX];
435 : : int r;
436 : :
437 [ - + ]: 24 : assert(u);
438 : :
439 [ + - ]: 24 : if (!MANAGER_IS_SYSTEM(u->manager))
440 : 24 : return;
441 : :
442 [ # # ]: 0 : if (sd_id128_is_null(u->invocation_id))
443 : 0 : return;
444 : :
445 : 0 : r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
446 : : "trusted.invocation_id",
447 : 0 : sd_id128_to_string(u->invocation_id, ids), 32,
448 : : 0);
449 [ # # ]: 0 : if (r < 0)
450 [ # # ]: 0 : log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
451 : : }
452 : :
453 : 0 : static int lookup_block_device(const char *p, dev_t *ret) {
454 : 0 : dev_t rdev, dev = 0;
455 : : mode_t mode;
456 : : int r;
457 : :
458 [ # # ]: 0 : assert(p);
459 [ # # ]: 0 : assert(ret);
460 : :
461 : 0 : r = device_path_parse_major_minor(p, &mode, &rdev);
462 [ # # ]: 0 : if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
463 : : struct stat st;
464 [ # # ]: 0 : if (stat(p, &st) < 0)
465 [ # # ]: 0 : return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
466 : 0 : rdev = (dev_t)st.st_rdev;
467 : 0 : dev = (dev_t)st.st_dev;
468 : 0 : mode = st.st_mode;
469 [ # # ]: 0 : } else if (r < 0)
470 [ # # ]: 0 : return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
471 : :
472 [ # # ]: 0 : if (S_ISCHR(mode)) {
473 [ # # ]: 0 : log_warning("Device node '%s' is a character device, but block device needed.", p);
474 : 0 : return -ENOTBLK;
475 [ # # ]: 0 : } else if (S_ISBLK(mode))
476 : 0 : *ret = rdev;
477 [ # # ]: 0 : else if (major(dev) != 0)
478 : 0 : *ret = dev; /* If this is not a device node then use the block device this file is stored on */
479 : : else {
480 : : /* If this is btrfs, getting the backing block device is a bit harder */
481 : 0 : r = btrfs_get_block_device(p, ret);
482 [ # # # # ]: 0 : if (r < 0 && r != -ENOTTY)
483 [ # # ]: 0 : return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
484 [ # # ]: 0 : if (r == -ENOTTY) {
485 [ # # ]: 0 : log_warning("'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
486 : 0 : return -ENODEV;
487 : : }
488 : : }
489 : :
490 : : /* If this is a LUKS device, try to get the originating block device */
491 : 0 : (void) block_get_originating(*ret, ret);
492 : :
493 : : /* If this is a partition, try to get the originating block device */
494 : 0 : (void) block_get_whole_disk(*ret, ret);
495 : 0 : return 0;
496 : : }
497 : :
498 : 0 : static int whitelist_device(BPFProgram *prog, const char *path, const char *node, const char *acc) {
499 : : dev_t rdev;
500 : : mode_t mode;
501 : : int r;
502 : :
503 [ # # ]: 0 : assert(path);
504 [ # # ]: 0 : assert(acc);
505 : :
506 : : /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and
507 : : * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This
508 : : * means clients can use these path without the device node actually around */
509 : 0 : r = device_path_parse_major_minor(node, &mode, &rdev);
510 [ # # ]: 0 : if (r < 0) {
511 [ # # ]: 0 : if (r != -ENODEV)
512 [ # # ]: 0 : return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node);
513 : :
514 : : struct stat st;
515 [ # # ]: 0 : if (stat(node, &st) < 0)
516 [ # # ]: 0 : return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
517 : :
518 [ # # # # ]: 0 : if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
519 [ # # ]: 0 : log_warning("%s is not a device.", node);
520 : 0 : return -ENODEV;
521 : : }
522 : 0 : rdev = (dev_t) st.st_rdev;
523 : 0 : mode = st.st_mode;
524 : : }
525 : :
526 [ # # ]: 0 : if (cg_all_unified() > 0) {
527 [ # # ]: 0 : if (!prog)
528 : 0 : return 0;
529 : :
530 : 0 : return cgroup_bpf_whitelist_device(prog, S_ISCHR(mode) ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
531 [ # # ]: 0 : major(rdev), minor(rdev), acc);
532 : :
533 : : } else {
534 : : char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
535 : :
536 : 0 : sprintf(buf,
537 : : "%c %u:%u %s",
538 [ # # ]: 0 : S_ISCHR(mode) ? 'c' : 'b',
539 : : major(rdev), minor(rdev),
540 : : acc);
541 : :
542 : : /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL here. */
543 : :
544 : 0 : r = cg_set_attribute("devices", path, "devices.allow", buf);
545 [ # # ]: 0 : if (r < 0)
546 [ # # # # : 0 : return log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING,
# # ]
547 : : r, "Failed to set devices.allow on %s: %m", path);
548 : :
549 : 0 : return 0;
550 : : }
551 : : }
552 : :
553 : 0 : static int whitelist_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc) {
554 : 0 : _cleanup_fclose_ FILE *f = NULL;
555 : : char buf[2+DECIMAL_STR_MAX(unsigned)+3+4];
556 : 0 : bool good = false;
557 : : unsigned maj;
558 : : int r;
559 : :
560 [ # # ]: 0 : assert(path);
561 [ # # ]: 0 : assert(acc);
562 [ # # # # ]: 0 : assert(IN_SET(type, 'b', 'c'));
563 : :
564 [ # # ]: 0 : if (streq(name, "*")) {
565 : : /* If the name is a wildcard, then apply this list to all devices of this type */
566 : :
567 [ # # ]: 0 : if (cg_all_unified() > 0) {
568 [ # # ]: 0 : if (!prog)
569 : 0 : return 0;
570 : :
571 [ # # ]: 0 : (void) cgroup_bpf_whitelist_class(prog, type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, acc);
572 : : } else {
573 [ # # ]: 0 : xsprintf(buf, "%c *:* %s", type, acc);
574 : :
575 : 0 : r = cg_set_attribute("devices", path, "devices.allow", buf);
576 [ # # ]: 0 : if (r < 0)
577 [ # # # # : 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
# # ]
578 : : "Failed to set devices.allow on %s: %m", path);
579 : 0 : return 0;
580 : : }
581 : : }
582 : :
583 [ # # # # : 0 : if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj)) {
# # # # ]
584 : : /* The name is numeric and suitable as major. In that case, let's take is major, and create the entry
585 : : * directly */
586 : :
587 [ # # ]: 0 : if (cg_all_unified() > 0) {
588 [ # # ]: 0 : if (!prog)
589 : 0 : return 0;
590 : :
591 [ # # ]: 0 : (void) cgroup_bpf_whitelist_major(prog,
592 : : type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
593 : : maj, acc);
594 : : } else {
595 [ # # ]: 0 : xsprintf(buf, "%c %u:* %s", type, maj, acc);
596 : :
597 : 0 : r = cg_set_attribute("devices", path, "devices.allow", buf);
598 [ # # ]: 0 : if (r < 0)
599 [ # # # # : 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
# # ]
600 : : "Failed to set devices.allow on %s: %m", path);
601 : : }
602 : :
603 : 0 : return 0;
604 : : }
605 : :
606 : 0 : f = fopen("/proc/devices", "re");
607 [ # # ]: 0 : if (!f)
608 [ # # ]: 0 : return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
609 : :
610 : 0 : for (;;) {
611 [ # # # # ]: 0 : _cleanup_free_ char *line = NULL;
612 : : char *w, *p;
613 : :
614 : 0 : r = read_line(f, LONG_LINE_MAX, &line);
615 [ # # ]: 0 : if (r < 0)
616 [ # # ]: 0 : return log_warning_errno(r, "Failed to read /proc/devices: %m");
617 [ # # ]: 0 : if (r == 0)
618 : 0 : break;
619 : :
620 [ # # # # ]: 0 : if (type == 'c' && streq(line, "Character devices:")) {
621 : 0 : good = true;
622 : 0 : continue;
623 : : }
624 : :
625 [ # # # # ]: 0 : if (type == 'b' && streq(line, "Block devices:")) {
626 : 0 : good = true;
627 : 0 : continue;
628 : : }
629 : :
630 [ # # ]: 0 : if (isempty(line)) {
631 : 0 : good = false;
632 : 0 : continue;
633 : : }
634 : :
635 [ # # ]: 0 : if (!good)
636 : 0 : continue;
637 : :
638 : 0 : p = strstrip(line);
639 : :
640 : 0 : w = strpbrk(p, WHITESPACE);
641 [ # # ]: 0 : if (!w)
642 : 0 : continue;
643 : 0 : *w = 0;
644 : :
645 : 0 : r = safe_atou(p, &maj);
646 [ # # ]: 0 : if (r < 0)
647 : 0 : continue;
648 [ # # ]: 0 : if (maj <= 0)
649 : 0 : continue;
650 : :
651 : 0 : w++;
652 : 0 : w += strspn(w, WHITESPACE);
653 : :
654 [ # # ]: 0 : if (fnmatch(name, w, 0) != 0)
655 : 0 : continue;
656 : :
657 [ # # ]: 0 : if (cg_all_unified() > 0) {
658 [ # # ]: 0 : if (!prog)
659 : 0 : continue;
660 : :
661 [ # # ]: 0 : (void) cgroup_bpf_whitelist_major(prog,
662 : : type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
663 : : maj, acc);
664 : : } else {
665 : 0 : sprintf(buf,
666 : : "%c %u:* %s",
667 : : type,
668 : : maj,
669 : : acc);
670 : :
671 : : /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL
672 : : * here. */
673 : :
674 : 0 : r = cg_set_attribute("devices", path, "devices.allow", buf);
675 [ # # ]: 0 : if (r < 0)
676 [ # # # # : 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING,
# # ]
677 : : r, "Failed to set devices.allow on %s: %m", path);
678 : : }
679 : : }
680 : :
681 : 0 : return 0;
682 : : }
683 : :
684 : 2316 : static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
685 [ + - ]: 4632 : return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
686 [ - + ]: 2316 : c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
687 : : }
688 : :
689 : 2316 : static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
690 [ + + ]: 4612 : return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
691 [ - + ]: 2296 : c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
692 : : }
693 : :
694 : 122 : static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
695 [ + + + + ]: 122 : if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
696 [ - + ]: 54 : c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
697 : 0 : return c->startup_cpu_weight;
698 [ - + ]: 122 : else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
699 : 0 : return c->cpu_weight;
700 : : else
701 : 122 : return CGROUP_WEIGHT_DEFAULT;
702 : : }
703 : :
704 : 0 : static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
705 [ # # # # ]: 0 : if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
706 [ # # ]: 0 : c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
707 : 0 : return c->startup_cpu_shares;
708 [ # # ]: 0 : else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
709 : 0 : return c->cpu_shares;
710 : : else
711 : 0 : return CGROUP_CPU_SHARES_DEFAULT;
712 : : }
713 : :
714 : 48 : usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
715 : : /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
716 : : * need to be higher than that boundary. quota is specified in USecPerSec.
717 : : * Additionally, period must be at most max_period. */
718 [ - + ]: 48 : assert(quota > 0);
719 : :
720 : 48 : return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
721 : : }
722 : :
723 : 0 : static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
724 : : usec_t new_period;
725 : :
726 [ # # ]: 0 : if (quota == USEC_INFINITY)
727 : : /* Always use default period for infinity quota. */
728 : 0 : return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
729 : :
730 [ # # ]: 0 : if (period == USEC_INFINITY)
731 : : /* Default period was requested. */
732 : 0 : period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
733 : :
734 : : /* Clamp to interval [1ms, 1s] */
735 : 0 : new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
736 : :
737 [ # # ]: 0 : if (new_period != period) {
738 : : char v[FORMAT_TIMESPAN_MAX];
739 [ # # # # : 0 : log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING, 0,
# # ]
740 : : "Clamping CPU interval for cpu.max: period is now %s",
741 : : format_timespan(v, sizeof(v), new_period, 1));
742 : 0 : u->warned_clamping_cpu_quota_period = true;
743 : : }
744 : :
745 : 0 : return new_period;
746 : : }
747 : :
748 : 0 : static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
749 : : char buf[DECIMAL_STR_MAX(uint64_t) + 2];
750 : :
751 [ # # ]: 0 : xsprintf(buf, "%" PRIu64 "\n", weight);
752 : 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
753 : 0 : }
754 : :
755 : 0 : static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
756 : : char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
757 : :
758 : 0 : period = cgroup_cpu_adjust_period_and_log(u, period, quota);
759 [ # # ]: 0 : if (quota != USEC_INFINITY)
760 [ # # ]: 0 : xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
761 : : MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
762 : : else
763 [ # # ]: 0 : xsprintf(buf, "max " USEC_FMT "\n", period);
764 : 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
765 : 0 : }
766 : :
767 : 0 : static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
768 : : char buf[DECIMAL_STR_MAX(uint64_t) + 2];
769 : :
770 [ # # ]: 0 : xsprintf(buf, "%" PRIu64 "\n", shares);
771 : 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
772 : 0 : }
773 : :
774 : 0 : static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
775 : : char buf[DECIMAL_STR_MAX(usec_t) + 2];
776 : :
777 : 0 : period = cgroup_cpu_adjust_period_and_log(u, period, quota);
778 : :
779 [ # # ]: 0 : xsprintf(buf, USEC_FMT "\n", period);
780 : 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
781 : :
782 [ # # ]: 0 : if (quota != USEC_INFINITY) {
783 [ # # ]: 0 : xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
784 : 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
785 : : } else
786 : 0 : (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
787 : 0 : }
788 : :
789 : 0 : static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
790 [ # # ]: 0 : return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
791 : : CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
792 : : }
793 : :
794 : 0 : static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
795 [ # # ]: 0 : return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
796 : : CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
797 : : }
798 : :
799 : 2316 : static bool cgroup_context_has_io_config(CGroupContext *c) {
800 : 4632 : return c->io_accounting ||
801 [ + + ]: 2316 : c->io_weight != CGROUP_WEIGHT_INVALID ||
802 [ + - ]: 2276 : c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
803 [ + - ]: 2276 : c->io_device_weights ||
804 [ + - + - ]: 6908 : c->io_device_latencies ||
805 [ - + ]: 2276 : c->io_device_limits;
806 : : }
807 : :
808 : 2276 : static bool cgroup_context_has_blockio_config(CGroupContext *c) {
809 : 4552 : return c->blockio_accounting ||
810 [ + - ]: 2276 : c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
811 [ + - ]: 2276 : c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
812 [ + - + - ]: 6828 : c->blockio_device_weights ||
813 [ - + ]: 2276 : c->blockio_device_bandwidths;
814 : : }
815 : :
816 : 0 : static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
817 [ # # # # ]: 0 : if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
818 [ # # ]: 0 : c->startup_io_weight != CGROUP_WEIGHT_INVALID)
819 : 0 : return c->startup_io_weight;
820 [ # # ]: 0 : else if (c->io_weight != CGROUP_WEIGHT_INVALID)
821 : 0 : return c->io_weight;
822 : : else
823 : 0 : return CGROUP_WEIGHT_DEFAULT;
824 : : }
825 : :
826 : 0 : static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
827 [ # # # # ]: 0 : if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
828 [ # # ]: 0 : c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
829 : 0 : return c->startup_blockio_weight;
830 [ # # ]: 0 : else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
831 : 0 : return c->blockio_weight;
832 : : else
833 : 0 : return CGROUP_BLKIO_WEIGHT_DEFAULT;
834 : : }
835 : :
836 : 0 : static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
837 [ # # ]: 0 : return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
838 : : CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
839 : : }
840 : :
841 : 0 : static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
842 [ # # ]: 0 : return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
843 : : CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
844 : : }
845 : :
846 : 0 : static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
847 : : char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
848 : : dev_t dev;
849 : : int r;
850 : :
851 : 0 : r = lookup_block_device(dev_path, &dev);
852 [ # # ]: 0 : if (r < 0)
853 : 0 : return;
854 : :
855 [ # # ]: 0 : xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
856 : 0 : (void) set_attribute_and_warn(u, "io", "io.weight", buf);
857 : : }
858 : :
859 : 0 : static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
860 : : char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
861 : : dev_t dev;
862 : : int r;
863 : :
864 : 0 : r = lookup_block_device(dev_path, &dev);
865 [ # # ]: 0 : if (r < 0)
866 : 0 : return;
867 : :
868 [ # # ]: 0 : xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
869 : 0 : (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf);
870 : : }
871 : :
872 : 0 : static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
873 : : char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
874 : : dev_t dev;
875 : : int r;
876 : :
877 : 0 : r = lookup_block_device(dev_path, &dev);
878 [ # # ]: 0 : if (r < 0)
879 : 0 : return;
880 : :
881 [ # # ]: 0 : if (target != USEC_INFINITY)
882 [ # # ]: 0 : xsprintf(buf, "%u:%u target=%" PRIu64 "\n", major(dev), minor(dev), target);
883 : : else
884 [ # # ]: 0 : xsprintf(buf, "%u:%u target=max\n", major(dev), minor(dev));
885 : :
886 : 0 : (void) set_attribute_and_warn(u, "io", "io.latency", buf);
887 : : }
888 : :
889 : 0 : static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
890 : : char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
891 : : char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
892 : : CGroupIOLimitType type;
893 : : dev_t dev;
894 : : int r;
895 : :
896 : 0 : r = lookup_block_device(dev_path, &dev);
897 [ # # ]: 0 : if (r < 0)
898 : 0 : return;
899 : :
900 [ # # ]: 0 : for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
901 [ # # ]: 0 : if (limits[type] != cgroup_io_limit_defaults[type])
902 [ # # ]: 0 : xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
903 : : else
904 [ # # # # ]: 0 : xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
905 : :
906 [ # # ]: 0 : xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
907 : : limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
908 : : limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
909 : 0 : (void) set_attribute_and_warn(u, "io", "io.max", buf);
910 : : }
911 : :
912 : 0 : static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
913 : : char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
914 : : dev_t dev;
915 : : int r;
916 : :
917 : 0 : r = lookup_block_device(dev_path, &dev);
918 [ # # ]: 0 : if (r < 0)
919 : 0 : return;
920 : :
921 : 0 : sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
922 : 0 : (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf);
923 : :
924 : 0 : sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
925 : 0 : (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf);
926 : : }
927 : :
928 : 220 : static bool unit_has_unified_memory_config(Unit *u) {
929 : : CGroupContext *c;
930 : :
931 [ - + ]: 220 : assert(u);
932 : :
933 : 220 : c = unit_get_cgroup_context(u);
934 [ - + ]: 220 : assert(c);
935 : :
936 [ + - ]: 220 : return c->memory_min > 0 || unit_get_ancestor_memory_low(u) > 0 ||
937 [ + - + - : 660 : c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX ||
+ - ]
938 [ - + ]: 220 : c->memory_swap_max != CGROUP_LIMIT_MAX;
939 : : }
940 : :
941 : 0 : static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
942 : 0 : char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
943 : :
944 [ # # ]: 0 : if (v != CGROUP_LIMIT_MAX)
945 [ # # ]: 0 : xsprintf(buf, "%" PRIu64 "\n", v);
946 : :
947 : 0 : (void) set_attribute_and_warn(u, "memory", file, buf);
948 : 0 : }
949 : :
950 : 0 : static void cgroup_apply_firewall(Unit *u) {
951 [ # # ]: 0 : assert(u);
952 : :
953 : : /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
954 : :
955 [ # # ]: 0 : if (bpf_firewall_compile(u) < 0)
956 : 0 : return;
957 : :
958 : 0 : (void) bpf_firewall_load_custom(u);
959 : 0 : (void) bpf_firewall_install(u);
960 : : }
961 : :
962 : 24 : static void cgroup_context_apply(
963 : : Unit *u,
964 : : CGroupMask apply_mask,
965 : : ManagerState state) {
966 : :
967 : : const char *path;
968 : : CGroupContext *c;
969 : : bool is_host_root, is_local_root;
970 : : int r;
971 : :
972 [ - + ]: 24 : assert(u);
973 : :
974 : : /* Nothing to do? Exit early! */
975 [ - + ]: 24 : if (apply_mask == 0)
976 : 0 : return;
977 : :
978 : : /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
979 : : * attributes should only be managed for cgroups further down the tree. */
980 : 24 : is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
981 : 24 : is_host_root = unit_has_host_root_cgroup(u);
982 : :
983 [ - + ]: 24 : assert_se(c = unit_get_cgroup_context(u));
984 [ - + ]: 24 : assert_se(path = u->cgroup_path);
985 : :
986 [ + - ]: 24 : if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
987 : 24 : path = "/";
988 : :
989 : : /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
990 : : * then), and missing cgroups, i.e. EROFS and ENOENT. */
991 : :
992 : : /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
993 : : * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
994 : : * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
995 : : * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
996 : : * we couldn't even write to them if we wanted to). */
997 [ - + # # ]: 24 : if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
998 : :
999 [ # # ]: 0 : if (cg_all_unified() > 0) {
1000 : : uint64_t weight;
1001 : :
1002 [ # # ]: 0 : if (cgroup_context_has_cpu_weight(c))
1003 : 0 : weight = cgroup_context_cpu_weight(c, state);
1004 [ # # ]: 0 : else if (cgroup_context_has_cpu_shares(c)) {
1005 : : uint64_t shares;
1006 : :
1007 : 0 : shares = cgroup_context_cpu_shares(c, state);
1008 : 0 : weight = cgroup_cpu_shares_to_weight(shares);
1009 : :
1010 [ # # ]: 0 : log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
1011 : : shares, weight, path);
1012 : : } else
1013 : 0 : weight = CGROUP_WEIGHT_DEFAULT;
1014 : :
1015 : 0 : cgroup_apply_unified_cpu_weight(u, weight);
1016 : 0 : cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
1017 : :
1018 : : } else {
1019 : : uint64_t shares;
1020 : :
1021 [ # # ]: 0 : if (cgroup_context_has_cpu_weight(c)) {
1022 : : uint64_t weight;
1023 : :
1024 : 0 : weight = cgroup_context_cpu_weight(c, state);
1025 : 0 : shares = cgroup_cpu_weight_to_shares(weight);
1026 : :
1027 [ # # ]: 0 : log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
1028 : : weight, shares, path);
1029 [ # # ]: 0 : } else if (cgroup_context_has_cpu_shares(c))
1030 : 0 : shares = cgroup_context_cpu_shares(c, state);
1031 : : else
1032 : 0 : shares = CGROUP_CPU_SHARES_DEFAULT;
1033 : :
1034 : 0 : cgroup_apply_legacy_cpu_shares(u, shares);
1035 : 0 : cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
1036 : : }
1037 : : }
1038 : :
1039 : : /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
1040 : : * controller), and in case of containers we want to leave control of these attributes to the container manager
1041 : : * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
1042 [ - + # # ]: 24 : if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
1043 : : char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
1044 : : bool has_io, has_blockio;
1045 : : uint64_t weight;
1046 : :
1047 : 0 : has_io = cgroup_context_has_io_config(c);
1048 : 0 : has_blockio = cgroup_context_has_blockio_config(c);
1049 : :
1050 [ # # ]: 0 : if (has_io)
1051 : 0 : weight = cgroup_context_io_weight(c, state);
1052 [ # # ]: 0 : else if (has_blockio) {
1053 : : uint64_t blkio_weight;
1054 : :
1055 : 0 : blkio_weight = cgroup_context_blkio_weight(c, state);
1056 : 0 : weight = cgroup_weight_blkio_to_io(blkio_weight);
1057 : :
1058 [ # # ]: 0 : log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
1059 : : blkio_weight, weight);
1060 : : } else
1061 : 0 : weight = CGROUP_WEIGHT_DEFAULT;
1062 : :
1063 [ # # ]: 0 : xsprintf(buf, "default %" PRIu64 "\n", weight);
1064 : 0 : (void) set_attribute_and_warn(u, "io", "io.weight", buf);
1065 : :
1066 : : /* FIXME: drop this when distro kernels properly support BFQ through "io.weight"
1067 : : * See also: https://github.com/systemd/systemd/pull/13335 */
1068 [ # # ]: 0 : xsprintf(buf, "%" PRIu64 "\n", weight);
1069 : 0 : (void) set_attribute_and_warn(u, "io", "io.bfq.weight", buf);
1070 : :
1071 [ # # ]: 0 : if (has_io) {
1072 : : CGroupIODeviceLatency *latency;
1073 : : CGroupIODeviceLimit *limit;
1074 : : CGroupIODeviceWeight *w;
1075 : :
1076 [ # # ]: 0 : LIST_FOREACH(device_weights, w, c->io_device_weights)
1077 : 0 : cgroup_apply_io_device_weight(u, w->path, w->weight);
1078 : :
1079 [ # # ]: 0 : LIST_FOREACH(device_limits, limit, c->io_device_limits)
1080 : 0 : cgroup_apply_io_device_limit(u, limit->path, limit->limits);
1081 : :
1082 [ # # ]: 0 : LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
1083 : 0 : cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
1084 : :
1085 [ # # ]: 0 : } else if (has_blockio) {
1086 : : CGroupBlockIODeviceWeight *w;
1087 : : CGroupBlockIODeviceBandwidth *b;
1088 : :
1089 [ # # ]: 0 : LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
1090 : 0 : weight = cgroup_weight_blkio_to_io(w->weight);
1091 : :
1092 [ # # ]: 0 : log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
1093 : : w->weight, weight, w->path);
1094 : :
1095 : 0 : cgroup_apply_io_device_weight(u, w->path, weight);
1096 : : }
1097 : :
1098 [ # # ]: 0 : LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
1099 : : uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
1100 : : CGroupIOLimitType type;
1101 : :
1102 [ # # ]: 0 : for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
1103 : 0 : limits[type] = cgroup_io_limit_defaults[type];
1104 : :
1105 : 0 : limits[CGROUP_IO_RBPS_MAX] = b->rbps;
1106 : 0 : limits[CGROUP_IO_WBPS_MAX] = b->wbps;
1107 : :
1108 [ # # ]: 0 : log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
1109 : : b->rbps, b->wbps, b->path);
1110 : :
1111 : 0 : cgroup_apply_io_device_limit(u, b->path, limits);
1112 : : }
1113 : : }
1114 : : }
1115 : :
1116 [ - + ]: 24 : if (apply_mask & CGROUP_MASK_BLKIO) {
1117 : : bool has_io, has_blockio;
1118 : :
1119 : 0 : has_io = cgroup_context_has_io_config(c);
1120 : 0 : has_blockio = cgroup_context_has_blockio_config(c);
1121 : :
1122 : : /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be
1123 : : * left to our container manager, too. */
1124 [ # # ]: 0 : if (!is_local_root) {
1125 : : char buf[DECIMAL_STR_MAX(uint64_t)+1];
1126 : : uint64_t weight;
1127 : :
1128 [ # # ]: 0 : if (has_io) {
1129 : : uint64_t io_weight;
1130 : :
1131 : 0 : io_weight = cgroup_context_io_weight(c, state);
1132 : 0 : weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
1133 : :
1134 [ # # ]: 0 : log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
1135 : : io_weight, weight);
1136 [ # # ]: 0 : } else if (has_blockio)
1137 : 0 : weight = cgroup_context_blkio_weight(c, state);
1138 : : else
1139 : 0 : weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
1140 : :
1141 [ # # ]: 0 : xsprintf(buf, "%" PRIu64 "\n", weight);
1142 : 0 : (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf);
1143 : :
1144 [ # # ]: 0 : if (has_io) {
1145 : : CGroupIODeviceWeight *w;
1146 : :
1147 [ # # ]: 0 : LIST_FOREACH(device_weights, w, c->io_device_weights) {
1148 : 0 : weight = cgroup_weight_io_to_blkio(w->weight);
1149 : :
1150 [ # # ]: 0 : log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
1151 : : w->weight, weight, w->path);
1152 : :
1153 : 0 : cgroup_apply_blkio_device_weight(u, w->path, weight);
1154 : : }
1155 [ # # ]: 0 : } else if (has_blockio) {
1156 : : CGroupBlockIODeviceWeight *w;
1157 : :
1158 [ # # ]: 0 : LIST_FOREACH(device_weights, w, c->blockio_device_weights)
1159 : 0 : cgroup_apply_blkio_device_weight(u, w->path, w->weight);
1160 : : }
1161 : : }
1162 : :
1163 : : /* The bandwidth limits are something that make sense to be applied to the host's root but not container
1164 : : * roots, as there we want the container manager to handle it */
1165 [ # # # # ]: 0 : if (is_host_root || !is_local_root) {
1166 [ # # ]: 0 : if (has_io) {
1167 : : CGroupIODeviceLimit *l;
1168 : :
1169 [ # # ]: 0 : LIST_FOREACH(device_limits, l, c->io_device_limits) {
1170 [ # # ]: 0 : log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
1171 : : l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
1172 : :
1173 : 0 : cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
1174 : : }
1175 [ # # ]: 0 : } else if (has_blockio) {
1176 : : CGroupBlockIODeviceBandwidth *b;
1177 : :
1178 [ # # ]: 0 : LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
1179 : 0 : cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps);
1180 : : }
1181 : : }
1182 : : }
1183 : :
1184 : : /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
1185 : : * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
1186 : : * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
1187 : : * write to this if we wanted to.) */
1188 [ + - - + ]: 24 : if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
1189 : :
1190 [ # # ]: 0 : if (cg_all_unified() > 0) {
1191 : 0 : uint64_t max, swap_max = CGROUP_LIMIT_MAX;
1192 : :
1193 [ # # ]: 0 : if (unit_has_unified_memory_config(u)) {
1194 : 0 : max = c->memory_max;
1195 : 0 : swap_max = c->memory_swap_max;
1196 : : } else {
1197 : 0 : max = c->memory_limit;
1198 : :
1199 [ # # ]: 0 : if (max != CGROUP_LIMIT_MAX)
1200 [ # # ]: 0 : log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
1201 : : }
1202 : :
1203 : 0 : cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min);
1204 : 0 : cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
1205 : 0 : cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
1206 : 0 : cgroup_apply_unified_memory_limit(u, "memory.max", max);
1207 : 0 : cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
1208 : :
1209 : 0 : (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
1210 : :
1211 : : } else {
1212 : : char buf[DECIMAL_STR_MAX(uint64_t) + 1];
1213 : : uint64_t val;
1214 : :
1215 [ # # ]: 0 : if (unit_has_unified_memory_config(u)) {
1216 : 0 : val = c->memory_max;
1217 [ # # ]: 0 : log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val);
1218 : : } else
1219 : 0 : val = c->memory_limit;
1220 : :
1221 [ # # ]: 0 : if (val == CGROUP_LIMIT_MAX)
1222 : 0 : strncpy(buf, "-1\n", sizeof(buf));
1223 : : else
1224 [ # # ]: 0 : xsprintf(buf, "%" PRIu64 "\n", val);
1225 : :
1226 : 0 : (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
1227 : : }
1228 : : }
1229 : :
1230 : : /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
1231 : : * containers, where we leave this to the manager */
1232 [ - + # # ]: 24 : if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
1233 [ # # # # ]: 0 : (is_host_root || cg_all_unified() > 0 || !is_local_root)) {
1234 : 0 : _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
1235 : : CGroupDeviceAllow *a;
1236 : :
1237 [ # # ]: 0 : if (cg_all_unified() > 0) {
1238 : 0 : r = cgroup_init_device_bpf(&prog, c->device_policy, c->device_allow);
1239 [ # # ]: 0 : if (r < 0)
1240 [ # # ]: 0 : log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
1241 : : } else {
1242 : : /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL
1243 : : * here. */
1244 : :
1245 [ # # # # ]: 0 : if (c->device_allow || c->device_policy != CGROUP_AUTO)
1246 : 0 : r = cg_set_attribute("devices", path, "devices.deny", "a");
1247 : : else
1248 : 0 : r = cg_set_attribute("devices", path, "devices.allow", "a");
1249 [ # # ]: 0 : if (r < 0)
1250 [ # # # # : 0 : log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
# # # # #
# ]
1251 : : "Failed to reset devices.allow/devices.deny: %m");
1252 : : }
1253 : :
1254 [ # # ]: 0 : if (c->device_policy == CGROUP_CLOSED ||
1255 [ # # # # ]: 0 : (c->device_policy == CGROUP_AUTO && c->device_allow)) {
1256 : : static const char auto_devices[] =
1257 : : "/dev/null\0" "rwm\0"
1258 : : "/dev/zero\0" "rwm\0"
1259 : : "/dev/full\0" "rwm\0"
1260 : : "/dev/random\0" "rwm\0"
1261 : : "/dev/urandom\0" "rwm\0"
1262 : : "/dev/tty\0" "rwm\0"
1263 : : "/dev/ptmx\0" "rwm\0"
1264 : : /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
1265 : : "/run/systemd/inaccessible/chr\0" "rwm\0"
1266 : : "/run/systemd/inaccessible/blk\0" "rwm\0";
1267 : :
1268 : : const char *x, *y;
1269 : :
1270 [ # # # # : 0 : NULSTR_FOREACH_PAIR(x, y, auto_devices)
# # ]
1271 : 0 : (void) whitelist_device(prog, path, x, y);
1272 : :
1273 : : /* PTS (/dev/pts) devices may not be duplicated, but accessed */
1274 : 0 : (void) whitelist_major(prog, path, "pts", 'c', "rw");
1275 : : }
1276 : :
1277 [ # # ]: 0 : LIST_FOREACH(device_allow, a, c->device_allow) {
1278 : : char acc[4], *val;
1279 : 0 : unsigned k = 0;
1280 : :
1281 [ # # ]: 0 : if (a->r)
1282 : 0 : acc[k++] = 'r';
1283 [ # # ]: 0 : if (a->w)
1284 : 0 : acc[k++] = 'w';
1285 [ # # ]: 0 : if (a->m)
1286 : 0 : acc[k++] = 'm';
1287 : :
1288 [ # # ]: 0 : if (k == 0)
1289 : 0 : continue;
1290 : :
1291 : 0 : acc[k++] = 0;
1292 : :
1293 [ # # ]: 0 : if (path_startswith(a->path, "/dev/"))
1294 : 0 : (void) whitelist_device(prog, path, a->path, acc);
1295 [ # # ]: 0 : else if ((val = startswith(a->path, "block-")))
1296 : 0 : (void) whitelist_major(prog, path, val, 'b', acc);
1297 [ # # ]: 0 : else if ((val = startswith(a->path, "char-")))
1298 : 0 : (void) whitelist_major(prog, path, val, 'c', acc);
1299 : : else
1300 [ # # ]: 0 : log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
1301 : : }
1302 : :
1303 : 0 : r = cgroup_apply_device_bpf(u, prog, c->device_policy, c->device_allow);
1304 [ # # ]: 0 : if (r < 0) {
1305 : : static bool warned = false;
1306 : :
1307 [ # # # # ]: 0 : log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
1308 : : "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
1309 : : "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
1310 : : "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
1311 : :
1312 : 0 : warned = true;
1313 : : }
1314 : : }
1315 : :
1316 [ + - ]: 24 : if (apply_mask & CGROUP_MASK_PIDS) {
1317 : :
1318 [ - + ]: 24 : if (is_host_root) {
1319 : : /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
1320 : : * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
1321 : : * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
1322 : : * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
1323 : : * exclusive ownership of the sysctls, but we still want to honour things if the user sets
1324 : : * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
1325 : : * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
1326 : : * it also counts. But if the user never set a limit through us (i.e. we are the default of
1327 : : * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
1328 : : * the first time we set a limit. Note that this boolean is flushed out on manager reload,
1329 : : * which is desirable so that there's an official way to release control of the sysctl from
1330 : : * systemd: set the limit to unbounded and reload. */
1331 : :
1332 [ # # ]: 0 : if (c->tasks_max != CGROUP_LIMIT_MAX) {
1333 : 0 : u->manager->sysctl_pid_max_changed = true;
1334 : 0 : r = procfs_tasks_set_limit(c->tasks_max);
1335 [ # # ]: 0 : } else if (u->manager->sysctl_pid_max_changed)
1336 : 0 : r = procfs_tasks_set_limit(TASKS_MAX);
1337 : : else
1338 : 0 : r = 0;
1339 [ # # ]: 0 : if (r < 0)
1340 [ # # # # : 0 : log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r,
# # # # #
# ]
1341 : : "Failed to write to tasks limit sysctls: %m");
1342 : : }
1343 : :
1344 : : /* The attribute itself is not available on the host root cgroup, and in the container case we want to
1345 : : * leave it for the container manager. */
1346 [ - + ]: 24 : if (!is_local_root) {
1347 [ # # ]: 0 : if (c->tasks_max != CGROUP_LIMIT_MAX) {
1348 : : char buf[DECIMAL_STR_MAX(uint64_t) + 2];
1349 : :
1350 : 0 : sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
1351 : 0 : (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
1352 : : } else
1353 : 0 : (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
1354 : : }
1355 : : }
1356 : :
1357 [ - + ]: 24 : if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
1358 : 0 : cgroup_apply_firewall(u);
1359 : : }
1360 : :
1361 : 2316 : static bool unit_get_needs_bpf_firewall(Unit *u) {
1362 : : CGroupContext *c;
1363 : : Unit *p;
1364 [ - + ]: 2316 : assert(u);
1365 : :
1366 : 2316 : c = unit_get_cgroup_context(u);
1367 [ - + ]: 2316 : if (!c)
1368 : 0 : return false;
1369 : :
1370 [ + - ]: 2316 : if (c->ip_accounting ||
1371 [ + - ]: 2316 : c->ip_address_allow ||
1372 [ + - ]: 2316 : c->ip_address_deny ||
1373 [ + - ]: 2316 : c->ip_filters_ingress ||
1374 [ - + ]: 2316 : c->ip_filters_egress)
1375 : 0 : return true;
1376 : :
1377 : : /* If any parent slice has an IP access list defined, it applies too */
1378 [ + + ]: 4648 : for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
1379 : 2332 : c = unit_get_cgroup_context(p);
1380 [ - + ]: 2332 : if (!c)
1381 : 0 : return false;
1382 : :
1383 [ + - ]: 2332 : if (c->ip_address_allow ||
1384 [ - + ]: 2332 : c->ip_address_deny)
1385 : 0 : return true;
1386 : : }
1387 : :
1388 : 2316 : return false;
1389 : : }
1390 : :
1391 : 2316 : static CGroupMask unit_get_cgroup_mask(Unit *u) {
1392 : 2316 : CGroupMask mask = 0;
1393 : : CGroupContext *c;
1394 : :
1395 [ - + ]: 2316 : assert(u);
1396 : :
1397 : 2316 : c = unit_get_cgroup_context(u);
1398 : :
1399 [ - + ]: 2316 : assert(c);
1400 : :
1401 : : /* Figure out which controllers we need, based on the cgroup context object */
1402 : :
1403 [ + + ]: 2316 : if (c->cpu_accounting)
1404 : 20 : mask |= get_cpu_accounting_mask();
1405 : :
1406 [ + - + + ]: 4632 : if (cgroup_context_has_cpu_weight(c) ||
1407 : 2316 : cgroup_context_has_cpu_shares(c) ||
1408 [ - + ]: 2296 : c->cpu_quota_per_sec_usec != USEC_INFINITY)
1409 : 20 : mask |= CGROUP_MASK_CPU;
1410 : :
1411 [ + + - + ]: 2316 : if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
1412 : 40 : mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
1413 : :
1414 [ + + ]: 2316 : if (c->memory_accounting ||
1415 [ + + - + ]: 460 : c->memory_limit != CGROUP_LIMIT_MAX ||
1416 : 220 : unit_has_unified_memory_config(u))
1417 : 2096 : mask |= CGROUP_MASK_MEMORY;
1418 : :
1419 [ + - ]: 2316 : if (c->device_allow ||
1420 [ - + ]: 2316 : c->device_policy != CGROUP_AUTO)
1421 : 0 : mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
1422 : :
1423 [ + + ]: 2316 : if (c->tasks_accounting ||
1424 [ - + ]: 260 : c->tasks_max != CGROUP_LIMIT_MAX)
1425 : 2056 : mask |= CGROUP_MASK_PIDS;
1426 : :
1427 : 2316 : return CGROUP_MASK_EXTEND_JOINED(mask);
1428 : : }
1429 : :
1430 : 2316 : static CGroupMask unit_get_bpf_mask(Unit *u) {
1431 : 2316 : CGroupMask mask = 0;
1432 : :
1433 : : /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
1434 : : * too. */
1435 : :
1436 [ - + ]: 2316 : if (unit_get_needs_bpf_firewall(u))
1437 : 0 : mask |= CGROUP_MASK_BPF_FIREWALL;
1438 : :
1439 : 2316 : return mask;
1440 : : }
1441 : :
1442 : 6396 : CGroupMask unit_get_own_mask(Unit *u) {
1443 : : CGroupContext *c;
1444 : :
1445 : : /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
1446 : : * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
1447 : :
1448 [ + + ]: 6396 : if (u->load_state != UNIT_LOADED)
1449 : 744 : return 0;
1450 : :
1451 : 5652 : c = unit_get_cgroup_context(u);
1452 [ + + ]: 5652 : if (!c)
1453 : 3336 : return 0;
1454 : :
1455 : 2316 : return (unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u);
1456 : : }
1457 : :
1458 : 7124 : CGroupMask unit_get_delegate_mask(Unit *u) {
1459 : : CGroupContext *c;
1460 : :
1461 : : /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
1462 : : * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
1463 : : *
1464 : : * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
1465 : :
1466 [ + - ]: 7124 : if (!unit_cgroup_delegate(u))
1467 : 7124 : return 0;
1468 : :
1469 [ # # ]: 0 : if (cg_all_unified() <= 0) {
1470 : : ExecContext *e;
1471 : :
1472 : 0 : e = unit_get_exec_context(u);
1473 [ # # # # ]: 0 : if (e && !exec_context_maintains_privileges(e))
1474 : 0 : return 0;
1475 : : }
1476 : :
1477 [ # # ]: 0 : assert_se(c = unit_get_cgroup_context(u));
1478 : 0 : return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers);
1479 : : }
1480 : :
1481 : 6580 : CGroupMask unit_get_members_mask(Unit *u) {
1482 [ - + ]: 6580 : assert(u);
1483 : :
1484 : : /* Returns the mask of controllers all of the unit's children require, merged */
1485 : :
1486 [ + + ]: 6580 : if (u->cgroup_members_mask_valid)
1487 : 5052 : return u->cgroup_members_mask; /* Use cached value if possible */
1488 : :
1489 : 1528 : u->cgroup_members_mask = 0;
1490 : :
1491 [ + + ]: 1528 : if (u->type == UNIT_SLICE) {
1492 : : void *v;
1493 : : Unit *member;
1494 : : Iterator i;
1495 : :
1496 [ + + ]: 1448 : HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
1497 [ + + ]: 1384 : if (UNIT_DEREF(member->slice) == u)
1498 : 1372 : u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
1499 : : }
1500 : : }
1501 : :
1502 : 1528 : u->cgroup_members_mask_valid = true;
1503 : 1528 : return u->cgroup_members_mask;
1504 : : }
1505 : :
1506 : 176 : CGroupMask unit_get_siblings_mask(Unit *u) {
1507 [ - + ]: 176 : assert(u);
1508 : :
1509 : : /* Returns the mask of controllers all of the unit's siblings
1510 : : * require, i.e. the members mask of the unit's parent slice
1511 : : * if there is one. */
1512 : :
1513 [ + + ]: 176 : if (UNIT_ISSET(u->slice))
1514 : 136 : return unit_get_members_mask(UNIT_DEREF(u->slice));
1515 : :
1516 : 40 : return unit_get_subtree_mask(u); /* we are the top-level slice */
1517 : : }
1518 : :
1519 : 4960 : CGroupMask unit_get_disable_mask(Unit *u) {
1520 : : CGroupContext *c;
1521 : :
1522 : 4960 : c = unit_get_cgroup_context(u);
1523 [ - + ]: 4960 : if (!c)
1524 : 0 : return 0;
1525 : :
1526 : 4960 : return c->disable_controllers;
1527 : : }
1528 : :
1529 : 4960 : CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
1530 : : CGroupMask mask;
1531 : :
1532 [ - + ]: 4960 : assert(u);
1533 : 4960 : mask = unit_get_disable_mask(u);
1534 : :
1535 : : /* Returns the mask of controllers which are marked as forcibly
1536 : : * disabled in any ancestor unit or the unit in question. */
1537 : :
1538 [ + + ]: 4960 : if (UNIT_ISSET(u->slice))
1539 : 2484 : mask |= unit_get_ancestor_disable_mask(UNIT_DEREF(u->slice));
1540 : :
1541 : 4960 : return mask;
1542 : : }
1543 : :
1544 : 1412 : CGroupMask unit_get_subtree_mask(Unit *u) {
1545 : :
1546 : : /* Returns the mask of this subtree, meaning of the group
1547 : : * itself and its children. */
1548 : :
1549 : 1412 : return unit_get_own_mask(u) | unit_get_members_mask(u);
1550 : : }
1551 : :
1552 : 112 : CGroupMask unit_get_target_mask(Unit *u) {
1553 : : CGroupMask mask;
1554 : :
1555 : : /* This returns the cgroup mask of all controllers to enable
1556 : : * for a specific cgroup, i.e. everything it needs itself,
1557 : : * plus all that its children need, plus all that its siblings
1558 : : * need. This is primarily useful on the legacy cgroup
1559 : : * hierarchy, where we need to duplicate each cgroup in each
1560 : : * hierarchy that shall be enabled for it. */
1561 : :
1562 : 112 : mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
1563 : :
1564 [ - + ]: 112 : if (mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
1565 : 0 : emit_bpf_firewall_warning(u);
1566 : :
1567 : 112 : mask &= u->manager->cgroup_supported;
1568 : 112 : mask &= ~unit_get_ancestor_disable_mask(u);
1569 : :
1570 : 112 : return mask;
1571 : : }
1572 : :
1573 : 48 : CGroupMask unit_get_enable_mask(Unit *u) {
1574 : : CGroupMask mask;
1575 : :
1576 : : /* This returns the cgroup mask of all controllers to enable
1577 : : * for the children of a specific cgroup. This is primarily
1578 : : * useful for the unified cgroup hierarchy, where each cgroup
1579 : : * controls which controllers are enabled for its children. */
1580 : :
1581 : 48 : mask = unit_get_members_mask(u);
1582 : 48 : mask &= u->manager->cgroup_supported;
1583 : 48 : mask &= ~unit_get_ancestor_disable_mask(u);
1584 : :
1585 : 48 : return mask;
1586 : : }
1587 : :
1588 : 9028 : void unit_invalidate_cgroup_members_masks(Unit *u) {
1589 [ - + ]: 9028 : assert(u);
1590 : :
1591 : : /* Recurse invalidate the member masks cache all the way up the tree */
1592 : 9028 : u->cgroup_members_mask_valid = false;
1593 : :
1594 [ + + ]: 9028 : if (UNIT_ISSET(u->slice))
1595 : 1231 : unit_invalidate_cgroup_members_masks(UNIT_DEREF(u->slice));
1596 : 9028 : }
1597 : :
1598 : 0 : const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
1599 : :
1600 : : /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
1601 : :
1602 [ # # ]: 0 : while (u) {
1603 : :
1604 [ # # # # ]: 0 : if (u->cgroup_path &&
1605 : 0 : u->cgroup_realized &&
1606 [ # # ]: 0 : FLAGS_SET(u->cgroup_realized_mask, mask))
1607 : 0 : return u->cgroup_path;
1608 : :
1609 : 0 : u = UNIT_DEREF(u->slice);
1610 : : }
1611 : :
1612 : 0 : return NULL;
1613 : : }
1614 : :
1615 : 0 : static const char *migrate_callback(CGroupMask mask, void *userdata) {
1616 : 0 : return unit_get_realized_cgroup_path(userdata, mask);
1617 : : }
1618 : :
1619 : 48 : char *unit_default_cgroup_path(const Unit *u) {
1620 : 48 : _cleanup_free_ char *escaped = NULL, *slice = NULL;
1621 : : int r;
1622 : :
1623 [ - + ]: 48 : assert(u);
1624 : :
1625 [ + + ]: 48 : if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1626 : 24 : return strdup(u->manager->cgroup_root);
1627 : :
1628 [ + - - + ]: 24 : if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
1629 : 0 : r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
1630 [ # # ]: 0 : if (r < 0)
1631 : 0 : return NULL;
1632 : : }
1633 : :
1634 : 24 : escaped = cg_escape(u->id);
1635 [ - + ]: 24 : if (!escaped)
1636 : 0 : return NULL;
1637 : :
1638 : 24 : return path_join(empty_to_root(u->manager->cgroup_root), slice, escaped);
1639 : : }
1640 : :
1641 : 48 : int unit_set_cgroup_path(Unit *u, const char *path) {
1642 : 48 : _cleanup_free_ char *p = NULL;
1643 : : int r;
1644 : :
1645 [ - + ]: 48 : assert(u);
1646 : :
1647 [ - + ]: 48 : if (streq_ptr(u->cgroup_path, path))
1648 : 0 : return 0;
1649 : :
1650 [ + - ]: 48 : if (path) {
1651 : 48 : p = strdup(path);
1652 [ - + ]: 48 : if (!p)
1653 : 0 : return -ENOMEM;
1654 : : }
1655 : :
1656 [ + - ]: 48 : if (p) {
1657 : 48 : r = hashmap_put(u->manager->cgroup_unit, p, u);
1658 [ - + ]: 48 : if (r < 0)
1659 : 0 : return r;
1660 : : }
1661 : :
1662 : 48 : unit_release_cgroup(u);
1663 : 48 : u->cgroup_path = TAKE_PTR(p);
1664 : :
1665 : 48 : return 1;
1666 : : }
1667 : :
1668 : 24 : int unit_watch_cgroup(Unit *u) {
1669 : 24 : _cleanup_free_ char *events = NULL;
1670 : : int r;
1671 : :
1672 [ - + ]: 24 : assert(u);
1673 : :
1674 : : /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
1675 : : * cgroupv2 is available. */
1676 : :
1677 [ - + ]: 24 : if (!u->cgroup_path)
1678 : 0 : return 0;
1679 : :
1680 [ - + ]: 24 : if (u->cgroup_control_inotify_wd >= 0)
1681 : 0 : return 0;
1682 : :
1683 : : /* Only applies to the unified hierarchy */
1684 : 24 : r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
1685 [ - + ]: 24 : if (r < 0)
1686 [ # # ]: 0 : return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
1687 [ - + ]: 24 : if (r == 0)
1688 : 0 : return 0;
1689 : :
1690 : : /* No point in watch the top-level slice, it's never going to run empty. */
1691 [ + - ]: 24 : if (unit_has_name(u, SPECIAL_ROOT_SLICE))
1692 : 24 : return 0;
1693 : :
1694 : 0 : r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
1695 [ # # ]: 0 : if (r < 0)
1696 : 0 : return log_oom();
1697 : :
1698 : 0 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
1699 [ # # ]: 0 : if (r < 0)
1700 : 0 : return log_oom();
1701 : :
1702 : 0 : u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1703 [ # # ]: 0 : if (u->cgroup_control_inotify_wd < 0) {
1704 : :
1705 [ # # ]: 0 : if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
1706 : : * is not an error */
1707 : 0 : return 0;
1708 : :
1709 [ # # ]: 0 : return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", u->cgroup_path);
1710 : : }
1711 : :
1712 : 0 : r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
1713 [ # # ]: 0 : if (r < 0)
1714 [ # # ]: 0 : return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor to hash map: %m");
1715 : :
1716 : 0 : return 0;
1717 : : }
1718 : :
1719 : 24 : int unit_watch_cgroup_memory(Unit *u) {
1720 : 24 : _cleanup_free_ char *events = NULL;
1721 : : CGroupContext *c;
1722 : : int r;
1723 : :
1724 [ - + ]: 24 : assert(u);
1725 : :
1726 : : /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
1727 : : * cgroupv2 is available. */
1728 : :
1729 [ - + ]: 24 : if (!u->cgroup_path)
1730 : 0 : return 0;
1731 : :
1732 : 24 : c = unit_get_cgroup_context(u);
1733 [ - + ]: 24 : if (!c)
1734 : 0 : return 0;
1735 : :
1736 : : /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
1737 : : * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
1738 : : * all. */
1739 [ - + ]: 24 : if (!c->memory_accounting)
1740 : 0 : return 0;
1741 : :
1742 : : /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
1743 : : * we also don't want to generate a log message for each parent cgroup of a process. */
1744 [ + - ]: 24 : if (u->type == UNIT_SLICE)
1745 : 24 : return 0;
1746 : :
1747 [ # # ]: 0 : if (u->cgroup_memory_inotify_wd >= 0)
1748 : 0 : return 0;
1749 : :
1750 : : /* Only applies to the unified hierarchy */
1751 : 0 : r = cg_all_unified();
1752 [ # # ]: 0 : if (r < 0)
1753 [ # # ]: 0 : return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
1754 [ # # ]: 0 : if (r == 0)
1755 : 0 : return 0;
1756 : :
1757 : 0 : r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
1758 [ # # ]: 0 : if (r < 0)
1759 : 0 : return log_oom();
1760 : :
1761 : 0 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
1762 [ # # ]: 0 : if (r < 0)
1763 : 0 : return log_oom();
1764 : :
1765 : 0 : u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
1766 [ # # ]: 0 : if (u->cgroup_memory_inotify_wd < 0) {
1767 : :
1768 [ # # ]: 0 : if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
1769 : : * is not an error */
1770 : 0 : return 0;
1771 : :
1772 [ # # ]: 0 : return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", u->cgroup_path);
1773 : : }
1774 : :
1775 : 0 : r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
1776 [ # # ]: 0 : if (r < 0)
1777 [ # # ]: 0 : return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor to hash map: %m");
1778 : :
1779 : 0 : return 0;
1780 : : }
1781 : :
1782 : 72 : int unit_pick_cgroup_path(Unit *u) {
1783 : 72 : _cleanup_free_ char *path = NULL;
1784 : : int r;
1785 : :
1786 [ - + ]: 72 : assert(u);
1787 : :
1788 [ + + ]: 72 : if (u->cgroup_path)
1789 : 24 : return 0;
1790 : :
1791 [ - + ]: 48 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
1792 : 0 : return -EINVAL;
1793 : :
1794 : 48 : path = unit_default_cgroup_path(u);
1795 [ - + ]: 48 : if (!path)
1796 : 0 : return log_oom();
1797 : :
1798 : 48 : r = unit_set_cgroup_path(u, path);
1799 [ - + ]: 48 : if (r == -EEXIST)
1800 [ # # ]: 0 : return log_unit_error_errno(u, r, "Control group %s exists already.", path);
1801 [ - + ]: 48 : if (r < 0)
1802 [ # # ]: 0 : return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
1803 : :
1804 : 48 : return 0;
1805 : : }
1806 : :
1807 : 48 : static int unit_create_cgroup(
1808 : : Unit *u,
1809 : : CGroupMask target_mask,
1810 : : CGroupMask enable_mask,
1811 : : ManagerState state) {
1812 : :
1813 : : bool created;
1814 : : int r;
1815 : :
1816 [ - + ]: 48 : assert(u);
1817 : :
1818 [ - + ]: 48 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
1819 : 0 : return 0;
1820 : :
1821 : : /* Figure out our cgroup path */
1822 : 48 : r = unit_pick_cgroup_path(u);
1823 [ - + ]: 48 : if (r < 0)
1824 : 0 : return r;
1825 : :
1826 : : /* First, create our own group */
1827 : 48 : r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
1828 [ + + ]: 48 : if (r < 0)
1829 [ + - ]: 24 : return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
1830 : 24 : created = r;
1831 : :
1832 : : /* Start watching it */
1833 : 24 : (void) unit_watch_cgroup(u);
1834 : 24 : (void) unit_watch_cgroup_memory(u);
1835 : :
1836 : : /* Preserve enabled controllers in delegated units, adjust others. */
1837 [ + - - + : 24 : if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
# # ]
1838 : 24 : CGroupMask result_mask = 0;
1839 : :
1840 : : /* Enable all controllers we need */
1841 : 24 : r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask);
1842 [ - + ]: 24 : if (r < 0)
1843 [ # # ]: 0 : log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
1844 : :
1845 : : /* If we just turned off a controller, this might release the controller for our parent too, let's
1846 : : * enqueue the parent for re-realization in that case again. */
1847 [ - + ]: 24 : if (UNIT_ISSET(u->slice)) {
1848 : : CGroupMask turned_off;
1849 : :
1850 [ # # ]: 0 : turned_off = (u->cgroup_realized ? u->cgroup_enabled_mask & ~result_mask : 0);
1851 [ # # ]: 0 : if (turned_off != 0) {
1852 : : Unit *parent;
1853 : :
1854 : : /* Force the parent to propagate the enable mask to the kernel again, by invalidating
1855 : : * the controller we just turned off. */
1856 : :
1857 [ # # ]: 0 : for (parent = UNIT_DEREF(u->slice); parent; parent = UNIT_DEREF(parent->slice))
1858 : 0 : unit_invalidate_cgroup(parent, turned_off);
1859 : : }
1860 : : }
1861 : :
1862 : : /* Remember what's actually enabled now */
1863 : 24 : u->cgroup_enabled_mask = result_mask;
1864 : : }
1865 : :
1866 : : /* Keep track that this is now realized */
1867 : 24 : u->cgroup_realized = true;
1868 : 24 : u->cgroup_realized_mask = target_mask;
1869 : :
1870 [ - + # # ]: 24 : if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
1871 : :
1872 : : /* Then, possibly move things over, but not if
1873 : : * subgroups may contain processes, which is the case
1874 : : * for slice and delegation units. */
1875 : 0 : r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
1876 [ # # ]: 0 : if (r < 0)
1877 [ # # ]: 0 : log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
1878 : : }
1879 : :
1880 : : /* Set attributes */
1881 : 24 : cgroup_context_apply(u, target_mask, state);
1882 : 24 : cgroup_xattr_apply(u);
1883 : :
1884 : 24 : return 0;
1885 : : }
1886 : :
1887 : 0 : static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
1888 : 0 : _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1889 : : char *pp;
1890 : : int r;
1891 : :
1892 [ # # ]: 0 : assert(u);
1893 : :
1894 [ # # ]: 0 : if (MANAGER_IS_SYSTEM(u->manager))
1895 : 0 : return -EINVAL;
1896 : :
1897 [ # # ]: 0 : if (!u->manager->system_bus)
1898 : 0 : return -EIO;
1899 : :
1900 [ # # ]: 0 : if (!u->cgroup_path)
1901 : 0 : return -EINVAL;
1902 : :
1903 : : /* Determine this unit's cgroup path relative to our cgroup root */
1904 : 0 : pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
1905 [ # # ]: 0 : if (!pp)
1906 : 0 : return -EINVAL;
1907 : :
1908 [ # # # # : 0 : pp = strjoina("/", pp, suffix_path);
# # # # #
# # # ]
1909 : 0 : path_simplify(pp, false);
1910 : :
1911 : 0 : r = sd_bus_call_method(u->manager->system_bus,
1912 : : "org.freedesktop.systemd1",
1913 : : "/org/freedesktop/systemd1",
1914 : : "org.freedesktop.systemd1.Manager",
1915 : : "AttachProcessesToUnit",
1916 : : &error, NULL,
1917 : : "ssau",
1918 : : NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
1919 [ # # ]: 0 : if (r < 0)
1920 [ # # ]: 0 : return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
1921 : :
1922 : 0 : return 0;
1923 : : }
1924 : :
1925 : 0 : int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
1926 : : CGroupMask delegated_mask;
1927 : : const char *p;
1928 : : Iterator i;
1929 : : void *pidp;
1930 : : int r, q;
1931 : :
1932 [ # # ]: 0 : assert(u);
1933 : :
1934 [ # # ]: 0 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
1935 : 0 : return -EINVAL;
1936 : :
1937 [ # # ]: 0 : if (set_isempty(pids))
1938 : 0 : return 0;
1939 : :
1940 : : /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
1941 : : * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
1942 : 0 : r = bpf_firewall_load_custom(u);
1943 [ # # ]: 0 : if (r < 0)
1944 : 0 : return r;
1945 : :
1946 : 0 : r = unit_realize_cgroup(u);
1947 [ # # ]: 0 : if (r < 0)
1948 : 0 : return r;
1949 : :
1950 [ # # ]: 0 : if (isempty(suffix_path))
1951 : 0 : p = u->cgroup_path;
1952 : : else
1953 [ # # # # : 0 : p = prefix_roota(u->cgroup_path, suffix_path);
# # # # #
# # # # #
# # ]
1954 : :
1955 : 0 : delegated_mask = unit_get_delegate_mask(u);
1956 : :
1957 : 0 : r = 0;
1958 [ # # ]: 0 : SET_FOREACH(pidp, pids, i) {
1959 : 0 : pid_t pid = PTR_TO_PID(pidp);
1960 : : CGroupController c;
1961 : :
1962 : : /* First, attach the PID to the main cgroup hierarchy */
1963 : 0 : q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
1964 [ # # ]: 0 : if (q < 0) {
1965 [ # # ]: 0 : log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
1966 : :
1967 [ # # # # : 0 : if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
# # ]
1968 : : int z;
1969 : :
1970 : : /* If we are in a user instance, and we can't move the process ourselves due to
1971 : : * permission problems, let's ask the system instance about it instead. Since it's more
1972 : : * privileged it might be able to move the process across the leaves of a subtree who's
1973 : : * top node is not owned by us. */
1974 : :
1975 : 0 : z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
1976 [ # # ]: 0 : if (z < 0)
1977 [ # # ]: 0 : log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
1978 : : else
1979 : 0 : continue; /* When the bus thing worked via the bus we are fully done for this PID. */
1980 : : }
1981 : :
1982 [ # # ]: 0 : if (r >= 0)
1983 : 0 : r = q; /* Remember first error */
1984 : :
1985 : 0 : continue;
1986 : : }
1987 : :
1988 : 0 : q = cg_all_unified();
1989 [ # # ]: 0 : if (q < 0)
1990 : 0 : return q;
1991 [ # # ]: 0 : if (q > 0)
1992 : 0 : continue;
1993 : :
1994 : : /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
1995 : : * innermost realized one */
1996 : :
1997 [ # # ]: 0 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1998 : 0 : CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1999 : : const char *realized;
2000 : :
2001 [ # # ]: 0 : if (!(u->manager->cgroup_supported & bit))
2002 : 0 : continue;
2003 : :
2004 : : /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
2005 [ # # ]: 0 : if (delegated_mask & u->cgroup_realized_mask & bit) {
2006 : 0 : q = cg_attach(cgroup_controller_to_string(c), p, pid);
2007 [ # # ]: 0 : if (q >= 0)
2008 : 0 : continue; /* Success! */
2009 : :
2010 [ # # ]: 0 : log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
2011 : : pid, p, cgroup_controller_to_string(c));
2012 : : }
2013 : :
2014 : : /* So this controller is either not delegate or realized, or something else weird happened. In
2015 : : * that case let's attach the PID at least to the closest cgroup up the tree that is
2016 : : * realized. */
2017 : 0 : realized = unit_get_realized_cgroup_path(u, bit);
2018 [ # # ]: 0 : if (!realized)
2019 : 0 : continue; /* Not even realized in the root slice? Then let's not bother */
2020 : :
2021 : 0 : q = cg_attach(cgroup_controller_to_string(c), realized, pid);
2022 [ # # ]: 0 : if (q < 0)
2023 [ # # ]: 0 : log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
2024 : : pid, realized, cgroup_controller_to_string(c));
2025 : : }
2026 : : }
2027 : :
2028 : 0 : return r;
2029 : : }
2030 : :
2031 : 24 : static bool unit_has_mask_realized(
2032 : : Unit *u,
2033 : : CGroupMask target_mask,
2034 : : CGroupMask enable_mask) {
2035 : :
2036 [ - + ]: 24 : assert(u);
2037 : :
2038 : : /* Returns true if this unit is fully realized. We check four things:
2039 : : *
2040 : : * 1. Whether the cgroup was created at all
2041 : : * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
2042 : : * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
2043 : : * 4. Whether the invalidation mask is currently zero
2044 : : *
2045 : : * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
2046 : : * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
2047 : : * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
2048 : : * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
2049 : : * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
2050 : : * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
2051 : : * simply don't matter. */
2052 : :
2053 : 24 : return u->cgroup_realized &&
2054 [ # # ]: 0 : ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
2055 [ - + # # ]: 24 : ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
2056 [ # # ]: 0 : u->cgroup_invalidated_mask == 0;
2057 : : }
2058 : :
2059 : 0 : static bool unit_has_mask_disables_realized(
2060 : : Unit *u,
2061 : : CGroupMask target_mask,
2062 : : CGroupMask enable_mask) {
2063 : :
2064 [ # # ]: 0 : assert(u);
2065 : :
2066 : : /* Returns true if all controllers which should be disabled are indeed disabled.
2067 : : *
2068 : : * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
2069 : : * already removed. */
2070 : :
2071 [ # # ]: 0 : return !u->cgroup_realized ||
2072 [ # # ]: 0 : (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
2073 [ # # ]: 0 : FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
2074 : : }
2075 : :
2076 : 24 : static bool unit_has_mask_enables_realized(
2077 : : Unit *u,
2078 : : CGroupMask target_mask,
2079 : : CGroupMask enable_mask) {
2080 : :
2081 [ - + ]: 24 : assert(u);
2082 : :
2083 : : /* Returns true if all controllers which should be enabled are indeed enabled.
2084 : : *
2085 : : * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
2086 : : * we want to add is already added. */
2087 : :
2088 : 24 : return u->cgroup_realized &&
2089 [ - + # # ]: 24 : ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
2090 [ # # ]: 0 : ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
2091 : : }
2092 : :
2093 : 537 : void unit_add_to_cgroup_realize_queue(Unit *u) {
2094 [ - + ]: 537 : assert(u);
2095 : :
2096 [ + + ]: 537 : if (u->in_cgroup_realize_queue)
2097 : 477 : return;
2098 : :
2099 [ - + + + ]: 60 : LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
2100 : 60 : u->in_cgroup_realize_queue = true;
2101 : : }
2102 : :
2103 : 24 : static void unit_remove_from_cgroup_realize_queue(Unit *u) {
2104 [ - + ]: 24 : assert(u);
2105 : :
2106 [ + - ]: 24 : if (!u->in_cgroup_realize_queue)
2107 : 24 : return;
2108 : :
2109 [ # # # # : 0 : LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
# # # # ]
2110 : 0 : u->in_cgroup_realize_queue = false;
2111 : : }
2112 : :
2113 : : /* Controllers can only be enabled breadth-first, from the root of the
2114 : : * hierarchy downwards to the unit in question. */
2115 : 24 : static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
2116 : : CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
2117 : : int r;
2118 : :
2119 [ - + ]: 24 : assert(u);
2120 : :
2121 : : /* First go deal with this unit's parent, or we won't be able to enable
2122 : : * any new controllers at this layer. */
2123 [ - + ]: 24 : if (UNIT_ISSET(u->slice)) {
2124 : 0 : r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
2125 [ # # ]: 0 : if (r < 0)
2126 : 0 : return r;
2127 : : }
2128 : :
2129 : 24 : target_mask = unit_get_target_mask(u);
2130 : 24 : enable_mask = unit_get_enable_mask(u);
2131 : :
2132 : : /* We can only enable in this direction, don't try to disable anything.
2133 : : */
2134 [ - + ]: 24 : if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
2135 : 0 : return 0;
2136 : :
2137 : 24 : new_target_mask = u->cgroup_realized_mask | target_mask;
2138 : 24 : new_enable_mask = u->cgroup_enabled_mask | enable_mask;
2139 : :
2140 : 24 : return unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
2141 : : }
2142 : :
2143 : : /* Controllers can only be disabled depth-first, from the leaves of the
2144 : : * hierarchy upwards to the unit in question. */
2145 : 24 : static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
2146 : : Iterator i;
2147 : : Unit *m;
2148 : : void *v;
2149 : :
2150 [ - + ]: 24 : assert(u);
2151 : :
2152 [ + - ]: 24 : if (u->type != UNIT_SLICE)
2153 : 24 : return 0;
2154 : :
2155 [ # # ]: 0 : HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
2156 : : CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
2157 : : int r;
2158 : :
2159 [ # # ]: 0 : if (UNIT_DEREF(m->slice) != u)
2160 : 0 : continue;
2161 : :
2162 : : /* The cgroup for this unit might not actually be fully
2163 : : * realised yet, in which case it isn't holding any controllers
2164 : : * open anyway. */
2165 [ # # ]: 0 : if (!m->cgroup_path)
2166 : 0 : continue;
2167 : :
2168 : : /* We must disable those below us first in order to release the
2169 : : * controller. */
2170 [ # # ]: 0 : if (m->type == UNIT_SLICE)
2171 : 0 : (void) unit_realize_cgroup_now_disable(m, state);
2172 : :
2173 : 0 : target_mask = unit_get_target_mask(m);
2174 : 0 : enable_mask = unit_get_enable_mask(m);
2175 : :
2176 : : /* We can only disable in this direction, don't try to enable
2177 : : * anything. */
2178 [ # # ]: 0 : if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
2179 : 0 : continue;
2180 : :
2181 : 0 : new_target_mask = m->cgroup_realized_mask & target_mask;
2182 : 0 : new_enable_mask = m->cgroup_enabled_mask & enable_mask;
2183 : :
2184 : 0 : r = unit_create_cgroup(m, new_target_mask, new_enable_mask, state);
2185 [ # # ]: 0 : if (r < 0)
2186 : 0 : return r;
2187 : : }
2188 : :
2189 : 0 : return 0;
2190 : : }
2191 : :
2192 : : /* Check if necessary controllers and attributes for a unit are in place.
2193 : : *
2194 : : * - If so, do nothing.
2195 : : * - If not, create paths, move processes over, and set attributes.
2196 : : *
2197 : : * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
2198 : : * a depth-first way. As such the process looks like this:
2199 : : *
2200 : : * Suppose we have a cgroup hierarchy which looks like this:
2201 : : *
2202 : : * root
2203 : : * / \
2204 : : * / \
2205 : : * / \
2206 : : * a b
2207 : : * / \ / \
2208 : : * / \ / \
2209 : : * c d e f
2210 : : * / \ / \ / \ / \
2211 : : * h i j k l m n o
2212 : : *
2213 : : * 1. We want to realise cgroup "d" now.
2214 : : * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
2215 : : * 3. cgroup "k" just started requesting the memory controller.
2216 : : *
2217 : : * To make this work we must do the following in order:
2218 : : *
2219 : : * 1. Disable CPU controller in k, j
2220 : : * 2. Disable CPU controller in d
2221 : : * 3. Enable memory controller in root
2222 : : * 4. Enable memory controller in a
2223 : : * 5. Enable memory controller in d
2224 : : * 6. Enable memory controller in k
2225 : : *
2226 : : * Notice that we need to touch j in one direction, but not the other. We also
2227 : : * don't go beyond d when disabling -- it's up to "a" to get realized if it
2228 : : * wants to disable further. The basic rules are therefore:
2229 : : *
2230 : : * - If you're disabling something, you need to realise all of the cgroups from
2231 : : * your recursive descendants to the root. This starts from the leaves.
2232 : : * - If you're enabling something, you need to realise from the root cgroup
2233 : : * downwards, but you don't need to iterate your recursive descendants.
2234 : : *
2235 : : * Returns 0 on success and < 0 on failure. */
2236 : 24 : static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
2237 : : CGroupMask target_mask, enable_mask;
2238 : : int r;
2239 : :
2240 [ - + ]: 24 : assert(u);
2241 : :
2242 : 24 : unit_remove_from_cgroup_realize_queue(u);
2243 : :
2244 : 24 : target_mask = unit_get_target_mask(u);
2245 : 24 : enable_mask = unit_get_enable_mask(u);
2246 : :
2247 [ - + ]: 24 : if (unit_has_mask_realized(u, target_mask, enable_mask))
2248 : 0 : return 0;
2249 : :
2250 : : /* Disable controllers below us, if there are any */
2251 : 24 : r = unit_realize_cgroup_now_disable(u, state);
2252 [ - + ]: 24 : if (r < 0)
2253 : 0 : return r;
2254 : :
2255 : : /* Enable controllers above us, if there are any */
2256 [ + - ]: 24 : if (UNIT_ISSET(u->slice)) {
2257 : 24 : r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
2258 [ - + ]: 24 : if (r < 0)
2259 : 0 : return r;
2260 : : }
2261 : :
2262 : : /* Now actually deal with the cgroup we were trying to realise and set attributes */
2263 : 24 : r = unit_create_cgroup(u, target_mask, enable_mask, state);
2264 [ + - ]: 24 : if (r < 0)
2265 : 24 : return r;
2266 : :
2267 : : /* Now, reset the invalidation mask */
2268 : 0 : u->cgroup_invalidated_mask = 0;
2269 : 0 : return 0;
2270 : : }
2271 : :
2272 : 0 : unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
2273 : : ManagerState state;
2274 : 0 : unsigned n = 0;
2275 : : Unit *i;
2276 : : int r;
2277 : :
2278 [ # # ]: 0 : assert(m);
2279 : :
2280 : 0 : state = manager_state(m);
2281 : :
2282 [ # # ]: 0 : while ((i = m->cgroup_realize_queue)) {
2283 [ # # ]: 0 : assert(i->in_cgroup_realize_queue);
2284 : :
2285 [ # # ]: 0 : if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
2286 : : /* Maybe things changed, and the unit is not actually active anymore? */
2287 : 0 : unit_remove_from_cgroup_realize_queue(i);
2288 : 0 : continue;
2289 : : }
2290 : :
2291 : 0 : r = unit_realize_cgroup_now(i, state);
2292 [ # # ]: 0 : if (r < 0)
2293 [ # # ]: 0 : log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
2294 : :
2295 : 0 : n++;
2296 : : }
2297 : :
2298 : 0 : return n;
2299 : : }
2300 : :
2301 : 24 : static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
2302 : : Unit *slice;
2303 : :
2304 : : /* This adds the siblings of the specified unit and the
2305 : : * siblings of all parent units to the cgroup queue. (But
2306 : : * neither the specified unit itself nor the parents.) */
2307 : :
2308 [ + + ]: 48 : while ((slice = UNIT_DEREF(u->slice))) {
2309 : : Iterator i;
2310 : : Unit *m;
2311 : : void *v;
2312 : :
2313 [ + + ]: 48 : HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
2314 : : /* Skip units that have a dependency on the slice
2315 : : * but aren't actually in it. */
2316 [ + - ]: 24 : if (UNIT_DEREF(m->slice) != slice)
2317 : 24 : continue;
2318 : :
2319 : : /* No point in doing cgroup application for units
2320 : : * without active processes. */
2321 [ # # ]: 0 : if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
2322 : 0 : continue;
2323 : :
2324 : : /* If the unit doesn't need any new controllers
2325 : : * and has current ones realized, it doesn't need
2326 : : * any changes. */
2327 [ # # ]: 0 : if (unit_has_mask_realized(m,
2328 : : unit_get_target_mask(m),
2329 : : unit_get_enable_mask(m)))
2330 : 0 : continue;
2331 : :
2332 : 0 : unit_add_to_cgroup_realize_queue(m);
2333 : : }
2334 : :
2335 : 24 : u = slice;
2336 : : }
2337 : 24 : }
2338 : :
2339 : 24 : int unit_realize_cgroup(Unit *u) {
2340 [ - + ]: 24 : assert(u);
2341 : :
2342 [ - + ]: 24 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
2343 : 0 : return 0;
2344 : :
2345 : : /* So, here's the deal: when realizing the cgroups for this
2346 : : * unit, we need to first create all parents, but there's more
2347 : : * actually: for the weight-based controllers we also need to
2348 : : * make sure that all our siblings (i.e. units that are in the
2349 : : * same slice as we are) have cgroups, too. Otherwise, things
2350 : : * would become very uneven as each of their processes would
2351 : : * get as much resources as all our group together. This call
2352 : : * will synchronously create the parent cgroups, but will
2353 : : * defer work on the siblings to the next event loop
2354 : : * iteration. */
2355 : :
2356 : : /* Add all sibling slices to the cgroup queue. */
2357 : 24 : unit_add_siblings_to_cgroup_realize_queue(u);
2358 : :
2359 : : /* And realize this one now (and apply the values) */
2360 : 24 : return unit_realize_cgroup_now(u, manager_state(u->manager));
2361 : : }
2362 : :
2363 : 8776 : void unit_release_cgroup(Unit *u) {
2364 [ - + ]: 8776 : assert(u);
2365 : :
2366 : : /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
2367 : : * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
2368 : :
2369 [ + + ]: 8776 : if (u->cgroup_path) {
2370 : 48 : (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
2371 : 48 : u->cgroup_path = mfree(u->cgroup_path);
2372 : : }
2373 : :
2374 [ - + ]: 8776 : if (u->cgroup_control_inotify_wd >= 0) {
2375 [ # # ]: 0 : if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
2376 [ # # ]: 0 : log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);
2377 : :
2378 : 0 : (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
2379 : 0 : u->cgroup_control_inotify_wd = -1;
2380 : : }
2381 : :
2382 [ - + ]: 8776 : if (u->cgroup_memory_inotify_wd >= 0) {
2383 [ # # ]: 0 : if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
2384 [ # # ]: 0 : log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
2385 : :
2386 : 0 : (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
2387 : 0 : u->cgroup_memory_inotify_wd = -1;
2388 : : }
2389 : 8776 : }
2390 : :
2391 : 28 : void unit_prune_cgroup(Unit *u) {
2392 : : int r;
2393 : : bool is_root_slice;
2394 : :
2395 [ - + ]: 28 : assert(u);
2396 : :
2397 : : /* Removes the cgroup, if empty and possible, and stops watching it. */
2398 : :
2399 [ + - ]: 28 : if (!u->cgroup_path)
2400 : 28 : return;
2401 : :
2402 : 0 : (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
2403 : :
2404 : 0 : is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
2405 : :
2406 : 0 : r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
2407 [ # # ]: 0 : if (r < 0)
2408 : : /* One reason we could have failed here is, that the cgroup still contains a process.
2409 : : * However, if the cgroup becomes removable at a later time, it might be removed when
2410 : : * the containing slice is stopped. So even if we failed now, this unit shouldn't assume
2411 : : * that the cgroup is still realized the next time it is started. Do not return early
2412 : : * on error, continue cleanup. */
2413 [ # # # # : 0 : log_unit_full(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
# # ]
2414 : :
2415 [ # # ]: 0 : if (is_root_slice)
2416 : 0 : return;
2417 : :
2418 : 0 : unit_release_cgroup(u);
2419 : :
2420 : 0 : u->cgroup_realized = false;
2421 : 0 : u->cgroup_realized_mask = 0;
2422 : 0 : u->cgroup_enabled_mask = 0;
2423 : :
2424 : 0 : u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
2425 : : }
2426 : :
2427 : 0 : int unit_search_main_pid(Unit *u, pid_t *ret) {
2428 : 0 : _cleanup_fclose_ FILE *f = NULL;
2429 : 0 : pid_t pid = 0, npid;
2430 : : int r;
2431 : :
2432 [ # # ]: 0 : assert(u);
2433 [ # # ]: 0 : assert(ret);
2434 : :
2435 [ # # ]: 0 : if (!u->cgroup_path)
2436 : 0 : return -ENXIO;
2437 : :
2438 : 0 : r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
2439 [ # # ]: 0 : if (r < 0)
2440 : 0 : return r;
2441 : :
2442 [ # # ]: 0 : while (cg_read_pid(f, &npid) > 0) {
2443 : :
2444 [ # # ]: 0 : if (npid == pid)
2445 : 0 : continue;
2446 : :
2447 [ # # ]: 0 : if (pid_is_my_child(npid) == 0)
2448 : 0 : continue;
2449 : :
2450 [ # # ]: 0 : if (pid != 0)
2451 : : /* Dang, there's more than one daemonized PID
2452 : : in this group, so we don't know what process
2453 : : is the main process. */
2454 : :
2455 : 0 : return -ENODATA;
2456 : :
2457 : 0 : pid = npid;
2458 : : }
2459 : :
2460 : 0 : *ret = pid;
2461 : 0 : return 0;
2462 : : }
2463 : :
2464 : 0 : static int unit_watch_pids_in_path(Unit *u, const char *path) {
2465 : 0 : _cleanup_closedir_ DIR *d = NULL;
2466 : 0 : _cleanup_fclose_ FILE *f = NULL;
2467 : 0 : int ret = 0, r;
2468 : :
2469 [ # # ]: 0 : assert(u);
2470 [ # # ]: 0 : assert(path);
2471 : :
2472 : 0 : r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
2473 [ # # ]: 0 : if (r < 0)
2474 : 0 : ret = r;
2475 : : else {
2476 : : pid_t pid;
2477 : :
2478 [ # # ]: 0 : while ((r = cg_read_pid(f, &pid)) > 0) {
2479 : 0 : r = unit_watch_pid(u, pid, false);
2480 [ # # # # ]: 0 : if (r < 0 && ret >= 0)
2481 : 0 : ret = r;
2482 : : }
2483 : :
2484 [ # # # # ]: 0 : if (r < 0 && ret >= 0)
2485 : 0 : ret = r;
2486 : : }
2487 : :
2488 : 0 : r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
2489 [ # # ]: 0 : if (r < 0) {
2490 [ # # ]: 0 : if (ret >= 0)
2491 : 0 : ret = r;
2492 : : } else {
2493 : : char *fn;
2494 : :
2495 [ # # ]: 0 : while ((r = cg_read_subgroup(d, &fn)) > 0) {
2496 [ # # ]: 0 : _cleanup_free_ char *p = NULL;
2497 : :
2498 : 0 : p = path_join(empty_to_root(path), fn);
2499 : 0 : free(fn);
2500 : :
2501 [ # # ]: 0 : if (!p)
2502 : 0 : return -ENOMEM;
2503 : :
2504 : 0 : r = unit_watch_pids_in_path(u, p);
2505 [ # # # # ]: 0 : if (r < 0 && ret >= 0)
2506 : 0 : ret = r;
2507 : : }
2508 : :
2509 [ # # # # ]: 0 : if (r < 0 && ret >= 0)
2510 : 0 : ret = r;
2511 : : }
2512 : :
2513 : 0 : return ret;
2514 : : }
2515 : :
2516 : 0 : int unit_synthesize_cgroup_empty_event(Unit *u) {
2517 : : int r;
2518 : :
2519 [ # # ]: 0 : assert(u);
2520 : :
2521 : : /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
2522 : : * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
2523 : : * get as notification source as soon as we stopped having any useful PIDs to watch for. */
2524 : :
2525 [ # # ]: 0 : if (!u->cgroup_path)
2526 : 0 : return -ENOENT;
2527 : :
2528 : 0 : r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2529 [ # # ]: 0 : if (r < 0)
2530 : 0 : return r;
2531 [ # # ]: 0 : if (r > 0) /* On unified we have reliable notifications, and don't need this */
2532 : 0 : return 0;
2533 : :
2534 [ # # ]: 0 : if (!set_isempty(u->pids))
2535 : 0 : return 0;
2536 : :
2537 : 0 : unit_add_to_cgroup_empty_queue(u);
2538 : 0 : return 0;
2539 : : }
2540 : :
2541 : 0 : int unit_watch_all_pids(Unit *u) {
2542 : : int r;
2543 : :
2544 [ # # ]: 0 : assert(u);
2545 : :
2546 : : /* Adds all PIDs from our cgroup to the set of PIDs we
2547 : : * watch. This is a fallback logic for cases where we do not
2548 : : * get reliable cgroup empty notifications: we try to use
2549 : : * SIGCHLD as replacement. */
2550 : :
2551 [ # # ]: 0 : if (!u->cgroup_path)
2552 : 0 : return -ENOENT;
2553 : :
2554 : 0 : r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2555 [ # # ]: 0 : if (r < 0)
2556 : 0 : return r;
2557 [ # # ]: 0 : if (r > 0) /* On unified we can use proper notifications */
2558 : 0 : return 0;
2559 : :
2560 : 0 : return unit_watch_pids_in_path(u, u->cgroup_path);
2561 : : }
2562 : :
2563 : 0 : static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
2564 : 0 : Manager *m = userdata;
2565 : : Unit *u;
2566 : : int r;
2567 : :
2568 [ # # ]: 0 : assert(s);
2569 [ # # ]: 0 : assert(m);
2570 : :
2571 : 0 : u = m->cgroup_empty_queue;
2572 [ # # ]: 0 : if (!u)
2573 : 0 : return 0;
2574 : :
2575 [ # # ]: 0 : assert(u->in_cgroup_empty_queue);
2576 : 0 : u->in_cgroup_empty_queue = false;
2577 [ # # # # : 0 : LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
# # # # ]
2578 : :
2579 [ # # ]: 0 : if (m->cgroup_empty_queue) {
2580 : : /* More stuff queued, let's make sure we remain enabled */
2581 : 0 : r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2582 [ # # ]: 0 : if (r < 0)
2583 [ # # ]: 0 : log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
2584 : : }
2585 : :
2586 : 0 : unit_add_to_gc_queue(u);
2587 : :
2588 [ # # ]: 0 : if (UNIT_VTABLE(u)->notify_cgroup_empty)
2589 : 0 : UNIT_VTABLE(u)->notify_cgroup_empty(u);
2590 : :
2591 : 0 : return 0;
2592 : : }
2593 : :
2594 : 0 : void unit_add_to_cgroup_empty_queue(Unit *u) {
2595 : : int r;
2596 : :
2597 [ # # ]: 0 : assert(u);
2598 : :
2599 : : /* Note that there are four different ways how cgroup empty events reach us:
2600 : : *
2601 : : * 1. On the unified hierarchy we get an inotify event on the cgroup
2602 : : *
2603 : : * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
2604 : : *
2605 : : * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
2606 : : *
2607 : : * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
2608 : : * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
2609 : : *
2610 : : * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
2611 : : * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
2612 : : * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
2613 : : * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
2614 : : * case for scope units). */
2615 : :
2616 [ # # ]: 0 : if (u->in_cgroup_empty_queue)
2617 : 0 : return;
2618 : :
2619 : : /* Let's verify that the cgroup is really empty */
2620 [ # # ]: 0 : if (!u->cgroup_path)
2621 : 0 : return;
2622 : 0 : r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
2623 [ # # ]: 0 : if (r < 0) {
2624 [ # # ]: 0 : log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
2625 : 0 : return;
2626 : : }
2627 [ # # ]: 0 : if (r == 0)
2628 : 0 : return;
2629 : :
2630 [ # # # # ]: 0 : LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
2631 : 0 : u->in_cgroup_empty_queue = true;
2632 : :
2633 : : /* Trigger the defer event */
2634 : 0 : r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
2635 [ # # ]: 0 : if (r < 0)
2636 [ # # ]: 0 : log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
2637 : : }
2638 : :
2639 : 0 : int unit_check_oom(Unit *u) {
2640 : 0 : _cleanup_free_ char *oom_kill = NULL;
2641 : : bool increased;
2642 : : uint64_t c;
2643 : : int r;
2644 : :
2645 [ # # ]: 0 : if (!u->cgroup_path)
2646 : 0 : return 0;
2647 : :
2648 : 0 : r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
2649 [ # # ]: 0 : if (r < 0)
2650 [ # # ]: 0 : return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
2651 : :
2652 : 0 : r = safe_atou64(oom_kill, &c);
2653 [ # # ]: 0 : if (r < 0)
2654 [ # # ]: 0 : return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
2655 : :
2656 : 0 : increased = c > u->oom_kill_last;
2657 : 0 : u->oom_kill_last = c;
2658 : :
2659 [ # # ]: 0 : if (!increased)
2660 : 0 : return 0;
2661 : :
2662 : 0 : log_struct(LOG_NOTICE,
2663 : : "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
2664 : : LOG_UNIT_ID(u),
2665 : : LOG_UNIT_INVOCATION_ID(u),
2666 : : LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
2667 : :
2668 [ # # ]: 0 : if (UNIT_VTABLE(u)->notify_cgroup_oom)
2669 : 0 : UNIT_VTABLE(u)->notify_cgroup_oom(u);
2670 : :
2671 : 0 : return 1;
2672 : : }
2673 : :
2674 : 0 : static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
2675 : 0 : Manager *m = userdata;
2676 : : Unit *u;
2677 : : int r;
2678 : :
2679 [ # # ]: 0 : assert(s);
2680 [ # # ]: 0 : assert(m);
2681 : :
2682 : 0 : u = m->cgroup_oom_queue;
2683 [ # # ]: 0 : if (!u)
2684 : 0 : return 0;
2685 : :
2686 [ # # ]: 0 : assert(u->in_cgroup_oom_queue);
2687 : 0 : u->in_cgroup_oom_queue = false;
2688 [ # # # # : 0 : LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
# # # # ]
2689 : :
2690 [ # # ]: 0 : if (m->cgroup_oom_queue) {
2691 : : /* More stuff queued, let's make sure we remain enabled */
2692 : 0 : r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
2693 [ # # ]: 0 : if (r < 0)
2694 [ # # ]: 0 : log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
2695 : : }
2696 : :
2697 : 0 : (void) unit_check_oom(u);
2698 : 0 : return 0;
2699 : : }
2700 : :
2701 : 0 : static void unit_add_to_cgroup_oom_queue(Unit *u) {
2702 : : int r;
2703 : :
2704 [ # # ]: 0 : assert(u);
2705 : :
2706 [ # # ]: 0 : if (u->in_cgroup_oom_queue)
2707 : 0 : return;
2708 [ # # ]: 0 : if (!u->cgroup_path)
2709 : 0 : return;
2710 : :
2711 [ # # # # ]: 0 : LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
2712 : 0 : u->in_cgroup_oom_queue = true;
2713 : :
2714 : : /* Trigger the defer event */
2715 [ # # ]: 0 : if (!u->manager->cgroup_oom_event_source) {
2716 [ # # ]: 0 : _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
2717 : :
2718 : 0 : r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
2719 [ # # ]: 0 : if (r < 0) {
2720 [ # # ]: 0 : log_error_errno(r, "Failed to create cgroup oom event source: %m");
2721 : 0 : return;
2722 : : }
2723 : :
2724 : 0 : r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
2725 [ # # ]: 0 : if (r < 0) {
2726 [ # # ]: 0 : log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
2727 : 0 : return;
2728 : : }
2729 : :
2730 : 0 : (void) sd_event_source_set_description(s, "cgroup-oom");
2731 : 0 : u->manager->cgroup_oom_event_source = TAKE_PTR(s);
2732 : : }
2733 : :
2734 : 0 : r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
2735 [ # # ]: 0 : if (r < 0)
2736 [ # # ]: 0 : log_error_errno(r, "Failed to enable cgroup oom event source: %m");
2737 : : }
2738 : :
2739 : 0 : static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
2740 : 0 : Manager *m = userdata;
2741 : :
2742 [ # # ]: 0 : assert(s);
2743 [ # # ]: 0 : assert(fd >= 0);
2744 [ # # ]: 0 : assert(m);
2745 : :
2746 : 0 : for (;;) {
2747 : : union inotify_event_buffer buffer;
2748 : : struct inotify_event *e;
2749 : : ssize_t l;
2750 : :
2751 : 0 : l = read(fd, &buffer, sizeof(buffer));
2752 [ # # ]: 0 : if (l < 0) {
2753 [ # # # # ]: 0 : if (IN_SET(errno, EINTR, EAGAIN))
2754 : 0 : return 0;
2755 : :
2756 [ # # ]: 0 : return log_error_errno(errno, "Failed to read control group inotify events: %m");
2757 : : }
2758 : :
2759 [ # # ]: 0 : FOREACH_INOTIFY_EVENT(e, buffer, l) {
2760 : : Unit *u;
2761 : :
2762 [ # # ]: 0 : if (e->wd < 0)
2763 : : /* Queue overflow has no watch descriptor */
2764 : 0 : continue;
2765 : :
2766 [ # # ]: 0 : if (e->mask & IN_IGNORED)
2767 : : /* The watch was just removed */
2768 : 0 : continue;
2769 : :
2770 : : /* Note that inotify might deliver events for a watch even after it was removed,
2771 : : * because it was queued before the removal. Let's ignore this here safely. */
2772 : :
2773 : 0 : u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
2774 [ # # ]: 0 : if (u)
2775 : 0 : unit_add_to_cgroup_empty_queue(u);
2776 : :
2777 : 0 : u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
2778 [ # # ]: 0 : if (u)
2779 : 0 : unit_add_to_cgroup_oom_queue(u);
2780 : : }
2781 : : }
2782 : : }
2783 : :
2784 : 44 : static int cg_bpf_mask_supported(CGroupMask *ret) {
2785 : 44 : CGroupMask mask = 0;
2786 : : int r;
2787 : :
2788 : : /* BPF-based firewall */
2789 : 44 : r = bpf_firewall_supported();
2790 [ - + ]: 44 : if (r > 0)
2791 : 0 : mask |= CGROUP_MASK_BPF_FIREWALL;
2792 : :
2793 : : /* BPF-based device access control */
2794 : 44 : r = bpf_devices_supported();
2795 [ - + ]: 44 : if (r > 0)
2796 : 0 : mask |= CGROUP_MASK_BPF_DEVICES;
2797 : :
2798 : 44 : *ret = mask;
2799 : 44 : return 0;
2800 : : }
2801 : :
2802 : 44 : int manager_setup_cgroup(Manager *m) {
2803 : 44 : _cleanup_free_ char *path = NULL;
2804 : : const char *scope_path;
2805 : : CGroupController c;
2806 : : int r, all_unified;
2807 : : CGroupMask mask;
2808 : : char *e;
2809 : :
2810 [ - + ]: 44 : assert(m);
2811 : :
2812 : : /* 1. Determine hierarchy */
2813 : 44 : m->cgroup_root = mfree(m->cgroup_root);
2814 : 44 : r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
2815 [ - + ]: 44 : if (r < 0)
2816 [ # # ]: 0 : return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
2817 : :
2818 : : /* Chop off the init scope, if we are already located in it */
2819 : 44 : e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
2820 : :
2821 : : /* LEGACY: Also chop off the system slice if we are in
2822 : : * it. This is to support live upgrades from older systemd
2823 : : * versions where PID 1 was moved there. Also see
2824 : : * cg_get_root_path(). */
2825 [ + - - + ]: 44 : if (!e && MANAGER_IS_SYSTEM(m)) {
2826 : 0 : e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
2827 [ # # ]: 0 : if (!e)
2828 : 0 : e = endswith(m->cgroup_root, "/system"); /* even more legacy */
2829 : : }
2830 [ - + ]: 44 : if (e)
2831 : 0 : *e = 0;
2832 : :
2833 : : /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
2834 : : * easily prepend it everywhere. */
2835 : 44 : delete_trailing_chars(m->cgroup_root, "/");
2836 : :
2837 : : /* 2. Show data */
2838 : 44 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
2839 [ - + ]: 44 : if (r < 0)
2840 [ # # ]: 0 : return log_error_errno(r, "Cannot find cgroup mount point: %m");
2841 : :
2842 : 44 : r = cg_unified_flush();
2843 [ - + ]: 44 : if (r < 0)
2844 [ # # ]: 0 : return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
2845 : :
2846 : 44 : all_unified = cg_all_unified();
2847 [ - + ]: 44 : if (all_unified < 0)
2848 [ # # ]: 0 : return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
2849 [ - + ]: 44 : if (all_unified > 0)
2850 [ # # ]: 0 : log_debug("Unified cgroup hierarchy is located at %s.", path);
2851 : : else {
2852 : 44 : r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
2853 [ - + ]: 44 : if (r < 0)
2854 [ # # ]: 0 : return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
2855 [ + - ]: 44 : if (r > 0)
2856 [ + + ]: 44 : log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
2857 : : else
2858 [ # # ]: 0 : log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
2859 : : }
2860 : :
2861 : : /* 3. Allocate cgroup empty defer event source */
2862 : 44 : m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2863 : 44 : r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
2864 [ - + ]: 44 : if (r < 0)
2865 [ # # ]: 0 : return log_error_errno(r, "Failed to create cgroup empty event source: %m");
2866 : :
2867 : : /* Schedule cgroup empty checks early, but after having processed service notification messages or
2868 : : * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
2869 : : * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
2870 : 44 : r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
2871 [ - + ]: 44 : if (r < 0)
2872 [ # # ]: 0 : return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
2873 : :
2874 : 44 : r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
2875 [ - + ]: 44 : if (r < 0)
2876 [ # # ]: 0 : return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
2877 : :
2878 : 44 : (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
2879 : :
2880 : : /* 4. Install notifier inotify object, or agent */
2881 [ + - ]: 44 : if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2882 : :
2883 : : /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
2884 : :
2885 : 44 : m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2886 : 44 : safe_close(m->cgroup_inotify_fd);
2887 : :
2888 : 44 : m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
2889 [ - + ]: 44 : if (m->cgroup_inotify_fd < 0)
2890 [ # # ]: 0 : return log_error_errno(errno, "Failed to create control group inotify object: %m");
2891 : :
2892 : 44 : r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
2893 [ - + ]: 44 : if (r < 0)
2894 [ # # ]: 0 : return log_error_errno(r, "Failed to watch control group inotify object: %m");
2895 : :
2896 : : /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
2897 : : * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
2898 : : * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
2899 : 44 : r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
2900 [ - + ]: 44 : if (r < 0)
2901 [ # # ]: 0 : return log_error_errno(r, "Failed to set priority of inotify event source: %m");
2902 : :
2903 : 44 : (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
2904 : :
2905 [ # # # # : 0 : } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) {
# # ]
2906 : :
2907 : : /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
2908 : : * since it does not generate events when control groups with children run empty. */
2909 : :
2910 : 0 : r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
2911 [ # # ]: 0 : if (r < 0)
2912 [ # # ]: 0 : log_warning_errno(r, "Failed to install release agent, ignoring: %m");
2913 [ # # ]: 0 : else if (r > 0)
2914 [ # # ]: 0 : log_debug("Installed release agent.");
2915 [ # # ]: 0 : else if (r == 0)
2916 [ # # ]: 0 : log_debug("Release agent already installed.");
2917 : : }
2918 : :
2919 : : /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
2920 [ + + + - : 220 : scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
- + - + +
+ + - ]
2921 : 44 : r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2922 [ - + ]: 44 : if (r >= 0) {
2923 : : /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
2924 : 0 : r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
2925 [ # # ]: 0 : if (r < 0)
2926 [ # # ]: 0 : log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
2927 : :
2928 : : /* 6. And pin it, so that it cannot be unmounted */
2929 : 0 : safe_close(m->pin_cgroupfs_fd);
2930 : 0 : m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
2931 [ # # ]: 0 : if (m->pin_cgroupfs_fd < 0)
2932 [ # # ]: 0 : return log_error_errno(errno, "Failed to open pin file: %m");
2933 : :
2934 [ - + ]: 44 : } else if (!MANAGER_IS_TEST_RUN(m))
2935 [ # # ]: 0 : return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
2936 : :
2937 : : /* 7. Always enable hierarchical support if it exists... */
2938 [ + - - + ]: 44 : if (!all_unified && !MANAGER_IS_TEST_RUN(m))
2939 : 0 : (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
2940 : :
2941 : : /* 8. Figure out which controllers are supported */
2942 : 44 : r = cg_mask_supported(&m->cgroup_supported);
2943 [ - + ]: 44 : if (r < 0)
2944 [ # # ]: 0 : return log_error_errno(r, "Failed to determine supported controllers: %m");
2945 : :
2946 : : /* 9. Figure out which bpf-based pseudo-controllers are supported */
2947 : 44 : r = cg_bpf_mask_supported(&mask);
2948 [ - + ]: 44 : if (r < 0)
2949 [ # # ]: 0 : return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
2950 : 44 : m->cgroup_supported |= mask;
2951 : :
2952 : : /* 10. Log which controllers are supported */
2953 [ + + ]: 440 : for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
2954 [ + + ]: 396 : log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
2955 : :
2956 : 44 : return 0;
2957 : : }
2958 : :
2959 : 56 : void manager_shutdown_cgroup(Manager *m, bool delete) {
2960 [ - + ]: 56 : assert(m);
2961 : :
2962 : : /* We can't really delete the group, since we are in it. But
2963 : : * let's trim it. */
2964 [ - + # # : 56 : if (delete && m->cgroup_root && m->test_run_flags != MANAGER_TEST_RUN_MINIMAL)
# # ]
2965 : 0 : (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
2966 : :
2967 : 56 : m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
2968 : :
2969 : 56 : m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
2970 : 56 : m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
2971 : :
2972 : 56 : m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
2973 : 56 : m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
2974 : :
2975 : 56 : m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
2976 : :
2977 : 56 : m->cgroup_root = mfree(m->cgroup_root);
2978 : 56 : }
2979 : :
2980 : 8 : Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
2981 : : char *p;
2982 : : Unit *u;
2983 : :
2984 [ - + ]: 8 : assert(m);
2985 [ - + ]: 8 : assert(cgroup);
2986 : :
2987 : 8 : u = hashmap_get(m->cgroup_unit, cgroup);
2988 [ - + ]: 8 : if (u)
2989 : 0 : return u;
2990 : :
2991 : 8 : p = strdupa(cgroup);
2992 : 16 : for (;;) {
2993 : : char *e;
2994 : :
2995 : 24 : e = strrchr(p, '/');
2996 [ + - + + ]: 24 : if (!e || e == p)
2997 : 8 : return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
2998 : :
2999 : 16 : *e = 0;
3000 : :
3001 : 16 : u = hashmap_get(m->cgroup_unit, p);
3002 [ - + ]: 16 : if (u)
3003 : 0 : return u;
3004 : : }
3005 : : }
3006 : :
3007 : 8 : Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
3008 : 8 : _cleanup_free_ char *cgroup = NULL;
3009 : :
3010 [ - + ]: 8 : assert(m);
3011 : :
3012 [ - + ]: 8 : if (!pid_is_valid(pid))
3013 : 0 : return NULL;
3014 : :
3015 [ - + ]: 8 : if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
3016 : 0 : return NULL;
3017 : :
3018 : 8 : return manager_get_unit_by_cgroup(m, cgroup);
3019 : : }
3020 : :
3021 : 0 : Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
3022 : : Unit *u, **array;
3023 : :
3024 [ # # ]: 0 : assert(m);
3025 : :
3026 : : /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
3027 : : * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
3028 : : * relevant one as children of the process will be assigned to that one, too, before all else. */
3029 : :
3030 [ # # ]: 0 : if (!pid_is_valid(pid))
3031 : 0 : return NULL;
3032 : :
3033 [ # # ]: 0 : if (pid == getpid_cached())
3034 : 0 : return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
3035 : :
3036 : 0 : u = manager_get_unit_by_pid_cgroup(m, pid);
3037 [ # # ]: 0 : if (u)
3038 : 0 : return u;
3039 : :
3040 : 0 : u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
3041 [ # # ]: 0 : if (u)
3042 : 0 : return u;
3043 : :
3044 : 0 : array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
3045 [ # # ]: 0 : if (array)
3046 : 0 : return array[0];
3047 : :
3048 : 0 : return NULL;
3049 : : }
3050 : :
3051 : 0 : int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
3052 : : Unit *u;
3053 : :
3054 [ # # ]: 0 : assert(m);
3055 [ # # ]: 0 : assert(cgroup);
3056 : :
3057 : : /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
3058 : : * or from the --system instance */
3059 : :
3060 [ # # ]: 0 : log_debug("Got cgroup empty notification for: %s", cgroup);
3061 : :
3062 : 0 : u = manager_get_unit_by_cgroup(m, cgroup);
3063 [ # # ]: 0 : if (!u)
3064 : 0 : return 0;
3065 : :
3066 : 0 : unit_add_to_cgroup_empty_queue(u);
3067 : 0 : return 1;
3068 : : }
3069 : :
3070 : 0 : int unit_get_memory_current(Unit *u, uint64_t *ret) {
3071 : 0 : _cleanup_free_ char *v = NULL;
3072 : : int r;
3073 : :
3074 [ # # ]: 0 : assert(u);
3075 [ # # ]: 0 : assert(ret);
3076 : :
3077 [ # # # # ]: 0 : if (!UNIT_CGROUP_BOOL(u, memory_accounting))
3078 : 0 : return -ENODATA;
3079 : :
3080 [ # # ]: 0 : if (!u->cgroup_path)
3081 : 0 : return -ENODATA;
3082 : :
3083 : : /* The root cgroup doesn't expose this information, let's get it from /proc instead */
3084 [ # # ]: 0 : if (unit_has_host_root_cgroup(u))
3085 : 0 : return procfs_memory_get_used(ret);
3086 : :
3087 [ # # ]: 0 : if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
3088 : 0 : return -ENODATA;
3089 : :
3090 : 0 : r = cg_all_unified();
3091 [ # # ]: 0 : if (r < 0)
3092 : 0 : return r;
3093 [ # # ]: 0 : if (r > 0)
3094 : 0 : r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
3095 : : else
3096 : 0 : r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
3097 [ # # ]: 0 : if (r == -ENOENT)
3098 : 0 : return -ENODATA;
3099 [ # # ]: 0 : if (r < 0)
3100 : 0 : return r;
3101 : :
3102 : 0 : return safe_atou64(v, ret);
3103 : : }
3104 : :
3105 : 0 : int unit_get_tasks_current(Unit *u, uint64_t *ret) {
3106 : 0 : _cleanup_free_ char *v = NULL;
3107 : : int r;
3108 : :
3109 [ # # ]: 0 : assert(u);
3110 [ # # ]: 0 : assert(ret);
3111 : :
3112 [ # # # # ]: 0 : if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
3113 : 0 : return -ENODATA;
3114 : :
3115 [ # # ]: 0 : if (!u->cgroup_path)
3116 : 0 : return -ENODATA;
3117 : :
3118 : : /* The root cgroup doesn't expose this information, let's get it from /proc instead */
3119 [ # # ]: 0 : if (unit_has_host_root_cgroup(u))
3120 : 0 : return procfs_tasks_get_current(ret);
3121 : :
3122 [ # # ]: 0 : if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
3123 : 0 : return -ENODATA;
3124 : :
3125 : 0 : r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
3126 [ # # ]: 0 : if (r == -ENOENT)
3127 : 0 : return -ENODATA;
3128 [ # # ]: 0 : if (r < 0)
3129 : 0 : return r;
3130 : :
3131 : 0 : return safe_atou64(v, ret);
3132 : : }
3133 : :
3134 : 24 : static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
3135 : 24 : _cleanup_free_ char *v = NULL;
3136 : : uint64_t ns;
3137 : : int r;
3138 : :
3139 [ - + ]: 24 : assert(u);
3140 [ - + ]: 24 : assert(ret);
3141 : :
3142 [ - + ]: 24 : if (!u->cgroup_path)
3143 : 0 : return -ENODATA;
3144 : :
3145 : : /* The root cgroup doesn't expose this information, let's get it from /proc instead */
3146 [ - + ]: 24 : if (unit_has_host_root_cgroup(u))
3147 : 0 : return procfs_cpu_get_usage(ret);
3148 : :
3149 : : /* Requisite controllers for CPU accounting are not enabled */
3150 [ + - ]: 24 : if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
3151 : 24 : return -ENODATA;
3152 : :
3153 : 0 : r = cg_all_unified();
3154 [ # # ]: 0 : if (r < 0)
3155 : 0 : return r;
3156 [ # # ]: 0 : if (r > 0) {
3157 [ # # ]: 0 : _cleanup_free_ char *val = NULL;
3158 : : uint64_t us;
3159 : :
3160 : 0 : r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
3161 [ # # # # ]: 0 : if (IN_SET(r, -ENOENT, -ENXIO))
3162 : 0 : return -ENODATA;
3163 [ # # ]: 0 : if (r < 0)
3164 : 0 : return r;
3165 : :
3166 : 0 : r = safe_atou64(val, &us);
3167 [ # # ]: 0 : if (r < 0)
3168 : 0 : return r;
3169 : :
3170 : 0 : ns = us * NSEC_PER_USEC;
3171 : : } else {
3172 : 0 : r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
3173 [ # # ]: 0 : if (r == -ENOENT)
3174 : 0 : return -ENODATA;
3175 [ # # ]: 0 : if (r < 0)
3176 : 0 : return r;
3177 : :
3178 : 0 : r = safe_atou64(v, &ns);
3179 [ # # ]: 0 : if (r < 0)
3180 : 0 : return r;
3181 : : }
3182 : :
3183 : 0 : *ret = ns;
3184 : 0 : return 0;
3185 : : }
3186 : :
3187 : 28 : int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
3188 : : nsec_t ns;
3189 : : int r;
3190 : :
3191 [ - + ]: 28 : assert(u);
3192 : :
3193 : : /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
3194 : : * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
3195 : : * call this function with a NULL return value. */
3196 : :
3197 [ - + + - ]: 28 : if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
3198 : 28 : return -ENODATA;
3199 : :
3200 : 0 : r = unit_get_cpu_usage_raw(u, &ns);
3201 [ # # # # ]: 0 : if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
3202 : : /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
3203 : : * cached value. */
3204 : :
3205 [ # # ]: 0 : if (ret)
3206 : 0 : *ret = u->cpu_usage_last;
3207 : 0 : return 0;
3208 : : }
3209 [ # # ]: 0 : if (r < 0)
3210 : 0 : return r;
3211 : :
3212 [ # # ]: 0 : if (ns > u->cpu_usage_base)
3213 : 0 : ns -= u->cpu_usage_base;
3214 : : else
3215 : 0 : ns = 0;
3216 : :
3217 : 0 : u->cpu_usage_last = ns;
3218 [ # # ]: 0 : if (ret)
3219 : 0 : *ret = ns;
3220 : :
3221 : 0 : return 0;
3222 : : }
3223 : :
3224 : 112 : int unit_get_ip_accounting(
3225 : : Unit *u,
3226 : : CGroupIPAccountingMetric metric,
3227 : : uint64_t *ret) {
3228 : :
3229 : : uint64_t value;
3230 : : int fd, r;
3231 : :
3232 [ - + ]: 112 : assert(u);
3233 [ - + ]: 112 : assert(metric >= 0);
3234 [ - + ]: 112 : assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
3235 [ - + ]: 112 : assert(ret);
3236 : :
3237 [ - + + - ]: 112 : if (!UNIT_CGROUP_BOOL(u, ip_accounting))
3238 : 112 : return -ENODATA;
3239 : :
3240 [ # # ]: 0 : fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
3241 [ # # ]: 0 : u->ip_accounting_ingress_map_fd :
3242 : : u->ip_accounting_egress_map_fd;
3243 [ # # ]: 0 : if (fd < 0)
3244 : 0 : return -ENODATA;
3245 : :
3246 [ # # # # ]: 0 : if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
3247 : 0 : r = bpf_firewall_read_accounting(fd, &value, NULL);
3248 : : else
3249 : 0 : r = bpf_firewall_read_accounting(fd, NULL, &value);
3250 [ # # ]: 0 : if (r < 0)
3251 : 0 : return r;
3252 : :
3253 : : /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
3254 : : * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
3255 : : * ip_accounting_extra[] field, and add them in here transparently. */
3256 : :
3257 : 0 : *ret = value + u->ip_accounting_extra[metric];
3258 : :
3259 : 0 : return r;
3260 : : }
3261 : :
3262 : 24 : static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
3263 : : static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
3264 : : [CGROUP_IO_READ_BYTES] = "rbytes=",
3265 : : [CGROUP_IO_WRITE_BYTES] = "wbytes=",
3266 : : [CGROUP_IO_READ_OPERATIONS] = "rios=",
3267 : : [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
3268 : : };
3269 : 24 : uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
3270 : 24 : _cleanup_free_ char *path = NULL;
3271 : 24 : _cleanup_fclose_ FILE *f = NULL;
3272 : : int r;
3273 : :
3274 [ - + ]: 24 : assert(u);
3275 : :
3276 [ - + ]: 24 : if (!u->cgroup_path)
3277 : 0 : return -ENODATA;
3278 : :
3279 [ - + ]: 24 : if (unit_has_host_root_cgroup(u))
3280 : 0 : return -ENODATA; /* TODO: return useful data for the top-level cgroup */
3281 : :
3282 : 24 : r = cg_all_unified();
3283 [ - + ]: 24 : if (r < 0)
3284 : 0 : return r;
3285 [ + - ]: 24 : if (r == 0) /* TODO: support cgroupv1 */
3286 : 24 : return -ENODATA;
3287 : :
3288 [ # # ]: 0 : if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
3289 : 0 : return -ENODATA;
3290 : :
3291 : 0 : r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
3292 [ # # ]: 0 : if (r < 0)
3293 : 0 : return r;
3294 : :
3295 : 0 : f = fopen(path, "re");
3296 [ # # ]: 0 : if (!f)
3297 : 0 : return -errno;
3298 : :
3299 : 0 : for (;;) {
3300 [ # # # ]: 0 : _cleanup_free_ char *line = NULL;
3301 : : const char *p;
3302 : :
3303 : 0 : r = read_line(f, LONG_LINE_MAX, &line);
3304 [ # # ]: 0 : if (r < 0)
3305 : 0 : return r;
3306 [ # # ]: 0 : if (r == 0)
3307 : 0 : break;
3308 : :
3309 : 0 : p = line;
3310 : 0 : p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
3311 : 0 : p += strspn(p, WHITESPACE); /* Skip over following whitespace */
3312 : :
3313 : 0 : for (;;) {
3314 [ # # # ]: 0 : _cleanup_free_ char *word = NULL;
3315 : :
3316 : 0 : r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
3317 [ # # ]: 0 : if (r < 0)
3318 : 0 : return r;
3319 [ # # ]: 0 : if (r == 0)
3320 : 0 : break;
3321 : :
3322 [ # # ]: 0 : for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
3323 : : const char *x;
3324 : :
3325 : 0 : x = startswith(word, field_names[i]);
3326 [ # # ]: 0 : if (x) {
3327 : : uint64_t w;
3328 : :
3329 : 0 : r = safe_atou64(x, &w);
3330 [ # # ]: 0 : if (r < 0)
3331 : 0 : return r;
3332 : :
3333 : : /* Sum up the stats of all devices */
3334 : 0 : acc[i] += w;
3335 : 0 : break;
3336 : : }
3337 : : }
3338 : : }
3339 : : }
3340 : :
3341 : 0 : memcpy(ret, acc, sizeof(acc));
3342 : 0 : return 0;
3343 : : }
3344 : :
3345 : 112 : int unit_get_io_accounting(
3346 : : Unit *u,
3347 : : CGroupIOAccountingMetric metric,
3348 : : bool allow_cache,
3349 : : uint64_t *ret) {
3350 : :
3351 : : uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
3352 : : int r;
3353 : :
3354 : : /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
3355 : :
3356 [ - + + - ]: 112 : if (!UNIT_CGROUP_BOOL(u, io_accounting))
3357 : 112 : return -ENODATA;
3358 : :
3359 [ # # # # ]: 0 : if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
3360 : 0 : goto done;
3361 : :
3362 : 0 : r = unit_get_io_accounting_raw(u, raw);
3363 [ # # # # ]: 0 : if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
3364 : 0 : goto done;
3365 [ # # ]: 0 : if (r < 0)
3366 : 0 : return r;
3367 : :
3368 [ # # ]: 0 : for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
3369 : : /* Saturated subtraction */
3370 [ # # ]: 0 : if (raw[i] > u->io_accounting_base[i])
3371 : 0 : u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
3372 : : else
3373 : 0 : u->io_accounting_last[i] = 0;
3374 : : }
3375 : :
3376 : 0 : done:
3377 [ # # ]: 0 : if (ret)
3378 : 0 : *ret = u->io_accounting_last[metric];
3379 : :
3380 : 0 : return 0;
3381 : : }
3382 : :
3383 : 24 : int unit_reset_cpu_accounting(Unit *u) {
3384 : : int r;
3385 : :
3386 [ - + ]: 24 : assert(u);
3387 : :
3388 : 24 : u->cpu_usage_last = NSEC_INFINITY;
3389 : :
3390 : 24 : r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
3391 [ + - ]: 24 : if (r < 0) {
3392 : 24 : u->cpu_usage_base = 0;
3393 : 24 : return r;
3394 : : }
3395 : :
3396 : 0 : return 0;
3397 : : }
3398 : :
3399 : 24 : int unit_reset_ip_accounting(Unit *u) {
3400 : 24 : int r = 0, q = 0;
3401 : :
3402 [ - + ]: 24 : assert(u);
3403 : :
3404 [ - + ]: 24 : if (u->ip_accounting_ingress_map_fd >= 0)
3405 : 0 : r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
3406 : :
3407 [ - + ]: 24 : if (u->ip_accounting_egress_map_fd >= 0)
3408 : 0 : q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
3409 : :
3410 [ + - ]: 24 : zero(u->ip_accounting_extra);
3411 : :
3412 [ - + ]: 24 : return r < 0 ? r : q;
3413 : : }
3414 : :
3415 : 24 : int unit_reset_io_accounting(Unit *u) {
3416 : : int r;
3417 : :
3418 [ - + ]: 24 : assert(u);
3419 : :
3420 [ + + ]: 120 : for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++)
3421 : 96 : u->io_accounting_last[i] = UINT64_MAX;
3422 : :
3423 : 24 : r = unit_get_io_accounting_raw(u, u->io_accounting_base);
3424 [ + - ]: 24 : if (r < 0) {
3425 [ + - ]: 24 : zero(u->io_accounting_base);
3426 : 24 : return r;
3427 : : }
3428 : :
3429 : 0 : return 0;
3430 : : }
3431 : :
3432 : 24 : int unit_reset_accounting(Unit *u) {
3433 : : int r, q, v;
3434 : :
3435 [ - + ]: 24 : assert(u);
3436 : :
3437 : 24 : r = unit_reset_cpu_accounting(u);
3438 : 24 : q = unit_reset_io_accounting(u);
3439 : 24 : v = unit_reset_ip_accounting(u);
3440 : :
3441 [ - + # # ]: 24 : return r < 0 ? r : q < 0 ? q : v;
3442 : : }
3443 : :
3444 : 0 : void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
3445 [ # # ]: 0 : assert(u);
3446 : :
3447 [ # # ]: 0 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
3448 : 0 : return;
3449 : :
3450 [ # # ]: 0 : if (m == 0)
3451 : 0 : return;
3452 : :
3453 : : /* always invalidate compat pairs together */
3454 [ # # ]: 0 : if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
3455 : 0 : m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
3456 : :
3457 [ # # ]: 0 : if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
3458 : 0 : m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
3459 : :
3460 [ # # ]: 0 : if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */
3461 : 0 : return;
3462 : :
3463 : 0 : u->cgroup_invalidated_mask |= m;
3464 : 0 : unit_add_to_cgroup_realize_queue(u);
3465 : : }
3466 : :
3467 : 0 : void unit_invalidate_cgroup_bpf(Unit *u) {
3468 [ # # ]: 0 : assert(u);
3469 : :
3470 [ # # ]: 0 : if (!UNIT_HAS_CGROUP_CONTEXT(u))
3471 : 0 : return;
3472 : :
3473 [ # # ]: 0 : if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
3474 : 0 : return;
3475 : :
3476 : 0 : u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
3477 : 0 : unit_add_to_cgroup_realize_queue(u);
3478 : :
3479 : : /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
3480 : : * list of our children includes our own. */
3481 [ # # ]: 0 : if (u->type == UNIT_SLICE) {
3482 : : Unit *member;
3483 : : Iterator i;
3484 : : void *v;
3485 : :
3486 [ # # ]: 0 : HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
3487 [ # # ]: 0 : if (UNIT_DEREF(member->slice) == u)
3488 : 0 : unit_invalidate_cgroup_bpf(member);
3489 : : }
3490 : : }
3491 : : }
3492 : :
3493 : 7148 : bool unit_cgroup_delegate(Unit *u) {
3494 : : CGroupContext *c;
3495 : :
3496 [ - + ]: 7148 : assert(u);
3497 : :
3498 [ + + ]: 7148 : if (!UNIT_VTABLE(u)->can_delegate)
3499 : 6344 : return false;
3500 : :
3501 : 804 : c = unit_get_cgroup_context(u);
3502 [ - + ]: 804 : if (!c)
3503 : 0 : return false;
3504 : :
3505 : 804 : return c->delegate;
3506 : : }
3507 : :
3508 : 4 : void manager_invalidate_startup_units(Manager *m) {
3509 : : Iterator i;
3510 : : Unit *u;
3511 : :
3512 [ - + ]: 4 : assert(m);
3513 : :
3514 [ - + ]: 4 : SET_FOREACH(u, m->startup_units, i)
3515 : 0 : unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
3516 : 4 : }
3517 : :
3518 : 448 : static int unit_get_nice(Unit *u) {
3519 : : ExecContext *ec;
3520 : :
3521 : 448 : ec = unit_get_exec_context(u);
3522 [ + + ]: 448 : return ec ? ec->nice : 0;
3523 : : }
3524 : :
3525 : 448 : static uint64_t unit_get_cpu_weight(Unit *u) {
3526 : 448 : ManagerState state = manager_state(u->manager);
3527 : : CGroupContext *cc;
3528 : :
3529 : 448 : cc = unit_get_cgroup_context(u);
3530 [ + + ]: 448 : return cc ? cgroup_context_cpu_weight(cc, state) : CGROUP_WEIGHT_DEFAULT;
3531 : : }
3532 : :
3533 : 505 : int compare_job_priority(const void *a, const void *b) {
3534 : 505 : const Job *x = a, *y = b;
3535 : : int nice_x, nice_y;
3536 : : uint64_t weight_x, weight_y;
3537 : : int ret;
3538 : :
3539 [ + + + + ]: 505 : if ((ret = CMP(x->unit->type, y->unit->type)) != 0)
3540 : 281 : return -ret;
3541 : :
3542 : 224 : weight_x = unit_get_cpu_weight(x->unit);
3543 : 224 : weight_y = unit_get_cpu_weight(y->unit);
3544 : :
3545 [ + - - + ]: 224 : if ((ret = CMP(weight_x, weight_y)) != 0)
3546 : 0 : return -ret;
3547 : :
3548 : 224 : nice_x = unit_get_nice(x->unit);
3549 : 224 : nice_y = unit_get_nice(y->unit);
3550 : :
3551 [ + - - + ]: 224 : if ((ret = CMP(nice_x, nice_y)) != 0)
3552 : 0 : return ret;
3553 : :
3554 : 224 : return strcmp(x->unit->id, y->unit->id);
3555 : : }
3556 : :
3557 : : static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
3558 : : [CGROUP_AUTO] = "auto",
3559 : : [CGROUP_CLOSED] = "closed",
3560 : : [CGROUP_STRICT] = "strict",
3561 : : };
3562 : :
3563 [ + + + + ]: 768 : DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
|