LCOV - code coverage report
Current view: top level - core - cgroup.c (source / functions) Hit Total Coverage
Test: main_coverage.info Lines: 512 1704 30.0 %
Date: 2019-08-22 15:41:25 Functions: 64 120 53.3 %

          Line data    Source code
       1             : /* SPDX-License-Identifier: LGPL-2.1+ */
       2             : 
       3             : #include <fcntl.h>
       4             : #include <fnmatch.h>
       5             : 
       6             : #include "sd-messages.h"
       7             : 
       8             : #include "alloc-util.h"
       9             : #include "blockdev-util.h"
      10             : #include "bpf-devices.h"
      11             : #include "bpf-firewall.h"
      12             : #include "btrfs-util.h"
      13             : #include "bus-error.h"
      14             : #include "cgroup-util.h"
      15             : #include "cgroup.h"
      16             : #include "fd-util.h"
      17             : #include "fileio.h"
      18             : #include "fs-util.h"
      19             : #include "nulstr-util.h"
      20             : #include "parse-util.h"
      21             : #include "path-util.h"
      22             : #include "process-util.h"
      23             : #include "procfs-util.h"
      24             : #include "special.h"
      25             : #include "stat-util.h"
      26             : #include "stdio-util.h"
      27             : #include "string-table.h"
      28             : #include "string-util.h"
      29             : #include "virt.h"
      30             : 
      31             : #define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
      32             : 
      33             : /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
      34             :  * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
      35             :  * out specific attributes from us. */
      36             : #define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
      37             : 
      38          29 : bool manager_owns_host_root_cgroup(Manager *m) {
      39          29 :         assert(m);
      40             : 
      41             :         /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
      42             :          * group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
      43             :          * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
      44             :          * we run in any kind of container virtualization. */
      45             : 
      46          29 :         if (MANAGER_IS_USER(m))
      47          29 :                 return false;
      48             : 
      49           0 :         if (detect_container() > 0)
      50           0 :                 return false;
      51             : 
      52           0 :         return empty_or_root(m->cgroup_root);
      53             : }
      54             : 
      55          18 : bool unit_has_host_root_cgroup(Unit *u) {
      56          18 :         assert(u);
      57             : 
      58             :         /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
      59             :          * the manager manages the root cgroup. */
      60             : 
      61          18 :         if (!manager_owns_host_root_cgroup(u->manager))
      62          18 :                 return false;
      63             : 
      64           0 :         return unit_has_name(u, SPECIAL_ROOT_SLICE);
      65             : }
      66             : 
      67           0 : static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
      68             :         int r;
      69             : 
      70           0 :         r = cg_set_attribute(controller, u->cgroup_path, attribute, value);
      71           0 :         if (r < 0)
      72           0 :                 log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
      73             :                               strna(attribute), isempty(u->cgroup_path) ? "/" : u->cgroup_path, (int) strcspn(value, NEWLINE), value);
      74             : 
      75           0 :         return r;
      76             : }
      77             : 
      78           0 : static void cgroup_compat_warn(void) {
      79             :         static bool cgroup_compat_warned = false;
      80             : 
      81           0 :         if (cgroup_compat_warned)
      82           0 :                 return;
      83             : 
      84           0 :         log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
      85             :                     "See cgroup-compat debug messages for details.");
      86             : 
      87           0 :         cgroup_compat_warned = true;
      88             : }
      89             : 
      90             : #define log_cgroup_compat(unit, fmt, ...) do {                                  \
      91             :                 cgroup_compat_warn();                                           \
      92             :                 log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__);     \
      93             :         } while (false)
      94             : 
      95         589 : void cgroup_context_init(CGroupContext *c) {
      96         589 :         assert(c);
      97             : 
      98             :         /* Initialize everything to the kernel defaults. */
      99             : 
     100         589 :         *c = (CGroupContext) {
     101             :                 .cpu_weight = CGROUP_WEIGHT_INVALID,
     102             :                 .startup_cpu_weight = CGROUP_WEIGHT_INVALID,
     103             :                 .cpu_quota_per_sec_usec = USEC_INFINITY,
     104             :                 .cpu_quota_period_usec = USEC_INFINITY,
     105             : 
     106             :                 .cpu_shares = CGROUP_CPU_SHARES_INVALID,
     107             :                 .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
     108             : 
     109             :                 .memory_high = CGROUP_LIMIT_MAX,
     110             :                 .memory_max = CGROUP_LIMIT_MAX,
     111             :                 .memory_swap_max = CGROUP_LIMIT_MAX,
     112             : 
     113             :                 .memory_limit = CGROUP_LIMIT_MAX,
     114             : 
     115             :                 .io_weight = CGROUP_WEIGHT_INVALID,
     116             :                 .startup_io_weight = CGROUP_WEIGHT_INVALID,
     117             : 
     118             :                 .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
     119             :                 .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
     120             : 
     121             :                 .tasks_max = CGROUP_LIMIT_MAX,
     122             :         };
     123         589 : }
     124             : 
     125           0 : void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
     126           0 :         assert(c);
     127           0 :         assert(a);
     128             : 
     129           0 :         LIST_REMOVE(device_allow, c->device_allow, a);
     130           0 :         free(a->path);
     131           0 :         free(a);
     132           0 : }
     133             : 
     134           0 : void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
     135           0 :         assert(c);
     136           0 :         assert(w);
     137             : 
     138           0 :         LIST_REMOVE(device_weights, c->io_device_weights, w);
     139           0 :         free(w->path);
     140           0 :         free(w);
     141           0 : }
     142             : 
     143           0 : void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
     144           0 :         assert(c);
     145           0 :         assert(l);
     146             : 
     147           0 :         LIST_REMOVE(device_latencies, c->io_device_latencies, l);
     148           0 :         free(l->path);
     149           0 :         free(l);
     150           0 : }
     151             : 
     152           0 : void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
     153           0 :         assert(c);
     154           0 :         assert(l);
     155             : 
     156           0 :         LIST_REMOVE(device_limits, c->io_device_limits, l);
     157           0 :         free(l->path);
     158           0 :         free(l);
     159           0 : }
     160             : 
     161           0 : void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
     162           0 :         assert(c);
     163           0 :         assert(w);
     164             : 
     165           0 :         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
     166           0 :         free(w->path);
     167           0 :         free(w);
     168           0 : }
     169             : 
     170           0 : void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
     171           0 :         assert(c);
     172           0 :         assert(b);
     173             : 
     174           0 :         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
     175           0 :         free(b->path);
     176           0 :         free(b);
     177           0 : }
     178             : 
     179         589 : void cgroup_context_done(CGroupContext *c) {
     180         589 :         assert(c);
     181             : 
     182         589 :         while (c->io_device_weights)
     183           0 :                 cgroup_context_free_io_device_weight(c, c->io_device_weights);
     184             : 
     185         589 :         while (c->io_device_latencies)
     186           0 :                 cgroup_context_free_io_device_latency(c, c->io_device_latencies);
     187             : 
     188         589 :         while (c->io_device_limits)
     189           0 :                 cgroup_context_free_io_device_limit(c, c->io_device_limits);
     190             : 
     191         589 :         while (c->blockio_device_weights)
     192           0 :                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
     193             : 
     194         589 :         while (c->blockio_device_bandwidths)
     195           0 :                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
     196             : 
     197         589 :         while (c->device_allow)
     198           0 :                 cgroup_context_free_device_allow(c, c->device_allow);
     199             : 
     200         589 :         c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
     201         589 :         c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
     202             : 
     203         589 :         c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
     204         589 :         c->ip_filters_egress = strv_free(c->ip_filters_egress);
     205         589 : }
     206             : 
     207         182 : void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
     208         182 :         _cleanup_free_ char *disable_controllers_str = NULL;
     209             :         CGroupIODeviceLimit *il;
     210             :         CGroupIODeviceWeight *iw;
     211             :         CGroupIODeviceLatency *l;
     212             :         CGroupBlockIODeviceBandwidth *b;
     213             :         CGroupBlockIODeviceWeight *w;
     214             :         CGroupDeviceAllow *a;
     215             :         IPAddressAccessItem *iaai;
     216             :         char **path;
     217             :         char u[FORMAT_TIMESPAN_MAX];
     218             :         char v[FORMAT_TIMESPAN_MAX];
     219             : 
     220         182 :         assert(c);
     221         182 :         assert(f);
     222             : 
     223         182 :         prefix = strempty(prefix);
     224             : 
     225         182 :         (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
     226             : 
     227         364 :         fprintf(f,
     228             :                 "%sCPUAccounting=%s\n"
     229             :                 "%sIOAccounting=%s\n"
     230             :                 "%sBlockIOAccounting=%s\n"
     231             :                 "%sMemoryAccounting=%s\n"
     232             :                 "%sTasksAccounting=%s\n"
     233             :                 "%sIPAccounting=%s\n"
     234             :                 "%sCPUWeight=%" PRIu64 "\n"
     235             :                 "%sStartupCPUWeight=%" PRIu64 "\n"
     236             :                 "%sCPUShares=%" PRIu64 "\n"
     237             :                 "%sStartupCPUShares=%" PRIu64 "\n"
     238             :                 "%sCPUQuotaPerSecSec=%s\n"
     239             :                 "%sCPUQuotaPeriodSec=%s\n"
     240             :                 "%sIOWeight=%" PRIu64 "\n"
     241             :                 "%sStartupIOWeight=%" PRIu64 "\n"
     242             :                 "%sBlockIOWeight=%" PRIu64 "\n"
     243             :                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
     244             :                 "%sDefaultMemoryMin=%" PRIu64 "\n"
     245             :                 "%sDefaultMemoryLow=%" PRIu64 "\n"
     246             :                 "%sMemoryMin=%" PRIu64 "\n"
     247             :                 "%sMemoryLow=%" PRIu64 "\n"
     248             :                 "%sMemoryHigh=%" PRIu64 "\n"
     249             :                 "%sMemoryMax=%" PRIu64 "\n"
     250             :                 "%sMemorySwapMax=%" PRIu64 "\n"
     251             :                 "%sMemoryLimit=%" PRIu64 "\n"
     252             :                 "%sTasksMax=%" PRIu64 "\n"
     253             :                 "%sDevicePolicy=%s\n"
     254             :                 "%sDisableControllers=%s\n"
     255             :                 "%sDelegate=%s\n",
     256         182 :                 prefix, yes_no(c->cpu_accounting),
     257         182 :                 prefix, yes_no(c->io_accounting),
     258         182 :                 prefix, yes_no(c->blockio_accounting),
     259         182 :                 prefix, yes_no(c->memory_accounting),
     260         182 :                 prefix, yes_no(c->tasks_accounting),
     261         182 :                 prefix, yes_no(c->ip_accounting),
     262             :                 prefix, c->cpu_weight,
     263             :                 prefix, c->startup_cpu_weight,
     264             :                 prefix, c->cpu_shares,
     265             :                 prefix, c->startup_cpu_shares,
     266             :                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
     267             :                 prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1),
     268             :                 prefix, c->io_weight,
     269             :                 prefix, c->startup_io_weight,
     270             :                 prefix, c->blockio_weight,
     271             :                 prefix, c->startup_blockio_weight,
     272             :                 prefix, c->default_memory_min,
     273             :                 prefix, c->default_memory_low,
     274             :                 prefix, c->memory_min,
     275             :                 prefix, c->memory_low,
     276             :                 prefix, c->memory_high,
     277             :                 prefix, c->memory_max,
     278             :                 prefix, c->memory_swap_max,
     279             :                 prefix, c->memory_limit,
     280             :                 prefix, c->tasks_max,
     281             :                 prefix, cgroup_device_policy_to_string(c->device_policy),
     282             :                 prefix, strempty(disable_controllers_str),
     283         182 :                 prefix, yes_no(c->delegate));
     284             : 
     285         182 :         if (c->delegate) {
     286           0 :                 _cleanup_free_ char *t = NULL;
     287             : 
     288           0 :                 (void) cg_mask_to_string(c->delegate_controllers, &t);
     289             : 
     290           0 :                 fprintf(f, "%sDelegateControllers=%s\n",
     291             :                         prefix,
     292             :                         strempty(t));
     293             :         }
     294             : 
     295         182 :         LIST_FOREACH(device_allow, a, c->device_allow)
     296           0 :                 fprintf(f,
     297             :                         "%sDeviceAllow=%s %s%s%s\n",
     298             :                         prefix,
     299             :                         a->path,
     300           0 :                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
     301             : 
     302         182 :         LIST_FOREACH(device_weights, iw, c->io_device_weights)
     303           0 :                 fprintf(f,
     304             :                         "%sIODeviceWeight=%s %" PRIu64 "\n",
     305             :                         prefix,
     306             :                         iw->path,
     307             :                         iw->weight);
     308             : 
     309         182 :         LIST_FOREACH(device_latencies, l, c->io_device_latencies)
     310           0 :                 fprintf(f,
     311             :                         "%sIODeviceLatencyTargetSec=%s %s\n",
     312             :                         prefix,
     313             :                         l->path,
     314             :                         format_timespan(u, sizeof(u), l->target_usec, 1));
     315             : 
     316         182 :         LIST_FOREACH(device_limits, il, c->io_device_limits) {
     317             :                 char buf[FORMAT_BYTES_MAX];
     318             :                 CGroupIOLimitType type;
     319             : 
     320           0 :                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
     321           0 :                         if (il->limits[type] != cgroup_io_limit_defaults[type])
     322           0 :                                 fprintf(f,
     323             :                                         "%s%s=%s %s\n",
     324             :                                         prefix,
     325             :                                         cgroup_io_limit_type_to_string(type),
     326             :                                         il->path,
     327             :                                         format_bytes(buf, sizeof(buf), il->limits[type]));
     328             :         }
     329             : 
     330         182 :         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
     331           0 :                 fprintf(f,
     332             :                         "%sBlockIODeviceWeight=%s %" PRIu64,
     333             :                         prefix,
     334             :                         w->path,
     335             :                         w->weight);
     336             : 
     337         182 :         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
     338             :                 char buf[FORMAT_BYTES_MAX];
     339             : 
     340           0 :                 if (b->rbps != CGROUP_LIMIT_MAX)
     341           0 :                         fprintf(f,
     342             :                                 "%sBlockIOReadBandwidth=%s %s\n",
     343             :                                 prefix,
     344             :                                 b->path,
     345             :                                 format_bytes(buf, sizeof(buf), b->rbps));
     346           0 :                 if (b->wbps != CGROUP_LIMIT_MAX)
     347           0 :                         fprintf(f,
     348             :                                 "%sBlockIOWriteBandwidth=%s %s\n",
     349             :                                 prefix,
     350             :                                 b->path,
     351             :                                 format_bytes(buf, sizeof(buf), b->wbps));
     352             :         }
     353             : 
     354         182 :         LIST_FOREACH(items, iaai, c->ip_address_allow) {
     355           0 :                 _cleanup_free_ char *k = NULL;
     356             : 
     357           0 :                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
     358           0 :                 fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
     359             :         }
     360             : 
     361         182 :         LIST_FOREACH(items, iaai, c->ip_address_deny) {
     362           0 :                 _cleanup_free_ char *k = NULL;
     363             : 
     364           0 :                 (void) in_addr_to_string(iaai->family, &iaai->address, &k);
     365           0 :                 fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen);
     366             :         }
     367             : 
     368         182 :         STRV_FOREACH(path, c->ip_filters_ingress)
     369           0 :                 fprintf(f, "%sIPIngressFilterPath=%s\n", prefix, *path);
     370             : 
     371         182 :         STRV_FOREACH(path, c->ip_filters_egress)
     372           0 :                 fprintf(f, "%sIPEgressFilterPath=%s\n", prefix, *path);
     373         182 : }
     374             : 
     375           0 : int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) {
     376           0 :         _cleanup_free_ CGroupDeviceAllow *a = NULL;
     377           0 :         _cleanup_free_ char *d = NULL;
     378             : 
     379           0 :         assert(c);
     380           0 :         assert(dev);
     381           0 :         assert(isempty(mode) || in_charset(mode, "rwm"));
     382             : 
     383           0 :         a = new(CGroupDeviceAllow, 1);
     384           0 :         if (!a)
     385           0 :                 return -ENOMEM;
     386             : 
     387           0 :         d = strdup(dev);
     388           0 :         if (!d)
     389           0 :                 return -ENOMEM;
     390             : 
     391           0 :         *a = (CGroupDeviceAllow) {
     392           0 :                 .path = TAKE_PTR(d),
     393           0 :                 .r = isempty(mode) || strchr(mode, 'r'),
     394           0 :                 .w = isempty(mode) || strchr(mode, 'w'),
     395           0 :                 .m = isempty(mode) || strchr(mode, 'm'),
     396             :         };
     397             : 
     398           0 :         LIST_PREPEND(device_allow, c->device_allow, a);
     399           0 :         TAKE_PTR(a);
     400             : 
     401           0 :         return 0;
     402             : }
     403             : 
     404             : #define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry)                       \
     405             :         uint64_t unit_get_ancestor_##entry(Unit *u) {                   \
     406             :                 CGroupContext *c;                                       \
     407             :                                                                         \
     408             :                 /* 1. Is entry set in this unit? If so, use that.       \
     409             :                  * 2. Is the default for this entry set in any          \
     410             :                  *    ancestor? If so, use that.                        \
     411             :                  * 3. Otherwise, return CGROUP_LIMIT_MIN. */            \
     412             :                                                                         \
     413             :                 assert(u);                                              \
     414             :                                                                         \
     415             :                 c = unit_get_cgroup_context(u);                         \
     416             :                 if (c && c->entry##_set)                                \
     417             :                         return c->entry;                                \
     418             :                                                                         \
     419             :                 while ((u = UNIT_DEREF(u->slice))) {                    \
     420             :                         c = unit_get_cgroup_context(u);                 \
     421             :                         if (c && c->default_##entry##_set)              \
     422             :                                 return c->default_##entry;              \
     423             :                 }                                                       \
     424             :                                                                         \
     425             :                 /* We've reached the root, but nobody had default for   \
     426             :                  * this entry set, so set it to the kernel default. */  \
     427             :                 return CGROUP_LIMIT_MIN;                                \
     428             : }
     429             : 
     430         136 : UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
     431           0 : UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
     432             : 
     433           6 : static void cgroup_xattr_apply(Unit *u) {
     434             :         char ids[SD_ID128_STRING_MAX];
     435             :         int r;
     436             : 
     437           6 :         assert(u);
     438             : 
     439           6 :         if (!MANAGER_IS_SYSTEM(u->manager))
     440           6 :                 return;
     441             : 
     442           0 :         if (sd_id128_is_null(u->invocation_id))
     443           0 :                 return;
     444             : 
     445           0 :         r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
     446             :                          "trusted.invocation_id",
     447           0 :                          sd_id128_to_string(u->invocation_id, ids), 32,
     448             :                          0);
     449           0 :         if (r < 0)
     450           0 :                 log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
     451             : }
     452             : 
     453           0 : static int lookup_block_device(const char *p, dev_t *ret) {
     454           0 :         dev_t rdev, dev = 0;
     455             :         mode_t mode;
     456             :         int r;
     457             : 
     458           0 :         assert(p);
     459           0 :         assert(ret);
     460             : 
     461           0 :         r = device_path_parse_major_minor(p, &mode, &rdev);
     462           0 :         if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
     463             :                 struct stat st;
     464           0 :                 if (stat(p, &st) < 0)
     465           0 :                         return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
     466           0 :                 rdev = (dev_t)st.st_rdev;
     467           0 :                 dev = (dev_t)st.st_dev;
     468           0 :                 mode = st.st_mode;
     469           0 :         } else if (r < 0)
     470           0 :                 return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
     471             : 
     472           0 :         if (S_ISCHR(mode)) {
     473           0 :                 log_warning("Device node '%s' is a character device, but block device needed.", p);
     474           0 :                 return -ENOTBLK;
     475           0 :         } else if (S_ISBLK(mode))
     476           0 :                 *ret = rdev;
     477           0 :         else if (major(dev) != 0)
     478           0 :                 *ret = dev; /* If this is not a device node then use the block device this file is stored on */
     479             :         else {
     480             :                 /* If this is btrfs, getting the backing block device is a bit harder */
     481           0 :                 r = btrfs_get_block_device(p, ret);
     482           0 :                 if (r < 0 && r != -ENOTTY)
     483           0 :                         return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
     484           0 :                 if (r == -ENOTTY) {
     485           0 :                         log_warning("'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
     486           0 :                         return -ENODEV;
     487             :                 }
     488             :         }
     489             : 
     490             :         /* If this is a LUKS device, try to get the originating block device */
     491           0 :         (void) block_get_originating(*ret, ret);
     492             : 
     493             :         /* If this is a partition, try to get the originating block device */
     494           0 :         (void) block_get_whole_disk(*ret, ret);
     495           0 :         return 0;
     496             : }
     497             : 
     498           0 : static int whitelist_device(BPFProgram *prog, const char *path, const char *node, const char *acc) {
     499             :         dev_t rdev;
     500             :         mode_t mode;
     501             :         int r;
     502             : 
     503           0 :         assert(path);
     504           0 :         assert(acc);
     505             : 
     506             :         /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and
     507             :          * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This
     508             :          * means clients can use these path without the device node actually around */
     509           0 :         r = device_path_parse_major_minor(node, &mode, &rdev);
     510           0 :         if (r < 0) {
     511           0 :                 if (r != -ENODEV)
     512           0 :                         return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node);
     513             : 
     514             :                 struct stat st;
     515           0 :                 if (stat(node, &st) < 0)
     516           0 :                         return log_warning_errno(errno, "Couldn't stat device %s: %m", node);
     517             : 
     518           0 :                 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
     519           0 :                         log_warning("%s is not a device.", node);
     520           0 :                         return -ENODEV;
     521             :                 }
     522           0 :                 rdev = (dev_t) st.st_rdev;
     523           0 :                 mode = st.st_mode;
     524             :         }
     525             : 
     526           0 :         if (cg_all_unified() > 0) {
     527           0 :                 if (!prog)
     528           0 :                         return 0;
     529             : 
     530           0 :                 return cgroup_bpf_whitelist_device(prog, S_ISCHR(mode) ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
     531           0 :                                                    major(rdev), minor(rdev), acc);
     532             : 
     533             :         } else {
     534             :                 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
     535             : 
     536           0 :                 sprintf(buf,
     537             :                         "%c %u:%u %s",
     538           0 :                         S_ISCHR(mode) ? 'c' : 'b',
     539             :                         major(rdev), minor(rdev),
     540             :                         acc);
     541             : 
     542             :                 /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL here. */
     543             : 
     544           0 :                 r = cg_set_attribute("devices", path, "devices.allow", buf);
     545           0 :                 if (r < 0)
     546           0 :                         return log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING,
     547             :                                               r, "Failed to set devices.allow on %s: %m", path);
     548             : 
     549           0 :                 return 0;
     550             :         }
     551             : }
     552             : 
     553           0 : static int whitelist_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc) {
     554           0 :         _cleanup_fclose_ FILE *f = NULL;
     555             :         char buf[2+DECIMAL_STR_MAX(unsigned)+3+4];
     556           0 :         bool good = false;
     557             :         unsigned maj;
     558             :         int r;
     559             : 
     560           0 :         assert(path);
     561           0 :         assert(acc);
     562           0 :         assert(IN_SET(type, 'b', 'c'));
     563             : 
     564           0 :         if (streq(name, "*")) {
     565             :                 /* If the name is a wildcard, then apply this list to all devices of this type */
     566             : 
     567           0 :                 if (cg_all_unified() > 0) {
     568           0 :                         if (!prog)
     569           0 :                                 return 0;
     570             : 
     571           0 :                         (void) cgroup_bpf_whitelist_class(prog, type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, acc);
     572             :                 } else {
     573           0 :                         xsprintf(buf, "%c *:* %s", type, acc);
     574             : 
     575           0 :                         r = cg_set_attribute("devices", path, "devices.allow", buf);
     576           0 :                         if (r < 0)
     577           0 :                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
     578             :                                                "Failed to set devices.allow on %s: %m", path);
     579           0 :                         return 0;
     580             :                 }
     581             :         }
     582             : 
     583           0 :         if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj)) {
     584             :                 /* The name is numeric and suitable as major. In that case, let's take is major, and create the entry
     585             :                  * directly */
     586             : 
     587           0 :                 if (cg_all_unified() > 0) {
     588           0 :                         if (!prog)
     589           0 :                                 return 0;
     590             : 
     591           0 :                         (void) cgroup_bpf_whitelist_major(prog,
     592             :                                                           type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
     593             :                                                           maj, acc);
     594             :                 } else {
     595           0 :                         xsprintf(buf, "%c %u:* %s", type, maj, acc);
     596             : 
     597           0 :                         r = cg_set_attribute("devices", path, "devices.allow", buf);
     598           0 :                         if (r < 0)
     599           0 :                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
     600             :                                                "Failed to set devices.allow on %s: %m", path);
     601             :                 }
     602             : 
     603           0 :                 return 0;
     604             :         }
     605             : 
     606           0 :         f = fopen("/proc/devices", "re");
     607           0 :         if (!f)
     608           0 :                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
     609             : 
     610           0 :         for (;;) {
     611           0 :                 _cleanup_free_ char *line = NULL;
     612             :                 char *w, *p;
     613             : 
     614           0 :                 r = read_line(f, LONG_LINE_MAX, &line);
     615           0 :                 if (r < 0)
     616           0 :                         return log_warning_errno(r, "Failed to read /proc/devices: %m");
     617           0 :                 if (r == 0)
     618           0 :                         break;
     619             : 
     620           0 :                 if (type == 'c' && streq(line, "Character devices:")) {
     621           0 :                         good = true;
     622           0 :                         continue;
     623             :                 }
     624             : 
     625           0 :                 if (type == 'b' && streq(line, "Block devices:")) {
     626           0 :                         good = true;
     627           0 :                         continue;
     628             :                 }
     629             : 
     630           0 :                 if (isempty(line)) {
     631           0 :                         good = false;
     632           0 :                         continue;
     633             :                 }
     634             : 
     635           0 :                 if (!good)
     636           0 :                         continue;
     637             : 
     638           0 :                 p = strstrip(line);
     639             : 
     640           0 :                 w = strpbrk(p, WHITESPACE);
     641           0 :                 if (!w)
     642           0 :                         continue;
     643           0 :                 *w = 0;
     644             : 
     645           0 :                 r = safe_atou(p, &maj);
     646           0 :                 if (r < 0)
     647           0 :                         continue;
     648           0 :                 if (maj <= 0)
     649           0 :                         continue;
     650             : 
     651           0 :                 w++;
     652           0 :                 w += strspn(w, WHITESPACE);
     653             : 
     654           0 :                 if (fnmatch(name, w, 0) != 0)
     655           0 :                         continue;
     656             : 
     657           0 :                 if (cg_all_unified() > 0) {
     658           0 :                         if (!prog)
     659           0 :                                 continue;
     660             : 
     661           0 :                         (void) cgroup_bpf_whitelist_major(prog,
     662             :                                                           type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
     663             :                                                           maj, acc);
     664             :                 } else {
     665           0 :                         sprintf(buf,
     666             :                                 "%c %u:* %s",
     667             :                                 type,
     668             :                                 maj,
     669             :                                 acc);
     670             : 
     671             :                         /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL
     672             :                          * here. */
     673             : 
     674           0 :                         r = cg_set_attribute("devices", path, "devices.allow", buf);
     675           0 :                         if (r < 0)
     676           0 :                                 log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING,
     677             :                                                r, "Failed to set devices.allow on %s: %m", path);
     678             :                 }
     679             :         }
     680             : 
     681           0 :         return 0;
     682             : }
     683             : 
     684         579 : static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
     685        1158 :         return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
     686         579 :                 c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
     687             : }
     688             : 
     689         579 : static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
     690        1153 :         return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
     691         574 :                 c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
     692             : }
     693             : 
     694          20 : static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
     695          20 :         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
     696           4 :             c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
     697           0 :                 return c->startup_cpu_weight;
     698          20 :         else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
     699           0 :                 return c->cpu_weight;
     700             :         else
     701          20 :                 return CGROUP_WEIGHT_DEFAULT;
     702             : }
     703             : 
     704           0 : static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
     705           0 :         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
     706           0 :             c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
     707           0 :                 return c->startup_cpu_shares;
     708           0 :         else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
     709           0 :                 return c->cpu_shares;
     710             :         else
     711           0 :                 return CGROUP_CPU_SHARES_DEFAULT;
     712             : }
     713             : 
     714          12 : usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
     715             :         /* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
     716             :          * need to be higher than that boundary. quota is specified in USecPerSec.
     717             :          * Additionally, period must be at most max_period. */
     718          12 :         assert(quota > 0);
     719             : 
     720          12 :         return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
     721             : }
     722             : 
     723           0 : static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
     724             :         usec_t new_period;
     725             : 
     726           0 :         if (quota == USEC_INFINITY)
     727             :                 /* Always use default period for infinity quota. */
     728           0 :                 return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
     729             : 
     730           0 :         if (period == USEC_INFINITY)
     731             :                 /* Default period was requested. */
     732           0 :                 period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
     733             : 
     734             :         /* Clamp to interval [1ms, 1s] */
     735           0 :         new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
     736             : 
     737           0 :         if (new_period != period) {
     738             :                 char v[FORMAT_TIMESPAN_MAX];
     739           0 :                 log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING, 0,
     740             :                               "Clamping CPU interval for cpu.max: period is now %s",
     741             :                               format_timespan(v, sizeof(v), new_period, 1));
     742           0 :                 u->warned_clamping_cpu_quota_period = true;
     743             :         }
     744             : 
     745           0 :         return new_period;
     746             : }
     747             : 
     748           0 : static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
     749             :         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
     750             : 
     751           0 :         xsprintf(buf, "%" PRIu64 "\n", weight);
     752           0 :         (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
     753           0 : }
     754             : 
     755           0 : static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
     756             :         char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
     757             : 
     758           0 :         period = cgroup_cpu_adjust_period_and_log(u, period, quota);
     759           0 :         if (quota != USEC_INFINITY)
     760           0 :                 xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
     761             :                          MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
     762             :         else
     763           0 :                 xsprintf(buf, "max " USEC_FMT "\n", period);
     764           0 :         (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
     765           0 : }
     766             : 
     767           0 : static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
     768             :         char buf[DECIMAL_STR_MAX(uint64_t) + 2];
     769             : 
     770           0 :         xsprintf(buf, "%" PRIu64 "\n", shares);
     771           0 :         (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
     772           0 : }
     773             : 
     774           0 : static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
     775             :         char buf[DECIMAL_STR_MAX(usec_t) + 2];
     776             : 
     777           0 :         period = cgroup_cpu_adjust_period_and_log(u, period, quota);
     778             : 
     779           0 :         xsprintf(buf, USEC_FMT "\n", period);
     780           0 :         (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
     781             : 
     782           0 :         if (quota != USEC_INFINITY) {
     783           0 :                 xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
     784           0 :                 (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
     785             :         } else
     786           0 :                 (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
     787           0 : }
     788             : 
     789           0 : static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
     790           0 :         return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
     791             :                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
     792             : }
     793             : 
     794           0 : static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
     795           0 :         return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
     796             :                      CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
     797             : }
     798             : 
     799         579 : static bool cgroup_context_has_io_config(CGroupContext *c) {
     800        1158 :         return c->io_accounting ||
     801         579 :                 c->io_weight != CGROUP_WEIGHT_INVALID ||
     802         569 :                 c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
     803         569 :                 c->io_device_weights ||
     804        1727 :                 c->io_device_latencies ||
     805         569 :                 c->io_device_limits;
     806             : }
     807             : 
     808         569 : static bool cgroup_context_has_blockio_config(CGroupContext *c) {
     809        1138 :         return c->blockio_accounting ||
     810         569 :                 c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
     811         569 :                 c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
     812        1707 :                 c->blockio_device_weights ||
     813         569 :                 c->blockio_device_bandwidths;
     814             : }
     815             : 
     816           0 : static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
     817           0 :         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
     818           0 :             c->startup_io_weight != CGROUP_WEIGHT_INVALID)
     819           0 :                 return c->startup_io_weight;
     820           0 :         else if (c->io_weight != CGROUP_WEIGHT_INVALID)
     821           0 :                 return c->io_weight;
     822             :         else
     823           0 :                 return CGROUP_WEIGHT_DEFAULT;
     824             : }
     825             : 
     826           0 : static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
     827           0 :         if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
     828           0 :             c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
     829           0 :                 return c->startup_blockio_weight;
     830           0 :         else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
     831           0 :                 return c->blockio_weight;
     832             :         else
     833           0 :                 return CGROUP_BLKIO_WEIGHT_DEFAULT;
     834             : }
     835             : 
     836           0 : static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
     837           0 :         return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
     838             :                      CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
     839             : }
     840             : 
     841           0 : static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
     842           0 :         return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
     843             :                      CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
     844             : }
     845             : 
     846           0 : static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
     847             :         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
     848             :         dev_t dev;
     849             :         int r;
     850             : 
     851           0 :         r = lookup_block_device(dev_path, &dev);
     852           0 :         if (r < 0)
     853           0 :                 return;
     854             : 
     855           0 :         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
     856           0 :         (void) set_attribute_and_warn(u, "io", "io.weight", buf);
     857             : }
     858             : 
     859           0 : static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
     860             :         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
     861             :         dev_t dev;
     862             :         int r;
     863             : 
     864           0 :         r = lookup_block_device(dev_path, &dev);
     865           0 :         if (r < 0)
     866           0 :                 return;
     867             : 
     868           0 :         xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
     869           0 :         (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf);
     870             : }
     871             : 
     872           0 : static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
     873             :         char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
     874             :         dev_t dev;
     875             :         int r;
     876             : 
     877           0 :         r = lookup_block_device(dev_path, &dev);
     878           0 :         if (r < 0)
     879           0 :                 return;
     880             : 
     881           0 :         if (target != USEC_INFINITY)
     882           0 :                 xsprintf(buf, "%u:%u target=%" PRIu64 "\n", major(dev), minor(dev), target);
     883             :         else
     884           0 :                 xsprintf(buf, "%u:%u target=max\n", major(dev), minor(dev));
     885             : 
     886           0 :         (void) set_attribute_and_warn(u, "io", "io.latency", buf);
     887             : }
     888             : 
     889           0 : static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
     890             :         char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
     891             :         char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
     892             :         CGroupIOLimitType type;
     893             :         dev_t dev;
     894             :         int r;
     895             : 
     896           0 :         r = lookup_block_device(dev_path, &dev);
     897           0 :         if (r < 0)
     898           0 :                 return;
     899             : 
     900           0 :         for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
     901           0 :                 if (limits[type] != cgroup_io_limit_defaults[type])
     902           0 :                         xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
     903             :                 else
     904           0 :                         xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
     905             : 
     906           0 :         xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
     907             :                  limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
     908             :                  limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
     909           0 :         (void) set_attribute_and_warn(u, "io", "io.max", buf);
     910             : }
     911             : 
     912           0 : static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
     913             :         char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
     914             :         dev_t dev;
     915             :         int r;
     916             : 
     917           0 :         r = lookup_block_device(dev_path, &dev);
     918           0 :         if (r < 0)
     919           0 :                 return;
     920             : 
     921           0 :         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
     922           0 :         (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf);
     923             : 
     924           0 :         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
     925           0 :         (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf);
     926             : }
     927             : 
     928          55 : static bool unit_has_unified_memory_config(Unit *u) {
     929             :         CGroupContext *c;
     930             : 
     931          55 :         assert(u);
     932             : 
     933          55 :         c = unit_get_cgroup_context(u);
     934          55 :         assert(c);
     935             : 
     936          55 :         return c->memory_min > 0 || unit_get_ancestor_memory_low(u) > 0 ||
     937         165 :                c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX ||
     938          55 :                c->memory_swap_max != CGROUP_LIMIT_MAX;
     939             : }
     940             : 
     941           0 : static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
     942           0 :         char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
     943             : 
     944           0 :         if (v != CGROUP_LIMIT_MAX)
     945           0 :                 xsprintf(buf, "%" PRIu64 "\n", v);
     946             : 
     947           0 :         (void) set_attribute_and_warn(u, "memory", file, buf);
     948           0 : }
     949             : 
     950           0 : static void cgroup_apply_firewall(Unit *u) {
     951           0 :         assert(u);
     952             : 
     953             :         /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
     954             : 
     955           0 :         if (bpf_firewall_compile(u) < 0)
     956           0 :                 return;
     957             : 
     958           0 :         (void) bpf_firewall_load_custom(u);
     959           0 :         (void) bpf_firewall_install(u);
     960             : }
     961             : 
     962           6 : static void cgroup_context_apply(
     963             :                 Unit *u,
     964             :                 CGroupMask apply_mask,
     965             :                 ManagerState state) {
     966             : 
     967             :         const char *path;
     968             :         CGroupContext *c;
     969             :         bool is_host_root, is_local_root;
     970             :         int r;
     971             : 
     972           6 :         assert(u);
     973             : 
     974             :         /* Nothing to do? Exit early! */
     975           6 :         if (apply_mask == 0)
     976           0 :                 return;
     977             : 
     978             :         /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
     979             :          * attributes should only be managed for cgroups further down the tree. */
     980           6 :         is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
     981           6 :         is_host_root = unit_has_host_root_cgroup(u);
     982             : 
     983           6 :         assert_se(c = unit_get_cgroup_context(u));
     984           6 :         assert_se(path = u->cgroup_path);
     985             : 
     986           6 :         if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
     987           6 :                 path = "/";
     988             : 
     989             :         /* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
     990             :          * then), and missing cgroups, i.e. EROFS and ENOENT. */
     991             : 
     992             :         /* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
     993             :          * setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
     994             :          * level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
     995             :          * containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
     996             :          * we couldn't even write to them if we wanted to). */
     997           6 :         if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
     998             : 
     999           0 :                 if (cg_all_unified() > 0) {
    1000             :                         uint64_t weight;
    1001             : 
    1002           0 :                         if (cgroup_context_has_cpu_weight(c))
    1003           0 :                                 weight = cgroup_context_cpu_weight(c, state);
    1004           0 :                         else if (cgroup_context_has_cpu_shares(c)) {
    1005             :                                 uint64_t shares;
    1006             : 
    1007           0 :                                 shares = cgroup_context_cpu_shares(c, state);
    1008           0 :                                 weight = cgroup_cpu_shares_to_weight(shares);
    1009             : 
    1010           0 :                                 log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
    1011             :                                                   shares, weight, path);
    1012             :                         } else
    1013           0 :                                 weight = CGROUP_WEIGHT_DEFAULT;
    1014             : 
    1015           0 :                         cgroup_apply_unified_cpu_weight(u, weight);
    1016           0 :                         cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
    1017             : 
    1018             :                 } else {
    1019             :                         uint64_t shares;
    1020             : 
    1021           0 :                         if (cgroup_context_has_cpu_weight(c)) {
    1022             :                                 uint64_t weight;
    1023             : 
    1024           0 :                                 weight = cgroup_context_cpu_weight(c, state);
    1025           0 :                                 shares = cgroup_cpu_weight_to_shares(weight);
    1026             : 
    1027           0 :                                 log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
    1028             :                                                   weight, shares, path);
    1029           0 :                         } else if (cgroup_context_has_cpu_shares(c))
    1030           0 :                                 shares = cgroup_context_cpu_shares(c, state);
    1031             :                         else
    1032           0 :                                 shares = CGROUP_CPU_SHARES_DEFAULT;
    1033             : 
    1034           0 :                         cgroup_apply_legacy_cpu_shares(u, shares);
    1035           0 :                         cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
    1036             :                 }
    1037             :         }
    1038             : 
    1039             :         /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
    1040             :          * controller), and in case of containers we want to leave control of these attributes to the container manager
    1041             :          * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
    1042           6 :         if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
    1043             :                 char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
    1044             :                 bool has_io, has_blockio;
    1045             :                 uint64_t weight;
    1046             : 
    1047           0 :                 has_io = cgroup_context_has_io_config(c);
    1048           0 :                 has_blockio = cgroup_context_has_blockio_config(c);
    1049             : 
    1050           0 :                 if (has_io)
    1051           0 :                         weight = cgroup_context_io_weight(c, state);
    1052           0 :                 else if (has_blockio) {
    1053             :                         uint64_t blkio_weight;
    1054             : 
    1055           0 :                         blkio_weight = cgroup_context_blkio_weight(c, state);
    1056           0 :                         weight = cgroup_weight_blkio_to_io(blkio_weight);
    1057             : 
    1058           0 :                         log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
    1059             :                                           blkio_weight, weight);
    1060             :                 } else
    1061           0 :                         weight = CGROUP_WEIGHT_DEFAULT;
    1062             : 
    1063           0 :                 xsprintf(buf, "default %" PRIu64 "\n", weight);
    1064           0 :                 (void) set_attribute_and_warn(u, "io", "io.weight", buf);
    1065             : 
    1066             :                 /* FIXME: drop this when distro kernels properly support BFQ through "io.weight"
    1067             :                  * See also: https://github.com/systemd/systemd/pull/13335 */
    1068           0 :                 xsprintf(buf, "%" PRIu64 "\n", weight);
    1069           0 :                 (void) set_attribute_and_warn(u, "io", "io.bfq.weight", buf);
    1070             : 
    1071           0 :                 if (has_io) {
    1072             :                         CGroupIODeviceLatency *latency;
    1073             :                         CGroupIODeviceLimit *limit;
    1074             :                         CGroupIODeviceWeight *w;
    1075             : 
    1076           0 :                         LIST_FOREACH(device_weights, w, c->io_device_weights)
    1077           0 :                                 cgroup_apply_io_device_weight(u, w->path, w->weight);
    1078             : 
    1079           0 :                         LIST_FOREACH(device_limits, limit, c->io_device_limits)
    1080           0 :                                 cgroup_apply_io_device_limit(u, limit->path, limit->limits);
    1081             : 
    1082           0 :                         LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
    1083           0 :                                 cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
    1084             : 
    1085           0 :                 } else if (has_blockio) {
    1086             :                         CGroupBlockIODeviceWeight *w;
    1087             :                         CGroupBlockIODeviceBandwidth *b;
    1088             : 
    1089           0 :                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
    1090           0 :                                 weight = cgroup_weight_blkio_to_io(w->weight);
    1091             : 
    1092           0 :                                 log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
    1093             :                                                   w->weight, weight, w->path);
    1094             : 
    1095           0 :                                 cgroup_apply_io_device_weight(u, w->path, weight);
    1096             :                         }
    1097             : 
    1098           0 :                         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
    1099             :                                 uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
    1100             :                                 CGroupIOLimitType type;
    1101             : 
    1102           0 :                                 for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
    1103           0 :                                         limits[type] = cgroup_io_limit_defaults[type];
    1104             : 
    1105           0 :                                 limits[CGROUP_IO_RBPS_MAX] = b->rbps;
    1106           0 :                                 limits[CGROUP_IO_WBPS_MAX] = b->wbps;
    1107             : 
    1108           0 :                                 log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
    1109             :                                                   b->rbps, b->wbps, b->path);
    1110             : 
    1111           0 :                                 cgroup_apply_io_device_limit(u, b->path, limits);
    1112             :                         }
    1113             :                 }
    1114             :         }
    1115             : 
    1116           6 :         if (apply_mask & CGROUP_MASK_BLKIO) {
    1117             :                 bool has_io, has_blockio;
    1118             : 
    1119           0 :                 has_io = cgroup_context_has_io_config(c);
    1120           0 :                 has_blockio = cgroup_context_has_blockio_config(c);
    1121             : 
    1122             :                 /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be
    1123             :                  * left to our container manager, too. */
    1124           0 :                 if (!is_local_root) {
    1125             :                         char buf[DECIMAL_STR_MAX(uint64_t)+1];
    1126             :                         uint64_t weight;
    1127             : 
    1128           0 :                         if (has_io) {
    1129             :                                 uint64_t io_weight;
    1130             : 
    1131           0 :                                 io_weight = cgroup_context_io_weight(c, state);
    1132           0 :                                 weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
    1133             : 
    1134           0 :                                 log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
    1135             :                                                   io_weight, weight);
    1136           0 :                         } else if (has_blockio)
    1137           0 :                                 weight = cgroup_context_blkio_weight(c, state);
    1138             :                         else
    1139           0 :                                 weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
    1140             : 
    1141           0 :                         xsprintf(buf, "%" PRIu64 "\n", weight);
    1142           0 :                         (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf);
    1143             : 
    1144           0 :                         if (has_io) {
    1145             :                                 CGroupIODeviceWeight *w;
    1146             : 
    1147           0 :                                 LIST_FOREACH(device_weights, w, c->io_device_weights) {
    1148           0 :                                         weight = cgroup_weight_io_to_blkio(w->weight);
    1149             : 
    1150           0 :                                         log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
    1151             :                                                           w->weight, weight, w->path);
    1152             : 
    1153           0 :                                         cgroup_apply_blkio_device_weight(u, w->path, weight);
    1154             :                                 }
    1155           0 :                         } else if (has_blockio) {
    1156             :                                 CGroupBlockIODeviceWeight *w;
    1157             : 
    1158           0 :                                 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
    1159           0 :                                         cgroup_apply_blkio_device_weight(u, w->path, w->weight);
    1160             :                         }
    1161             :                 }
    1162             : 
    1163             :                 /* The bandwidth limits are something that make sense to be applied to the host's root but not container
    1164             :                  * roots, as there we want the container manager to handle it */
    1165           0 :                 if (is_host_root || !is_local_root) {
    1166           0 :                         if (has_io) {
    1167             :                                 CGroupIODeviceLimit *l;
    1168             : 
    1169           0 :                                 LIST_FOREACH(device_limits, l, c->io_device_limits) {
    1170           0 :                                         log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
    1171             :                                                           l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
    1172             : 
    1173           0 :                                         cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
    1174             :                                 }
    1175           0 :                         } else if (has_blockio) {
    1176             :                                 CGroupBlockIODeviceBandwidth *b;
    1177             : 
    1178           0 :                                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
    1179           0 :                                         cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps);
    1180             :                         }
    1181             :                 }
    1182             :         }
    1183             : 
    1184             :         /* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
    1185             :          * exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
    1186             :          * want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
    1187             :          * write to this if we wanted to.) */
    1188           6 :         if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
    1189             : 
    1190           0 :                 if (cg_all_unified() > 0) {
    1191           0 :                         uint64_t max, swap_max = CGROUP_LIMIT_MAX;
    1192             : 
    1193           0 :                         if (unit_has_unified_memory_config(u)) {
    1194           0 :                                 max = c->memory_max;
    1195           0 :                                 swap_max = c->memory_swap_max;
    1196             :                         } else {
    1197           0 :                                 max = c->memory_limit;
    1198             : 
    1199           0 :                                 if (max != CGROUP_LIMIT_MAX)
    1200           0 :                                         log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
    1201             :                         }
    1202             : 
    1203           0 :                         cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min);
    1204           0 :                         cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
    1205           0 :                         cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
    1206           0 :                         cgroup_apply_unified_memory_limit(u, "memory.max", max);
    1207           0 :                         cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
    1208             : 
    1209           0 :                         (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
    1210             : 
    1211             :                 } else {
    1212             :                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
    1213             :                         uint64_t val;
    1214             : 
    1215           0 :                         if (unit_has_unified_memory_config(u)) {
    1216           0 :                                 val = c->memory_max;
    1217           0 :                                 log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val);
    1218             :                         } else
    1219           0 :                                 val = c->memory_limit;
    1220             : 
    1221           0 :                         if (val == CGROUP_LIMIT_MAX)
    1222           0 :                                 strncpy(buf, "-1\n", sizeof(buf));
    1223             :                         else
    1224           0 :                                 xsprintf(buf, "%" PRIu64 "\n", val);
    1225             : 
    1226           0 :                         (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
    1227             :                 }
    1228             :         }
    1229             : 
    1230             :         /* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
    1231             :          * containers, where we leave this to the manager */
    1232           6 :         if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
    1233           0 :             (is_host_root || cg_all_unified() > 0 || !is_local_root)) {
    1234           0 :                 _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
    1235             :                 CGroupDeviceAllow *a;
    1236             : 
    1237           0 :                 if (cg_all_unified() > 0) {
    1238           0 :                         r = cgroup_init_device_bpf(&prog, c->device_policy, c->device_allow);
    1239           0 :                         if (r < 0)
    1240           0 :                                 log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
    1241             :                 } else {
    1242             :                         /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL
    1243             :                          * here. */
    1244             : 
    1245           0 :                         if (c->device_allow || c->device_policy != CGROUP_AUTO)
    1246           0 :                                 r = cg_set_attribute("devices", path, "devices.deny", "a");
    1247             :                         else
    1248           0 :                                 r = cg_set_attribute("devices", path, "devices.allow", "a");
    1249           0 :                         if (r < 0)
    1250           0 :                                 log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
    1251             :                                               "Failed to reset devices.allow/devices.deny: %m");
    1252             :                 }
    1253             : 
    1254           0 :                 if (c->device_policy == CGROUP_CLOSED ||
    1255           0 :                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
    1256             :                         static const char auto_devices[] =
    1257             :                                 "/dev/null\0" "rwm\0"
    1258             :                                 "/dev/zero\0" "rwm\0"
    1259             :                                 "/dev/full\0" "rwm\0"
    1260             :                                 "/dev/random\0" "rwm\0"
    1261             :                                 "/dev/urandom\0" "rwm\0"
    1262             :                                 "/dev/tty\0" "rwm\0"
    1263             :                                 "/dev/ptmx\0" "rwm\0"
    1264             :                                 /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */
    1265             :                                 "/run/systemd/inaccessible/chr\0" "rwm\0"
    1266             :                                 "/run/systemd/inaccessible/blk\0" "rwm\0";
    1267             : 
    1268             :                         const char *x, *y;
    1269             : 
    1270           0 :                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
    1271           0 :                                 (void) whitelist_device(prog, path, x, y);
    1272             : 
    1273             :                         /* PTS (/dev/pts) devices may not be duplicated, but accessed */
    1274           0 :                         (void) whitelist_major(prog, path, "pts", 'c', "rw");
    1275             :                 }
    1276             : 
    1277           0 :                 LIST_FOREACH(device_allow, a, c->device_allow) {
    1278             :                         char acc[4], *val;
    1279           0 :                         unsigned k = 0;
    1280             : 
    1281           0 :                         if (a->r)
    1282           0 :                                 acc[k++] = 'r';
    1283           0 :                         if (a->w)
    1284           0 :                                 acc[k++] = 'w';
    1285           0 :                         if (a->m)
    1286           0 :                                 acc[k++] = 'm';
    1287             : 
    1288           0 :                         if (k == 0)
    1289           0 :                                 continue;
    1290             : 
    1291           0 :                         acc[k++] = 0;
    1292             : 
    1293           0 :                         if (path_startswith(a->path, "/dev/"))
    1294           0 :                                 (void) whitelist_device(prog, path, a->path, acc);
    1295           0 :                         else if ((val = startswith(a->path, "block-")))
    1296           0 :                                 (void) whitelist_major(prog, path, val, 'b', acc);
    1297           0 :                         else if ((val = startswith(a->path, "char-")))
    1298           0 :                                 (void) whitelist_major(prog, path, val, 'c', acc);
    1299             :                         else
    1300           0 :                                 log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
    1301             :                 }
    1302             : 
    1303           0 :                 r = cgroup_apply_device_bpf(u, prog, c->device_policy, c->device_allow);
    1304           0 :                 if (r < 0) {
    1305             :                         static bool warned = false;
    1306             : 
    1307           0 :                         log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
    1308             :                                  "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
    1309             :                                  "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
    1310             :                                  "(This warning is only shown for the first loaded unit using device ACL.)", u->id);
    1311             : 
    1312           0 :                         warned = true;
    1313             :                 }
    1314             :         }
    1315             : 
    1316           6 :         if (apply_mask & CGROUP_MASK_PIDS) {
    1317             : 
    1318           6 :                 if (is_host_root) {
    1319             :                         /* So, the "pids" controller does not expose anything on the root cgroup, in order not to
    1320             :                          * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
    1321             :                          * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
    1322             :                          * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
    1323             :                          * exclusive ownership of the sysctls, but we still want to honour things if the user sets
    1324             :                          * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
    1325             :                          * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
    1326             :                          * it also counts. But if the user never set a limit through us (i.e. we are the default of
    1327             :                          * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
    1328             :                          * the first time we set a limit. Note that this boolean is flushed out on manager reload,
    1329             :                          * which is desirable so that there's an official way to release control of the sysctl from
    1330             :                          * systemd: set the limit to unbounded and reload. */
    1331             : 
    1332           0 :                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
    1333           0 :                                 u->manager->sysctl_pid_max_changed = true;
    1334           0 :                                 r = procfs_tasks_set_limit(c->tasks_max);
    1335           0 :                         } else if (u->manager->sysctl_pid_max_changed)
    1336           0 :                                 r = procfs_tasks_set_limit(TASKS_MAX);
    1337             :                         else
    1338           0 :                                 r = 0;
    1339           0 :                         if (r < 0)
    1340           0 :                                 log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r,
    1341             :                                               "Failed to write to tasks limit sysctls: %m");
    1342             :                 }
    1343             : 
    1344             :                 /* The attribute itself is not available on the host root cgroup, and in the container case we want to
    1345             :                  * leave it for the container manager. */
    1346           6 :                 if (!is_local_root) {
    1347           0 :                         if (c->tasks_max != CGROUP_LIMIT_MAX) {
    1348             :                                 char buf[DECIMAL_STR_MAX(uint64_t) + 2];
    1349             : 
    1350           0 :                                 sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
    1351           0 :                                 (void) set_attribute_and_warn(u, "pids", "pids.max", buf);
    1352             :                         } else
    1353           0 :                                 (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
    1354             :                 }
    1355             :         }
    1356             : 
    1357           6 :         if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
    1358           0 :                 cgroup_apply_firewall(u);
    1359             : }
    1360             : 
    1361         579 : static bool unit_get_needs_bpf_firewall(Unit *u) {
    1362             :         CGroupContext *c;
    1363             :         Unit *p;
    1364         579 :         assert(u);
    1365             : 
    1366         579 :         c = unit_get_cgroup_context(u);
    1367         579 :         if (!c)
    1368           0 :                 return false;
    1369             : 
    1370         579 :         if (c->ip_accounting ||
    1371         579 :             c->ip_address_allow ||
    1372         579 :             c->ip_address_deny ||
    1373         579 :             c->ip_filters_ingress ||
    1374         579 :             c->ip_filters_egress)
    1375           0 :                 return true;
    1376             : 
    1377             :         /* If any parent slice has an IP access list defined, it applies too */
    1378        1162 :         for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
    1379         583 :                 c = unit_get_cgroup_context(p);
    1380         583 :                 if (!c)
    1381           0 :                         return false;
    1382             : 
    1383         583 :                 if (c->ip_address_allow ||
    1384         583 :                     c->ip_address_deny)
    1385           0 :                         return true;
    1386             :         }
    1387             : 
    1388         579 :         return false;
    1389             : }
    1390             : 
    1391         579 : static CGroupMask unit_get_cgroup_mask(Unit *u) {
    1392         579 :         CGroupMask mask = 0;
    1393             :         CGroupContext *c;
    1394             : 
    1395         579 :         assert(u);
    1396             : 
    1397         579 :         c = unit_get_cgroup_context(u);
    1398             : 
    1399         579 :         assert(c);
    1400             : 
    1401             :         /* Figure out which controllers we need, based on the cgroup context object */
    1402             : 
    1403         579 :         if (c->cpu_accounting)
    1404           5 :                 mask |= get_cpu_accounting_mask();
    1405             : 
    1406        1158 :         if (cgroup_context_has_cpu_weight(c) ||
    1407         579 :             cgroup_context_has_cpu_shares(c) ||
    1408         574 :             c->cpu_quota_per_sec_usec != USEC_INFINITY)
    1409           5 :                 mask |= CGROUP_MASK_CPU;
    1410             : 
    1411         579 :         if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
    1412          10 :                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
    1413             : 
    1414         579 :         if (c->memory_accounting ||
    1415         115 :             c->memory_limit != CGROUP_LIMIT_MAX ||
    1416          55 :             unit_has_unified_memory_config(u))
    1417         524 :                 mask |= CGROUP_MASK_MEMORY;
    1418             : 
    1419         579 :         if (c->device_allow ||
    1420         579 :             c->device_policy != CGROUP_AUTO)
    1421           0 :                 mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
    1422             : 
    1423         579 :         if (c->tasks_accounting ||
    1424          65 :             c->tasks_max != CGROUP_LIMIT_MAX)
    1425         514 :                 mask |= CGROUP_MASK_PIDS;
    1426             : 
    1427         579 :         return CGROUP_MASK_EXTEND_JOINED(mask);
    1428             : }
    1429             : 
    1430         579 : static CGroupMask unit_get_bpf_mask(Unit *u) {
    1431         579 :         CGroupMask mask = 0;
    1432             : 
    1433             :         /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
    1434             :          * too. */
    1435             : 
    1436         579 :         if (unit_get_needs_bpf_firewall(u))
    1437           0 :                 mask |= CGROUP_MASK_BPF_FIREWALL;
    1438             : 
    1439         579 :         return mask;
    1440             : }
    1441             : 
    1442        1587 : CGroupMask unit_get_own_mask(Unit *u) {
    1443             :         CGroupContext *c;
    1444             : 
    1445             :         /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
    1446             :          * mask, as we shouldn't reflect it in the cgroup hierarchy then. */
    1447             : 
    1448        1587 :         if (u->load_state != UNIT_LOADED)
    1449         186 :                 return 0;
    1450             : 
    1451        1401 :         c = unit_get_cgroup_context(u);
    1452        1401 :         if (!c)
    1453         822 :                 return 0;
    1454             : 
    1455         579 :         return (unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u);
    1456             : }
    1457             : 
    1458        1769 : CGroupMask unit_get_delegate_mask(Unit *u) {
    1459             :         CGroupContext *c;
    1460             : 
    1461             :         /* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
    1462             :          * process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
    1463             :          *
    1464             :          * Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
    1465             : 
    1466        1769 :         if (!unit_cgroup_delegate(u))
    1467        1769 :                 return 0;
    1468             : 
    1469           0 :         if (cg_all_unified() <= 0) {
    1470             :                 ExecContext *e;
    1471             : 
    1472           0 :                 e = unit_get_exec_context(u);
    1473           0 :                 if (e && !exec_context_maintains_privileges(e))
    1474           0 :                         return 0;
    1475             :         }
    1476             : 
    1477           0 :         assert_se(c = unit_get_cgroup_context(u));
    1478           0 :         return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers);
    1479             : }
    1480             : 
    1481        1633 : CGroupMask unit_get_members_mask(Unit *u) {
    1482        1633 :         assert(u);
    1483             : 
    1484             :         /* Returns the mask of controllers all of the unit's children require, merged */
    1485             : 
    1486        1633 :         if (u->cgroup_members_mask_valid)
    1487        1253 :                 return u->cgroup_members_mask; /* Use cached value if possible */
    1488             : 
    1489         380 :         u->cgroup_members_mask = 0;
    1490             : 
    1491         380 :         if (u->type == UNIT_SLICE) {
    1492             :                 void *v;
    1493             :                 Unit *member;
    1494             :                 Iterator i;
    1495             : 
    1496         362 :                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
    1497         346 :                         if (UNIT_DEREF(member->slice) == u)
    1498         343 :                                 u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
    1499             :                 }
    1500             :         }
    1501             : 
    1502         380 :         u->cgroup_members_mask_valid = true;
    1503         380 :         return u->cgroup_members_mask;
    1504             : }
    1505             : 
    1506          44 : CGroupMask unit_get_siblings_mask(Unit *u) {
    1507          44 :         assert(u);
    1508             : 
    1509             :         /* Returns the mask of controllers all of the unit's siblings
    1510             :          * require, i.e. the members mask of the unit's parent slice
    1511             :          * if there is one. */
    1512             : 
    1513          44 :         if (UNIT_ISSET(u->slice))
    1514          34 :                 return unit_get_members_mask(UNIT_DEREF(u->slice));
    1515             : 
    1516          10 :         return unit_get_subtree_mask(u); /* we are the top-level slice */
    1517             : }
    1518             : 
    1519        1240 : CGroupMask unit_get_disable_mask(Unit *u) {
    1520             :         CGroupContext *c;
    1521             : 
    1522        1240 :         c = unit_get_cgroup_context(u);
    1523        1240 :         if (!c)
    1524           0 :                 return 0;
    1525             : 
    1526        1240 :         return c->disable_controllers;
    1527             : }
    1528             : 
    1529        1240 : CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
    1530             :         CGroupMask mask;
    1531             : 
    1532        1240 :         assert(u);
    1533        1240 :         mask = unit_get_disable_mask(u);
    1534             : 
    1535             :         /* Returns the mask of controllers which are marked as forcibly
    1536             :          * disabled in any ancestor unit or the unit in question. */
    1537             : 
    1538        1240 :         if (UNIT_ISSET(u->slice))
    1539         621 :                 mask |= unit_get_ancestor_disable_mask(UNIT_DEREF(u->slice));
    1540             : 
    1541        1240 :         return mask;
    1542             : }
    1543             : 
    1544         353 : CGroupMask unit_get_subtree_mask(Unit *u) {
    1545             : 
    1546             :         /* Returns the mask of this subtree, meaning of the group
    1547             :          * itself and its children. */
    1548             : 
    1549         353 :         return unit_get_own_mask(u) | unit_get_members_mask(u);
    1550             : }
    1551             : 
    1552          28 : CGroupMask unit_get_target_mask(Unit *u) {
    1553             :         CGroupMask mask;
    1554             : 
    1555             :         /* This returns the cgroup mask of all controllers to enable
    1556             :          * for a specific cgroup, i.e. everything it needs itself,
    1557             :          * plus all that its children need, plus all that its siblings
    1558             :          * need. This is primarily useful on the legacy cgroup
    1559             :          * hierarchy, where we need to duplicate each cgroup in each
    1560             :          * hierarchy that shall be enabled for it. */
    1561             : 
    1562          28 :         mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
    1563             : 
    1564          28 :         if (mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
    1565           0 :                 emit_bpf_firewall_warning(u);
    1566             : 
    1567          28 :         mask &= u->manager->cgroup_supported;
    1568          28 :         mask &= ~unit_get_ancestor_disable_mask(u);
    1569             : 
    1570          28 :         return mask;
    1571             : }
    1572             : 
    1573          12 : CGroupMask unit_get_enable_mask(Unit *u) {
    1574             :         CGroupMask mask;
    1575             : 
    1576             :         /* This returns the cgroup mask of all controllers to enable
    1577             :          * for the children of a specific cgroup. This is primarily
    1578             :          * useful for the unified cgroup hierarchy, where each cgroup
    1579             :          * controls which controllers are enabled for its children. */
    1580             : 
    1581          12 :         mask = unit_get_members_mask(u);
    1582          12 :         mask &= u->manager->cgroup_supported;
    1583          12 :         mask &= ~unit_get_ancestor_disable_mask(u);
    1584             : 
    1585          12 :         return mask;
    1586             : }
    1587             : 
    1588        2289 : void unit_invalidate_cgroup_members_masks(Unit *u) {
    1589        2289 :         assert(u);
    1590             : 
    1591             :         /* Recurse invalidate the member masks cache all the way up the tree */
    1592        2289 :         u->cgroup_members_mask_valid = false;
    1593             : 
    1594        2289 :         if (UNIT_ISSET(u->slice))
    1595         313 :                 unit_invalidate_cgroup_members_masks(UNIT_DEREF(u->slice));
    1596        2289 : }
    1597             : 
    1598           0 : const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
    1599             : 
    1600             :         /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
    1601             : 
    1602           0 :         while (u) {
    1603             : 
    1604           0 :                 if (u->cgroup_path &&
    1605           0 :                     u->cgroup_realized &&
    1606           0 :                     FLAGS_SET(u->cgroup_realized_mask, mask))
    1607           0 :                         return u->cgroup_path;
    1608             : 
    1609           0 :                 u = UNIT_DEREF(u->slice);
    1610             :         }
    1611             : 
    1612           0 :         return NULL;
    1613             : }
    1614             : 
    1615           0 : static const char *migrate_callback(CGroupMask mask, void *userdata) {
    1616           0 :         return unit_get_realized_cgroup_path(userdata, mask);
    1617             : }
    1618             : 
    1619          12 : char *unit_default_cgroup_path(const Unit *u) {
    1620          12 :         _cleanup_free_ char *escaped = NULL, *slice = NULL;
    1621             :         int r;
    1622             : 
    1623          12 :         assert(u);
    1624             : 
    1625          12 :         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
    1626           6 :                 return strdup(u->manager->cgroup_root);
    1627             : 
    1628           6 :         if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
    1629           0 :                 r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
    1630           0 :                 if (r < 0)
    1631           0 :                         return NULL;
    1632             :         }
    1633             : 
    1634           6 :         escaped = cg_escape(u->id);
    1635           6 :         if (!escaped)
    1636           0 :                 return NULL;
    1637             : 
    1638           6 :         return path_join(empty_to_root(u->manager->cgroup_root), slice, escaped);
    1639             : }
    1640             : 
    1641          12 : int unit_set_cgroup_path(Unit *u, const char *path) {
    1642          12 :         _cleanup_free_ char *p = NULL;
    1643             :         int r;
    1644             : 
    1645          12 :         assert(u);
    1646             : 
    1647          12 :         if (streq_ptr(u->cgroup_path, path))
    1648           0 :                 return 0;
    1649             : 
    1650          12 :         if (path) {
    1651          12 :                 p = strdup(path);
    1652          12 :                 if (!p)
    1653           0 :                         return -ENOMEM;
    1654             :         }
    1655             : 
    1656          12 :         if (p) {
    1657          12 :                 r = hashmap_put(u->manager->cgroup_unit, p, u);
    1658          12 :                 if (r < 0)
    1659           0 :                         return r;
    1660             :         }
    1661             : 
    1662          12 :         unit_release_cgroup(u);
    1663          12 :         u->cgroup_path = TAKE_PTR(p);
    1664             : 
    1665          12 :         return 1;
    1666             : }
    1667             : 
    1668           6 : int unit_watch_cgroup(Unit *u) {
    1669           6 :         _cleanup_free_ char *events = NULL;
    1670             :         int r;
    1671             : 
    1672           6 :         assert(u);
    1673             : 
    1674             :         /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
    1675             :          * cgroupv2 is available. */
    1676             : 
    1677           6 :         if (!u->cgroup_path)
    1678           0 :                 return 0;
    1679             : 
    1680           6 :         if (u->cgroup_control_inotify_wd >= 0)
    1681           0 :                 return 0;
    1682             : 
    1683             :         /* Only applies to the unified hierarchy */
    1684           6 :         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
    1685           6 :         if (r < 0)
    1686           0 :                 return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
    1687           6 :         if (r == 0)
    1688           0 :                 return 0;
    1689             : 
    1690             :         /* No point in watch the top-level slice, it's never going to run empty. */
    1691           6 :         if (unit_has_name(u, SPECIAL_ROOT_SLICE))
    1692           6 :                 return 0;
    1693             : 
    1694           0 :         r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
    1695           0 :         if (r < 0)
    1696           0 :                 return log_oom();
    1697             : 
    1698           0 :         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
    1699           0 :         if (r < 0)
    1700           0 :                 return log_oom();
    1701             : 
    1702           0 :         u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
    1703           0 :         if (u->cgroup_control_inotify_wd < 0) {
    1704             : 
    1705           0 :                 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
    1706             :                                       * is not an error */
    1707           0 :                         return 0;
    1708             : 
    1709           0 :                 return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", u->cgroup_path);
    1710             :         }
    1711             : 
    1712           0 :         r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
    1713           0 :         if (r < 0)
    1714           0 :                 return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor to hash map: %m");
    1715             : 
    1716           0 :         return 0;
    1717             : }
    1718             : 
    1719           6 : int unit_watch_cgroup_memory(Unit *u) {
    1720           6 :         _cleanup_free_ char *events = NULL;
    1721             :         CGroupContext *c;
    1722             :         int r;
    1723             : 
    1724           6 :         assert(u);
    1725             : 
    1726             :         /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
    1727             :          * cgroupv2 is available. */
    1728             : 
    1729           6 :         if (!u->cgroup_path)
    1730           0 :                 return 0;
    1731             : 
    1732           6 :         c = unit_get_cgroup_context(u);
    1733           6 :         if (!c)
    1734           0 :                 return 0;
    1735             : 
    1736             :         /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
    1737             :          * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
    1738             :          * all. */
    1739           6 :         if (!c->memory_accounting)
    1740           0 :                 return 0;
    1741             : 
    1742             :         /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
    1743             :          * we also don't want to generate a log message for each parent cgroup of a process. */
    1744           6 :         if (u->type == UNIT_SLICE)
    1745           6 :                 return 0;
    1746             : 
    1747           0 :         if (u->cgroup_memory_inotify_wd >= 0)
    1748           0 :                 return 0;
    1749             : 
    1750             :         /* Only applies to the unified hierarchy */
    1751           0 :         r = cg_all_unified();
    1752           0 :         if (r < 0)
    1753           0 :                 return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
    1754           0 :         if (r == 0)
    1755           0 :                 return 0;
    1756             : 
    1757           0 :         r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
    1758           0 :         if (r < 0)
    1759           0 :                 return log_oom();
    1760             : 
    1761           0 :         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
    1762           0 :         if (r < 0)
    1763           0 :                 return log_oom();
    1764             : 
    1765           0 :         u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
    1766           0 :         if (u->cgroup_memory_inotify_wd < 0) {
    1767             : 
    1768           0 :                 if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
    1769             :                                       * is not an error */
    1770           0 :                         return 0;
    1771             : 
    1772           0 :                 return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", u->cgroup_path);
    1773             :         }
    1774             : 
    1775           0 :         r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
    1776           0 :         if (r < 0)
    1777           0 :                 return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor to hash map: %m");
    1778             : 
    1779           0 :         return 0;
    1780             : }
    1781             : 
    1782          18 : int unit_pick_cgroup_path(Unit *u) {
    1783          18 :         _cleanup_free_ char *path = NULL;
    1784             :         int r;
    1785             : 
    1786          18 :         assert(u);
    1787             : 
    1788          18 :         if (u->cgroup_path)
    1789           6 :                 return 0;
    1790             : 
    1791          12 :         if (!UNIT_HAS_CGROUP_CONTEXT(u))
    1792           0 :                 return -EINVAL;
    1793             : 
    1794          12 :         path = unit_default_cgroup_path(u);
    1795          12 :         if (!path)
    1796           0 :                 return log_oom();
    1797             : 
    1798          12 :         r = unit_set_cgroup_path(u, path);
    1799          12 :         if (r == -EEXIST)
    1800           0 :                 return log_unit_error_errno(u, r, "Control group %s exists already.", path);
    1801          12 :         if (r < 0)
    1802           0 :                 return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
    1803             : 
    1804          12 :         return 0;
    1805             : }
    1806             : 
    1807          12 : static int unit_create_cgroup(
    1808             :                 Unit *u,
    1809             :                 CGroupMask target_mask,
    1810             :                 CGroupMask enable_mask,
    1811             :                 ManagerState state) {
    1812             : 
    1813             :         bool created;
    1814             :         int r;
    1815             : 
    1816          12 :         assert(u);
    1817             : 
    1818          12 :         if (!UNIT_HAS_CGROUP_CONTEXT(u))
    1819           0 :                 return 0;
    1820             : 
    1821             :         /* Figure out our cgroup path */
    1822          12 :         r = unit_pick_cgroup_path(u);
    1823          12 :         if (r < 0)
    1824           0 :                 return r;
    1825             : 
    1826             :         /* First, create our own group */
    1827          12 :         r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
    1828          12 :         if (r < 0)
    1829           6 :                 return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
    1830           6 :         created = r;
    1831             : 
    1832             :         /* Start watching it */
    1833           6 :         (void) unit_watch_cgroup(u);
    1834           6 :         (void) unit_watch_cgroup_memory(u);
    1835             : 
    1836             :         /* Preserve enabled controllers in delegated units, adjust others. */
    1837           6 :         if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
    1838           6 :                 CGroupMask result_mask = 0;
    1839             : 
    1840             :                 /* Enable all controllers we need */
    1841           6 :                 r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask);
    1842           6 :                 if (r < 0)
    1843           0 :                         log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
    1844             : 
    1845             :                 /* If we just turned off a controller, this might release the controller for our parent too, let's
    1846             :                  * enqueue the parent for re-realization in that case again. */
    1847           6 :                 if (UNIT_ISSET(u->slice)) {
    1848             :                         CGroupMask turned_off;
    1849             : 
    1850           0 :                         turned_off = (u->cgroup_realized ? u->cgroup_enabled_mask & ~result_mask : 0);
    1851           0 :                         if (turned_off != 0) {
    1852             :                                 Unit *parent;
    1853             : 
    1854             :                                 /* Force the parent to propagate the enable mask to the kernel again, by invalidating
    1855             :                                  * the controller we just turned off. */
    1856             : 
    1857           0 :                                 for (parent = UNIT_DEREF(u->slice); parent; parent = UNIT_DEREF(parent->slice))
    1858           0 :                                         unit_invalidate_cgroup(parent, turned_off);
    1859             :                         }
    1860             :                 }
    1861             : 
    1862             :                 /* Remember what's actually enabled now */
    1863           6 :                 u->cgroup_enabled_mask = result_mask;
    1864             :         }
    1865             : 
    1866             :         /* Keep track that this is now realized */
    1867           6 :         u->cgroup_realized = true;
    1868           6 :         u->cgroup_realized_mask = target_mask;
    1869             : 
    1870           6 :         if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
    1871             : 
    1872             :                 /* Then, possibly move things over, but not if
    1873             :                  * subgroups may contain processes, which is the case
    1874             :                  * for slice and delegation units. */
    1875           0 :                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
    1876           0 :                 if (r < 0)
    1877           0 :                         log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
    1878             :         }
    1879             : 
    1880             :         /* Set attributes */
    1881           6 :         cgroup_context_apply(u, target_mask, state);
    1882           6 :         cgroup_xattr_apply(u);
    1883             : 
    1884           6 :         return 0;
    1885             : }
    1886             : 
    1887           0 : static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
    1888           0 :         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
    1889             :         char *pp;
    1890             :         int r;
    1891             : 
    1892           0 :         assert(u);
    1893             : 
    1894           0 :         if (MANAGER_IS_SYSTEM(u->manager))
    1895           0 :                 return -EINVAL;
    1896             : 
    1897           0 :         if (!u->manager->system_bus)
    1898           0 :                 return -EIO;
    1899             : 
    1900           0 :         if (!u->cgroup_path)
    1901           0 :                 return -EINVAL;
    1902             : 
    1903             :         /* Determine this unit's cgroup path relative to our cgroup root */
    1904           0 :         pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
    1905           0 :         if (!pp)
    1906           0 :                 return -EINVAL;
    1907             : 
    1908           0 :         pp = strjoina("/", pp, suffix_path);
    1909           0 :         path_simplify(pp, false);
    1910             : 
    1911           0 :         r = sd_bus_call_method(u->manager->system_bus,
    1912             :                                "org.freedesktop.systemd1",
    1913             :                                "/org/freedesktop/systemd1",
    1914             :                                "org.freedesktop.systemd1.Manager",
    1915             :                                "AttachProcessesToUnit",
    1916             :                                &error, NULL,
    1917             :                                "ssau",
    1918             :                                NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
    1919           0 :         if (r < 0)
    1920           0 :                 return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
    1921             : 
    1922           0 :         return 0;
    1923             : }
    1924             : 
    1925           0 : int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
    1926             :         CGroupMask delegated_mask;
    1927             :         const char *p;
    1928             :         Iterator i;
    1929             :         void *pidp;
    1930             :         int r, q;
    1931             : 
    1932           0 :         assert(u);
    1933             : 
    1934           0 :         if (!UNIT_HAS_CGROUP_CONTEXT(u))
    1935           0 :                 return -EINVAL;
    1936             : 
    1937           0 :         if (set_isempty(pids))
    1938           0 :                 return 0;
    1939             : 
    1940             :         /* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
    1941             :          * Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
    1942           0 :         r = bpf_firewall_load_custom(u);
    1943           0 :         if (r < 0)
    1944           0 :                 return r;
    1945             : 
    1946           0 :         r = unit_realize_cgroup(u);
    1947           0 :         if (r < 0)
    1948           0 :                 return r;
    1949             : 
    1950           0 :         if (isempty(suffix_path))
    1951           0 :                 p = u->cgroup_path;
    1952             :         else
    1953           0 :                 p = prefix_roota(u->cgroup_path, suffix_path);
    1954             : 
    1955           0 :         delegated_mask = unit_get_delegate_mask(u);
    1956             : 
    1957           0 :         r = 0;
    1958           0 :         SET_FOREACH(pidp, pids, i) {
    1959           0 :                 pid_t pid = PTR_TO_PID(pidp);
    1960             :                 CGroupController c;
    1961             : 
    1962             :                 /* First, attach the PID to the main cgroup hierarchy */
    1963           0 :                 q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
    1964           0 :                 if (q < 0) {
    1965           0 :                         log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
    1966             : 
    1967           0 :                         if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
    1968             :                                 int z;
    1969             : 
    1970             :                                 /* If we are in a user instance, and we can't move the process ourselves due to
    1971             :                                  * permission problems, let's ask the system instance about it instead. Since it's more
    1972             :                                  * privileged it might be able to move the process across the leaves of a subtree who's
    1973             :                                  * top node is not owned by us. */
    1974             : 
    1975           0 :                                 z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
    1976           0 :                                 if (z < 0)
    1977           0 :                                         log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
    1978             :                                 else
    1979           0 :                                         continue; /* When the bus thing worked via the bus we are fully done for this PID. */
    1980             :                         }
    1981             : 
    1982           0 :                         if (r >= 0)
    1983           0 :                                 r = q; /* Remember first error */
    1984             : 
    1985           0 :                         continue;
    1986             :                 }
    1987             : 
    1988           0 :                 q = cg_all_unified();
    1989           0 :                 if (q < 0)
    1990           0 :                         return q;
    1991           0 :                 if (q > 0)
    1992           0 :                         continue;
    1993             : 
    1994             :                 /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
    1995             :                  * innermost realized one */
    1996             : 
    1997           0 :                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
    1998           0 :                         CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
    1999             :                         const char *realized;
    2000             : 
    2001           0 :                         if (!(u->manager->cgroup_supported & bit))
    2002           0 :                                 continue;
    2003             : 
    2004             :                         /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
    2005           0 :                         if (delegated_mask & u->cgroup_realized_mask & bit) {
    2006           0 :                                 q = cg_attach(cgroup_controller_to_string(c), p, pid);
    2007           0 :                                 if (q >= 0)
    2008           0 :                                         continue; /* Success! */
    2009             : 
    2010           0 :                                 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
    2011             :                                                      pid, p, cgroup_controller_to_string(c));
    2012             :                         }
    2013             : 
    2014             :                         /* So this controller is either not delegate or realized, or something else weird happened. In
    2015             :                          * that case let's attach the PID at least to the closest cgroup up the tree that is
    2016             :                          * realized. */
    2017           0 :                         realized = unit_get_realized_cgroup_path(u, bit);
    2018           0 :                         if (!realized)
    2019           0 :                                 continue; /* Not even realized in the root slice? Then let's not bother */
    2020             : 
    2021           0 :                         q = cg_attach(cgroup_controller_to_string(c), realized, pid);
    2022           0 :                         if (q < 0)
    2023           0 :                                 log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
    2024             :                                                      pid, realized, cgroup_controller_to_string(c));
    2025             :                 }
    2026             :         }
    2027             : 
    2028           0 :         return r;
    2029             : }
    2030             : 
    2031           6 : static bool unit_has_mask_realized(
    2032             :                 Unit *u,
    2033             :                 CGroupMask target_mask,
    2034             :                 CGroupMask enable_mask) {
    2035             : 
    2036           6 :         assert(u);
    2037             : 
    2038             :         /* Returns true if this unit is fully realized. We check four things:
    2039             :          *
    2040             :          * 1. Whether the cgroup was created at all
    2041             :          * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
    2042             :          * 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
    2043             :          * 4. Whether the invalidation mask is currently zero
    2044             :          *
    2045             :          * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
    2046             :          * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
    2047             :          * real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
    2048             :          * is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
    2049             :          * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
    2050             :          * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
    2051             :          * simply don't matter. */
    2052             : 
    2053           6 :         return u->cgroup_realized &&
    2054           0 :                 ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
    2055           6 :                 ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
    2056           0 :                 u->cgroup_invalidated_mask == 0;
    2057             : }
    2058             : 
    2059           0 : static bool unit_has_mask_disables_realized(
    2060             :                 Unit *u,
    2061             :                 CGroupMask target_mask,
    2062             :                 CGroupMask enable_mask) {
    2063             : 
    2064           0 :         assert(u);
    2065             : 
    2066             :         /* Returns true if all controllers which should be disabled are indeed disabled.
    2067             :          *
    2068             :          * Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
    2069             :          * already removed. */
    2070             : 
    2071           0 :         return !u->cgroup_realized ||
    2072           0 :                 (FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
    2073           0 :                  FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
    2074             : }
    2075             : 
    2076           6 : static bool unit_has_mask_enables_realized(
    2077             :                 Unit *u,
    2078             :                 CGroupMask target_mask,
    2079             :                 CGroupMask enable_mask) {
    2080             : 
    2081           6 :         assert(u);
    2082             : 
    2083             :         /* Returns true if all controllers which should be enabled are indeed enabled.
    2084             :          *
    2085             :          * Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
    2086             :          * we want to add is already added. */
    2087             : 
    2088           6 :         return u->cgroup_realized &&
    2089           6 :                 ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
    2090           0 :                 ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
    2091             : }
    2092             : 
    2093         183 : void unit_add_to_cgroup_realize_queue(Unit *u) {
    2094         183 :         assert(u);
    2095             : 
    2096         183 :         if (u->in_cgroup_realize_queue)
    2097         167 :                 return;
    2098             : 
    2099          16 :         LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
    2100          16 :         u->in_cgroup_realize_queue = true;
    2101             : }
    2102             : 
    2103           6 : static void unit_remove_from_cgroup_realize_queue(Unit *u) {
    2104           6 :         assert(u);
    2105             : 
    2106           6 :         if (!u->in_cgroup_realize_queue)
    2107           6 :                 return;
    2108             : 
    2109           0 :         LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
    2110           0 :         u->in_cgroup_realize_queue = false;
    2111             : }
    2112             : 
    2113             : /* Controllers can only be enabled breadth-first, from the root of the
    2114             :  * hierarchy downwards to the unit in question. */
    2115           6 : static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
    2116             :         CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
    2117             :         int r;
    2118             : 
    2119           6 :         assert(u);
    2120             : 
    2121             :         /* First go deal with this unit's parent, or we won't be able to enable
    2122             :          * any new controllers at this layer. */
    2123           6 :         if (UNIT_ISSET(u->slice)) {
    2124           0 :                 r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
    2125           0 :                 if (r < 0)
    2126           0 :                         return r;
    2127             :         }
    2128             : 
    2129           6 :         target_mask = unit_get_target_mask(u);
    2130           6 :         enable_mask = unit_get_enable_mask(u);
    2131             : 
    2132             :         /* We can only enable in this direction, don't try to disable anything.
    2133             :          */
    2134           6 :         if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
    2135           0 :                 return 0;
    2136             : 
    2137           6 :         new_target_mask = u->cgroup_realized_mask | target_mask;
    2138           6 :         new_enable_mask = u->cgroup_enabled_mask | enable_mask;
    2139             : 
    2140           6 :         return unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
    2141             : }
    2142             : 
    2143             : /* Controllers can only be disabled depth-first, from the leaves of the
    2144             :  * hierarchy upwards to the unit in question. */
    2145           6 : static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
    2146             :         Iterator i;
    2147             :         Unit *m;
    2148             :         void *v;
    2149             : 
    2150           6 :         assert(u);
    2151             : 
    2152           6 :         if (u->type != UNIT_SLICE)
    2153           6 :                 return 0;
    2154             : 
    2155           0 :         HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
    2156             :                 CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
    2157             :                 int r;
    2158             : 
    2159           0 :                 if (UNIT_DEREF(m->slice) != u)
    2160           0 :                         continue;
    2161             : 
    2162             :                 /* The cgroup for this unit might not actually be fully
    2163             :                  * realised yet, in which case it isn't holding any controllers
    2164             :                  * open anyway. */
    2165           0 :                 if (!m->cgroup_path)
    2166           0 :                         continue;
    2167             : 
    2168             :                 /* We must disable those below us first in order to release the
    2169             :                  * controller. */
    2170           0 :                 if (m->type == UNIT_SLICE)
    2171           0 :                         (void) unit_realize_cgroup_now_disable(m, state);
    2172             : 
    2173           0 :                 target_mask = unit_get_target_mask(m);
    2174           0 :                 enable_mask = unit_get_enable_mask(m);
    2175             : 
    2176             :                 /* We can only disable in this direction, don't try to enable
    2177             :                  * anything. */
    2178           0 :                 if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
    2179           0 :                         continue;
    2180             : 
    2181           0 :                 new_target_mask = m->cgroup_realized_mask & target_mask;
    2182           0 :                 new_enable_mask = m->cgroup_enabled_mask & enable_mask;
    2183             : 
    2184           0 :                 r = unit_create_cgroup(m, new_target_mask, new_enable_mask, state);
    2185           0 :                 if (r < 0)
    2186           0 :                         return r;
    2187             :         }
    2188             : 
    2189           0 :         return 0;
    2190             : }
    2191             : 
    2192             : /* Check if necessary controllers and attributes for a unit are in place.
    2193             :  *
    2194             :  * - If so, do nothing.
    2195             :  * - If not, create paths, move processes over, and set attributes.
    2196             :  *
    2197             :  * Controllers can only be *enabled* in a breadth-first way, and *disabled* in
    2198             :  * a depth-first way. As such the process looks like this:
    2199             :  *
    2200             :  * Suppose we have a cgroup hierarchy which looks like this:
    2201             :  *
    2202             :  *             root
    2203             :  *            /    \
    2204             :  *           /      \
    2205             :  *          /        \
    2206             :  *         a          b
    2207             :  *        / \        / \
    2208             :  *       /   \      /   \
    2209             :  *      c     d    e     f
    2210             :  *     / \   / \  / \   / \
    2211             :  *     h i   j k  l m   n o
    2212             :  *
    2213             :  * 1. We want to realise cgroup "d" now.
    2214             :  * 2. cgroup "a" has DisableControllers=cpu in the associated unit.
    2215             :  * 3. cgroup "k" just started requesting the memory controller.
    2216             :  *
    2217             :  * To make this work we must do the following in order:
    2218             :  *
    2219             :  * 1. Disable CPU controller in k, j
    2220             :  * 2. Disable CPU controller in d
    2221             :  * 3. Enable memory controller in root
    2222             :  * 4. Enable memory controller in a
    2223             :  * 5. Enable memory controller in d
    2224             :  * 6. Enable memory controller in k
    2225             :  *
    2226             :  * Notice that we need to touch j in one direction, but not the other. We also
    2227             :  * don't go beyond d when disabling -- it's up to "a" to get realized if it
    2228             :  * wants to disable further. The basic rules are therefore:
    2229             :  *
    2230             :  * - If you're disabling something, you need to realise all of the cgroups from
    2231             :  *   your recursive descendants to the root. This starts from the leaves.
    2232             :  * - If you're enabling something, you need to realise from the root cgroup
    2233             :  *   downwards, but you don't need to iterate your recursive descendants.
    2234             :  *
    2235             :  * Returns 0 on success and < 0 on failure. */
    2236           6 : static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
    2237             :         CGroupMask target_mask, enable_mask;
    2238             :         int r;
    2239             : 
    2240           6 :         assert(u);
    2241             : 
    2242           6 :         unit_remove_from_cgroup_realize_queue(u);
    2243             : 
    2244           6 :         target_mask = unit_get_target_mask(u);
    2245           6 :         enable_mask = unit_get_enable_mask(u);
    2246             : 
    2247           6 :         if (unit_has_mask_realized(u, target_mask, enable_mask))
    2248           0 :                 return 0;
    2249             : 
    2250             :         /* Disable controllers below us, if there are any */
    2251           6 :         r = unit_realize_cgroup_now_disable(u, state);
    2252           6 :         if (r < 0)
    2253           0 :                 return r;
    2254             : 
    2255             :         /* Enable controllers above us, if there are any */
    2256           6 :         if (UNIT_ISSET(u->slice)) {
    2257           6 :                 r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
    2258           6 :                 if (r < 0)
    2259           0 :                         return r;
    2260             :         }
    2261             : 
    2262             :         /* Now actually deal with the cgroup we were trying to realise and set attributes */
    2263           6 :         r = unit_create_cgroup(u, target_mask, enable_mask, state);
    2264           6 :         if (r < 0)
    2265           6 :                 return r;
    2266             : 
    2267             :         /* Now, reset the invalidation mask */
    2268           0 :         u->cgroup_invalidated_mask = 0;
    2269           0 :         return 0;
    2270             : }
    2271             : 
    2272           0 : unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
    2273             :         ManagerState state;
    2274           0 :         unsigned n = 0;
    2275             :         Unit *i;
    2276             :         int r;
    2277             : 
    2278           0 :         assert(m);
    2279             : 
    2280           0 :         state = manager_state(m);
    2281             : 
    2282           0 :         while ((i = m->cgroup_realize_queue)) {
    2283           0 :                 assert(i->in_cgroup_realize_queue);
    2284             : 
    2285           0 :                 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
    2286             :                         /* Maybe things changed, and the unit is not actually active anymore? */
    2287           0 :                         unit_remove_from_cgroup_realize_queue(i);
    2288           0 :                         continue;
    2289             :                 }
    2290             : 
    2291           0 :                 r = unit_realize_cgroup_now(i, state);
    2292           0 :                 if (r < 0)
    2293           0 :                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
    2294             : 
    2295           0 :                 n++;
    2296             :         }
    2297             : 
    2298           0 :         return n;
    2299             : }
    2300             : 
    2301           6 : static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
    2302             :         Unit *slice;
    2303             : 
    2304             :         /* This adds the siblings of the specified unit and the
    2305             :          * siblings of all parent units to the cgroup queue. (But
    2306             :          * neither the specified unit itself nor the parents.) */
    2307             : 
    2308          12 :         while ((slice = UNIT_DEREF(u->slice))) {
    2309             :                 Iterator i;
    2310             :                 Unit *m;
    2311             :                 void *v;
    2312             : 
    2313          12 :                 HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE], i) {
    2314             :                         /* Skip units that have a dependency on the slice
    2315             :                          * but aren't actually in it. */
    2316           6 :                         if (UNIT_DEREF(m->slice) != slice)
    2317           6 :                                 continue;
    2318             : 
    2319             :                         /* No point in doing cgroup application for units
    2320             :                          * without active processes. */
    2321           0 :                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
    2322           0 :                                 continue;
    2323             : 
    2324             :                         /* If the unit doesn't need any new controllers
    2325             :                          * and has current ones realized, it doesn't need
    2326             :                          * any changes. */
    2327           0 :                         if (unit_has_mask_realized(m,
    2328             :                                                    unit_get_target_mask(m),
    2329             :                                                    unit_get_enable_mask(m)))
    2330           0 :                                 continue;
    2331             : 
    2332           0 :                         unit_add_to_cgroup_realize_queue(m);
    2333             :                 }
    2334             : 
    2335           6 :                 u = slice;
    2336             :         }
    2337           6 : }
    2338             : 
    2339           6 : int unit_realize_cgroup(Unit *u) {
    2340           6 :         assert(u);
    2341             : 
    2342           6 :         if (!UNIT_HAS_CGROUP_CONTEXT(u))
    2343           0 :                 return 0;
    2344             : 
    2345             :         /* So, here's the deal: when realizing the cgroups for this
    2346             :          * unit, we need to first create all parents, but there's more
    2347             :          * actually: for the weight-based controllers we also need to
    2348             :          * make sure that all our siblings (i.e. units that are in the
    2349             :          * same slice as we are) have cgroups, too. Otherwise, things
    2350             :          * would become very uneven as each of their processes would
    2351             :          * get as much resources as all our group together. This call
    2352             :          * will synchronously create the parent cgroups, but will
    2353             :          * defer work on the siblings to the next event loop
    2354             :          * iteration. */
    2355             : 
    2356             :         /* Add all sibling slices to the cgroup queue. */
    2357           6 :         unit_add_siblings_to_cgroup_realize_queue(u);
    2358             : 
    2359             :         /* And realize this one now (and apply the values) */
    2360           6 :         return unit_realize_cgroup_now(u, manager_state(u->manager));
    2361             : }
    2362             : 
    2363        2172 : void unit_release_cgroup(Unit *u) {
    2364        2172 :         assert(u);
    2365             : 
    2366             :         /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
    2367             :          * when we close down everything for reexecution, where we really want to leave the cgroup in place. */
    2368             : 
    2369        2172 :         if (u->cgroup_path) {
    2370          12 :                 (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
    2371          12 :                 u->cgroup_path = mfree(u->cgroup_path);
    2372             :         }
    2373             : 
    2374        2172 :         if (u->cgroup_control_inotify_wd >= 0) {
    2375           0 :                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
    2376           0 :                         log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);
    2377             : 
    2378           0 :                 (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
    2379           0 :                 u->cgroup_control_inotify_wd = -1;
    2380             :         }
    2381             : 
    2382        2172 :         if (u->cgroup_memory_inotify_wd >= 0) {
    2383           0 :                 if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
    2384           0 :                         log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
    2385             : 
    2386           0 :                 (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
    2387           0 :                 u->cgroup_memory_inotify_wd = -1;
    2388             :         }
    2389        2172 : }
    2390             : 
    2391           7 : void unit_prune_cgroup(Unit *u) {
    2392             :         int r;
    2393             :         bool is_root_slice;
    2394             : 
    2395           7 :         assert(u);
    2396             : 
    2397             :         /* Removes the cgroup, if empty and possible, and stops watching it. */
    2398             : 
    2399           7 :         if (!u->cgroup_path)
    2400           7 :                 return;
    2401             : 
    2402           0 :         (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
    2403             : 
    2404           0 :         is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
    2405             : 
    2406           0 :         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
    2407           0 :         if (r < 0)
    2408             :                 /* One reason we could have failed here is, that the cgroup still contains a process.
    2409             :                  * However, if the cgroup becomes removable at a later time, it might be removed when
    2410             :                  * the containing slice is stopped. So even if we failed now, this unit shouldn't assume
    2411             :                  * that the cgroup is still realized the next time it is started. Do not return early
    2412             :                  * on error, continue cleanup. */
    2413           0 :                 log_unit_full(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
    2414             : 
    2415           0 :         if (is_root_slice)
    2416           0 :                 return;
    2417             : 
    2418           0 :         unit_release_cgroup(u);
    2419             : 
    2420           0 :         u->cgroup_realized = false;
    2421           0 :         u->cgroup_realized_mask = 0;
    2422           0 :         u->cgroup_enabled_mask = 0;
    2423             : 
    2424           0 :         u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
    2425             : }
    2426             : 
    2427           0 : int unit_search_main_pid(Unit *u, pid_t *ret) {
    2428           0 :         _cleanup_fclose_ FILE *f = NULL;
    2429           0 :         pid_t pid = 0, npid;
    2430             :         int r;
    2431             : 
    2432           0 :         assert(u);
    2433           0 :         assert(ret);
    2434             : 
    2435           0 :         if (!u->cgroup_path)
    2436           0 :                 return -ENXIO;
    2437             : 
    2438           0 :         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
    2439           0 :         if (r < 0)
    2440           0 :                 return r;
    2441             : 
    2442           0 :         while (cg_read_pid(f, &npid) > 0)  {
    2443             : 
    2444           0 :                 if (npid == pid)
    2445           0 :                         continue;
    2446             : 
    2447           0 :                 if (pid_is_my_child(npid) == 0)
    2448           0 :                         continue;
    2449             : 
    2450           0 :                 if (pid != 0)
    2451             :                         /* Dang, there's more than one daemonized PID
    2452             :                         in this group, so we don't know what process
    2453             :                         is the main process. */
    2454             : 
    2455           0 :                         return -ENODATA;
    2456             : 
    2457           0 :                 pid = npid;
    2458             :         }
    2459             : 
    2460           0 :         *ret = pid;
    2461           0 :         return 0;
    2462             : }
    2463             : 
    2464           0 : static int unit_watch_pids_in_path(Unit *u, const char *path) {
    2465           0 :         _cleanup_closedir_ DIR *d = NULL;
    2466           0 :         _cleanup_fclose_ FILE *f = NULL;
    2467           0 :         int ret = 0, r;
    2468             : 
    2469           0 :         assert(u);
    2470           0 :         assert(path);
    2471             : 
    2472           0 :         r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
    2473           0 :         if (r < 0)
    2474           0 :                 ret = r;
    2475             :         else {
    2476             :                 pid_t pid;
    2477             : 
    2478           0 :                 while ((r = cg_read_pid(f, &pid)) > 0) {
    2479           0 :                         r = unit_watch_pid(u, pid, false);
    2480           0 :                         if (r < 0 && ret >= 0)
    2481           0 :                                 ret = r;
    2482             :                 }
    2483             : 
    2484           0 :                 if (r < 0 && ret >= 0)
    2485           0 :                         ret = r;
    2486             :         }
    2487             : 
    2488           0 :         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
    2489           0 :         if (r < 0) {
    2490           0 :                 if (ret >= 0)
    2491           0 :                         ret = r;
    2492             :         } else {
    2493             :                 char *fn;
    2494             : 
    2495           0 :                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
    2496           0 :                         _cleanup_free_ char *p = NULL;
    2497             : 
    2498           0 :                         p = path_join(empty_to_root(path), fn);
    2499           0 :                         free(fn);
    2500             : 
    2501           0 :                         if (!p)
    2502           0 :                                 return -ENOMEM;
    2503             : 
    2504           0 :                         r = unit_watch_pids_in_path(u, p);
    2505           0 :                         if (r < 0 && ret >= 0)
    2506           0 :                                 ret = r;
    2507             :                 }
    2508             : 
    2509           0 :                 if (r < 0 && ret >= 0)
    2510           0 :                         ret = r;
    2511             :         }
    2512             : 
    2513           0 :         return ret;
    2514             : }
    2515             : 
    2516           0 : int unit_synthesize_cgroup_empty_event(Unit *u) {
    2517             :         int r;
    2518             : 
    2519           0 :         assert(u);
    2520             : 
    2521             :         /* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
    2522             :          * support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
    2523             :          * get as notification source as soon as we stopped having any useful PIDs to watch for. */
    2524             : 
    2525           0 :         if (!u->cgroup_path)
    2526           0 :                 return -ENOENT;
    2527             : 
    2528           0 :         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
    2529           0 :         if (r < 0)
    2530           0 :                 return r;
    2531           0 :         if (r > 0) /* On unified we have reliable notifications, and don't need this */
    2532           0 :                 return 0;
    2533             : 
    2534           0 :         if (!set_isempty(u->pids))
    2535           0 :                 return 0;
    2536             : 
    2537           0 :         unit_add_to_cgroup_empty_queue(u);
    2538           0 :         return 0;
    2539             : }
    2540             : 
    2541           0 : int unit_watch_all_pids(Unit *u) {
    2542             :         int r;
    2543             : 
    2544           0 :         assert(u);
    2545             : 
    2546             :         /* Adds all PIDs from our cgroup to the set of PIDs we
    2547             :          * watch. This is a fallback logic for cases where we do not
    2548             :          * get reliable cgroup empty notifications: we try to use
    2549             :          * SIGCHLD as replacement. */
    2550             : 
    2551           0 :         if (!u->cgroup_path)
    2552           0 :                 return -ENOENT;
    2553             : 
    2554           0 :         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
    2555           0 :         if (r < 0)
    2556           0 :                 return r;
    2557           0 :         if (r > 0) /* On unified we can use proper notifications */
    2558           0 :                 return 0;
    2559             : 
    2560           0 :         return unit_watch_pids_in_path(u, u->cgroup_path);
    2561             : }
    2562             : 
    2563           0 : static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
    2564           0 :         Manager *m = userdata;
    2565             :         Unit *u;
    2566             :         int r;
    2567             : 
    2568           0 :         assert(s);
    2569           0 :         assert(m);
    2570             : 
    2571           0 :         u = m->cgroup_empty_queue;
    2572           0 :         if (!u)
    2573           0 :                 return 0;
    2574             : 
    2575           0 :         assert(u->in_cgroup_empty_queue);
    2576           0 :         u->in_cgroup_empty_queue = false;
    2577           0 :         LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
    2578             : 
    2579           0 :         if (m->cgroup_empty_queue) {
    2580             :                 /* More stuff queued, let's make sure we remain enabled */
    2581           0 :                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
    2582           0 :                 if (r < 0)
    2583           0 :                         log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
    2584             :         }
    2585             : 
    2586           0 :         unit_add_to_gc_queue(u);
    2587             : 
    2588           0 :         if (UNIT_VTABLE(u)->notify_cgroup_empty)
    2589           0 :                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
    2590             : 
    2591           0 :         return 0;
    2592             : }
    2593             : 
    2594           0 : void unit_add_to_cgroup_empty_queue(Unit *u) {
    2595             :         int r;
    2596             : 
    2597           0 :         assert(u);
    2598             : 
    2599             :         /* Note that there are four different ways how cgroup empty events reach us:
    2600             :          *
    2601             :          * 1. On the unified hierarchy we get an inotify event on the cgroup
    2602             :          *
    2603             :          * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
    2604             :          *
    2605             :          * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
    2606             :          *
    2607             :          * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
    2608             :          *    soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
    2609             :          *
    2610             :          * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
    2611             :          * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
    2612             :          * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
    2613             :          * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
    2614             :          * case for scope units). */
    2615             : 
    2616           0 :         if (u->in_cgroup_empty_queue)
    2617           0 :                 return;
    2618             : 
    2619             :         /* Let's verify that the cgroup is really empty */
    2620           0 :         if (!u->cgroup_path)
    2621           0 :                 return;
    2622           0 :         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
    2623           0 :         if (r < 0) {
    2624           0 :                 log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
    2625           0 :                 return;
    2626             :         }
    2627           0 :         if (r == 0)
    2628           0 :                 return;
    2629             : 
    2630           0 :         LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
    2631           0 :         u->in_cgroup_empty_queue = true;
    2632             : 
    2633             :         /* Trigger the defer event */
    2634           0 :         r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
    2635           0 :         if (r < 0)
    2636           0 :                 log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
    2637             : }
    2638             : 
    2639           0 : int unit_check_oom(Unit *u) {
    2640           0 :         _cleanup_free_ char *oom_kill = NULL;
    2641             :         bool increased;
    2642             :         uint64_t c;
    2643             :         int r;
    2644             : 
    2645           0 :         if (!u->cgroup_path)
    2646           0 :                 return 0;
    2647             : 
    2648           0 :         r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
    2649           0 :         if (r < 0)
    2650           0 :                 return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
    2651             : 
    2652           0 :         r = safe_atou64(oom_kill, &c);
    2653           0 :         if (r < 0)
    2654           0 :                 return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
    2655             : 
    2656           0 :         increased = c > u->oom_kill_last;
    2657           0 :         u->oom_kill_last = c;
    2658             : 
    2659           0 :         if (!increased)
    2660           0 :                 return 0;
    2661             : 
    2662           0 :         log_struct(LOG_NOTICE,
    2663             :                    "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
    2664             :                    LOG_UNIT_ID(u),
    2665             :                    LOG_UNIT_INVOCATION_ID(u),
    2666             :                    LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
    2667             : 
    2668           0 :         if (UNIT_VTABLE(u)->notify_cgroup_oom)
    2669           0 :                 UNIT_VTABLE(u)->notify_cgroup_oom(u);
    2670             : 
    2671           0 :         return 1;
    2672             : }
    2673             : 
    2674           0 : static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
    2675           0 :         Manager *m = userdata;
    2676             :         Unit *u;
    2677             :         int r;
    2678             : 
    2679           0 :         assert(s);
    2680           0 :         assert(m);
    2681             : 
    2682           0 :         u = m->cgroup_oom_queue;
    2683           0 :         if (!u)
    2684           0 :                 return 0;
    2685             : 
    2686           0 :         assert(u->in_cgroup_oom_queue);
    2687           0 :         u->in_cgroup_oom_queue = false;
    2688           0 :         LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
    2689             : 
    2690           0 :         if (m->cgroup_oom_queue) {
    2691             :                 /* More stuff queued, let's make sure we remain enabled */
    2692           0 :                 r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
    2693           0 :                 if (r < 0)
    2694           0 :                         log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
    2695             :         }
    2696             : 
    2697           0 :         (void) unit_check_oom(u);
    2698           0 :         return 0;
    2699             : }
    2700             : 
    2701           0 : static void unit_add_to_cgroup_oom_queue(Unit *u) {
    2702             :         int r;
    2703             : 
    2704           0 :         assert(u);
    2705             : 
    2706           0 :         if (u->in_cgroup_oom_queue)
    2707           0 :                 return;
    2708           0 :         if (!u->cgroup_path)
    2709           0 :                 return;
    2710             : 
    2711           0 :         LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
    2712           0 :         u->in_cgroup_oom_queue = true;
    2713             : 
    2714             :         /* Trigger the defer event */
    2715           0 :         if (!u->manager->cgroup_oom_event_source) {
    2716           0 :                 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
    2717             : 
    2718           0 :                 r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
    2719           0 :                 if (r < 0) {
    2720           0 :                         log_error_errno(r, "Failed to create cgroup oom event source: %m");
    2721           0 :                         return;
    2722             :                 }
    2723             : 
    2724           0 :                 r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
    2725           0 :                 if (r < 0) {
    2726           0 :                         log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
    2727           0 :                         return;
    2728             :                 }
    2729             : 
    2730           0 :                 (void) sd_event_source_set_description(s, "cgroup-oom");
    2731           0 :                 u->manager->cgroup_oom_event_source = TAKE_PTR(s);
    2732             :         }
    2733             : 
    2734           0 :         r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
    2735           0 :         if (r < 0)
    2736           0 :                 log_error_errno(r, "Failed to enable cgroup oom event source: %m");
    2737             : }
    2738             : 
    2739           0 : static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
    2740           0 :         Manager *m = userdata;
    2741             : 
    2742           0 :         assert(s);
    2743           0 :         assert(fd >= 0);
    2744           0 :         assert(m);
    2745             : 
    2746           0 :         for (;;) {
    2747             :                 union inotify_event_buffer buffer;
    2748             :                 struct inotify_event *e;
    2749             :                 ssize_t l;
    2750             : 
    2751           0 :                 l = read(fd, &buffer, sizeof(buffer));
    2752           0 :                 if (l < 0) {
    2753           0 :                         if (IN_SET(errno, EINTR, EAGAIN))
    2754           0 :                                 return 0;
    2755             : 
    2756           0 :                         return log_error_errno(errno, "Failed to read control group inotify events: %m");
    2757             :                 }
    2758             : 
    2759           0 :                 FOREACH_INOTIFY_EVENT(e, buffer, l) {
    2760             :                         Unit *u;
    2761             : 
    2762           0 :                         if (e->wd < 0)
    2763             :                                 /* Queue overflow has no watch descriptor */
    2764           0 :                                 continue;
    2765             : 
    2766           0 :                         if (e->mask & IN_IGNORED)
    2767             :                                 /* The watch was just removed */
    2768           0 :                                 continue;
    2769             : 
    2770             :                         /* Note that inotify might deliver events for a watch even after it was removed,
    2771             :                          * because it was queued before the removal. Let's ignore this here safely. */
    2772             : 
    2773           0 :                         u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
    2774           0 :                         if (u)
    2775           0 :                                 unit_add_to_cgroup_empty_queue(u);
    2776             : 
    2777           0 :                         u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
    2778           0 :                         if (u)
    2779           0 :                                 unit_add_to_cgroup_oom_queue(u);
    2780             :                 }
    2781             :         }
    2782             : }
    2783             : 
    2784          11 : static int cg_bpf_mask_supported(CGroupMask *ret) {
    2785          11 :         CGroupMask mask = 0;
    2786             :         int r;
    2787             : 
    2788             :         /* BPF-based firewall */
    2789          11 :         r = bpf_firewall_supported();
    2790          11 :         if (r > 0)
    2791           0 :                 mask |= CGROUP_MASK_BPF_FIREWALL;
    2792             : 
    2793             :         /* BPF-based device access control */
    2794          11 :         r = bpf_devices_supported();
    2795          11 :         if (r > 0)
    2796           0 :                 mask |= CGROUP_MASK_BPF_DEVICES;
    2797             : 
    2798          11 :         *ret = mask;
    2799          11 :         return 0;
    2800             : }
    2801             : 
    2802          11 : int manager_setup_cgroup(Manager *m) {
    2803          11 :         _cleanup_free_ char *path = NULL;
    2804             :         const char *scope_path;
    2805             :         CGroupController c;
    2806             :         int r, all_unified;
    2807             :         CGroupMask mask;
    2808             :         char *e;
    2809             : 
    2810          11 :         assert(m);
    2811             : 
    2812             :         /* 1. Determine hierarchy */
    2813          11 :         m->cgroup_root = mfree(m->cgroup_root);
    2814          11 :         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
    2815          11 :         if (r < 0)
    2816           0 :                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
    2817             : 
    2818             :         /* Chop off the init scope, if we are already located in it */
    2819          11 :         e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
    2820             : 
    2821             :         /* LEGACY: Also chop off the system slice if we are in
    2822             :          * it. This is to support live upgrades from older systemd
    2823             :          * versions where PID 1 was moved there. Also see
    2824             :          * cg_get_root_path(). */
    2825          11 :         if (!e && MANAGER_IS_SYSTEM(m)) {
    2826           0 :                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
    2827           0 :                 if (!e)
    2828           0 :                         e = endswith(m->cgroup_root, "/system"); /* even more legacy */
    2829             :         }
    2830          11 :         if (e)
    2831           0 :                 *e = 0;
    2832             : 
    2833             :         /* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
    2834             :          * easily prepend it everywhere. */
    2835          11 :         delete_trailing_chars(m->cgroup_root, "/");
    2836             : 
    2837             :         /* 2. Show data */
    2838          11 :         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
    2839          11 :         if (r < 0)
    2840           0 :                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
    2841             : 
    2842          11 :         r = cg_unified_flush();
    2843          11 :         if (r < 0)
    2844           0 :                 return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
    2845             : 
    2846          11 :         all_unified = cg_all_unified();
    2847          11 :         if (all_unified < 0)
    2848           0 :                 return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
    2849          11 :         if (all_unified > 0)
    2850           0 :                 log_debug("Unified cgroup hierarchy is located at %s.", path);
    2851             :         else {
    2852          11 :                 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
    2853          11 :                 if (r < 0)
    2854           0 :                         return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
    2855          11 :                 if (r > 0)
    2856          11 :                         log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
    2857             :                 else
    2858           0 :                         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
    2859             :         }
    2860             : 
    2861             :         /* 3. Allocate cgroup empty defer event source */
    2862          11 :         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
    2863          11 :         r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
    2864          11 :         if (r < 0)
    2865           0 :                 return log_error_errno(r, "Failed to create cgroup empty event source: %m");
    2866             : 
    2867             :         /* Schedule cgroup empty checks early, but after having processed service notification messages or
    2868             :          * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
    2869             :          * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
    2870          11 :         r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
    2871          11 :         if (r < 0)
    2872           0 :                 return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
    2873             : 
    2874          11 :         r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
    2875          11 :         if (r < 0)
    2876           0 :                 return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
    2877             : 
    2878          11 :         (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
    2879             : 
    2880             :         /* 4. Install notifier inotify object, or agent */
    2881          11 :         if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
    2882             : 
    2883             :                 /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
    2884             : 
    2885          11 :                 m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
    2886          11 :                 safe_close(m->cgroup_inotify_fd);
    2887             : 
    2888          11 :                 m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
    2889          11 :                 if (m->cgroup_inotify_fd < 0)
    2890           0 :                         return log_error_errno(errno, "Failed to create control group inotify object: %m");
    2891             : 
    2892          11 :                 r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
    2893          11 :                 if (r < 0)
    2894           0 :                         return log_error_errno(r, "Failed to watch control group inotify object: %m");
    2895             : 
    2896             :                 /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
    2897             :                  * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
    2898             :                  * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
    2899          11 :                 r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
    2900          11 :                 if (r < 0)
    2901           0 :                         return log_error_errno(r, "Failed to set priority of inotify event source: %m");
    2902             : 
    2903          11 :                 (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
    2904             : 
    2905           0 :         } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) {
    2906             : 
    2907             :                 /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
    2908             :                  * since it does not generate events when control groups with children run empty. */
    2909             : 
    2910           0 :                 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
    2911           0 :                 if (r < 0)
    2912           0 :                         log_warning_errno(r, "Failed to install release agent, ignoring: %m");
    2913           0 :                 else if (r > 0)
    2914           0 :                         log_debug("Installed release agent.");
    2915           0 :                 else if (r == 0)
    2916           0 :                         log_debug("Release agent already installed.");
    2917             :         }
    2918             : 
    2919             :         /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
    2920          55 :         scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
    2921          11 :         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
    2922          11 :         if (r >= 0) {
    2923             :                 /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
    2924           0 :                 r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
    2925           0 :                 if (r < 0)
    2926           0 :                         log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
    2927             : 
    2928             :                 /* 6. And pin it, so that it cannot be unmounted */
    2929           0 :                 safe_close(m->pin_cgroupfs_fd);
    2930           0 :                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
    2931           0 :                 if (m->pin_cgroupfs_fd < 0)
    2932           0 :                         return log_error_errno(errno, "Failed to open pin file: %m");
    2933             : 
    2934          11 :         } else if (!MANAGER_IS_TEST_RUN(m))
    2935           0 :                 return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
    2936             : 
    2937             :         /* 7. Always enable hierarchical support if it exists... */
    2938          11 :         if (!all_unified && !MANAGER_IS_TEST_RUN(m))
    2939           0 :                 (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
    2940             : 
    2941             :         /* 8. Figure out which controllers are supported */
    2942          11 :         r = cg_mask_supported(&m->cgroup_supported);
    2943          11 :         if (r < 0)
    2944           0 :                 return log_error_errno(r, "Failed to determine supported controllers: %m");
    2945             : 
    2946             :         /* 9. Figure out which bpf-based pseudo-controllers are supported */
    2947          11 :         r = cg_bpf_mask_supported(&mask);
    2948          11 :         if (r < 0)
    2949           0 :                 return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
    2950          11 :         m->cgroup_supported |= mask;
    2951             : 
    2952             :         /* 10. Log which controllers are supported */
    2953         110 :         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
    2954          99 :                 log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
    2955             : 
    2956          11 :         return 0;
    2957             : }
    2958             : 
    2959          14 : void manager_shutdown_cgroup(Manager *m, bool delete) {
    2960          14 :         assert(m);
    2961             : 
    2962             :         /* We can't really delete the group, since we are in it. But
    2963             :          * let's trim it. */
    2964          14 :         if (delete && m->cgroup_root && m->test_run_flags != MANAGER_TEST_RUN_MINIMAL)
    2965           0 :                 (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
    2966             : 
    2967          14 :         m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
    2968             : 
    2969          14 :         m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
    2970          14 :         m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
    2971             : 
    2972          14 :         m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
    2973          14 :         m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
    2974             : 
    2975          14 :         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
    2976             : 
    2977          14 :         m->cgroup_root = mfree(m->cgroup_root);
    2978          14 : }
    2979             : 
    2980           0 : Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
    2981             :         char *p;
    2982             :         Unit *u;
    2983             : 
    2984           0 :         assert(m);
    2985           0 :         assert(cgroup);
    2986             : 
    2987           0 :         u = hashmap_get(m->cgroup_unit, cgroup);
    2988           0 :         if (u)
    2989           0 :                 return u;
    2990             : 
    2991           0 :         p = strdupa(cgroup);
    2992           0 :         for (;;) {
    2993             :                 char *e;
    2994             : 
    2995           0 :                 e = strrchr(p, '/');
    2996           0 :                 if (!e || e == p)
    2997           0 :                         return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
    2998             : 
    2999           0 :                 *e = 0;
    3000             : 
    3001           0 :                 u = hashmap_get(m->cgroup_unit, p);
    3002           0 :                 if (u)
    3003           0 :                         return u;
    3004             :         }
    3005             : }
    3006             : 
    3007           0 : Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
    3008           0 :         _cleanup_free_ char *cgroup = NULL;
    3009             : 
    3010           0 :         assert(m);
    3011             : 
    3012           0 :         if (!pid_is_valid(pid))
    3013           0 :                 return NULL;
    3014             : 
    3015           0 :         if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
    3016           0 :                 return NULL;
    3017             : 
    3018           0 :         return manager_get_unit_by_cgroup(m, cgroup);
    3019             : }
    3020             : 
    3021           0 : Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
    3022             :         Unit *u, **array;
    3023             : 
    3024           0 :         assert(m);
    3025             : 
    3026             :         /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
    3027             :          * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
    3028             :          * relevant one as children of the process will be assigned to that one, too, before all else. */
    3029             : 
    3030           0 :         if (!pid_is_valid(pid))
    3031           0 :                 return NULL;
    3032             : 
    3033           0 :         if (pid == getpid_cached())
    3034           0 :                 return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
    3035             : 
    3036           0 :         u = manager_get_unit_by_pid_cgroup(m, pid);
    3037           0 :         if (u)
    3038           0 :                 return u;
    3039             : 
    3040           0 :         u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
    3041           0 :         if (u)
    3042           0 :                 return u;
    3043             : 
    3044           0 :         array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
    3045           0 :         if (array)
    3046           0 :                 return array[0];
    3047             : 
    3048           0 :         return NULL;
    3049             : }
    3050             : 
    3051           0 : int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
    3052             :         Unit *u;
    3053             : 
    3054           0 :         assert(m);
    3055           0 :         assert(cgroup);
    3056             : 
    3057             :         /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
    3058             :          * or from the --system instance */
    3059             : 
    3060           0 :         log_debug("Got cgroup empty notification for: %s", cgroup);
    3061             : 
    3062           0 :         u = manager_get_unit_by_cgroup(m, cgroup);
    3063           0 :         if (!u)
    3064           0 :                 return 0;
    3065             : 
    3066           0 :         unit_add_to_cgroup_empty_queue(u);
    3067           0 :         return 1;
    3068             : }
    3069             : 
    3070           0 : int unit_get_memory_current(Unit *u, uint64_t *ret) {
    3071           0 :         _cleanup_free_ char *v = NULL;
    3072             :         int r;
    3073             : 
    3074           0 :         assert(u);
    3075           0 :         assert(ret);
    3076             : 
    3077           0 :         if (!UNIT_CGROUP_BOOL(u, memory_accounting))
    3078           0 :                 return -ENODATA;
    3079             : 
    3080           0 :         if (!u->cgroup_path)
    3081           0 :                 return -ENODATA;
    3082             : 
    3083             :         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
    3084           0 :         if (unit_has_host_root_cgroup(u))
    3085           0 :                 return procfs_memory_get_used(ret);
    3086             : 
    3087           0 :         if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
    3088           0 :                 return -ENODATA;
    3089             : 
    3090           0 :         r = cg_all_unified();
    3091           0 :         if (r < 0)
    3092           0 :                 return r;
    3093           0 :         if (r > 0)
    3094           0 :                 r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
    3095             :         else
    3096           0 :                 r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
    3097           0 :         if (r == -ENOENT)
    3098           0 :                 return -ENODATA;
    3099           0 :         if (r < 0)
    3100           0 :                 return r;
    3101             : 
    3102           0 :         return safe_atou64(v, ret);
    3103             : }
    3104             : 
    3105           0 : int unit_get_tasks_current(Unit *u, uint64_t *ret) {
    3106           0 :         _cleanup_free_ char *v = NULL;
    3107             :         int r;
    3108             : 
    3109           0 :         assert(u);
    3110           0 :         assert(ret);
    3111             : 
    3112           0 :         if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
    3113           0 :                 return -ENODATA;
    3114             : 
    3115           0 :         if (!u->cgroup_path)
    3116           0 :                 return -ENODATA;
    3117             : 
    3118             :         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
    3119           0 :         if (unit_has_host_root_cgroup(u))
    3120           0 :                 return procfs_tasks_get_current(ret);
    3121             : 
    3122           0 :         if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
    3123           0 :                 return -ENODATA;
    3124             : 
    3125           0 :         r = cg_get_attribute("pids", u->cgroup_path, "pids.current", &v);
    3126           0 :         if (r == -ENOENT)
    3127           0 :                 return -ENODATA;
    3128           0 :         if (r < 0)
    3129           0 :                 return r;
    3130             : 
    3131           0 :         return safe_atou64(v, ret);
    3132             : }
    3133             : 
    3134           6 : static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
    3135           6 :         _cleanup_free_ char *v = NULL;
    3136             :         uint64_t ns;
    3137             :         int r;
    3138             : 
    3139           6 :         assert(u);
    3140           6 :         assert(ret);
    3141             : 
    3142           6 :         if (!u->cgroup_path)
    3143           0 :                 return -ENODATA;
    3144             : 
    3145             :         /* The root cgroup doesn't expose this information, let's get it from /proc instead */
    3146           6 :         if (unit_has_host_root_cgroup(u))
    3147           0 :                 return procfs_cpu_get_usage(ret);
    3148             : 
    3149             :         /* Requisite controllers for CPU accounting are not enabled */
    3150           6 :         if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
    3151           6 :                 return -ENODATA;
    3152             : 
    3153           0 :         r = cg_all_unified();
    3154           0 :         if (r < 0)
    3155           0 :                 return r;
    3156           0 :         if (r > 0) {
    3157           0 :                 _cleanup_free_ char *val = NULL;
    3158             :                 uint64_t us;
    3159             : 
    3160           0 :                 r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
    3161           0 :                 if (IN_SET(r, -ENOENT, -ENXIO))
    3162           0 :                         return -ENODATA;
    3163           0 :                 if (r < 0)
    3164           0 :                         return r;
    3165             : 
    3166           0 :                 r = safe_atou64(val, &us);
    3167           0 :                 if (r < 0)
    3168           0 :                         return r;
    3169             : 
    3170           0 :                 ns = us * NSEC_PER_USEC;
    3171             :         } else {
    3172           0 :                 r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
    3173           0 :                 if (r == -ENOENT)
    3174           0 :                         return -ENODATA;
    3175           0 :                 if (r < 0)
    3176           0 :                         return r;
    3177             : 
    3178           0 :                 r = safe_atou64(v, &ns);
    3179           0 :                 if (r < 0)
    3180           0 :                         return r;
    3181             :         }
    3182             : 
    3183           0 :         *ret = ns;
    3184           0 :         return 0;
    3185             : }
    3186             : 
    3187           7 : int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
    3188             :         nsec_t ns;
    3189             :         int r;
    3190             : 
    3191           7 :         assert(u);
    3192             : 
    3193             :         /* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
    3194             :          * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
    3195             :          * call this function with a NULL return value. */
    3196             : 
    3197           7 :         if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
    3198           7 :                 return -ENODATA;
    3199             : 
    3200           0 :         r = unit_get_cpu_usage_raw(u, &ns);
    3201           0 :         if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
    3202             :                 /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
    3203             :                  * cached value. */
    3204             : 
    3205           0 :                 if (ret)
    3206           0 :                         *ret = u->cpu_usage_last;
    3207           0 :                 return 0;
    3208             :         }
    3209           0 :         if (r < 0)
    3210           0 :                 return r;
    3211             : 
    3212           0 :         if (ns > u->cpu_usage_base)
    3213           0 :                 ns -= u->cpu_usage_base;
    3214             :         else
    3215           0 :                 ns = 0;
    3216             : 
    3217           0 :         u->cpu_usage_last = ns;
    3218           0 :         if (ret)
    3219           0 :                 *ret = ns;
    3220             : 
    3221           0 :         return 0;
    3222             : }
    3223             : 
    3224          28 : int unit_get_ip_accounting(
    3225             :                 Unit *u,
    3226             :                 CGroupIPAccountingMetric metric,
    3227             :                 uint64_t *ret) {
    3228             : 
    3229             :         uint64_t value;
    3230             :         int fd, r;
    3231             : 
    3232          28 :         assert(u);
    3233          28 :         assert(metric >= 0);
    3234          28 :         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
    3235          28 :         assert(ret);
    3236             : 
    3237          28 :         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
    3238          28 :                 return -ENODATA;
    3239             : 
    3240           0 :         fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
    3241           0 :                 u->ip_accounting_ingress_map_fd :
    3242             :                 u->ip_accounting_egress_map_fd;
    3243           0 :         if (fd < 0)
    3244           0 :                 return -ENODATA;
    3245             : 
    3246           0 :         if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
    3247           0 :                 r = bpf_firewall_read_accounting(fd, &value, NULL);
    3248             :         else
    3249           0 :                 r = bpf_firewall_read_accounting(fd, NULL, &value);
    3250           0 :         if (r < 0)
    3251           0 :                 return r;
    3252             : 
    3253             :         /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
    3254             :          * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
    3255             :          * ip_accounting_extra[] field, and add them in here transparently. */
    3256             : 
    3257           0 :         *ret = value + u->ip_accounting_extra[metric];
    3258             : 
    3259           0 :         return r;
    3260             : }
    3261             : 
    3262           6 : static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
    3263             :         static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
    3264             :                 [CGROUP_IO_READ_BYTES]       = "rbytes=",
    3265             :                 [CGROUP_IO_WRITE_BYTES]      = "wbytes=",
    3266             :                 [CGROUP_IO_READ_OPERATIONS]  = "rios=",
    3267             :                 [CGROUP_IO_WRITE_OPERATIONS] = "wios=",
    3268             :         };
    3269           6 :         uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
    3270           6 :         _cleanup_free_ char *path = NULL;
    3271           6 :         _cleanup_fclose_ FILE *f = NULL;
    3272             :         int r;
    3273             : 
    3274           6 :         assert(u);
    3275             : 
    3276           6 :         if (!u->cgroup_path)
    3277           0 :                 return -ENODATA;
    3278             : 
    3279           6 :         if (unit_has_host_root_cgroup(u))
    3280           0 :                 return -ENODATA; /* TODO: return useful data for the top-level cgroup */
    3281             : 
    3282           6 :         r = cg_all_unified();
    3283           6 :         if (r < 0)
    3284           0 :                 return r;
    3285           6 :         if (r == 0) /* TODO: support cgroupv1 */
    3286           6 :                 return -ENODATA;
    3287             : 
    3288           0 :         if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
    3289           0 :                 return -ENODATA;
    3290             : 
    3291           0 :         r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
    3292           0 :         if (r < 0)
    3293           0 :                 return r;
    3294             : 
    3295           0 :         f = fopen(path, "re");
    3296           0 :         if (!f)
    3297           0 :                 return -errno;
    3298             : 
    3299           0 :         for (;;) {
    3300           0 :                 _cleanup_free_ char *line = NULL;
    3301             :                 const char *p;
    3302             : 
    3303           0 :                 r = read_line(f, LONG_LINE_MAX, &line);
    3304           0 :                 if (r < 0)
    3305           0 :                         return r;
    3306           0 :                 if (r == 0)
    3307           0 :                         break;
    3308             : 
    3309           0 :                 p = line;
    3310           0 :                 p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
    3311           0 :                 p += strspn(p, WHITESPACE);  /* Skip over following whitespace */
    3312             : 
    3313           0 :                 for (;;) {
    3314           0 :                         _cleanup_free_ char *word = NULL;
    3315             : 
    3316           0 :                         r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
    3317           0 :                         if (r < 0)
    3318           0 :                                 return r;
    3319           0 :                         if (r == 0)
    3320           0 :                                 break;
    3321             : 
    3322           0 :                         for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
    3323             :                                 const char *x;
    3324             : 
    3325           0 :                                 x = startswith(word, field_names[i]);
    3326           0 :                                 if (x) {
    3327             :                                         uint64_t w;
    3328             : 
    3329           0 :                                         r = safe_atou64(x, &w);
    3330           0 :                                         if (r < 0)
    3331           0 :                                                 return r;
    3332             : 
    3333             :                                         /* Sum up the stats of all devices */
    3334           0 :                                         acc[i] += w;
    3335           0 :                                         break;
    3336             :                                 }
    3337             :                         }
    3338             :                 }
    3339             :         }
    3340             : 
    3341           0 :         memcpy(ret, acc, sizeof(acc));
    3342           0 :         return 0;
    3343             : }
    3344             : 
    3345          28 : int unit_get_io_accounting(
    3346             :                 Unit *u,
    3347             :                 CGroupIOAccountingMetric metric,
    3348             :                 bool allow_cache,
    3349             :                 uint64_t *ret) {
    3350             : 
    3351             :         uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
    3352             :         int r;
    3353             : 
    3354             :         /* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
    3355             : 
    3356          28 :         if (!UNIT_CGROUP_BOOL(u, io_accounting))
    3357          28 :                 return -ENODATA;
    3358             : 
    3359           0 :         if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
    3360           0 :                 goto done;
    3361             : 
    3362           0 :         r = unit_get_io_accounting_raw(u, raw);
    3363           0 :         if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
    3364           0 :                 goto done;
    3365           0 :         if (r < 0)
    3366           0 :                 return r;
    3367             : 
    3368           0 :         for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
    3369             :                 /* Saturated subtraction */
    3370           0 :                 if (raw[i] > u->io_accounting_base[i])
    3371           0 :                         u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
    3372             :                 else
    3373           0 :                         u->io_accounting_last[i] = 0;
    3374             :         }
    3375             : 
    3376           0 : done:
    3377           0 :         if (ret)
    3378           0 :                 *ret = u->io_accounting_last[metric];
    3379             : 
    3380           0 :         return 0;
    3381             : }
    3382             : 
    3383           6 : int unit_reset_cpu_accounting(Unit *u) {
    3384             :         int r;
    3385             : 
    3386           6 :         assert(u);
    3387             : 
    3388           6 :         u->cpu_usage_last = NSEC_INFINITY;
    3389             : 
    3390           6 :         r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
    3391           6 :         if (r < 0) {
    3392           6 :                 u->cpu_usage_base = 0;
    3393           6 :                 return r;
    3394             :         }
    3395             : 
    3396           0 :         return 0;
    3397             : }
    3398             : 
    3399           6 : int unit_reset_ip_accounting(Unit *u) {
    3400           6 :         int r = 0, q = 0;
    3401             : 
    3402           6 :         assert(u);
    3403             : 
    3404           6 :         if (u->ip_accounting_ingress_map_fd >= 0)
    3405           0 :                 r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
    3406             : 
    3407           6 :         if (u->ip_accounting_egress_map_fd >= 0)
    3408           0 :                 q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
    3409             : 
    3410           6 :         zero(u->ip_accounting_extra);
    3411             : 
    3412           6 :         return r < 0 ? r : q;
    3413             : }
    3414             : 
    3415           6 : int unit_reset_io_accounting(Unit *u) {
    3416             :         int r;
    3417             : 
    3418           6 :         assert(u);
    3419             : 
    3420          30 :         for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++)
    3421          24 :                 u->io_accounting_last[i] = UINT64_MAX;
    3422             : 
    3423           6 :         r = unit_get_io_accounting_raw(u, u->io_accounting_base);
    3424           6 :         if (r < 0) {
    3425           6 :                 zero(u->io_accounting_base);
    3426           6 :                 return r;
    3427             :         }
    3428             : 
    3429           0 :         return 0;
    3430             : }
    3431             : 
    3432           6 : int unit_reset_accounting(Unit *u) {
    3433             :         int r, q, v;
    3434             : 
    3435           6 :         assert(u);
    3436             : 
    3437           6 :         r = unit_reset_cpu_accounting(u);
    3438           6 :         q = unit_reset_io_accounting(u);
    3439           6 :         v = unit_reset_ip_accounting(u);
    3440             : 
    3441           6 :         return r < 0 ? r : q < 0 ? q : v;
    3442             : }
    3443             : 
    3444           0 : void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
    3445           0 :         assert(u);
    3446             : 
    3447           0 :         if (!UNIT_HAS_CGROUP_CONTEXT(u))
    3448           0 :                 return;
    3449             : 
    3450           0 :         if (m == 0)
    3451           0 :                 return;
    3452             : 
    3453             :         /* always invalidate compat pairs together */
    3454           0 :         if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
    3455           0 :                 m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
    3456             : 
    3457           0 :         if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
    3458           0 :                 m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
    3459             : 
    3460           0 :         if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */
    3461           0 :                 return;
    3462             : 
    3463           0 :         u->cgroup_invalidated_mask |= m;
    3464           0 :         unit_add_to_cgroup_realize_queue(u);
    3465             : }
    3466             : 
    3467           0 : void unit_invalidate_cgroup_bpf(Unit *u) {
    3468           0 :         assert(u);
    3469             : 
    3470           0 :         if (!UNIT_HAS_CGROUP_CONTEXT(u))
    3471           0 :                 return;
    3472             : 
    3473           0 :         if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
    3474           0 :                 return;
    3475             : 
    3476           0 :         u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
    3477           0 :         unit_add_to_cgroup_realize_queue(u);
    3478             : 
    3479             :         /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
    3480             :          * list of our children includes our own. */
    3481           0 :         if (u->type == UNIT_SLICE) {
    3482             :                 Unit *member;
    3483             :                 Iterator i;
    3484             :                 void *v;
    3485             : 
    3486           0 :                 HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE], i) {
    3487           0 :                         if (UNIT_DEREF(member->slice) == u)
    3488           0 :                                 unit_invalidate_cgroup_bpf(member);
    3489             :                 }
    3490             :         }
    3491             : }
    3492             : 
    3493        1775 : bool unit_cgroup_delegate(Unit *u) {
    3494             :         CGroupContext *c;
    3495             : 
    3496        1775 :         assert(u);
    3497             : 
    3498        1775 :         if (!UNIT_VTABLE(u)->can_delegate)
    3499        1574 :                 return false;
    3500             : 
    3501         201 :         c = unit_get_cgroup_context(u);
    3502         201 :         if (!c)
    3503           0 :                 return false;
    3504             : 
    3505         201 :         return c->delegate;
    3506             : }
    3507             : 
    3508           1 : void manager_invalidate_startup_units(Manager *m) {
    3509             :         Iterator i;
    3510             :         Unit *u;
    3511             : 
    3512           1 :         assert(m);
    3513             : 
    3514           1 :         SET_FOREACH(u, m->startup_units, i)
    3515           0 :                 unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
    3516           1 : }
    3517             : 
    3518         102 : static int unit_get_nice(Unit *u) {
    3519             :         ExecContext *ec;
    3520             : 
    3521         102 :         ec = unit_get_exec_context(u);
    3522         102 :         return ec ? ec->nice : 0;
    3523             : }
    3524             : 
    3525         102 : static uint64_t unit_get_cpu_weight(Unit *u) {
    3526         102 :         ManagerState state = manager_state(u->manager);
    3527             :         CGroupContext *cc;
    3528             : 
    3529         102 :         cc = unit_get_cgroup_context(u);
    3530         102 :         return cc ? cgroup_context_cpu_weight(cc, state) : CGROUP_WEIGHT_DEFAULT;
    3531             : }
    3532             : 
    3533         122 : int compare_job_priority(const void *a, const void *b) {
    3534         122 :         const Job *x = a, *y = b;
    3535             :         int nice_x, nice_y;
    3536             :         uint64_t weight_x, weight_y;
    3537             :         int ret;
    3538             : 
    3539         122 :         if ((ret = CMP(x->unit->type, y->unit->type)) != 0)
    3540          71 :                 return -ret;
    3541             : 
    3542          51 :         weight_x = unit_get_cpu_weight(x->unit);
    3543          51 :         weight_y = unit_get_cpu_weight(y->unit);
    3544             : 
    3545          51 :         if ((ret = CMP(weight_x, weight_y)) != 0)
    3546           0 :                 return -ret;
    3547             : 
    3548          51 :         nice_x = unit_get_nice(x->unit);
    3549          51 :         nice_y = unit_get_nice(y->unit);
    3550             : 
    3551          51 :         if ((ret = CMP(nice_x, nice_y)) != 0)
    3552           0 :                 return ret;
    3553             : 
    3554          51 :         return strcmp(x->unit->id, y->unit->id);
    3555             : }
    3556             : 
    3557             : static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
    3558             :         [CGROUP_AUTO] = "auto",
    3559             :         [CGROUP_CLOSED] = "closed",
    3560             :         [CGROUP_STRICT] = "strict",
    3561             : };
    3562             : 
    3563         192 : DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);

Generated by: LCOV version 1.14