LCOV - code coverage report
Current view: top level - core - namespace.c (source / functions) Hit Total Coverage
Test: main_coverage.info Lines: 13 849 1.5 %
Date: 2019-08-22 15:41:25 Functions: 8 48 16.7 %

          Line data    Source code
       1             : /* SPDX-License-Identifier: LGPL-2.1+ */
       2             : 
       3             : #include <errno.h>
       4             : #include <sched.h>
       5             : #include <stdio.h>
       6             : #include <string.h>
       7             : #include <sys/mount.h>
       8             : #include <sys/stat.h>
       9             : #include <unistd.h>
      10             : #include <linux/fs.h>
      11             : 
      12             : #include "alloc-util.h"
      13             : #include "base-filesystem.h"
      14             : #include "dev-setup.h"
      15             : #include "fd-util.h"
      16             : #include "fs-util.h"
      17             : #include "label.h"
      18             : #include "loop-util.h"
      19             : #include "loopback-setup.h"
      20             : #include "missing.h"
      21             : #include "mkdir.h"
      22             : #include "mount-util.h"
      23             : #include "mountpoint-util.h"
      24             : #include "namespace-util.h"
      25             : #include "namespace.h"
      26             : #include "nulstr-util.h"
      27             : #include "path-util.h"
      28             : #include "selinux-util.h"
      29             : #include "socket-util.h"
      30             : #include "sort-util.h"
      31             : #include "stat-util.h"
      32             : #include "string-table.h"
      33             : #include "string-util.h"
      34             : #include "strv.h"
      35             : #include "umask-util.h"
      36             : #include "user-util.h"
      37             : 
      38             : #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
      39             : 
      40             : typedef enum MountMode {
      41             :         /* This is ordered by priority! */
      42             :         INACCESSIBLE,
      43             :         BIND_MOUNT,
      44             :         BIND_MOUNT_RECURSIVE,
      45             :         PRIVATE_TMP,
      46             :         PRIVATE_DEV,
      47             :         BIND_DEV,
      48             :         EMPTY_DIR,
      49             :         SYSFS,
      50             :         PROCFS,
      51             :         READONLY,
      52             :         READWRITE,
      53             :         TMPFS,
      54             :         READWRITE_IMPLICIT, /* Should have the lowest priority. */
      55             :         _MOUNT_MODE_MAX,
      56             : } MountMode;
      57             : 
      58             : typedef struct MountEntry {
      59             :         const char *path_const;   /* Memory allocated on stack or static */
      60             :         MountMode mode:5;
      61             :         bool ignore:1;            /* Ignore if path does not exist? */
      62             :         bool has_prefix:1;        /* Already is prefixed by the root dir? */
      63             :         bool read_only:1;         /* Shall this mount point be read-only? */
      64             :         bool nosuid:1;            /* Shall set MS_NOSUID on the mount itself */
      65             :         bool applied:1;           /* Already applied */
      66             :         char *path_malloc;        /* Use this instead of 'path_const' if we had to allocate memory */
      67             :         const char *source_const; /* The source path, for bind mounts */
      68             :         char *source_malloc;
      69             :         const char *options_const;/* Mount options for tmpfs */
      70             :         char *options_malloc;
      71             :         unsigned long flags;      /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
      72             :         unsigned n_followed;
      73             : } MountEntry;
      74             : 
      75             : /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
      76             :  * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
      77             : static const MountEntry apivfs_table[] = {
      78             :         { "/proc",               PROCFS,       false },
      79             :         { "/dev",                BIND_DEV,     false },
      80             :         { "/sys",                SYSFS,        false },
      81             : };
      82             : 
      83             : /* ProtectKernelTunables= option and the related filesystem APIs */
      84             : static const MountEntry protect_kernel_tunables_table[] = {
      85             :         { "/proc/acpi",          READONLY,           true  },
      86             :         { "/proc/apm",           READONLY,           true  }, /* Obsolete API, there's no point in permitting access to this, ever */
      87             :         { "/proc/asound",        READONLY,           true  },
      88             :         { "/proc/bus",           READONLY,           true  },
      89             :         { "/proc/fs",            READONLY,           true  },
      90             :         { "/proc/irq",           READONLY,           true  },
      91             :         { "/proc/kallsyms",      INACCESSIBLE,       true  },
      92             :         { "/proc/kcore",         INACCESSIBLE,       true  },
      93             :         { "/proc/latency_stats", READONLY,           true  },
      94             :         { "/proc/mtrr",          READONLY,           true  },
      95             :         { "/proc/scsi",          READONLY,           true  },
      96             :         { "/proc/sys",           READONLY,           false },
      97             :         { "/proc/sysrq-trigger", READONLY,           true  },
      98             :         { "/proc/timer_stats",   READONLY,           true  },
      99             :         { "/sys",                READONLY,           false },
     100             :         { "/sys/fs/bpf",         READONLY,           true  },
     101             :         { "/sys/fs/cgroup",      READWRITE_IMPLICIT, false }, /* READONLY is set by ProtectControlGroups= option */
     102             :         { "/sys/fs/selinux",     READWRITE_IMPLICIT, true  },
     103             :         { "/sys/kernel/debug",   READONLY,           true  },
     104             :         { "/sys/kernel/tracing", READONLY,           true  },
     105             : };
     106             : 
     107             : /* ProtectKernelModules= option */
     108             : static const MountEntry protect_kernel_modules_table[] = {
     109             : #if HAVE_SPLIT_USR
     110             :         { "/lib/modules",        INACCESSIBLE, true  },
     111             : #endif
     112             :         { "/usr/lib/modules",    INACCESSIBLE, true  },
     113             : };
     114             : 
     115             : /*
     116             :  * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
     117             :  * system should be protected by ProtectSystem=
     118             :  */
     119             : static const MountEntry protect_home_read_only_table[] = {
     120             :         { "/home",               READONLY,     true  },
     121             :         { "/run/user",           READONLY,     true  },
     122             :         { "/root",               READONLY,     true  },
     123             : };
     124             : 
     125             : /* ProtectHome=tmpfs table */
     126             : static const MountEntry protect_home_tmpfs_table[] = {
     127             :         { "/home",               TMPFS,        true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
     128             :         { "/run/user",           TMPFS,        true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
     129             :         { "/root",               TMPFS,        true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
     130             : };
     131             : 
     132             : /* ProtectHome=yes table */
     133             : static const MountEntry protect_home_yes_table[] = {
     134             :         { "/home",               INACCESSIBLE, true  },
     135             :         { "/run/user",           INACCESSIBLE, true  },
     136             :         { "/root",               INACCESSIBLE, true  },
     137             : };
     138             : 
     139             : /* ProtectSystem=yes table */
     140             : static const MountEntry protect_system_yes_table[] = {
     141             :         { "/usr",                READONLY,     false },
     142             :         { "/boot",               READONLY,     true  },
     143             :         { "/efi",                READONLY,     true  },
     144             : #if HAVE_SPLIT_USR
     145             :         { "/lib",                READONLY,     true  },
     146             :         { "/lib64",              READONLY,     true  },
     147             :         { "/bin",                READONLY,     true  },
     148             : #  if HAVE_SPLIT_BIN
     149             :         { "/sbin",               READONLY,     true  },
     150             : #  endif
     151             : #endif
     152             : };
     153             : 
     154             : /* ProtectSystem=full includes ProtectSystem=yes */
     155             : static const MountEntry protect_system_full_table[] = {
     156             :         { "/usr",                READONLY,     false },
     157             :         { "/boot",               READONLY,     true  },
     158             :         { "/efi",                READONLY,     true  },
     159             :         { "/etc",                READONLY,     false },
     160             : #if HAVE_SPLIT_USR
     161             :         { "/lib",                READONLY,     true  },
     162             :         { "/lib64",              READONLY,     true  },
     163             :         { "/bin",                READONLY,     true  },
     164             : #  if HAVE_SPLIT_BIN
     165             :         { "/sbin",               READONLY,     true  },
     166             : #  endif
     167             : #endif
     168             : };
     169             : 
     170             : /*
     171             :  * ProtectSystem=strict table. In this strict mode, we mount everything
     172             :  * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
     173             :  * which are left writable, but PrivateDevices= + ProtectKernelTunables=
     174             :  * protect those, and these options should be fully orthogonal.
     175             :  * (And of course /home and friends are also left writable, as ProtectHome=
     176             :  * shall manage those, orthogonally).
     177             :  */
     178             : static const MountEntry protect_system_strict_table[] = {
     179             :         { "/",                   READONLY,           false },
     180             :         { "/proc",               READWRITE_IMPLICIT, false },      /* ProtectKernelTunables= */
     181             :         { "/sys",                READWRITE_IMPLICIT, false },      /* ProtectKernelTunables= */
     182             :         { "/dev",                READWRITE_IMPLICIT, false },      /* PrivateDevices= */
     183             :         { "/home",               READWRITE_IMPLICIT, true  },      /* ProtectHome= */
     184             :         { "/run/user",           READWRITE_IMPLICIT, true  },      /* ProtectHome= */
     185             :         { "/root",               READWRITE_IMPLICIT, true  },      /* ProtectHome= */
     186             : };
     187             : 
     188             : static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
     189             :         [INACCESSIBLE]         = "inaccessible",
     190             :         [BIND_MOUNT]           = "bind",
     191             :         [BIND_MOUNT_RECURSIVE] = "rbind",
     192             :         [PRIVATE_TMP]          = "private-tmp",
     193             :         [PRIVATE_DEV]          = "private-dev",
     194             :         [BIND_DEV]             = "bind-dev",
     195             :         [EMPTY_DIR]            = "empty",
     196             :         [SYSFS]                = "sysfs",
     197             :         [PROCFS]               = "procfs",
     198             :         [READONLY]             = "read-only",
     199             :         [READWRITE]            = "read-write",
     200             :         [TMPFS]                = "tmpfs",
     201             :         [READWRITE_IMPLICIT]   = "rw-implicit",
     202             : };
     203             : 
     204           0 : DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
     205             : 
     206           0 : static const char *mount_entry_path(const MountEntry *p) {
     207           0 :         assert(p);
     208             : 
     209             :         /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
     210             :          * otherwise the stack/static ->path field is returned. */
     211             : 
     212           0 :         return p->path_malloc ?: p->path_const;
     213             : }
     214             : 
     215           0 : static bool mount_entry_read_only(const MountEntry *p) {
     216           0 :         assert(p);
     217             : 
     218           0 :         return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
     219             : }
     220             : 
     221           0 : static const char *mount_entry_source(const MountEntry *p) {
     222           0 :         assert(p);
     223             : 
     224           0 :         return p->source_malloc ?: p->source_const;
     225             : }
     226             : 
     227           0 : static const char *mount_entry_options(const MountEntry *p) {
     228           0 :         assert(p);
     229             : 
     230           0 :         return p->options_malloc ?: p->options_const;
     231             : }
     232             : 
     233           0 : static void mount_entry_done(MountEntry *p) {
     234           0 :         assert(p);
     235             : 
     236           0 :         p->path_malloc = mfree(p->path_malloc);
     237           0 :         p->source_malloc = mfree(p->source_malloc);
     238           0 :         p->options_malloc = mfree(p->options_malloc);
     239           0 : }
     240             : 
     241           0 : static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
     242             :         char **i;
     243             : 
     244           0 :         assert(p);
     245             : 
     246             :         /* Adds a list of user-supplied READWRITE/READWRITE_IMPLICIT/READONLY/INACCESSIBLE entries */
     247             : 
     248           0 :         STRV_FOREACH(i, strv) {
     249           0 :                 bool ignore = false, needs_prefix = false;
     250           0 :                 const char *e = *i;
     251             : 
     252             :                 /* Look for any prefixes */
     253           0 :                 if (startswith(e, "-")) {
     254           0 :                         e++;
     255           0 :                         ignore = true;
     256             :                 }
     257           0 :                 if (startswith(e, "+")) {
     258           0 :                         e++;
     259           0 :                         needs_prefix = true;
     260             :                 }
     261             : 
     262           0 :                 if (!path_is_absolute(e))
     263           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
     264             :                                                "Path is not absolute: %s", e);
     265             : 
     266           0 :                 *((*p)++) = (MountEntry) {
     267             :                         .path_const = e,
     268             :                         .mode = mode,
     269             :                         .ignore = ignore,
     270           0 :                         .has_prefix = !needs_prefix && !forcibly_require_prefix,
     271             :                 };
     272             :         }
     273             : 
     274           0 :         return 0;
     275             : }
     276             : 
     277           0 : static int append_empty_dir_mounts(MountEntry **p, char **strv) {
     278             :         char **i;
     279             : 
     280           0 :         assert(p);
     281             : 
     282             :         /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
     283             :          * "/private/" boundary directories for DynamicUser=1. */
     284             : 
     285           0 :         STRV_FOREACH(i, strv) {
     286             : 
     287           0 :                 *((*p)++) = (MountEntry) {
     288           0 :                         .path_const = *i,
     289             :                         .mode = EMPTY_DIR,
     290             :                         .ignore = false,
     291             :                         .read_only = true,
     292             :                         .options_const = "mode=755",
     293             :                         .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
     294             :                 };
     295             :         }
     296             : 
     297           0 :         return 0;
     298             : }
     299             : 
     300           0 : static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) {
     301             :         size_t i;
     302             : 
     303           0 :         assert(p);
     304             : 
     305           0 :         for (i = 0; i < n; i++) {
     306           0 :                 const BindMount *b = binds + i;
     307             : 
     308           0 :                 *((*p)++) = (MountEntry) {
     309           0 :                         .path_const = b->destination,
     310           0 :                         .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
     311           0 :                         .read_only = b->read_only,
     312           0 :                         .nosuid = b->nosuid,
     313           0 :                         .source_const = b->source,
     314           0 :                         .ignore = b->ignore_enoent,
     315             :                 };
     316             :         }
     317             : 
     318           0 :         return 0;
     319             : }
     320             : 
     321           0 : static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) {
     322             :         size_t i;
     323             :         int r;
     324             : 
     325           0 :         assert(p);
     326             : 
     327           0 :         for (i = 0; i < n; i++) {
     328           0 :                 const TemporaryFileSystem *t = tmpfs + i;
     329           0 :                 _cleanup_free_ char *o = NULL, *str = NULL;
     330             :                 unsigned long flags;
     331           0 :                 bool ro = false;
     332             : 
     333           0 :                 if (!path_is_absolute(t->path))
     334           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
     335             :                                                "Path is not absolute: %s",
     336             :                                                t->path);
     337             : 
     338           0 :                 str = strjoin("mode=0755,", t->options);
     339           0 :                 if (!str)
     340           0 :                         return -ENOMEM;
     341             : 
     342           0 :                 r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
     343           0 :                 if (r < 0)
     344           0 :                         return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
     345             : 
     346           0 :                 ro = flags & MS_RDONLY;
     347           0 :                 if (ro)
     348           0 :                         flags ^= MS_RDONLY;
     349             : 
     350           0 :                 *((*p)++) = (MountEntry) {
     351           0 :                         .path_const = t->path,
     352             :                         .mode = TMPFS,
     353             :                         .read_only = ro,
     354           0 :                         .options_malloc = TAKE_PTR(o),
     355             :                         .flags = flags,
     356             :                 };
     357             :         }
     358             : 
     359           0 :         return 0;
     360             : }
     361             : 
     362           0 : static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) {
     363             :         size_t i;
     364             : 
     365           0 :         assert(p);
     366           0 :         assert(mounts);
     367             : 
     368             :         /* Adds a list of static pre-defined entries */
     369             : 
     370           0 :         for (i = 0; i < n; i++)
     371           0 :                 *((*p)++) = (MountEntry) {
     372           0 :                         .path_const = mount_entry_path(mounts+i),
     373           0 :                         .mode = mounts[i].mode,
     374           0 :                         .ignore = mounts[i].ignore || ignore_protect,
     375             :                 };
     376             : 
     377           0 :         return 0;
     378             : }
     379             : 
     380           0 : static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
     381           0 :         assert(p);
     382             : 
     383           0 :         switch (protect_home) {
     384             : 
     385           0 :         case PROTECT_HOME_NO:
     386           0 :                 return 0;
     387             : 
     388           0 :         case PROTECT_HOME_READ_ONLY:
     389           0 :                 return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
     390             : 
     391           0 :         case PROTECT_HOME_TMPFS:
     392           0 :                 return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
     393             : 
     394           0 :         case PROTECT_HOME_YES:
     395           0 :                 return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
     396             : 
     397           0 :         default:
     398           0 :                 assert_not_reached("Unexpected ProtectHome= value");
     399             :         }
     400             : }
     401             : 
     402           0 : static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
     403           0 :         assert(p);
     404             : 
     405           0 :         switch (protect_system) {
     406             : 
     407           0 :         case PROTECT_SYSTEM_NO:
     408           0 :                 return 0;
     409             : 
     410           0 :         case PROTECT_SYSTEM_STRICT:
     411           0 :                 return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
     412             : 
     413           0 :         case PROTECT_SYSTEM_YES:
     414           0 :                 return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
     415             : 
     416           0 :         case PROTECT_SYSTEM_FULL:
     417           0 :                 return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
     418             : 
     419           0 :         default:
     420           0 :                 assert_not_reached("Unexpected ProtectSystem= value");
     421             :         }
     422             : }
     423             : 
     424           0 : static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
     425             :         int d;
     426             : 
     427             :         /* If the paths are not equal, then order prefixes first */
     428           0 :         d = path_compare(mount_entry_path(a), mount_entry_path(b));
     429           0 :         if (d != 0)
     430           0 :                 return d;
     431             : 
     432             :         /* If the paths are equal, check the mode */
     433           0 :         return CMP((int) a->mode, (int) b->mode);
     434             : }
     435             : 
     436           0 : static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) {
     437             :         size_t i;
     438             : 
     439             :         /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
     440             : 
     441           0 :         for (i = 0; i < n; i++) {
     442             :                 char *s;
     443             : 
     444           0 :                 if (m[i].has_prefix)
     445           0 :                         continue;
     446             : 
     447           0 :                 s = path_join(root_directory, mount_entry_path(m+i));
     448           0 :                 if (!s)
     449           0 :                         return -ENOMEM;
     450             : 
     451           0 :                 free_and_replace(m[i].path_malloc, s);
     452           0 :                 m[i].has_prefix = true;
     453             :         }
     454             : 
     455           0 :         return 0;
     456             : }
     457             : 
     458           0 : static void drop_duplicates(MountEntry *m, size_t *n) {
     459             :         MountEntry *f, *t, *previous;
     460             : 
     461           0 :         assert(m);
     462           0 :         assert(n);
     463             : 
     464             :         /* Drops duplicate entries. Expects that the array is properly ordered already. */
     465             : 
     466           0 :         for (f = m, t = m, previous = NULL; f < m + *n; f++) {
     467             : 
     468             :                 /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
     469             :                  * above. Note that we only drop duplicates that haven't been mounted yet. */
     470           0 :                 if (previous &&
     471           0 :                     path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
     472           0 :                     !f->applied && !previous->applied) {
     473           0 :                         log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
     474           0 :                         previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
     475           0 :                         mount_entry_done(f);
     476           0 :                         continue;
     477             :                 }
     478             : 
     479           0 :                 *t = *f;
     480           0 :                 previous = t;
     481           0 :                 t++;
     482             :         }
     483             : 
     484           0 :         *n = t - m;
     485           0 : }
     486             : 
     487           0 : static void drop_inaccessible(MountEntry *m, size_t *n) {
     488             :         MountEntry *f, *t;
     489           0 :         const char *clear = NULL;
     490             : 
     491           0 :         assert(m);
     492           0 :         assert(n);
     493             : 
     494             :         /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
     495             :          * ordered already. */
     496             : 
     497           0 :         for (f = m, t = m; f < m + *n; f++) {
     498             : 
     499             :                 /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
     500             :                  * it, as inaccessible paths really should drop the entire subtree. */
     501           0 :                 if (clear && path_startswith(mount_entry_path(f), clear)) {
     502           0 :                         log_debug("%s is masked by %s.", mount_entry_path(f), clear);
     503           0 :                         mount_entry_done(f);
     504           0 :                         continue;
     505             :                 }
     506             : 
     507           0 :                 clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
     508             : 
     509           0 :                 *t = *f;
     510           0 :                 t++;
     511             :         }
     512             : 
     513           0 :         *n = t - m;
     514           0 : }
     515             : 
     516           0 : static void drop_nop(MountEntry *m, size_t *n) {
     517             :         MountEntry *f, *t;
     518             : 
     519           0 :         assert(m);
     520           0 :         assert(n);
     521             : 
     522             :         /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
     523             :          * list is ordered by prefixes. */
     524             : 
     525           0 :         for (f = m, t = m; f < m + *n; f++) {
     526             : 
     527             :                 /* Only suppress such subtrees for READONLY, READWRITE and READWRITE_IMPLICIT entries */
     528           0 :                 if (IN_SET(f->mode, READONLY, READWRITE, READWRITE_IMPLICIT)) {
     529             :                         MountEntry *p;
     530           0 :                         bool found = false;
     531             : 
     532             :                         /* Now let's find the first parent of the entry we are looking at. */
     533           0 :                         for (p = t-1; p >= m; p--) {
     534           0 :                                 if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
     535           0 :                                         found = true;
     536           0 :                                         break;
     537             :                                 }
     538             :                         }
     539             : 
     540             :                         /* We found it, let's see if it's the same mode, if so, we can drop this entry */
     541           0 :                         if (found && p->mode == f->mode) {
     542           0 :                                 log_debug("%s (%s) is made redundant by %s (%s)",
     543             :                                           mount_entry_path(f), mount_mode_to_string(f->mode),
     544             :                                           mount_entry_path(p), mount_mode_to_string(p->mode));
     545           0 :                                 mount_entry_done(f);
     546           0 :                                 continue;
     547             :                         }
     548             :                 }
     549             : 
     550           0 :                 *t = *f;
     551           0 :                 t++;
     552             :         }
     553             : 
     554           0 :         *n = t - m;
     555           0 : }
     556             : 
     557           0 : static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) {
     558             :         MountEntry *f, *t;
     559             : 
     560           0 :         assert(m);
     561           0 :         assert(n);
     562             : 
     563             :         /* Nothing to do */
     564           0 :         if (!root_directory)
     565           0 :                 return;
     566             : 
     567             :         /* Drops all mounts that are outside of the root directory. */
     568             : 
     569           0 :         for (f = m, t = m; f < m + *n; f++) {
     570             : 
     571           0 :                 if (!path_startswith(mount_entry_path(f), root_directory)) {
     572           0 :                         log_debug("%s is outside of root directory.", mount_entry_path(f));
     573           0 :                         mount_entry_done(f);
     574           0 :                         continue;
     575             :                 }
     576             : 
     577           0 :                 *t = *f;
     578           0 :                 t++;
     579             :         }
     580             : 
     581           0 :         *n = t - m;
     582             : }
     583             : 
     584           0 : static int clone_device_node(
     585             :                 const char *d,
     586             :                 const char *temporary_mount,
     587             :                 bool *make_devnode) {
     588             : 
     589           0 :         _cleanup_free_ char *sl = NULL;
     590             :         const char *dn, *bn, *t;
     591             :         struct stat st;
     592             :         int r;
     593             : 
     594           0 :         if (stat(d, &st) < 0) {
     595           0 :                 if (errno == ENOENT) {
     596           0 :                         log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
     597           0 :                         return -ENXIO;
     598             :                 }
     599             : 
     600           0 :                 return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
     601             :         }
     602             : 
     603           0 :         if (!S_ISBLK(st.st_mode) &&
     604           0 :             !S_ISCHR(st.st_mode))
     605           0 :                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
     606             :                                        "Device node '%s' to clone is not a device node, ignoring.",
     607             :                                        d);
     608             : 
     609           0 :         dn = strjoina(temporary_mount, d);
     610             : 
     611             :         /* First, try to create device node properly */
     612           0 :         if (*make_devnode) {
     613           0 :                 mac_selinux_create_file_prepare(d, st.st_mode);
     614           0 :                 r = mknod(dn, st.st_mode, st.st_rdev);
     615           0 :                 mac_selinux_create_file_clear();
     616           0 :                 if (r >= 0)
     617           0 :                         goto add_symlink;
     618           0 :                 if (errno != EPERM)
     619           0 :                         return log_debug_errno(errno, "mknod failed for %s: %m", d);
     620             : 
     621             :                 /* This didn't work, let's not try this again for the next iterations. */
     622           0 :                 *make_devnode = false;
     623             :         }
     624             : 
     625             :         /* We're about to fallback to bind-mounting the device
     626             :          * node. So create a dummy bind-mount target. */
     627           0 :         mac_selinux_create_file_prepare(d, 0);
     628           0 :         r = mknod(dn, S_IFREG, 0);
     629           0 :         mac_selinux_create_file_clear();
     630           0 :         if (r < 0 && errno != EEXIST)
     631           0 :                 return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
     632             : 
     633             :         /* Fallback to bind-mounting:
     634             :          * The assumption here is that all used device nodes carry standard
     635             :          * properties. Specifically, the devices nodes we bind-mount should
     636             :          * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx)
     637             :          * and should not carry ACLs. */
     638           0 :         if (mount(d, dn, NULL, MS_BIND, NULL) < 0)
     639           0 :                 return log_debug_errno(errno, "Bind mounting failed for '%s': %m", d);
     640             : 
     641           0 : add_symlink:
     642           0 :         bn = path_startswith(d, "/dev/");
     643           0 :         if (!bn)
     644           0 :                 return 0;
     645             : 
     646             :         /* Create symlinks like /dev/char/1:9 → ../urandom */
     647           0 :         if (asprintf(&sl, "%s/dev/%s/%u:%u", temporary_mount, S_ISCHR(st.st_mode) ? "char" : "block", major(st.st_rdev), minor(st.st_rdev)) < 0)
     648           0 :                 return log_oom();
     649             : 
     650           0 :         (void) mkdir_parents(sl, 0755);
     651             : 
     652           0 :         t = strjoina("../", bn);
     653             : 
     654           0 :         if (symlink(t, sl) < 0)
     655           0 :                 log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
     656             : 
     657           0 :         return 0;
     658             : }
     659             : 
     660           0 : static int mount_private_dev(MountEntry *m) {
     661             :         static const char devnodes[] =
     662             :                 "/dev/null\0"
     663             :                 "/dev/zero\0"
     664             :                 "/dev/full\0"
     665             :                 "/dev/random\0"
     666             :                 "/dev/urandom\0"
     667             :                 "/dev/tty\0";
     668             : 
     669           0 :         char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
     670           0 :         const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
     671           0 :         bool can_mknod = true;
     672           0 :         _cleanup_umask_ mode_t u;
     673             :         int r;
     674             : 
     675           0 :         assert(m);
     676             : 
     677           0 :         u = umask(0000);
     678             : 
     679           0 :         if (!mkdtemp(temporary_mount))
     680           0 :                 return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
     681             : 
     682           0 :         dev = strjoina(temporary_mount, "/dev");
     683           0 :         (void) mkdir(dev, 0755);
     684           0 :         if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
     685           0 :                 r = log_debug_errno(errno, "Failed to mount tmpfs on '%s': %m", dev);
     686           0 :                 goto fail;
     687             :         }
     688             : 
     689           0 :         devpts = strjoina(temporary_mount, "/dev/pts");
     690           0 :         (void) mkdir(devpts, 0755);
     691           0 :         if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
     692           0 :                 r = log_debug_errno(errno, "Failed to bind mount /dev/pts on '%s': %m", devpts);
     693           0 :                 goto fail;
     694             :         }
     695             : 
     696             :         /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
     697             :          * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
     698             :          * Thus, in that case make a clone.
     699             :          * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
     700           0 :         r = is_symlink("/dev/ptmx");
     701           0 :         if (r < 0) {
     702           0 :                 log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
     703           0 :                 goto fail;
     704           0 :         } else if (r > 0) {
     705           0 :                 devptmx = strjoina(temporary_mount, "/dev/ptmx");
     706           0 :                 if (symlink("pts/ptmx", devptmx) < 0) {
     707           0 :                         r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
     708           0 :                         goto fail;
     709             :                 }
     710             :         } else {
     711           0 :                 r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
     712           0 :                 if (r < 0)
     713           0 :                         goto fail;
     714             :         }
     715             : 
     716           0 :         devshm = strjoina(temporary_mount, "/dev/shm");
     717           0 :         (void) mkdir(devshm, 0755);
     718           0 :         r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
     719           0 :         if (r < 0) {
     720           0 :                 r = log_debug_errno(errno, "Failed to bind mount /dev/shm on '%s': %m", devshm);
     721           0 :                 goto fail;
     722             :         }
     723             : 
     724           0 :         devmqueue = strjoina(temporary_mount, "/dev/mqueue");
     725           0 :         (void) mkdir(devmqueue, 0755);
     726           0 :         if (mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL) < 0)
     727           0 :                 log_debug_errno(errno, "Failed to bind mount /dev/mqueue on '%s', ignoring: %m", devmqueue);
     728             : 
     729           0 :         devhugepages = strjoina(temporary_mount, "/dev/hugepages");
     730           0 :         (void) mkdir(devhugepages, 0755);
     731           0 :         if (mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL) < 0)
     732           0 :                 log_debug_errno(errno, "Failed to bind mount /dev/hugepages on '%s', ignoring: %m", devhugepages);
     733             : 
     734           0 :         devlog = strjoina(temporary_mount, "/dev/log");
     735           0 :         if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
     736           0 :                 log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
     737             : 
     738           0 :         NULSTR_FOREACH(d, devnodes) {
     739           0 :                 r = clone_device_node(d, temporary_mount, &can_mknod);
     740             :                 /* ENXIO means the the *source* is not a device file, skip creation in that case */
     741           0 :                 if (r < 0 && r != -ENXIO)
     742           0 :                         goto fail;
     743             :         }
     744             : 
     745           0 :         r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
     746           0 :         if (r < 0)
     747           0 :                 log_debug_errno(r, "Failed to setup basic device tree at '%s', ignoring: %m", temporary_mount);
     748             : 
     749             :         /* Create the /dev directory if missing. It is more likely to be
     750             :          * missing when the service is started with RootDirectory. This is
     751             :          * consistent with mount units creating the mount points when missing.
     752             :          */
     753           0 :         (void) mkdir_p_label(mount_entry_path(m), 0755);
     754             : 
     755             :         /* Unmount everything in old /dev */
     756           0 :         r = umount_recursive(mount_entry_path(m), 0);
     757           0 :         if (r < 0)
     758           0 :                 log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
     759             : 
     760           0 :         if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
     761           0 :                 r = log_debug_errno(errno, "Failed to move mount point '%s' to '%s': %m", dev, mount_entry_path(m));
     762           0 :                 goto fail;
     763             :         }
     764             : 
     765           0 :         (void) rmdir(dev);
     766           0 :         (void) rmdir(temporary_mount);
     767             : 
     768           0 :         return 0;
     769             : 
     770           0 : fail:
     771           0 :         if (devpts)
     772           0 :                 (void) umount(devpts);
     773             : 
     774           0 :         if (devshm)
     775           0 :                 (void) umount(devshm);
     776             : 
     777           0 :         if (devhugepages)
     778           0 :                 (void) umount(devhugepages);
     779             : 
     780           0 :         if (devmqueue)
     781           0 :                 (void) umount(devmqueue);
     782             : 
     783           0 :         (void) umount(dev);
     784           0 :         (void) rmdir(dev);
     785           0 :         (void) rmdir(temporary_mount);
     786             : 
     787           0 :         return r;
     788             : }
     789             : 
     790           0 : static int mount_bind_dev(const MountEntry *m) {
     791             :         int r;
     792             : 
     793           0 :         assert(m);
     794             : 
     795             :         /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
     796             :          * /dev. This is only used when RootDirectory= is set. */
     797             : 
     798           0 :         (void) mkdir_p_label(mount_entry_path(m), 0755);
     799             : 
     800           0 :         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
     801           0 :         if (r < 0)
     802           0 :                 return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
     803           0 :         if (r > 0) /* make this a NOP if /dev is already a mount point */
     804           0 :                 return 0;
     805             : 
     806           0 :         if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
     807           0 :                 return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
     808             : 
     809           0 :         return 1;
     810             : }
     811             : 
     812           0 : static int mount_sysfs(const MountEntry *m) {
     813             :         int r;
     814             : 
     815           0 :         assert(m);
     816             : 
     817           0 :         (void) mkdir_p_label(mount_entry_path(m), 0755);
     818             : 
     819           0 :         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
     820           0 :         if (r < 0)
     821           0 :                 return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
     822           0 :         if (r > 0) /* make this a NOP if /sys is already a mount point */
     823           0 :                 return 0;
     824             : 
     825             :         /* Bind mount the host's version so that we get all child mounts of it, too. */
     826           0 :         if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
     827           0 :                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
     828             : 
     829           0 :         return 1;
     830             : }
     831             : 
     832           0 : static int mount_procfs(const MountEntry *m) {
     833             :         int r;
     834             : 
     835           0 :         assert(m);
     836             : 
     837           0 :         (void) mkdir_p_label(mount_entry_path(m), 0755);
     838             : 
     839           0 :         r = path_is_mount_point(mount_entry_path(m), NULL, 0);
     840           0 :         if (r < 0)
     841           0 :                 return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
     842           0 :         if (r > 0) /* make this a NOP if /proc is already a mount point */
     843           0 :                 return 0;
     844             : 
     845             :         /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
     846           0 :         if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
     847           0 :                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
     848             : 
     849           0 :         return 1;
     850             : }
     851             : 
     852           0 : static int mount_tmpfs(const MountEntry *m) {
     853           0 :         assert(m);
     854             : 
     855             :         /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
     856             : 
     857           0 :         (void) mkdir_p_label(mount_entry_path(m), 0755);
     858           0 :         (void) umount_recursive(mount_entry_path(m), 0);
     859             : 
     860           0 :         if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
     861           0 :                 return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
     862             : 
     863           0 :         return 1;
     864             : }
     865             : 
     866           0 : static int follow_symlink(
     867             :                 const char *root_directory,
     868             :                 MountEntry *m) {
     869             : 
     870           0 :         _cleanup_free_ char *target = NULL;
     871             :         int r;
     872             : 
     873             :         /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
     874             :          * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
     875             :          * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
     876             :          * end and already have a fully normalized name. */
     877             : 
     878           0 :         r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target);
     879           0 :         if (r < 0)
     880           0 :                 return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
     881           0 :         if (r > 0) /* Reached the end, nothing more to resolve */
     882           0 :                 return 1;
     883             : 
     884           0 :         if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */
     885           0 :                 return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
     886             :                                        "Symlink loop on '%s'.",
     887             :                                        mount_entry_path(m));
     888             : 
     889           0 :         log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target);
     890             : 
     891           0 :         free_and_replace(m->path_malloc, target);
     892           0 :         m->has_prefix = true;
     893             : 
     894           0 :         m->n_followed ++;
     895             : 
     896           0 :         return 0;
     897             : }
     898             : 
     899           0 : static int apply_mount(
     900             :                 const char *root_directory,
     901             :                 MountEntry *m) {
     902             : 
     903           0 :         bool rbind = true, make = false;
     904             :         const char *what;
     905             :         int r;
     906             : 
     907           0 :         assert(m);
     908             : 
     909           0 :         log_debug("Applying namespace mount on %s", mount_entry_path(m));
     910             : 
     911           0 :         switch (m->mode) {
     912             : 
     913           0 :         case INACCESSIBLE: {
     914             :                 struct stat target;
     915             : 
     916             :                 /* First, get rid of everything that is below if there
     917             :                  * is anything... Then, overmount it with an
     918             :                  * inaccessible path. */
     919           0 :                 (void) umount_recursive(mount_entry_path(m), 0);
     920             : 
     921           0 :                 if (lstat(mount_entry_path(m), &target) < 0) {
     922           0 :                         if (errno == ENOENT && m->ignore)
     923           0 :                                 return 0;
     924             : 
     925           0 :                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
     926             :                 }
     927             : 
     928           0 :                 what = mode_to_inaccessible_node(target.st_mode);
     929           0 :                 if (!what)
     930           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
     931             :                                                "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
     932           0 :                 break;
     933             :         }
     934             : 
     935           0 :         case READONLY:
     936             :         case READWRITE:
     937             :         case READWRITE_IMPLICIT:
     938           0 :                 r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
     939           0 :                 if (r == -ENOENT && m->ignore)
     940           0 :                         return 0;
     941           0 :                 if (r < 0)
     942           0 :                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
     943           0 :                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
     944           0 :                         return 0;
     945             :                 /* This isn't a mount point yet, let's make it one. */
     946           0 :                 what = mount_entry_path(m);
     947           0 :                 break;
     948             : 
     949           0 :         case BIND_MOUNT:
     950           0 :                 rbind = false;
     951             : 
     952             :                 _fallthrough_;
     953           0 :         case BIND_MOUNT_RECURSIVE: {
     954           0 :                 _cleanup_free_ char *chased = NULL;
     955             : 
     956             :                 /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note that bind
     957             :                  * mount source paths are always relative to the host root, hence we pass NULL as root directory to
     958             :                  * chase_symlinks() here. */
     959             : 
     960           0 :                 r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased);
     961           0 :                 if (r == -ENOENT && m->ignore) {
     962           0 :                         log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
     963           0 :                         return 0;
     964             :                 }
     965           0 :                 if (r < 0)
     966           0 :                         return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
     967             : 
     968           0 :                 log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased);
     969             : 
     970           0 :                 free_and_replace(m->source_malloc, chased);
     971             : 
     972           0 :                 what = mount_entry_source(m);
     973           0 :                 make = true;
     974           0 :                 break;
     975             :         }
     976             : 
     977           0 :         case EMPTY_DIR:
     978             :         case TMPFS:
     979           0 :                 return mount_tmpfs(m);
     980             : 
     981           0 :         case PRIVATE_TMP:
     982           0 :                 what = mount_entry_source(m);
     983           0 :                 make = true;
     984           0 :                 break;
     985             : 
     986           0 :         case PRIVATE_DEV:
     987           0 :                 return mount_private_dev(m);
     988             : 
     989           0 :         case BIND_DEV:
     990           0 :                 return mount_bind_dev(m);
     991             : 
     992           0 :         case SYSFS:
     993           0 :                 return mount_sysfs(m);
     994             : 
     995           0 :         case PROCFS:
     996           0 :                 return mount_procfs(m);
     997             : 
     998           0 :         default:
     999           0 :                 assert_not_reached("Unknown mode");
    1000             :         }
    1001             : 
    1002           0 :         assert(what);
    1003             : 
    1004           0 :         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
    1005           0 :                 bool try_again = false;
    1006           0 :                 r = -errno;
    1007             : 
    1008           0 :                 if (r == -ENOENT && make) {
    1009             :                         struct stat st;
    1010             : 
    1011             :                         /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
    1012             : 
    1013           0 :                         if (stat(what, &st) < 0)
    1014           0 :                                 log_debug_errno(errno, "Mount point source '%s' is not accessible: %m", what);
    1015             :                         else {
    1016             :                                 int q;
    1017             : 
    1018           0 :                                 (void) mkdir_parents(mount_entry_path(m), 0755);
    1019             : 
    1020           0 :                                 if (S_ISDIR(st.st_mode))
    1021           0 :                                         q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0;
    1022             :                                 else
    1023           0 :                                         q = touch(mount_entry_path(m));
    1024             : 
    1025           0 :                                 if (q < 0)
    1026           0 :                                         log_debug_errno(q, "Failed to create destination mount point node '%s': %m", mount_entry_path(m));
    1027             :                                 else
    1028           0 :                                         try_again = true;
    1029             :                         }
    1030             :                 }
    1031             : 
    1032           0 :                 if (try_again) {
    1033           0 :                         if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
    1034           0 :                                 r = -errno;
    1035             :                         else
    1036           0 :                                 r = 0;
    1037             :                 }
    1038             : 
    1039           0 :                 if (r < 0)
    1040           0 :                         return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
    1041             :         }
    1042             : 
    1043           0 :         log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
    1044           0 :         return 0;
    1045             : }
    1046             : 
    1047             : /* Change per-mount flags on an existing mount */
    1048           0 : static int bind_remount_one(const char *path, unsigned long orig_flags, unsigned long new_flags, unsigned long flags_mask) {
    1049           0 :         if (mount(NULL, path, NULL, (orig_flags & ~flags_mask) | MS_REMOUNT | MS_BIND | new_flags, NULL) < 0)
    1050           0 :                 return -errno;
    1051             : 
    1052           0 :         return 0;
    1053             : }
    1054             : 
    1055           0 : static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
    1056           0 :         unsigned long new_flags = 0, flags_mask = 0;
    1057           0 :         bool submounts = false;
    1058           0 :         int r = 0;
    1059             : 
    1060           0 :         assert(m);
    1061           0 :         assert(proc_self_mountinfo);
    1062             : 
    1063           0 :         if (mount_entry_read_only(m) || m->mode == PRIVATE_DEV) {
    1064           0 :                 new_flags |= MS_RDONLY;
    1065           0 :                 flags_mask |= MS_RDONLY;
    1066             :         }
    1067             : 
    1068           0 :         if (m->nosuid) {
    1069           0 :                 new_flags |= MS_NOSUID;
    1070           0 :                 flags_mask |= MS_NOSUID;
    1071             :         }
    1072             : 
    1073           0 :         if (flags_mask == 0) /* No Change? */
    1074           0 :                 return 0;
    1075             : 
    1076             :         /* We generally apply these changes recursively, except for /dev, and the cases we know there's
    1077             :          * nothing further down.  Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
    1078             :          * per-mount read-only flag.  We can't set it on the superblock, if we are inside a user namespace
    1079             :          * and running Linux <= 4.17. */
    1080           0 :         submounts =
    1081           0 :                 mount_entry_read_only(m) &&
    1082           0 :                 !IN_SET(m->mode, EMPTY_DIR, TMPFS);
    1083           0 :         if (submounts)
    1084           0 :                 r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, blacklist, proc_self_mountinfo);
    1085             :         else
    1086           0 :                 r = bind_remount_one(mount_entry_path(m), m->flags, new_flags, flags_mask);
    1087             : 
    1088             :         /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
    1089             :          * read-only already stays this way. This improves compatibility with container managers, where we
    1090             :          * won't attempt to undo read-only mounts already applied. */
    1091             : 
    1092           0 :         if (r == -ENOENT && m->ignore)
    1093           0 :                 return 0;
    1094           0 :         if (r < 0)
    1095           0 :                 return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
    1096             :                                        submounts ? " and its submounts" : "");
    1097           0 :         return 0;
    1098             : }
    1099             : 
    1100           0 : static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
    1101           0 :         assert(ns_info);
    1102             : 
    1103             :         /*
    1104             :          * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
    1105             :          * since to protect the API VFS mounts, they need to be around in the
    1106             :          * first place...
    1107             :          */
    1108             : 
    1109           0 :         return ns_info->mount_apivfs ||
    1110           0 :                 ns_info->protect_control_groups ||
    1111             :                 ns_info->protect_kernel_tunables;
    1112             : }
    1113             : 
    1114           0 : static size_t namespace_calculate_mounts(
    1115             :                 const NamespaceInfo *ns_info,
    1116             :                 char** read_write_paths,
    1117             :                 char** read_only_paths,
    1118             :                 char** inaccessible_paths,
    1119             :                 char** empty_directories,
    1120             :                 size_t n_bind_mounts,
    1121             :                 size_t n_temporary_filesystems,
    1122             :                 const char* tmp_dir,
    1123             :                 const char* var_tmp_dir,
    1124             :                 ProtectHome protect_home,
    1125             :                 ProtectSystem protect_system) {
    1126             : 
    1127             :         size_t protect_home_cnt;
    1128           0 :         size_t protect_system_cnt =
    1129             :                 (protect_system == PROTECT_SYSTEM_STRICT ?
    1130           0 :                  ELEMENTSOF(protect_system_strict_table) :
    1131             :                  ((protect_system == PROTECT_SYSTEM_FULL) ?
    1132           0 :                   ELEMENTSOF(protect_system_full_table) :
    1133             :                   ((protect_system == PROTECT_SYSTEM_YES) ?
    1134           0 :                    ELEMENTSOF(protect_system_yes_table) : 0)));
    1135             : 
    1136           0 :         protect_home_cnt =
    1137             :                 (protect_home == PROTECT_HOME_YES ?
    1138           0 :                  ELEMENTSOF(protect_home_yes_table) :
    1139             :                  ((protect_home == PROTECT_HOME_READ_ONLY) ?
    1140           0 :                   ELEMENTSOF(protect_home_read_only_table) :
    1141             :                   ((protect_home == PROTECT_HOME_TMPFS) ?
    1142           0 :                    ELEMENTSOF(protect_home_tmpfs_table) : 0)));
    1143             : 
    1144           0 :         return !!tmp_dir + !!var_tmp_dir +
    1145           0 :                 strv_length(read_write_paths) +
    1146           0 :                 strv_length(read_only_paths) +
    1147           0 :                 strv_length(inaccessible_paths) +
    1148           0 :                 strv_length(empty_directories) +
    1149           0 :                 n_bind_mounts +
    1150           0 :                 n_temporary_filesystems +
    1151           0 :                 ns_info->private_dev +
    1152           0 :                 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
    1153           0 :                 (ns_info->protect_control_groups ? 1 : 0) +
    1154           0 :                 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
    1155           0 :                 protect_home_cnt + protect_system_cnt +
    1156           0 :                 (ns_info->protect_hostname ? 2 : 0) +
    1157           0 :                 (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0);
    1158             : }
    1159             : 
    1160           0 : static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) {
    1161           0 :         assert(root_directory);
    1162           0 :         assert(n_mounts);
    1163           0 :         assert(mounts || *n_mounts == 0);
    1164             : 
    1165           0 :         typesafe_qsort(mounts, *n_mounts, mount_path_compare);
    1166             : 
    1167           0 :         drop_duplicates(mounts, n_mounts);
    1168           0 :         drop_outside_root(root_directory, mounts, n_mounts);
    1169           0 :         drop_inaccessible(mounts, n_mounts);
    1170           0 :         drop_nop(mounts, n_mounts);
    1171           0 : }
    1172             : 
    1173           0 : int setup_namespace(
    1174             :                 const char* root_directory,
    1175             :                 const char* root_image,
    1176             :                 const NamespaceInfo *ns_info,
    1177             :                 char** read_write_paths,
    1178             :                 char** read_only_paths,
    1179             :                 char** inaccessible_paths,
    1180             :                 char** empty_directories,
    1181             :                 const BindMount *bind_mounts,
    1182             :                 size_t n_bind_mounts,
    1183             :                 const TemporaryFileSystem *temporary_filesystems,
    1184             :                 size_t n_temporary_filesystems,
    1185             :                 const char* tmp_dir,
    1186             :                 const char* var_tmp_dir,
    1187             :                 ProtectHome protect_home,
    1188             :                 ProtectSystem protect_system,
    1189             :                 unsigned long mount_flags,
    1190             :                 DissectImageFlags dissect_image_flags,
    1191             :                 char **error_path) {
    1192             : 
    1193           0 :         _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
    1194           0 :         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
    1195           0 :         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
    1196           0 :         _cleanup_free_ void *root_hash = NULL;
    1197           0 :         MountEntry *m = NULL, *mounts = NULL;
    1198           0 :         size_t n_mounts, root_hash_size = 0;
    1199           0 :         bool require_prefix = false;
    1200             :         const char *root;
    1201           0 :         int r = 0;
    1202             : 
    1203           0 :         assert(ns_info);
    1204             : 
    1205           0 :         if (mount_flags == 0)
    1206           0 :                 mount_flags = MS_SHARED;
    1207             : 
    1208           0 :         if (root_image) {
    1209           0 :                 dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
    1210             : 
    1211           0 :                 if (protect_system == PROTECT_SYSTEM_STRICT &&
    1212           0 :                     protect_home != PROTECT_HOME_NO &&
    1213           0 :                     strv_isempty(read_write_paths))
    1214           0 :                         dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
    1215             : 
    1216           0 :                 r = loop_device_make_by_path(root_image,
    1217           0 :                                              dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
    1218             :                                              &loop_device);
    1219           0 :                 if (r < 0)
    1220           0 :                         return log_debug_errno(r, "Failed to create loop device for root image: %m");
    1221             : 
    1222           0 :                 r = root_hash_load(root_image, &root_hash, &root_hash_size);
    1223           0 :                 if (r < 0)
    1224           0 :                         return log_debug_errno(r, "Failed to load root hash: %m");
    1225             : 
    1226           0 :                 r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
    1227           0 :                 if (r < 0)
    1228           0 :                         return log_debug_errno(r, "Failed to dissect image: %m");
    1229             : 
    1230           0 :                 r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
    1231           0 :                 if (r < 0)
    1232           0 :                         return log_debug_errno(r, "Failed to decrypt dissected image: %m");
    1233             :         }
    1234             : 
    1235           0 :         if (root_directory)
    1236           0 :                 root = root_directory;
    1237             :         else {
    1238             :                 /* Always create the mount namespace in a temporary directory, instead of operating
    1239             :                  * directly in the root. The temporary directory prevents any mounts from being
    1240             :                  * potentially obscured my other mounts we already applied.
    1241             :                  * We use the same mount point for all images, which is safe, since they all live
    1242             :                  * in their own namespaces after all, and hence won't see each other. */
    1243             : 
    1244           0 :                 root = "/run/systemd/unit-root";
    1245           0 :                 (void) mkdir_label(root, 0700);
    1246           0 :                 require_prefix = true;
    1247             :         }
    1248             : 
    1249           0 :         n_mounts = namespace_calculate_mounts(
    1250             :                         ns_info,
    1251             :                         read_write_paths,
    1252             :                         read_only_paths,
    1253             :                         inaccessible_paths,
    1254             :                         empty_directories,
    1255             :                         n_bind_mounts,
    1256             :                         n_temporary_filesystems,
    1257             :                         tmp_dir, var_tmp_dir,
    1258             :                         protect_home, protect_system);
    1259             : 
    1260           0 :         if (n_mounts > 0) {
    1261           0 :                 m = mounts = new0(MountEntry, n_mounts);
    1262           0 :                 if (!mounts)
    1263           0 :                         return -ENOMEM;
    1264             : 
    1265           0 :                 r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
    1266           0 :                 if (r < 0)
    1267           0 :                         goto finish;
    1268             : 
    1269           0 :                 r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
    1270           0 :                 if (r < 0)
    1271           0 :                         goto finish;
    1272             : 
    1273           0 :                 r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
    1274           0 :                 if (r < 0)
    1275           0 :                         goto finish;
    1276             : 
    1277           0 :                 r = append_empty_dir_mounts(&m, empty_directories);
    1278           0 :                 if (r < 0)
    1279           0 :                         goto finish;
    1280             : 
    1281           0 :                 r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
    1282           0 :                 if (r < 0)
    1283           0 :                         goto finish;
    1284             : 
    1285           0 :                 r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
    1286           0 :                 if (r < 0)
    1287           0 :                         goto finish;
    1288             : 
    1289           0 :                 if (tmp_dir) {
    1290           0 :                         *(m++) = (MountEntry) {
    1291             :                                 .path_const = "/tmp",
    1292             :                                 .mode = PRIVATE_TMP,
    1293             :                                 .source_const = tmp_dir,
    1294             :                         };
    1295             :                 }
    1296             : 
    1297           0 :                 if (var_tmp_dir) {
    1298           0 :                         *(m++) = (MountEntry) {
    1299             :                                 .path_const = "/var/tmp",
    1300             :                                 .mode = PRIVATE_TMP,
    1301             :                                 .source_const = var_tmp_dir,
    1302             :                         };
    1303             :                 }
    1304             : 
    1305           0 :                 if (ns_info->private_dev) {
    1306           0 :                         *(m++) = (MountEntry) {
    1307             :                                 .path_const = "/dev",
    1308             :                                 .mode = PRIVATE_DEV,
    1309             :                                 .flags = DEV_MOUNT_OPTIONS,
    1310             :                         };
    1311             :                 }
    1312             : 
    1313           0 :                 if (ns_info->protect_kernel_tunables) {
    1314           0 :                         r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
    1315           0 :                         if (r < 0)
    1316           0 :                                 goto finish;
    1317             :                 }
    1318             : 
    1319           0 :                 if (ns_info->protect_kernel_modules) {
    1320           0 :                         r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
    1321           0 :                         if (r < 0)
    1322           0 :                                 goto finish;
    1323             :                 }
    1324             : 
    1325           0 :                 if (ns_info->protect_control_groups) {
    1326           0 :                         *(m++) = (MountEntry) {
    1327             :                                 .path_const = "/sys/fs/cgroup",
    1328             :                                 .mode = READONLY,
    1329             :                         };
    1330             :                 }
    1331             : 
    1332           0 :                 r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
    1333           0 :                 if (r < 0)
    1334           0 :                         goto finish;
    1335             : 
    1336           0 :                 r = append_protect_system(&m, protect_system, false);
    1337           0 :                 if (r < 0)
    1338           0 :                         goto finish;
    1339             : 
    1340           0 :                 if (namespace_info_mount_apivfs(ns_info)) {
    1341           0 :                         r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
    1342           0 :                         if (r < 0)
    1343           0 :                                 goto finish;
    1344             :                 }
    1345             : 
    1346           0 :                 if (ns_info->protect_hostname) {
    1347           0 :                         *(m++) = (MountEntry) {
    1348             :                                 .path_const = "/proc/sys/kernel/hostname",
    1349             :                                 .mode = READONLY,
    1350             :                         };
    1351           0 :                         *(m++) = (MountEntry) {
    1352             :                                 .path_const = "/proc/sys/kernel/domainname",
    1353             :                                 .mode = READONLY,
    1354             :                         };
    1355             :                 }
    1356             : 
    1357           0 :                 assert(mounts + n_mounts == m);
    1358             : 
    1359             :                 /* Prepend the root directory where that's necessary */
    1360           0 :                 r = prefix_where_needed(mounts, n_mounts, root);
    1361           0 :                 if (r < 0)
    1362           0 :                         goto finish;
    1363             : 
    1364           0 :                 normalize_mounts(root, mounts, &n_mounts);
    1365             :         }
    1366             : 
    1367             :         /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
    1368             : 
    1369           0 :         if (unshare(CLONE_NEWNS) < 0) {
    1370           0 :                 r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
    1371           0 :                 if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS))
    1372             :                         /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter in place
    1373             :                          * that doesn't allow us to create namespaces (or a missing cap), then propagate a recognizable
    1374             :                          * error back, which the caller can use to detect this case (and only this) and optionally
    1375             :                          * continue without namespacing applied. */
    1376           0 :                         r = -ENOANO;
    1377             : 
    1378           0 :                 goto finish;
    1379             :         }
    1380             : 
    1381             :         /* Remount / as SLAVE so that nothing now mounted in the namespace
    1382             :          * shows up in the parent */
    1383           0 :         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
    1384           0 :                 r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
    1385           0 :                 goto finish;
    1386             :         }
    1387             : 
    1388           0 :         if (root_image) {
    1389             :                 /* A root image is specified, mount it to the right place */
    1390           0 :                 r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
    1391           0 :                 if (r < 0) {
    1392           0 :                         log_debug_errno(r, "Failed to mount root image: %m");
    1393           0 :                         goto finish;
    1394             :                 }
    1395             : 
    1396           0 :                 if (decrypted_image) {
    1397           0 :                         r = decrypted_image_relinquish(decrypted_image);
    1398           0 :                         if (r < 0) {
    1399           0 :                                 log_debug_errno(r, "Failed to relinquish decrypted image: %m");
    1400           0 :                                 goto finish;
    1401             :                         }
    1402             :                 }
    1403             : 
    1404           0 :                 loop_device_relinquish(loop_device);
    1405             : 
    1406           0 :         } else if (root_directory) {
    1407             : 
    1408             :                 /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
    1409           0 :                 r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
    1410           0 :                 if (r < 0) {
    1411           0 :                         log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
    1412           0 :                         goto finish;
    1413             :                 }
    1414           0 :                 if (r == 0) {
    1415           0 :                         if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
    1416           0 :                                 r = log_debug_errno(errno, "Failed to bind mount '%s': %m", root);
    1417           0 :                                 goto finish;
    1418             :                         }
    1419             :                 }
    1420             : 
    1421             :         } else {
    1422             : 
    1423             :                 /* Let's mount the main root directory to the root directory to use */
    1424           0 :                 if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
    1425           0 :                         r = log_debug_errno(errno, "Failed to bind mount '/' on '%s': %m", root);
    1426           0 :                         goto finish;
    1427             :                 }
    1428             :         }
    1429             : 
    1430             :         /* Try to set up the new root directory before mounting anything else there. */
    1431           0 :         if (root_image || root_directory)
    1432           0 :                 (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
    1433             : 
    1434           0 :         if (n_mounts > 0) {
    1435           0 :                 _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
    1436           0 :                 _cleanup_free_ char **blacklist = NULL;
    1437             :                 size_t j;
    1438             : 
    1439             :                 /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
    1440             :                  * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
    1441           0 :                 proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
    1442           0 :                 if (!proc_self_mountinfo) {
    1443           0 :                         r = log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
    1444           0 :                         if (error_path)
    1445           0 :                                 *error_path = strdup("/proc/self/mountinfo");
    1446           0 :                         goto finish;
    1447             :                 }
    1448             : 
    1449             :                 /* First round, establish all mounts we need */
    1450           0 :                 for (;;) {
    1451           0 :                         bool again = false;
    1452             : 
    1453           0 :                         for (m = mounts; m < mounts + n_mounts; ++m) {
    1454             : 
    1455           0 :                                 if (m->applied)
    1456           0 :                                         continue;
    1457             : 
    1458           0 :                                 r = follow_symlink(root, m);
    1459           0 :                                 if (r < 0) {
    1460           0 :                                         if (error_path && mount_entry_path(m))
    1461           0 :                                                 *error_path = strdup(mount_entry_path(m));
    1462           0 :                                         goto finish;
    1463             :                                 }
    1464           0 :                                 if (r == 0) {
    1465             :                                         /* We hit a symlinked mount point. The entry got rewritten and might point to a
    1466             :                                          * very different place now. Let's normalize the changed list, and start from
    1467             :                                          * the beginning. After all to mount the entry at the new location we might
    1468             :                                          * need some other mounts first */
    1469           0 :                                         again = true;
    1470           0 :                                         break;
    1471             :                                 }
    1472             : 
    1473           0 :                                 r = apply_mount(root, m);
    1474           0 :                                 if (r < 0) {
    1475           0 :                                         if (error_path && mount_entry_path(m))
    1476           0 :                                                 *error_path = strdup(mount_entry_path(m));
    1477           0 :                                         goto finish;
    1478             :                                 }
    1479             : 
    1480           0 :                                 m->applied = true;
    1481             :                         }
    1482             : 
    1483           0 :                         if (!again)
    1484           0 :                                 break;
    1485             : 
    1486           0 :                         normalize_mounts(root, mounts, &n_mounts);
    1487             :                 }
    1488             : 
    1489             :                 /* Create a blacklist we can pass to bind_mount_recursive() */
    1490           0 :                 blacklist = new(char*, n_mounts+1);
    1491           0 :                 if (!blacklist) {
    1492           0 :                         r = -ENOMEM;
    1493           0 :                         goto finish;
    1494             :                 }
    1495           0 :                 for (j = 0; j < n_mounts; j++)
    1496           0 :                         blacklist[j] = (char*) mount_entry_path(mounts+j);
    1497           0 :                 blacklist[j] = NULL;
    1498             : 
    1499             :                 /* Second round, flip the ro bits if necessary. */
    1500           0 :                 for (m = mounts; m < mounts + n_mounts; ++m) {
    1501           0 :                         r = make_read_only(m, blacklist, proc_self_mountinfo);
    1502           0 :                         if (r < 0) {
    1503           0 :                                 if (error_path && mount_entry_path(m))
    1504           0 :                                         *error_path = strdup(mount_entry_path(m));
    1505           0 :                                 goto finish;
    1506             :                         }
    1507             :                 }
    1508             :         }
    1509             : 
    1510             :         /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
    1511           0 :         r = mount_move_root(root);
    1512           0 :         if (r < 0) {
    1513           0 :                 log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
    1514           0 :                 goto finish;
    1515             :         }
    1516             : 
    1517             :         /* Remount / as the desired mode. Note that this will not
    1518             :          * reestablish propagation from our side to the host, since
    1519             :          * what's disconnected is disconnected. */
    1520           0 :         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
    1521           0 :                 r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
    1522           0 :                 goto finish;
    1523             :         }
    1524             : 
    1525           0 :         r = 0;
    1526             : 
    1527           0 : finish:
    1528           0 :         for (m = mounts; m < mounts + n_mounts; m++)
    1529           0 :                 mount_entry_done(m);
    1530             : 
    1531           0 :         free(mounts);
    1532             : 
    1533           0 :         return r;
    1534             : }
    1535             : 
    1536         559 : void bind_mount_free_many(BindMount *b, size_t n) {
    1537             :         size_t i;
    1538             : 
    1539         559 :         assert(b || n == 0);
    1540             : 
    1541         559 :         for (i = 0; i < n; i++) {
    1542           0 :                 free(b[i].source);
    1543           0 :                 free(b[i].destination);
    1544             :         }
    1545             : 
    1546         559 :         free(b);
    1547         559 : }
    1548             : 
    1549           0 : int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
    1550           0 :         _cleanup_free_ char *s = NULL, *d = NULL;
    1551             :         BindMount *c;
    1552             : 
    1553           0 :         assert(b);
    1554           0 :         assert(n);
    1555           0 :         assert(item);
    1556             : 
    1557           0 :         s = strdup(item->source);
    1558           0 :         if (!s)
    1559           0 :                 return -ENOMEM;
    1560             : 
    1561           0 :         d = strdup(item->destination);
    1562           0 :         if (!d)
    1563           0 :                 return -ENOMEM;
    1564             : 
    1565           0 :         c = reallocarray(*b, *n + 1, sizeof(BindMount));
    1566           0 :         if (!c)
    1567           0 :                 return -ENOMEM;
    1568             : 
    1569           0 :         *b = c;
    1570             : 
    1571           0 :         c[(*n) ++] = (BindMount) {
    1572           0 :                 .source = TAKE_PTR(s),
    1573           0 :                 .destination = TAKE_PTR(d),
    1574           0 :                 .read_only = item->read_only,
    1575           0 :                 .nosuid = item->nosuid,
    1576           0 :                 .recursive = item->recursive,
    1577           0 :                 .ignore_enoent = item->ignore_enoent,
    1578             :         };
    1579             : 
    1580           0 :         return 0;
    1581             : }
    1582             : 
    1583         559 : void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
    1584             :         size_t i;
    1585             : 
    1586         559 :         assert(t || n == 0);
    1587             : 
    1588         559 :         for (i = 0; i < n; i++) {
    1589           0 :                 free(t[i].path);
    1590           0 :                 free(t[i].options);
    1591             :         }
    1592             : 
    1593         559 :         free(t);
    1594         559 : }
    1595             : 
    1596           0 : int temporary_filesystem_add(
    1597             :                 TemporaryFileSystem **t,
    1598             :                 size_t *n,
    1599             :                 const char *path,
    1600             :                 const char *options) {
    1601             : 
    1602           0 :         _cleanup_free_ char *p = NULL, *o = NULL;
    1603             :         TemporaryFileSystem *c;
    1604             : 
    1605           0 :         assert(t);
    1606           0 :         assert(n);
    1607           0 :         assert(path);
    1608             : 
    1609           0 :         p = strdup(path);
    1610           0 :         if (!p)
    1611           0 :                 return -ENOMEM;
    1612             : 
    1613           0 :         if (!isempty(options)) {
    1614           0 :                 o = strdup(options);
    1615           0 :                 if (!o)
    1616           0 :                         return -ENOMEM;
    1617             :         }
    1618             : 
    1619           0 :         c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
    1620           0 :         if (!c)
    1621           0 :                 return -ENOMEM;
    1622             : 
    1623           0 :         *t = c;
    1624             : 
    1625           0 :         c[(*n) ++] = (TemporaryFileSystem) {
    1626           0 :                 .path = TAKE_PTR(p),
    1627           0 :                 .options = TAKE_PTR(o),
    1628             :         };
    1629             : 
    1630           0 :         return 0;
    1631             : }
    1632             : 
    1633           0 : static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
    1634           0 :         _cleanup_free_ char *x = NULL;
    1635             :         char bid[SD_ID128_STRING_MAX];
    1636             :         sd_id128_t boot_id;
    1637             :         int r;
    1638             : 
    1639           0 :         assert(id);
    1640           0 :         assert(prefix);
    1641           0 :         assert(path);
    1642             : 
    1643             :         /* We include the boot id in the directory so that after a
    1644             :          * reboot we can easily identify obsolete directories. */
    1645             : 
    1646           0 :         r = sd_id128_get_boot(&boot_id);
    1647           0 :         if (r < 0)
    1648           0 :                 return r;
    1649             : 
    1650           0 :         x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
    1651           0 :         if (!x)
    1652           0 :                 return -ENOMEM;
    1653             : 
    1654           0 :         RUN_WITH_UMASK(0077)
    1655           0 :                 if (!mkdtemp(x))
    1656           0 :                         return -errno;
    1657             : 
    1658           0 :         RUN_WITH_UMASK(0000) {
    1659             :                 char *y;
    1660             : 
    1661           0 :                 y = strjoina(x, "/tmp");
    1662             : 
    1663           0 :                 if (mkdir(y, 0777 | S_ISVTX) < 0)
    1664           0 :                         return -errno;
    1665             :         }
    1666             : 
    1667           0 :         *path = TAKE_PTR(x);
    1668             : 
    1669           0 :         return 0;
    1670             : }
    1671             : 
    1672           0 : int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
    1673             :         char *a, *b;
    1674             :         int r;
    1675             : 
    1676           0 :         assert(id);
    1677           0 :         assert(tmp_dir);
    1678           0 :         assert(var_tmp_dir);
    1679             : 
    1680           0 :         r = setup_one_tmp_dir(id, "/tmp", &a);
    1681           0 :         if (r < 0)
    1682           0 :                 return r;
    1683             : 
    1684           0 :         r = setup_one_tmp_dir(id, "/var/tmp", &b);
    1685           0 :         if (r < 0) {
    1686             :                 char *t;
    1687             : 
    1688           0 :                 t = strjoina(a, "/tmp");
    1689           0 :                 (void) rmdir(t);
    1690           0 :                 (void) rmdir(a);
    1691             : 
    1692           0 :                 free(a);
    1693           0 :                 return r;
    1694             :         }
    1695             : 
    1696           0 :         *tmp_dir = a;
    1697           0 :         *var_tmp_dir = b;
    1698             : 
    1699           0 :         return 0;
    1700             : }
    1701             : 
    1702           0 : int setup_netns(const int netns_storage_socket[static 2]) {
    1703           0 :         _cleanup_close_ int netns = -1;
    1704             :         int r, q;
    1705             : 
    1706           0 :         assert(netns_storage_socket);
    1707           0 :         assert(netns_storage_socket[0] >= 0);
    1708           0 :         assert(netns_storage_socket[1] >= 0);
    1709             : 
    1710             :         /* We use the passed socketpair as a storage buffer for our
    1711             :          * namespace reference fd. Whatever process runs this first
    1712             :          * shall create a new namespace, all others should just join
    1713             :          * it. To serialize that we use a file lock on the socket
    1714             :          * pair.
    1715             :          *
    1716             :          * It's a bit crazy, but hey, works great! */
    1717             : 
    1718           0 :         if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
    1719           0 :                 return -errno;
    1720             : 
    1721           0 :         netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
    1722           0 :         if (netns == -EAGAIN) {
    1723             :                 /* Nothing stored yet, so let's create a new namespace. */
    1724             : 
    1725           0 :                 if (unshare(CLONE_NEWNET) < 0) {
    1726           0 :                         r = -errno;
    1727           0 :                         goto fail;
    1728             :                 }
    1729             : 
    1730           0 :                 (void) loopback_setup();
    1731             : 
    1732           0 :                 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
    1733           0 :                 if (netns < 0) {
    1734           0 :                         r = -errno;
    1735           0 :                         goto fail;
    1736             :                 }
    1737             : 
    1738           0 :                 r = 1;
    1739             : 
    1740           0 :         } else if (netns < 0) {
    1741           0 :                 r = netns;
    1742           0 :                 goto fail;
    1743             : 
    1744             :         } else {
    1745             :                 /* Yay, found something, so let's join the namespace */
    1746           0 :                 if (setns(netns, CLONE_NEWNET) < 0) {
    1747           0 :                         r = -errno;
    1748           0 :                         goto fail;
    1749             :                 }
    1750             : 
    1751           0 :                 r = 0;
    1752             :         }
    1753             : 
    1754           0 :         q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
    1755           0 :         if (q < 0) {
    1756           0 :                 r = q;
    1757           0 :                 goto fail;
    1758             :         }
    1759             : 
    1760           0 : fail:
    1761           0 :         (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
    1762           0 :         return r;
    1763             : }
    1764             : 
    1765           0 : int open_netns_path(const int netns_storage_socket[static 2], const char *path) {
    1766           0 :         _cleanup_close_ int netns = -1;
    1767             :         int q, r;
    1768             : 
    1769           0 :         assert(netns_storage_socket);
    1770           0 :         assert(netns_storage_socket[0] >= 0);
    1771           0 :         assert(netns_storage_socket[1] >= 0);
    1772           0 :         assert(path);
    1773             : 
    1774             :         /* If the storage socket doesn't contain a netns fd yet, open one via the file system and store it in
    1775             :          * it. This is supposed to be called ahead of time, i.e. before setup_netns() which will allocate a
    1776             :          * new anonymous netns if needed. */
    1777             : 
    1778           0 :         if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
    1779           0 :                 return -errno;
    1780             : 
    1781           0 :         netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
    1782           0 :         if (netns == -EAGAIN) {
    1783             :                 /* Nothing stored yet. Open the file from the file system. */
    1784             : 
    1785           0 :                 netns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
    1786           0 :                 if (netns < 0) {
    1787           0 :                         r = -errno;
    1788           0 :                         goto fail;
    1789             :                 }
    1790             : 
    1791           0 :                 r = fd_is_network_ns(netns);
    1792           0 :                 if (r == 0) { /* Not a netns? Refuse early. */
    1793           0 :                         r = -EINVAL;
    1794           0 :                         goto fail;
    1795             :                 }
    1796           0 :                 if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
    1797           0 :                         goto fail;
    1798             : 
    1799           0 :                 r = 1;
    1800             : 
    1801           0 :         } else if (netns < 0) {
    1802           0 :                 r = netns;
    1803           0 :                 goto fail;
    1804             :         } else
    1805           0 :                 r = 0; /* Already allocated */
    1806             : 
    1807           0 :         q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
    1808           0 :         if (q < 0) {
    1809           0 :                 r = q;
    1810           0 :                 goto fail;
    1811             :         }
    1812             : 
    1813           0 : fail:
    1814           0 :         (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
    1815           0 :         return r;
    1816             : }
    1817             : 
    1818           0 : bool ns_type_supported(NamespaceType type) {
    1819             :         const char *t, *ns_proc;
    1820             : 
    1821           0 :         t = namespace_type_to_string(type);
    1822           0 :         if (!t) /* Don't know how to translate this? Then it's not supported */
    1823           0 :                 return false;
    1824             : 
    1825           0 :         ns_proc = strjoina("/proc/self/ns/", t);
    1826           0 :         return access(ns_proc, F_OK) == 0;
    1827             : }
    1828             : 
    1829             : static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
    1830             :         [PROTECT_HOME_NO] = "no",
    1831             :         [PROTECT_HOME_YES] = "yes",
    1832             :         [PROTECT_HOME_READ_ONLY] = "read-only",
    1833             :         [PROTECT_HOME_TMPFS] = "tmpfs",
    1834             : };
    1835             : 
    1836         183 : DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
    1837             : 
    1838             : static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
    1839             :         [PROTECT_SYSTEM_NO] = "no",
    1840             :         [PROTECT_SYSTEM_YES] = "yes",
    1841             :         [PROTECT_SYSTEM_FULL] = "full",
    1842             :         [PROTECT_SYSTEM_STRICT] = "strict",
    1843             : };
    1844             : 
    1845         183 : DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
    1846             : 
    1847             : static const char* const namespace_type_table[] = {
    1848             :         [NAMESPACE_MOUNT] = "mnt",
    1849             :         [NAMESPACE_CGROUP] = "cgroup",
    1850             :         [NAMESPACE_UTS] = "uts",
    1851             :         [NAMESPACE_IPC] = "ipc",
    1852             :         [NAMESPACE_USER] = "user",
    1853             :         [NAMESPACE_PID] = "pid",
    1854             :         [NAMESPACE_NET] = "net",
    1855             : };
    1856             : 
    1857          18 : DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);

Generated by: LCOV version 1.14