Branch data Line data Source code
1 : : /* SPDX-License-Identifier: LGPL-2.1+ */
2 : :
3 : : #include <errno.h>
4 : : #include <sched.h>
5 : : #include <stdio.h>
6 : : #include <string.h>
7 : : #include <sys/mount.h>
8 : : #include <sys/stat.h>
9 : : #include <unistd.h>
10 : : #include <linux/fs.h>
11 : :
12 : : #include "alloc-util.h"
13 : : #include "base-filesystem.h"
14 : : #include "dev-setup.h"
15 : : #include "fd-util.h"
16 : : #include "fs-util.h"
17 : : #include "label.h"
18 : : #include "loop-util.h"
19 : : #include "loopback-setup.h"
20 : : #include "missing.h"
21 : : #include "mkdir.h"
22 : : #include "mount-util.h"
23 : : #include "mountpoint-util.h"
24 : : #include "namespace-util.h"
25 : : #include "namespace.h"
26 : : #include "nulstr-util.h"
27 : : #include "path-util.h"
28 : : #include "selinux-util.h"
29 : : #include "socket-util.h"
30 : : #include "sort-util.h"
31 : : #include "stat-util.h"
32 : : #include "string-table.h"
33 : : #include "string-util.h"
34 : : #include "strv.h"
35 : : #include "umask-util.h"
36 : : #include "user-util.h"
37 : :
38 : : #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
39 : :
40 : : typedef enum MountMode {
41 : : /* This is ordered by priority! */
42 : : INACCESSIBLE,
43 : : BIND_MOUNT,
44 : : BIND_MOUNT_RECURSIVE,
45 : : PRIVATE_TMP,
46 : : PRIVATE_DEV,
47 : : BIND_DEV,
48 : : EMPTY_DIR,
49 : : SYSFS,
50 : : PROCFS,
51 : : READONLY,
52 : : READWRITE,
53 : : TMPFS,
54 : : READWRITE_IMPLICIT, /* Should have the lowest priority. */
55 : : _MOUNT_MODE_MAX,
56 : : } MountMode;
57 : :
58 : : typedef struct MountEntry {
59 : : const char *path_const; /* Memory allocated on stack or static */
60 : : MountMode mode:5;
61 : : bool ignore:1; /* Ignore if path does not exist? */
62 : : bool has_prefix:1; /* Already is prefixed by the root dir? */
63 : : bool read_only:1; /* Shall this mount point be read-only? */
64 : : bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */
65 : : bool applied:1; /* Already applied */
66 : : char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
67 : : const char *source_const; /* The source path, for bind mounts */
68 : : char *source_malloc;
69 : : const char *options_const;/* Mount options for tmpfs */
70 : : char *options_malloc;
71 : : unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
72 : : unsigned n_followed;
73 : : } MountEntry;
74 : :
75 : : /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
76 : : * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
77 : : static const MountEntry apivfs_table[] = {
78 : : { "/proc", PROCFS, false },
79 : : { "/dev", BIND_DEV, false },
80 : : { "/sys", SYSFS, false },
81 : : };
82 : :
83 : : /* ProtectKernelTunables= option and the related filesystem APIs */
84 : : static const MountEntry protect_kernel_tunables_table[] = {
85 : : { "/proc/acpi", READONLY, true },
86 : : { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
87 : : { "/proc/asound", READONLY, true },
88 : : { "/proc/bus", READONLY, true },
89 : : { "/proc/fs", READONLY, true },
90 : : { "/proc/irq", READONLY, true },
91 : : { "/proc/kallsyms", INACCESSIBLE, true },
92 : : { "/proc/kcore", INACCESSIBLE, true },
93 : : { "/proc/latency_stats", READONLY, true },
94 : : { "/proc/mtrr", READONLY, true },
95 : : { "/proc/scsi", READONLY, true },
96 : : { "/proc/sys", READONLY, false },
97 : : { "/proc/sysrq-trigger", READONLY, true },
98 : : { "/proc/timer_stats", READONLY, true },
99 : : { "/sys", READONLY, false },
100 : : { "/sys/fs/bpf", READONLY, true },
101 : : { "/sys/fs/cgroup", READWRITE_IMPLICIT, false }, /* READONLY is set by ProtectControlGroups= option */
102 : : { "/sys/fs/selinux", READWRITE_IMPLICIT, true },
103 : : { "/sys/kernel/debug", READONLY, true },
104 : : { "/sys/kernel/tracing", READONLY, true },
105 : : };
106 : :
107 : : /* ProtectKernelModules= option */
108 : : static const MountEntry protect_kernel_modules_table[] = {
109 : : #if HAVE_SPLIT_USR
110 : : { "/lib/modules", INACCESSIBLE, true },
111 : : #endif
112 : : { "/usr/lib/modules", INACCESSIBLE, true },
113 : : };
114 : :
115 : : /*
116 : : * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
117 : : * system should be protected by ProtectSystem=
118 : : */
119 : : static const MountEntry protect_home_read_only_table[] = {
120 : : { "/home", READONLY, true },
121 : : { "/run/user", READONLY, true },
122 : : { "/root", READONLY, true },
123 : : };
124 : :
125 : : /* ProtectHome=tmpfs table */
126 : : static const MountEntry protect_home_tmpfs_table[] = {
127 : : { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
128 : : { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
129 : : { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
130 : : };
131 : :
132 : : /* ProtectHome=yes table */
133 : : static const MountEntry protect_home_yes_table[] = {
134 : : { "/home", INACCESSIBLE, true },
135 : : { "/run/user", INACCESSIBLE, true },
136 : : { "/root", INACCESSIBLE, true },
137 : : };
138 : :
139 : : /* ProtectSystem=yes table */
140 : : static const MountEntry protect_system_yes_table[] = {
141 : : { "/usr", READONLY, false },
142 : : { "/boot", READONLY, true },
143 : : { "/efi", READONLY, true },
144 : : #if HAVE_SPLIT_USR
145 : : { "/lib", READONLY, true },
146 : : { "/lib64", READONLY, true },
147 : : { "/bin", READONLY, true },
148 : : # if HAVE_SPLIT_BIN
149 : : { "/sbin", READONLY, true },
150 : : # endif
151 : : #endif
152 : : };
153 : :
154 : : /* ProtectSystem=full includes ProtectSystem=yes */
155 : : static const MountEntry protect_system_full_table[] = {
156 : : { "/usr", READONLY, false },
157 : : { "/boot", READONLY, true },
158 : : { "/efi", READONLY, true },
159 : : { "/etc", READONLY, false },
160 : : #if HAVE_SPLIT_USR
161 : : { "/lib", READONLY, true },
162 : : { "/lib64", READONLY, true },
163 : : { "/bin", READONLY, true },
164 : : # if HAVE_SPLIT_BIN
165 : : { "/sbin", READONLY, true },
166 : : # endif
167 : : #endif
168 : : };
169 : :
170 : : /*
171 : : * ProtectSystem=strict table. In this strict mode, we mount everything
172 : : * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
173 : : * which are left writable, but PrivateDevices= + ProtectKernelTunables=
174 : : * protect those, and these options should be fully orthogonal.
175 : : * (And of course /home and friends are also left writable, as ProtectHome=
176 : : * shall manage those, orthogonally).
177 : : */
178 : : static const MountEntry protect_system_strict_table[] = {
179 : : { "/", READONLY, false },
180 : : { "/proc", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
181 : : { "/sys", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
182 : : { "/dev", READWRITE_IMPLICIT, false }, /* PrivateDevices= */
183 : : { "/home", READWRITE_IMPLICIT, true }, /* ProtectHome= */
184 : : { "/run/user", READWRITE_IMPLICIT, true }, /* ProtectHome= */
185 : : { "/root", READWRITE_IMPLICIT, true }, /* ProtectHome= */
186 : : };
187 : :
188 : : static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
189 : : [INACCESSIBLE] = "inaccessible",
190 : : [BIND_MOUNT] = "bind",
191 : : [BIND_MOUNT_RECURSIVE] = "rbind",
192 : : [PRIVATE_TMP] = "private-tmp",
193 : : [PRIVATE_DEV] = "private-dev",
194 : : [BIND_DEV] = "bind-dev",
195 : : [EMPTY_DIR] = "empty",
196 : : [SYSFS] = "sysfs",
197 : : [PROCFS] = "procfs",
198 : : [READONLY] = "read-only",
199 : : [READWRITE] = "read-write",
200 : : [TMPFS] = "tmpfs",
201 : : [READWRITE_IMPLICIT] = "rw-implicit",
202 : : };
203 : :
204 [ # # ]: 0 : DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
205 : :
206 : 0 : static const char *mount_entry_path(const MountEntry *p) {
207 [ # # ]: 0 : assert(p);
208 : :
209 : : /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
210 : : * otherwise the stack/static ->path field is returned. */
211 : :
212 [ # # ]: 0 : return p->path_malloc ?: p->path_const;
213 : : }
214 : :
215 : 0 : static bool mount_entry_read_only(const MountEntry *p) {
216 [ # # ]: 0 : assert(p);
217 : :
218 [ # # # # : 0 : return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
# # ]
219 : : }
220 : :
221 : 0 : static const char *mount_entry_source(const MountEntry *p) {
222 [ # # ]: 0 : assert(p);
223 : :
224 [ # # ]: 0 : return p->source_malloc ?: p->source_const;
225 : : }
226 : :
227 : 0 : static const char *mount_entry_options(const MountEntry *p) {
228 [ # # ]: 0 : assert(p);
229 : :
230 [ # # ]: 0 : return p->options_malloc ?: p->options_const;
231 : : }
232 : :
233 : 0 : static void mount_entry_done(MountEntry *p) {
234 [ # # ]: 0 : assert(p);
235 : :
236 : 0 : p->path_malloc = mfree(p->path_malloc);
237 : 0 : p->source_malloc = mfree(p->source_malloc);
238 : 0 : p->options_malloc = mfree(p->options_malloc);
239 : 0 : }
240 : :
241 : 0 : static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
242 : : char **i;
243 : :
244 [ # # ]: 0 : assert(p);
245 : :
246 : : /* Adds a list of user-supplied READWRITE/READWRITE_IMPLICIT/READONLY/INACCESSIBLE entries */
247 : :
248 [ # # # # ]: 0 : STRV_FOREACH(i, strv) {
249 : 0 : bool ignore = false, needs_prefix = false;
250 : 0 : const char *e = *i;
251 : :
252 : : /* Look for any prefixes */
253 [ # # ]: 0 : if (startswith(e, "-")) {
254 : 0 : e++;
255 : 0 : ignore = true;
256 : : }
257 [ # # ]: 0 : if (startswith(e, "+")) {
258 : 0 : e++;
259 : 0 : needs_prefix = true;
260 : : }
261 : :
262 [ # # ]: 0 : if (!path_is_absolute(e))
263 [ # # ]: 0 : return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
264 : : "Path is not absolute: %s", e);
265 : :
266 : 0 : *((*p)++) = (MountEntry) {
267 : : .path_const = e,
268 : : .mode = mode,
269 : : .ignore = ignore,
270 [ # # # # ]: 0 : .has_prefix = !needs_prefix && !forcibly_require_prefix,
271 : : };
272 : : }
273 : :
274 : 0 : return 0;
275 : : }
276 : :
277 : 0 : static int append_empty_dir_mounts(MountEntry **p, char **strv) {
278 : : char **i;
279 : :
280 [ # # ]: 0 : assert(p);
281 : :
282 : : /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
283 : : * "/private/" boundary directories for DynamicUser=1. */
284 : :
285 [ # # # # ]: 0 : STRV_FOREACH(i, strv) {
286 : :
287 : 0 : *((*p)++) = (MountEntry) {
288 : 0 : .path_const = *i,
289 : : .mode = EMPTY_DIR,
290 : : .ignore = false,
291 : : .read_only = true,
292 : : .options_const = "mode=755",
293 : : .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
294 : : };
295 : : }
296 : :
297 : 0 : return 0;
298 : : }
299 : :
300 : 0 : static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) {
301 : : size_t i;
302 : :
303 [ # # ]: 0 : assert(p);
304 : :
305 [ # # ]: 0 : for (i = 0; i < n; i++) {
306 : 0 : const BindMount *b = binds + i;
307 : :
308 : 0 : *((*p)++) = (MountEntry) {
309 : 0 : .path_const = b->destination,
310 [ # # ]: 0 : .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
311 : 0 : .read_only = b->read_only,
312 : 0 : .nosuid = b->nosuid,
313 : 0 : .source_const = b->source,
314 : 0 : .ignore = b->ignore_enoent,
315 : : };
316 : : }
317 : :
318 : 0 : return 0;
319 : : }
320 : :
321 : 0 : static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) {
322 : : size_t i;
323 : : int r;
324 : :
325 [ # # ]: 0 : assert(p);
326 : :
327 [ # # ]: 0 : for (i = 0; i < n; i++) {
328 : 0 : const TemporaryFileSystem *t = tmpfs + i;
329 [ # # # # ]: 0 : _cleanup_free_ char *o = NULL, *str = NULL;
330 : : unsigned long flags;
331 : 0 : bool ro = false;
332 : :
333 [ # # ]: 0 : if (!path_is_absolute(t->path))
334 [ # # ]: 0 : return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
335 : : "Path is not absolute: %s",
336 : : t->path);
337 : :
338 : 0 : str = strjoin("mode=0755,", t->options);
339 [ # # ]: 0 : if (!str)
340 : 0 : return -ENOMEM;
341 : :
342 : 0 : r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
343 [ # # ]: 0 : if (r < 0)
344 [ # # ]: 0 : return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
345 : :
346 : 0 : ro = flags & MS_RDONLY;
347 [ # # ]: 0 : if (ro)
348 : 0 : flags ^= MS_RDONLY;
349 : :
350 : 0 : *((*p)++) = (MountEntry) {
351 : 0 : .path_const = t->path,
352 : : .mode = TMPFS,
353 : : .read_only = ro,
354 : 0 : .options_malloc = TAKE_PTR(o),
355 : : .flags = flags,
356 : : };
357 : : }
358 : :
359 : 0 : return 0;
360 : : }
361 : :
362 : 0 : static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) {
363 : : size_t i;
364 : :
365 [ # # ]: 0 : assert(p);
366 [ # # ]: 0 : assert(mounts);
367 : :
368 : : /* Adds a list of static pre-defined entries */
369 : :
370 [ # # ]: 0 : for (i = 0; i < n; i++)
371 : 0 : *((*p)++) = (MountEntry) {
372 : 0 : .path_const = mount_entry_path(mounts+i),
373 : 0 : .mode = mounts[i].mode,
374 [ # # # # ]: 0 : .ignore = mounts[i].ignore || ignore_protect,
375 : : };
376 : :
377 : 0 : return 0;
378 : : }
379 : :
380 : 0 : static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
381 [ # # ]: 0 : assert(p);
382 : :
383 [ # # # # : 0 : switch (protect_home) {
# ]
384 : :
385 : 0 : case PROTECT_HOME_NO:
386 : 0 : return 0;
387 : :
388 : 0 : case PROTECT_HOME_READ_ONLY:
389 : 0 : return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
390 : :
391 : 0 : case PROTECT_HOME_TMPFS:
392 : 0 : return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
393 : :
394 : 0 : case PROTECT_HOME_YES:
395 : 0 : return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
396 : :
397 : 0 : default:
398 : 0 : assert_not_reached("Unexpected ProtectHome= value");
399 : : }
400 : : }
401 : :
402 : 0 : static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
403 [ # # ]: 0 : assert(p);
404 : :
405 [ # # # # : 0 : switch (protect_system) {
# ]
406 : :
407 : 0 : case PROTECT_SYSTEM_NO:
408 : 0 : return 0;
409 : :
410 : 0 : case PROTECT_SYSTEM_STRICT:
411 : 0 : return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
412 : :
413 : 0 : case PROTECT_SYSTEM_YES:
414 : 0 : return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
415 : :
416 : 0 : case PROTECT_SYSTEM_FULL:
417 : 0 : return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
418 : :
419 : 0 : default:
420 : 0 : assert_not_reached("Unexpected ProtectSystem= value");
421 : : }
422 : : }
423 : :
424 : 0 : static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
425 : : int d;
426 : :
427 : : /* If the paths are not equal, then order prefixes first */
428 : 0 : d = path_compare(mount_entry_path(a), mount_entry_path(b));
429 [ # # ]: 0 : if (d != 0)
430 : 0 : return d;
431 : :
432 : : /* If the paths are equal, check the mode */
433 [ # # ]: 0 : return CMP((int) a->mode, (int) b->mode);
434 : : }
435 : :
436 : 0 : static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) {
437 : : size_t i;
438 : :
439 : : /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
440 : :
441 [ # # ]: 0 : for (i = 0; i < n; i++) {
442 : : char *s;
443 : :
444 [ # # ]: 0 : if (m[i].has_prefix)
445 : 0 : continue;
446 : :
447 : 0 : s = path_join(root_directory, mount_entry_path(m+i));
448 [ # # ]: 0 : if (!s)
449 : 0 : return -ENOMEM;
450 : :
451 : 0 : free_and_replace(m[i].path_malloc, s);
452 : 0 : m[i].has_prefix = true;
453 : : }
454 : :
455 : 0 : return 0;
456 : : }
457 : :
458 : 0 : static void drop_duplicates(MountEntry *m, size_t *n) {
459 : : MountEntry *f, *t, *previous;
460 : :
461 [ # # ]: 0 : assert(m);
462 [ # # ]: 0 : assert(n);
463 : :
464 : : /* Drops duplicate entries. Expects that the array is properly ordered already. */
465 : :
466 [ # # ]: 0 : for (f = m, t = m, previous = NULL; f < m + *n; f++) {
467 : :
468 : : /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
469 : : * above. Note that we only drop duplicates that haven't been mounted yet. */
470 [ # # ]: 0 : if (previous &&
471 [ # # ]: 0 : path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
472 [ # # # # ]: 0 : !f->applied && !previous->applied) {
473 [ # # ]: 0 : log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
474 [ # # # # ]: 0 : previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
475 : 0 : mount_entry_done(f);
476 : 0 : continue;
477 : : }
478 : :
479 : 0 : *t = *f;
480 : 0 : previous = t;
481 : 0 : t++;
482 : : }
483 : :
484 : 0 : *n = t - m;
485 : 0 : }
486 : :
487 : 0 : static void drop_inaccessible(MountEntry *m, size_t *n) {
488 : : MountEntry *f, *t;
489 : 0 : const char *clear = NULL;
490 : :
491 [ # # ]: 0 : assert(m);
492 [ # # ]: 0 : assert(n);
493 : :
494 : : /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
495 : : * ordered already. */
496 : :
497 [ # # ]: 0 : for (f = m, t = m; f < m + *n; f++) {
498 : :
499 : : /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
500 : : * it, as inaccessible paths really should drop the entire subtree. */
501 [ # # # # ]: 0 : if (clear && path_startswith(mount_entry_path(f), clear)) {
502 [ # # ]: 0 : log_debug("%s is masked by %s.", mount_entry_path(f), clear);
503 : 0 : mount_entry_done(f);
504 : 0 : continue;
505 : : }
506 : :
507 [ # # ]: 0 : clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
508 : :
509 : 0 : *t = *f;
510 : 0 : t++;
511 : : }
512 : :
513 : 0 : *n = t - m;
514 : 0 : }
515 : :
516 : 0 : static void drop_nop(MountEntry *m, size_t *n) {
517 : : MountEntry *f, *t;
518 : :
519 [ # # ]: 0 : assert(m);
520 [ # # ]: 0 : assert(n);
521 : :
522 : : /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
523 : : * list is ordered by prefixes. */
524 : :
525 [ # # ]: 0 : for (f = m, t = m; f < m + *n; f++) {
526 : :
527 : : /* Only suppress such subtrees for READONLY, READWRITE and READWRITE_IMPLICIT entries */
528 [ # # # # ]: 0 : if (IN_SET(f->mode, READONLY, READWRITE, READWRITE_IMPLICIT)) {
529 : : MountEntry *p;
530 : 0 : bool found = false;
531 : :
532 : : /* Now let's find the first parent of the entry we are looking at. */
533 [ # # ]: 0 : for (p = t-1; p >= m; p--) {
534 [ # # ]: 0 : if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
535 : 0 : found = true;
536 : 0 : break;
537 : : }
538 : : }
539 : :
540 : : /* We found it, let's see if it's the same mode, if so, we can drop this entry */
541 [ # # # # ]: 0 : if (found && p->mode == f->mode) {
542 [ # # ]: 0 : log_debug("%s (%s) is made redundant by %s (%s)",
543 : : mount_entry_path(f), mount_mode_to_string(f->mode),
544 : : mount_entry_path(p), mount_mode_to_string(p->mode));
545 : 0 : mount_entry_done(f);
546 : 0 : continue;
547 : : }
548 : : }
549 : :
550 : 0 : *t = *f;
551 : 0 : t++;
552 : : }
553 : :
554 : 0 : *n = t - m;
555 : 0 : }
556 : :
557 : 0 : static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) {
558 : : MountEntry *f, *t;
559 : :
560 [ # # ]: 0 : assert(m);
561 [ # # ]: 0 : assert(n);
562 : :
563 : : /* Nothing to do */
564 [ # # ]: 0 : if (!root_directory)
565 : 0 : return;
566 : :
567 : : /* Drops all mounts that are outside of the root directory. */
568 : :
569 [ # # ]: 0 : for (f = m, t = m; f < m + *n; f++) {
570 : :
571 [ # # ]: 0 : if (!path_startswith(mount_entry_path(f), root_directory)) {
572 [ # # ]: 0 : log_debug("%s is outside of root directory.", mount_entry_path(f));
573 : 0 : mount_entry_done(f);
574 : 0 : continue;
575 : : }
576 : :
577 : 0 : *t = *f;
578 : 0 : t++;
579 : : }
580 : :
581 : 0 : *n = t - m;
582 : : }
583 : :
584 : 0 : static int clone_device_node(
585 : : const char *d,
586 : : const char *temporary_mount,
587 : : bool *make_devnode) {
588 : :
589 : 0 : _cleanup_free_ char *sl = NULL;
590 : : const char *dn, *bn, *t;
591 : : struct stat st;
592 : : int r;
593 : :
594 [ # # ]: 0 : if (stat(d, &st) < 0) {
595 [ # # ]: 0 : if (errno == ENOENT) {
596 [ # # ]: 0 : log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
597 : 0 : return -ENXIO;
598 : : }
599 : :
600 [ # # ]: 0 : return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
601 : : }
602 : :
603 [ # # ]: 0 : if (!S_ISBLK(st.st_mode) &&
604 [ # # ]: 0 : !S_ISCHR(st.st_mode))
605 [ # # ]: 0 : return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
606 : : "Device node '%s' to clone is not a device node, ignoring.",
607 : : d);
608 : :
609 [ # # # # : 0 : dn = strjoina(temporary_mount, d);
# # # # #
# # # ]
610 : :
611 : : /* First, try to create device node properly */
612 [ # # ]: 0 : if (*make_devnode) {
613 : 0 : mac_selinux_create_file_prepare(d, st.st_mode);
614 : 0 : r = mknod(dn, st.st_mode, st.st_rdev);
615 : 0 : mac_selinux_create_file_clear();
616 [ # # ]: 0 : if (r >= 0)
617 : 0 : goto add_symlink;
618 [ # # ]: 0 : if (errno != EPERM)
619 [ # # ]: 0 : return log_debug_errno(errno, "mknod failed for %s: %m", d);
620 : :
621 : : /* This didn't work, let's not try this again for the next iterations. */
622 : 0 : *make_devnode = false;
623 : : }
624 : :
625 : : /* We're about to fallback to bind-mounting the device
626 : : * node. So create a dummy bind-mount target. */
627 : 0 : mac_selinux_create_file_prepare(d, 0);
628 : 0 : r = mknod(dn, S_IFREG, 0);
629 : 0 : mac_selinux_create_file_clear();
630 [ # # # # ]: 0 : if (r < 0 && errno != EEXIST)
631 [ # # ]: 0 : return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
632 : :
633 : : /* Fallback to bind-mounting:
634 : : * The assumption here is that all used device nodes carry standard
635 : : * properties. Specifically, the devices nodes we bind-mount should
636 : : * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx)
637 : : * and should not carry ACLs. */
638 [ # # ]: 0 : if (mount(d, dn, NULL, MS_BIND, NULL) < 0)
639 [ # # ]: 0 : return log_debug_errno(errno, "Bind mounting failed for '%s': %m", d);
640 : :
641 : 0 : add_symlink:
642 : 0 : bn = path_startswith(d, "/dev/");
643 [ # # ]: 0 : if (!bn)
644 : 0 : return 0;
645 : :
646 : : /* Create symlinks like /dev/char/1:9 → ../urandom */
647 [ # # # # ]: 0 : if (asprintf(&sl, "%s/dev/%s/%u:%u", temporary_mount, S_ISCHR(st.st_mode) ? "char" : "block", major(st.st_rdev), minor(st.st_rdev)) < 0)
648 : 0 : return log_oom();
649 : :
650 : 0 : (void) mkdir_parents(sl, 0755);
651 : :
652 [ # # # # : 0 : t = strjoina("../", bn);
# # # # #
# # # ]
653 : :
654 [ # # ]: 0 : if (symlink(t, sl) < 0)
655 [ # # ]: 0 : log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
656 : :
657 : 0 : return 0;
658 : : }
659 : :
660 : 0 : static int mount_private_dev(MountEntry *m) {
661 : : static const char devnodes[] =
662 : : "/dev/null\0"
663 : : "/dev/zero\0"
664 : : "/dev/full\0"
665 : : "/dev/random\0"
666 : : "/dev/urandom\0"
667 : : "/dev/tty\0";
668 : :
669 : 0 : char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
670 : 0 : const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
671 : 0 : bool can_mknod = true;
672 : 0 : _cleanup_umask_ mode_t u;
673 : : int r;
674 : :
675 [ # # ]: 0 : assert(m);
676 : :
677 : 0 : u = umask(0000);
678 : :
679 [ # # ]: 0 : if (!mkdtemp(temporary_mount))
680 [ # # ]: 0 : return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
681 : :
682 [ # # # # : 0 : dev = strjoina(temporary_mount, "/dev");
# # # # #
# # # ]
683 : 0 : (void) mkdir(dev, 0755);
684 [ # # ]: 0 : if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
685 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to mount tmpfs on '%s': %m", dev);
686 : 0 : goto fail;
687 : : }
688 : :
689 [ # # # # : 0 : devpts = strjoina(temporary_mount, "/dev/pts");
# # # # #
# # # ]
690 : 0 : (void) mkdir(devpts, 0755);
691 [ # # ]: 0 : if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
692 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to bind mount /dev/pts on '%s': %m", devpts);
693 : 0 : goto fail;
694 : : }
695 : :
696 : : /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
697 : : * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
698 : : * Thus, in that case make a clone.
699 : : * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
700 : 0 : r = is_symlink("/dev/ptmx");
701 [ # # ]: 0 : if (r < 0) {
702 [ # # ]: 0 : log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
703 : 0 : goto fail;
704 [ # # ]: 0 : } else if (r > 0) {
705 [ # # # # : 0 : devptmx = strjoina(temporary_mount, "/dev/ptmx");
# # # # #
# # # ]
706 [ # # ]: 0 : if (symlink("pts/ptmx", devptmx) < 0) {
707 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
708 : 0 : goto fail;
709 : : }
710 : : } else {
711 : 0 : r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
712 [ # # ]: 0 : if (r < 0)
713 : 0 : goto fail;
714 : : }
715 : :
716 [ # # # # : 0 : devshm = strjoina(temporary_mount, "/dev/shm");
# # # # #
# # # ]
717 : 0 : (void) mkdir(devshm, 0755);
718 : 0 : r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
719 [ # # ]: 0 : if (r < 0) {
720 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to bind mount /dev/shm on '%s': %m", devshm);
721 : 0 : goto fail;
722 : : }
723 : :
724 [ # # # # : 0 : devmqueue = strjoina(temporary_mount, "/dev/mqueue");
# # # # #
# # # ]
725 : 0 : (void) mkdir(devmqueue, 0755);
726 [ # # ]: 0 : if (mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL) < 0)
727 [ # # ]: 0 : log_debug_errno(errno, "Failed to bind mount /dev/mqueue on '%s', ignoring: %m", devmqueue);
728 : :
729 [ # # # # : 0 : devhugepages = strjoina(temporary_mount, "/dev/hugepages");
# # # # #
# # # ]
730 : 0 : (void) mkdir(devhugepages, 0755);
731 [ # # ]: 0 : if (mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL) < 0)
732 [ # # ]: 0 : log_debug_errno(errno, "Failed to bind mount /dev/hugepages on '%s', ignoring: %m", devhugepages);
733 : :
734 [ # # # # : 0 : devlog = strjoina(temporary_mount, "/dev/log");
# # # # #
# # # ]
735 [ # # ]: 0 : if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
736 [ # # ]: 0 : log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
737 : :
738 [ # # # # ]: 0 : NULSTR_FOREACH(d, devnodes) {
739 : 0 : r = clone_device_node(d, temporary_mount, &can_mknod);
740 : : /* ENXIO means the the *source* is not a device file, skip creation in that case */
741 [ # # # # ]: 0 : if (r < 0 && r != -ENXIO)
742 : 0 : goto fail;
743 : : }
744 : :
745 : 0 : r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
746 [ # # ]: 0 : if (r < 0)
747 [ # # ]: 0 : log_debug_errno(r, "Failed to setup basic device tree at '%s', ignoring: %m", temporary_mount);
748 : :
749 : : /* Create the /dev directory if missing. It is more likely to be
750 : : * missing when the service is started with RootDirectory. This is
751 : : * consistent with mount units creating the mount points when missing.
752 : : */
753 : 0 : (void) mkdir_p_label(mount_entry_path(m), 0755);
754 : :
755 : : /* Unmount everything in old /dev */
756 : 0 : r = umount_recursive(mount_entry_path(m), 0);
757 [ # # ]: 0 : if (r < 0)
758 [ # # ]: 0 : log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
759 : :
760 [ # # ]: 0 : if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
761 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to move mount point '%s' to '%s': %m", dev, mount_entry_path(m));
762 : 0 : goto fail;
763 : : }
764 : :
765 : 0 : (void) rmdir(dev);
766 : 0 : (void) rmdir(temporary_mount);
767 : :
768 : 0 : return 0;
769 : :
770 : 0 : fail:
771 [ # # ]: 0 : if (devpts)
772 : 0 : (void) umount(devpts);
773 : :
774 [ # # ]: 0 : if (devshm)
775 : 0 : (void) umount(devshm);
776 : :
777 [ # # ]: 0 : if (devhugepages)
778 : 0 : (void) umount(devhugepages);
779 : :
780 [ # # ]: 0 : if (devmqueue)
781 : 0 : (void) umount(devmqueue);
782 : :
783 : 0 : (void) umount(dev);
784 : 0 : (void) rmdir(dev);
785 : 0 : (void) rmdir(temporary_mount);
786 : :
787 : 0 : return r;
788 : : }
789 : :
790 : 0 : static int mount_bind_dev(const MountEntry *m) {
791 : : int r;
792 : :
793 [ # # ]: 0 : assert(m);
794 : :
795 : : /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
796 : : * /dev. This is only used when RootDirectory= is set. */
797 : :
798 : 0 : (void) mkdir_p_label(mount_entry_path(m), 0755);
799 : :
800 : 0 : r = path_is_mount_point(mount_entry_path(m), NULL, 0);
801 [ # # ]: 0 : if (r < 0)
802 [ # # ]: 0 : return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
803 [ # # ]: 0 : if (r > 0) /* make this a NOP if /dev is already a mount point */
804 : 0 : return 0;
805 : :
806 [ # # ]: 0 : if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
807 [ # # ]: 0 : return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
808 : :
809 : 0 : return 1;
810 : : }
811 : :
812 : 0 : static int mount_sysfs(const MountEntry *m) {
813 : : int r;
814 : :
815 [ # # ]: 0 : assert(m);
816 : :
817 : 0 : (void) mkdir_p_label(mount_entry_path(m), 0755);
818 : :
819 : 0 : r = path_is_mount_point(mount_entry_path(m), NULL, 0);
820 [ # # ]: 0 : if (r < 0)
821 [ # # ]: 0 : return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
822 [ # # ]: 0 : if (r > 0) /* make this a NOP if /sys is already a mount point */
823 : 0 : return 0;
824 : :
825 : : /* Bind mount the host's version so that we get all child mounts of it, too. */
826 [ # # ]: 0 : if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
827 [ # # ]: 0 : return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
828 : :
829 : 0 : return 1;
830 : : }
831 : :
832 : 0 : static int mount_procfs(const MountEntry *m) {
833 : : int r;
834 : :
835 [ # # ]: 0 : assert(m);
836 : :
837 : 0 : (void) mkdir_p_label(mount_entry_path(m), 0755);
838 : :
839 : 0 : r = path_is_mount_point(mount_entry_path(m), NULL, 0);
840 [ # # ]: 0 : if (r < 0)
841 [ # # ]: 0 : return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
842 [ # # ]: 0 : if (r > 0) /* make this a NOP if /proc is already a mount point */
843 : 0 : return 0;
844 : :
845 : : /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
846 [ # # ]: 0 : if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
847 [ # # ]: 0 : return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
848 : :
849 : 0 : return 1;
850 : : }
851 : :
852 : 0 : static int mount_tmpfs(const MountEntry *m) {
853 [ # # ]: 0 : assert(m);
854 : :
855 : : /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
856 : :
857 : 0 : (void) mkdir_p_label(mount_entry_path(m), 0755);
858 : 0 : (void) umount_recursive(mount_entry_path(m), 0);
859 : :
860 [ # # ]: 0 : if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
861 [ # # ]: 0 : return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
862 : :
863 : 0 : return 1;
864 : : }
865 : :
866 : 0 : static int follow_symlink(
867 : : const char *root_directory,
868 : : MountEntry *m) {
869 : :
870 : 0 : _cleanup_free_ char *target = NULL;
871 : : int r;
872 : :
873 : : /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
874 : : * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
875 : : * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
876 : : * end and already have a fully normalized name. */
877 : :
878 : 0 : r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target);
879 [ # # ]: 0 : if (r < 0)
880 [ # # ]: 0 : return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
881 [ # # ]: 0 : if (r > 0) /* Reached the end, nothing more to resolve */
882 : 0 : return 1;
883 : :
884 [ # # ]: 0 : if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */
885 [ # # ]: 0 : return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
886 : : "Symlink loop on '%s'.",
887 : : mount_entry_path(m));
888 : :
889 [ # # ]: 0 : log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target);
890 : :
891 : 0 : free_and_replace(m->path_malloc, target);
892 : 0 : m->has_prefix = true;
893 : :
894 : 0 : m->n_followed ++;
895 : :
896 : 0 : return 0;
897 : : }
898 : :
899 : 0 : static int apply_mount(
900 : : const char *root_directory,
901 : : MountEntry *m) {
902 : :
903 : 0 : bool rbind = true, make = false;
904 : : const char *what;
905 : : int r;
906 : :
907 [ # # ]: 0 : assert(m);
908 : :
909 [ # # ]: 0 : log_debug("Applying namespace mount on %s", mount_entry_path(m));
910 : :
911 [ # # # # : 0 : switch (m->mode) {
# # # # #
# # ]
912 : :
913 : 0 : case INACCESSIBLE: {
914 : : struct stat target;
915 : :
916 : : /* First, get rid of everything that is below if there
917 : : * is anything... Then, overmount it with an
918 : : * inaccessible path. */
919 : 0 : (void) umount_recursive(mount_entry_path(m), 0);
920 : :
921 [ # # ]: 0 : if (lstat(mount_entry_path(m), &target) < 0) {
922 [ # # # # ]: 0 : if (errno == ENOENT && m->ignore)
923 : 0 : return 0;
924 : :
925 [ # # ]: 0 : return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
926 : : }
927 : :
928 : 0 : what = mode_to_inaccessible_node(target.st_mode);
929 [ # # ]: 0 : if (!what)
930 [ # # ]: 0 : return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
931 : : "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
932 : 0 : break;
933 : : }
934 : :
935 : 0 : case READONLY:
936 : : case READWRITE:
937 : : case READWRITE_IMPLICIT:
938 : 0 : r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
939 [ # # # # ]: 0 : if (r == -ENOENT && m->ignore)
940 : 0 : return 0;
941 [ # # ]: 0 : if (r < 0)
942 [ # # ]: 0 : return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
943 [ # # ]: 0 : if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
944 : 0 : return 0;
945 : : /* This isn't a mount point yet, let's make it one. */
946 : 0 : what = mount_entry_path(m);
947 : 0 : break;
948 : :
949 : 0 : case BIND_MOUNT:
950 : 0 : rbind = false;
951 : :
952 : : _fallthrough_;
953 : 0 : case BIND_MOUNT_RECURSIVE: {
954 [ # # ]: 0 : _cleanup_free_ char *chased = NULL;
955 : :
956 : : /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note that bind
957 : : * mount source paths are always relative to the host root, hence we pass NULL as root directory to
958 : : * chase_symlinks() here. */
959 : :
960 : 0 : r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased);
961 [ # # # # ]: 0 : if (r == -ENOENT && m->ignore) {
962 [ # # ]: 0 : log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
963 : 0 : return 0;
964 : : }
965 [ # # ]: 0 : if (r < 0)
966 [ # # ]: 0 : return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
967 : :
968 [ # # ]: 0 : log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased);
969 : :
970 : 0 : free_and_replace(m->source_malloc, chased);
971 : :
972 : 0 : what = mount_entry_source(m);
973 : 0 : make = true;
974 : 0 : break;
975 : : }
976 : :
977 : 0 : case EMPTY_DIR:
978 : : case TMPFS:
979 : 0 : return mount_tmpfs(m);
980 : :
981 : 0 : case PRIVATE_TMP:
982 : 0 : what = mount_entry_source(m);
983 : 0 : make = true;
984 : 0 : break;
985 : :
986 : 0 : case PRIVATE_DEV:
987 : 0 : return mount_private_dev(m);
988 : :
989 : 0 : case BIND_DEV:
990 : 0 : return mount_bind_dev(m);
991 : :
992 : 0 : case SYSFS:
993 : 0 : return mount_sysfs(m);
994 : :
995 : 0 : case PROCFS:
996 : 0 : return mount_procfs(m);
997 : :
998 : 0 : default:
999 : 0 : assert_not_reached("Unknown mode");
1000 : : }
1001 : :
1002 [ # # ]: 0 : assert(what);
1003 : :
1004 [ # # # # ]: 0 : if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
1005 : 0 : bool try_again = false;
1006 : 0 : r = -errno;
1007 : :
1008 [ # # # # ]: 0 : if (r == -ENOENT && make) {
1009 : : struct stat st;
1010 : :
1011 : : /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
1012 : :
1013 [ # # ]: 0 : if (stat(what, &st) < 0)
1014 [ # # ]: 0 : log_debug_errno(errno, "Mount point source '%s' is not accessible: %m", what);
1015 : : else {
1016 : : int q;
1017 : :
1018 : 0 : (void) mkdir_parents(mount_entry_path(m), 0755);
1019 : :
1020 [ # # ]: 0 : if (S_ISDIR(st.st_mode))
1021 [ # # ]: 0 : q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0;
1022 : : else
1023 : 0 : q = touch(mount_entry_path(m));
1024 : :
1025 [ # # ]: 0 : if (q < 0)
1026 [ # # ]: 0 : log_debug_errno(q, "Failed to create destination mount point node '%s': %m", mount_entry_path(m));
1027 : : else
1028 : 0 : try_again = true;
1029 : : }
1030 : : }
1031 : :
1032 [ # # ]: 0 : if (try_again) {
1033 [ # # # # ]: 0 : if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
1034 : 0 : r = -errno;
1035 : : else
1036 : 0 : r = 0;
1037 : : }
1038 : :
1039 [ # # ]: 0 : if (r < 0)
1040 [ # # ]: 0 : return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
1041 : : }
1042 : :
1043 [ # # ]: 0 : log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
1044 : 0 : return 0;
1045 : : }
1046 : :
1047 : : /* Change per-mount flags on an existing mount */
1048 : 0 : static int bind_remount_one(const char *path, unsigned long orig_flags, unsigned long new_flags, unsigned long flags_mask) {
1049 [ # # ]: 0 : if (mount(NULL, path, NULL, (orig_flags & ~flags_mask) | MS_REMOUNT | MS_BIND | new_flags, NULL) < 0)
1050 : 0 : return -errno;
1051 : :
1052 : 0 : return 0;
1053 : : }
1054 : :
1055 : 0 : static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
1056 : 0 : unsigned long new_flags = 0, flags_mask = 0;
1057 : 0 : bool submounts = false;
1058 : 0 : int r = 0;
1059 : :
1060 [ # # ]: 0 : assert(m);
1061 [ # # ]: 0 : assert(proc_self_mountinfo);
1062 : :
1063 [ # # # # ]: 0 : if (mount_entry_read_only(m) || m->mode == PRIVATE_DEV) {
1064 : 0 : new_flags |= MS_RDONLY;
1065 : 0 : flags_mask |= MS_RDONLY;
1066 : : }
1067 : :
1068 [ # # ]: 0 : if (m->nosuid) {
1069 : 0 : new_flags |= MS_NOSUID;
1070 : 0 : flags_mask |= MS_NOSUID;
1071 : : }
1072 : :
1073 [ # # ]: 0 : if (flags_mask == 0) /* No Change? */
1074 : 0 : return 0;
1075 : :
1076 : : /* We generally apply these changes recursively, except for /dev, and the cases we know there's
1077 : : * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
1078 : : * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace
1079 : : * and running Linux <= 4.17. */
1080 : 0 : submounts =
1081 [ # # ]: 0 : mount_entry_read_only(m) &&
1082 [ # # # # ]: 0 : !IN_SET(m->mode, EMPTY_DIR, TMPFS);
1083 [ # # ]: 0 : if (submounts)
1084 : 0 : r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, blacklist, proc_self_mountinfo);
1085 : : else
1086 : 0 : r = bind_remount_one(mount_entry_path(m), m->flags, new_flags, flags_mask);
1087 : :
1088 : : /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
1089 : : * read-only already stays this way. This improves compatibility with container managers, where we
1090 : : * won't attempt to undo read-only mounts already applied. */
1091 : :
1092 [ # # # # ]: 0 : if (r == -ENOENT && m->ignore)
1093 : 0 : return 0;
1094 [ # # ]: 0 : if (r < 0)
1095 [ # # # # ]: 0 : return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
1096 : : submounts ? " and its submounts" : "");
1097 : 0 : return 0;
1098 : : }
1099 : :
1100 : 0 : static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
1101 [ # # ]: 0 : assert(ns_info);
1102 : :
1103 : : /*
1104 : : * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1105 : : * since to protect the API VFS mounts, they need to be around in the
1106 : : * first place...
1107 : : */
1108 : :
1109 [ # # ]: 0 : return ns_info->mount_apivfs ||
1110 [ # # # # ]: 0 : ns_info->protect_control_groups ||
1111 : : ns_info->protect_kernel_tunables;
1112 : : }
1113 : :
1114 : 0 : static size_t namespace_calculate_mounts(
1115 : : const NamespaceInfo *ns_info,
1116 : : char** read_write_paths,
1117 : : char** read_only_paths,
1118 : : char** inaccessible_paths,
1119 : : char** empty_directories,
1120 : : size_t n_bind_mounts,
1121 : : size_t n_temporary_filesystems,
1122 : : const char* tmp_dir,
1123 : : const char* var_tmp_dir,
1124 : : ProtectHome protect_home,
1125 : : ProtectSystem protect_system) {
1126 : :
1127 : : size_t protect_home_cnt;
1128 : 0 : size_t protect_system_cnt =
1129 : : (protect_system == PROTECT_SYSTEM_STRICT ?
1130 [ # # ]: 0 : ELEMENTSOF(protect_system_strict_table) :
1131 : : ((protect_system == PROTECT_SYSTEM_FULL) ?
1132 [ # # ]: 0 : ELEMENTSOF(protect_system_full_table) :
1133 : : ((protect_system == PROTECT_SYSTEM_YES) ?
1134 [ # # ]: 0 : ELEMENTSOF(protect_system_yes_table) : 0)));
1135 : :
1136 : 0 : protect_home_cnt =
1137 : : (protect_home == PROTECT_HOME_YES ?
1138 [ # # ]: 0 : ELEMENTSOF(protect_home_yes_table) :
1139 : : ((protect_home == PROTECT_HOME_READ_ONLY) ?
1140 [ # # ]: 0 : ELEMENTSOF(protect_home_read_only_table) :
1141 : : ((protect_home == PROTECT_HOME_TMPFS) ?
1142 [ # # ]: 0 : ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1143 : :
1144 : 0 : return !!tmp_dir + !!var_tmp_dir +
1145 : 0 : strv_length(read_write_paths) +
1146 : 0 : strv_length(read_only_paths) +
1147 : 0 : strv_length(inaccessible_paths) +
1148 : 0 : strv_length(empty_directories) +
1149 : 0 : n_bind_mounts +
1150 : 0 : n_temporary_filesystems +
1151 : 0 : ns_info->private_dev +
1152 [ # # ]: 0 : (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1153 : 0 : (ns_info->protect_control_groups ? 1 : 0) +
1154 : 0 : (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1155 : 0 : protect_home_cnt + protect_system_cnt +
1156 : 0 : (ns_info->protect_hostname ? 2 : 0) +
1157 [ # # ]: 0 : (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1158 : : }
1159 : :
1160 : 0 : static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) {
1161 [ # # ]: 0 : assert(root_directory);
1162 [ # # ]: 0 : assert(n_mounts);
1163 [ # # # # ]: 0 : assert(mounts || *n_mounts == 0);
1164 : :
1165 : 0 : typesafe_qsort(mounts, *n_mounts, mount_path_compare);
1166 : :
1167 : 0 : drop_duplicates(mounts, n_mounts);
1168 : 0 : drop_outside_root(root_directory, mounts, n_mounts);
1169 : 0 : drop_inaccessible(mounts, n_mounts);
1170 : 0 : drop_nop(mounts, n_mounts);
1171 : 0 : }
1172 : :
1173 : 0 : int setup_namespace(
1174 : : const char* root_directory,
1175 : : const char* root_image,
1176 : : const NamespaceInfo *ns_info,
1177 : : char** read_write_paths,
1178 : : char** read_only_paths,
1179 : : char** inaccessible_paths,
1180 : : char** empty_directories,
1181 : : const BindMount *bind_mounts,
1182 : : size_t n_bind_mounts,
1183 : : const TemporaryFileSystem *temporary_filesystems,
1184 : : size_t n_temporary_filesystems,
1185 : : const char* tmp_dir,
1186 : : const char* var_tmp_dir,
1187 : : ProtectHome protect_home,
1188 : : ProtectSystem protect_system,
1189 : : unsigned long mount_flags,
1190 : : DissectImageFlags dissect_image_flags,
1191 : : char **error_path) {
1192 : :
1193 : 0 : _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1194 : 0 : _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1195 : 0 : _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1196 : 0 : _cleanup_free_ void *root_hash = NULL;
1197 : 0 : MountEntry *m = NULL, *mounts = NULL;
1198 : 0 : size_t n_mounts, root_hash_size = 0;
1199 : 0 : bool require_prefix = false;
1200 : : const char *root;
1201 : 0 : int r = 0;
1202 : :
1203 [ # # ]: 0 : assert(ns_info);
1204 : :
1205 [ # # ]: 0 : if (mount_flags == 0)
1206 : 0 : mount_flags = MS_SHARED;
1207 : :
1208 [ # # ]: 0 : if (root_image) {
1209 : 0 : dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1210 : :
1211 [ # # # # ]: 0 : if (protect_system == PROTECT_SYSTEM_STRICT &&
1212 [ # # ]: 0 : protect_home != PROTECT_HOME_NO &&
1213 : 0 : strv_isempty(read_write_paths))
1214 : 0 : dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1215 : :
1216 : 0 : r = loop_device_make_by_path(root_image,
1217 [ # # ]: 0 : dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1218 : : &loop_device);
1219 [ # # ]: 0 : if (r < 0)
1220 [ # # ]: 0 : return log_debug_errno(r, "Failed to create loop device for root image: %m");
1221 : :
1222 : 0 : r = root_hash_load(root_image, &root_hash, &root_hash_size);
1223 [ # # ]: 0 : if (r < 0)
1224 [ # # ]: 0 : return log_debug_errno(r, "Failed to load root hash: %m");
1225 : :
1226 : 0 : r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1227 [ # # ]: 0 : if (r < 0)
1228 [ # # ]: 0 : return log_debug_errno(r, "Failed to dissect image: %m");
1229 : :
1230 : 0 : r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1231 [ # # ]: 0 : if (r < 0)
1232 [ # # ]: 0 : return log_debug_errno(r, "Failed to decrypt dissected image: %m");
1233 : : }
1234 : :
1235 [ # # ]: 0 : if (root_directory)
1236 : 0 : root = root_directory;
1237 : : else {
1238 : : /* Always create the mount namespace in a temporary directory, instead of operating
1239 : : * directly in the root. The temporary directory prevents any mounts from being
1240 : : * potentially obscured my other mounts we already applied.
1241 : : * We use the same mount point for all images, which is safe, since they all live
1242 : : * in their own namespaces after all, and hence won't see each other. */
1243 : :
1244 : 0 : root = "/run/systemd/unit-root";
1245 : 0 : (void) mkdir_label(root, 0700);
1246 : 0 : require_prefix = true;
1247 : : }
1248 : :
1249 : 0 : n_mounts = namespace_calculate_mounts(
1250 : : ns_info,
1251 : : read_write_paths,
1252 : : read_only_paths,
1253 : : inaccessible_paths,
1254 : : empty_directories,
1255 : : n_bind_mounts,
1256 : : n_temporary_filesystems,
1257 : : tmp_dir, var_tmp_dir,
1258 : : protect_home, protect_system);
1259 : :
1260 [ # # ]: 0 : if (n_mounts > 0) {
1261 [ # # ]: 0 : m = mounts = new0(MountEntry, n_mounts);
1262 [ # # ]: 0 : if (!mounts)
1263 : 0 : return -ENOMEM;
1264 : :
1265 : 0 : r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1266 [ # # ]: 0 : if (r < 0)
1267 : 0 : goto finish;
1268 : :
1269 : 0 : r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1270 [ # # ]: 0 : if (r < 0)
1271 : 0 : goto finish;
1272 : :
1273 : 0 : r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1274 [ # # ]: 0 : if (r < 0)
1275 : 0 : goto finish;
1276 : :
1277 : 0 : r = append_empty_dir_mounts(&m, empty_directories);
1278 [ # # ]: 0 : if (r < 0)
1279 : 0 : goto finish;
1280 : :
1281 : 0 : r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1282 [ # # ]: 0 : if (r < 0)
1283 : 0 : goto finish;
1284 : :
1285 : 0 : r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1286 [ # # ]: 0 : if (r < 0)
1287 : 0 : goto finish;
1288 : :
1289 [ # # ]: 0 : if (tmp_dir) {
1290 : 0 : *(m++) = (MountEntry) {
1291 : : .path_const = "/tmp",
1292 : : .mode = PRIVATE_TMP,
1293 : : .source_const = tmp_dir,
1294 : : };
1295 : : }
1296 : :
1297 [ # # ]: 0 : if (var_tmp_dir) {
1298 : 0 : *(m++) = (MountEntry) {
1299 : : .path_const = "/var/tmp",
1300 : : .mode = PRIVATE_TMP,
1301 : : .source_const = var_tmp_dir,
1302 : : };
1303 : : }
1304 : :
1305 [ # # ]: 0 : if (ns_info->private_dev) {
1306 : 0 : *(m++) = (MountEntry) {
1307 : : .path_const = "/dev",
1308 : : .mode = PRIVATE_DEV,
1309 : : .flags = DEV_MOUNT_OPTIONS,
1310 : : };
1311 : : }
1312 : :
1313 [ # # ]: 0 : if (ns_info->protect_kernel_tunables) {
1314 : 0 : r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1315 [ # # ]: 0 : if (r < 0)
1316 : 0 : goto finish;
1317 : : }
1318 : :
1319 [ # # ]: 0 : if (ns_info->protect_kernel_modules) {
1320 : 0 : r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1321 [ # # ]: 0 : if (r < 0)
1322 : 0 : goto finish;
1323 : : }
1324 : :
1325 [ # # ]: 0 : if (ns_info->protect_control_groups) {
1326 : 0 : *(m++) = (MountEntry) {
1327 : : .path_const = "/sys/fs/cgroup",
1328 : : .mode = READONLY,
1329 : : };
1330 : : }
1331 : :
1332 : 0 : r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1333 [ # # ]: 0 : if (r < 0)
1334 : 0 : goto finish;
1335 : :
1336 : 0 : r = append_protect_system(&m, protect_system, false);
1337 [ # # ]: 0 : if (r < 0)
1338 : 0 : goto finish;
1339 : :
1340 [ # # ]: 0 : if (namespace_info_mount_apivfs(ns_info)) {
1341 : 0 : r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1342 [ # # ]: 0 : if (r < 0)
1343 : 0 : goto finish;
1344 : : }
1345 : :
1346 [ # # ]: 0 : if (ns_info->protect_hostname) {
1347 : 0 : *(m++) = (MountEntry) {
1348 : : .path_const = "/proc/sys/kernel/hostname",
1349 : : .mode = READONLY,
1350 : : };
1351 : 0 : *(m++) = (MountEntry) {
1352 : : .path_const = "/proc/sys/kernel/domainname",
1353 : : .mode = READONLY,
1354 : : };
1355 : : }
1356 : :
1357 [ # # ]: 0 : assert(mounts + n_mounts == m);
1358 : :
1359 : : /* Prepend the root directory where that's necessary */
1360 : 0 : r = prefix_where_needed(mounts, n_mounts, root);
1361 [ # # ]: 0 : if (r < 0)
1362 : 0 : goto finish;
1363 : :
1364 : 0 : normalize_mounts(root, mounts, &n_mounts);
1365 : : }
1366 : :
1367 : : /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
1368 : :
1369 [ # # ]: 0 : if (unshare(CLONE_NEWNS) < 0) {
1370 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
1371 [ # # # # ]: 0 : if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS))
1372 : : /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter in place
1373 : : * that doesn't allow us to create namespaces (or a missing cap), then propagate a recognizable
1374 : : * error back, which the caller can use to detect this case (and only this) and optionally
1375 : : * continue without namespacing applied. */
1376 : 0 : r = -ENOANO;
1377 : :
1378 : 0 : goto finish;
1379 : : }
1380 : :
1381 : : /* Remount / as SLAVE so that nothing now mounted in the namespace
1382 : : * shows up in the parent */
1383 [ # # ]: 0 : if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1384 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
1385 : 0 : goto finish;
1386 : : }
1387 : :
1388 [ # # ]: 0 : if (root_image) {
1389 : : /* A root image is specified, mount it to the right place */
1390 : 0 : r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1391 [ # # ]: 0 : if (r < 0) {
1392 [ # # ]: 0 : log_debug_errno(r, "Failed to mount root image: %m");
1393 : 0 : goto finish;
1394 : : }
1395 : :
1396 [ # # ]: 0 : if (decrypted_image) {
1397 : 0 : r = decrypted_image_relinquish(decrypted_image);
1398 [ # # ]: 0 : if (r < 0) {
1399 [ # # ]: 0 : log_debug_errno(r, "Failed to relinquish decrypted image: %m");
1400 : 0 : goto finish;
1401 : : }
1402 : : }
1403 : :
1404 : 0 : loop_device_relinquish(loop_device);
1405 : :
1406 [ # # ]: 0 : } else if (root_directory) {
1407 : :
1408 : : /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1409 : 0 : r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1410 [ # # ]: 0 : if (r < 0) {
1411 [ # # ]: 0 : log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
1412 : 0 : goto finish;
1413 : : }
1414 [ # # ]: 0 : if (r == 0) {
1415 [ # # ]: 0 : if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1416 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to bind mount '%s': %m", root);
1417 : 0 : goto finish;
1418 : : }
1419 : : }
1420 : :
1421 : : } else {
1422 : :
1423 : : /* Let's mount the main root directory to the root directory to use */
1424 [ # # ]: 0 : if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1425 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to bind mount '/' on '%s': %m", root);
1426 : 0 : goto finish;
1427 : : }
1428 : : }
1429 : :
1430 : : /* Try to set up the new root directory before mounting anything else there. */
1431 [ # # # # ]: 0 : if (root_image || root_directory)
1432 : 0 : (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1433 : :
1434 [ # # ]: 0 : if (n_mounts > 0) {
1435 [ # # ]: 0 : _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1436 [ # # ]: 0 : _cleanup_free_ char **blacklist = NULL;
1437 : : size_t j;
1438 : :
1439 : : /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1440 : : * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1441 : 0 : proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1442 [ # # ]: 0 : if (!proc_self_mountinfo) {
1443 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
1444 [ # # ]: 0 : if (error_path)
1445 : 0 : *error_path = strdup("/proc/self/mountinfo");
1446 : 0 : goto finish;
1447 : : }
1448 : :
1449 : : /* First round, establish all mounts we need */
1450 : 0 : for (;;) {
1451 : 0 : bool again = false;
1452 : :
1453 [ # # ]: 0 : for (m = mounts; m < mounts + n_mounts; ++m) {
1454 : :
1455 [ # # ]: 0 : if (m->applied)
1456 : 0 : continue;
1457 : :
1458 : 0 : r = follow_symlink(root, m);
1459 [ # # ]: 0 : if (r < 0) {
1460 [ # # # # ]: 0 : if (error_path && mount_entry_path(m))
1461 : 0 : *error_path = strdup(mount_entry_path(m));
1462 : 0 : goto finish;
1463 : : }
1464 [ # # ]: 0 : if (r == 0) {
1465 : : /* We hit a symlinked mount point. The entry got rewritten and might point to a
1466 : : * very different place now. Let's normalize the changed list, and start from
1467 : : * the beginning. After all to mount the entry at the new location we might
1468 : : * need some other mounts first */
1469 : 0 : again = true;
1470 : 0 : break;
1471 : : }
1472 : :
1473 : 0 : r = apply_mount(root, m);
1474 [ # # ]: 0 : if (r < 0) {
1475 [ # # # # ]: 0 : if (error_path && mount_entry_path(m))
1476 : 0 : *error_path = strdup(mount_entry_path(m));
1477 : 0 : goto finish;
1478 : : }
1479 : :
1480 : 0 : m->applied = true;
1481 : : }
1482 : :
1483 [ # # ]: 0 : if (!again)
1484 : 0 : break;
1485 : :
1486 : 0 : normalize_mounts(root, mounts, &n_mounts);
1487 : : }
1488 : :
1489 : : /* Create a blacklist we can pass to bind_mount_recursive() */
1490 : 0 : blacklist = new(char*, n_mounts+1);
1491 [ # # ]: 0 : if (!blacklist) {
1492 : 0 : r = -ENOMEM;
1493 : 0 : goto finish;
1494 : : }
1495 [ # # ]: 0 : for (j = 0; j < n_mounts; j++)
1496 : 0 : blacklist[j] = (char*) mount_entry_path(mounts+j);
1497 : 0 : blacklist[j] = NULL;
1498 : :
1499 : : /* Second round, flip the ro bits if necessary. */
1500 [ # # ]: 0 : for (m = mounts; m < mounts + n_mounts; ++m) {
1501 : 0 : r = make_read_only(m, blacklist, proc_self_mountinfo);
1502 [ # # ]: 0 : if (r < 0) {
1503 [ # # # # ]: 0 : if (error_path && mount_entry_path(m))
1504 : 0 : *error_path = strdup(mount_entry_path(m));
1505 : 0 : goto finish;
1506 : : }
1507 : : }
1508 : : }
1509 : :
1510 : : /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1511 : 0 : r = mount_move_root(root);
1512 [ # # ]: 0 : if (r < 0) {
1513 [ # # ]: 0 : log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
1514 : 0 : goto finish;
1515 : : }
1516 : :
1517 : : /* Remount / as the desired mode. Note that this will not
1518 : : * reestablish propagation from our side to the host, since
1519 : : * what's disconnected is disconnected. */
1520 [ # # ]: 0 : if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1521 [ # # ]: 0 : r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
1522 : 0 : goto finish;
1523 : : }
1524 : :
1525 : 0 : r = 0;
1526 : :
1527 : 0 : finish:
1528 [ # # ]: 0 : for (m = mounts; m < mounts + n_mounts; m++)
1529 : 0 : mount_entry_done(m);
1530 : :
1531 : 0 : free(mounts);
1532 : :
1533 : 0 : return r;
1534 : : }
1535 : :
1536 : 2236 : void bind_mount_free_many(BindMount *b, size_t n) {
1537 : : size_t i;
1538 : :
1539 [ + - - + ]: 2236 : assert(b || n == 0);
1540 : :
1541 [ - + ]: 2236 : for (i = 0; i < n; i++) {
1542 : 0 : free(b[i].source);
1543 : 0 : free(b[i].destination);
1544 : : }
1545 : :
1546 : 2236 : free(b);
1547 : 2236 : }
1548 : :
1549 : 0 : int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
1550 : 0 : _cleanup_free_ char *s = NULL, *d = NULL;
1551 : : BindMount *c;
1552 : :
1553 [ # # ]: 0 : assert(b);
1554 [ # # ]: 0 : assert(n);
1555 [ # # ]: 0 : assert(item);
1556 : :
1557 : 0 : s = strdup(item->source);
1558 [ # # ]: 0 : if (!s)
1559 : 0 : return -ENOMEM;
1560 : :
1561 : 0 : d = strdup(item->destination);
1562 [ # # ]: 0 : if (!d)
1563 : 0 : return -ENOMEM;
1564 : :
1565 : 0 : c = reallocarray(*b, *n + 1, sizeof(BindMount));
1566 [ # # ]: 0 : if (!c)
1567 : 0 : return -ENOMEM;
1568 : :
1569 : 0 : *b = c;
1570 : :
1571 : 0 : c[(*n) ++] = (BindMount) {
1572 : 0 : .source = TAKE_PTR(s),
1573 : 0 : .destination = TAKE_PTR(d),
1574 : 0 : .read_only = item->read_only,
1575 : 0 : .nosuid = item->nosuid,
1576 : 0 : .recursive = item->recursive,
1577 : 0 : .ignore_enoent = item->ignore_enoent,
1578 : : };
1579 : :
1580 : 0 : return 0;
1581 : : }
1582 : :
1583 : 2236 : void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
1584 : : size_t i;
1585 : :
1586 [ + - - + ]: 2236 : assert(t || n == 0);
1587 : :
1588 [ - + ]: 2236 : for (i = 0; i < n; i++) {
1589 : 0 : free(t[i].path);
1590 : 0 : free(t[i].options);
1591 : : }
1592 : :
1593 : 2236 : free(t);
1594 : 2236 : }
1595 : :
1596 : 0 : int temporary_filesystem_add(
1597 : : TemporaryFileSystem **t,
1598 : : size_t *n,
1599 : : const char *path,
1600 : : const char *options) {
1601 : :
1602 : 0 : _cleanup_free_ char *p = NULL, *o = NULL;
1603 : : TemporaryFileSystem *c;
1604 : :
1605 [ # # ]: 0 : assert(t);
1606 [ # # ]: 0 : assert(n);
1607 [ # # ]: 0 : assert(path);
1608 : :
1609 : 0 : p = strdup(path);
1610 [ # # ]: 0 : if (!p)
1611 : 0 : return -ENOMEM;
1612 : :
1613 [ # # ]: 0 : if (!isempty(options)) {
1614 : 0 : o = strdup(options);
1615 [ # # ]: 0 : if (!o)
1616 : 0 : return -ENOMEM;
1617 : : }
1618 : :
1619 : 0 : c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
1620 [ # # ]: 0 : if (!c)
1621 : 0 : return -ENOMEM;
1622 : :
1623 : 0 : *t = c;
1624 : :
1625 : 0 : c[(*n) ++] = (TemporaryFileSystem) {
1626 : 0 : .path = TAKE_PTR(p),
1627 : 0 : .options = TAKE_PTR(o),
1628 : : };
1629 : :
1630 : 0 : return 0;
1631 : : }
1632 : :
1633 : 0 : static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1634 : 0 : _cleanup_free_ char *x = NULL;
1635 : : char bid[SD_ID128_STRING_MAX];
1636 : : sd_id128_t boot_id;
1637 : : int r;
1638 : :
1639 [ # # ]: 0 : assert(id);
1640 [ # # ]: 0 : assert(prefix);
1641 [ # # ]: 0 : assert(path);
1642 : :
1643 : : /* We include the boot id in the directory so that after a
1644 : : * reboot we can easily identify obsolete directories. */
1645 : :
1646 : 0 : r = sd_id128_get_boot(&boot_id);
1647 [ # # ]: 0 : if (r < 0)
1648 : 0 : return r;
1649 : :
1650 : 0 : x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1651 [ # # ]: 0 : if (!x)
1652 : 0 : return -ENOMEM;
1653 : :
1654 [ # # # # ]: 0 : RUN_WITH_UMASK(0077)
1655 [ # # ]: 0 : if (!mkdtemp(x))
1656 : 0 : return -errno;
1657 : :
1658 [ # # # # ]: 0 : RUN_WITH_UMASK(0000) {
1659 : : char *y;
1660 : :
1661 [ # # # # : 0 : y = strjoina(x, "/tmp");
# # # # #
# # # ]
1662 : :
1663 [ # # ]: 0 : if (mkdir(y, 0777 | S_ISVTX) < 0)
1664 : 0 : return -errno;
1665 : : }
1666 : :
1667 : 0 : *path = TAKE_PTR(x);
1668 : :
1669 : 0 : return 0;
1670 : : }
1671 : :
1672 : 0 : int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1673 : : char *a, *b;
1674 : : int r;
1675 : :
1676 [ # # ]: 0 : assert(id);
1677 [ # # ]: 0 : assert(tmp_dir);
1678 [ # # ]: 0 : assert(var_tmp_dir);
1679 : :
1680 : 0 : r = setup_one_tmp_dir(id, "/tmp", &a);
1681 [ # # ]: 0 : if (r < 0)
1682 : 0 : return r;
1683 : :
1684 : 0 : r = setup_one_tmp_dir(id, "/var/tmp", &b);
1685 [ # # ]: 0 : if (r < 0) {
1686 : : char *t;
1687 : :
1688 [ # # # # : 0 : t = strjoina(a, "/tmp");
# # # # #
# # # ]
1689 : 0 : (void) rmdir(t);
1690 : 0 : (void) rmdir(a);
1691 : :
1692 : 0 : free(a);
1693 : 0 : return r;
1694 : : }
1695 : :
1696 : 0 : *tmp_dir = a;
1697 : 0 : *var_tmp_dir = b;
1698 : :
1699 : 0 : return 0;
1700 : : }
1701 : :
1702 : 0 : int setup_netns(const int netns_storage_socket[static 2]) {
1703 : 0 : _cleanup_close_ int netns = -1;
1704 : : int r, q;
1705 : :
1706 [ # # ]: 0 : assert(netns_storage_socket);
1707 [ # # ]: 0 : assert(netns_storage_socket[0] >= 0);
1708 [ # # ]: 0 : assert(netns_storage_socket[1] >= 0);
1709 : :
1710 : : /* We use the passed socketpair as a storage buffer for our
1711 : : * namespace reference fd. Whatever process runs this first
1712 : : * shall create a new namespace, all others should just join
1713 : : * it. To serialize that we use a file lock on the socket
1714 : : * pair.
1715 : : *
1716 : : * It's a bit crazy, but hey, works great! */
1717 : :
1718 [ # # ]: 0 : if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1719 : 0 : return -errno;
1720 : :
1721 : 0 : netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1722 [ # # ]: 0 : if (netns == -EAGAIN) {
1723 : : /* Nothing stored yet, so let's create a new namespace. */
1724 : :
1725 [ # # ]: 0 : if (unshare(CLONE_NEWNET) < 0) {
1726 : 0 : r = -errno;
1727 : 0 : goto fail;
1728 : : }
1729 : :
1730 : 0 : (void) loopback_setup();
1731 : :
1732 : 0 : netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1733 [ # # ]: 0 : if (netns < 0) {
1734 : 0 : r = -errno;
1735 : 0 : goto fail;
1736 : : }
1737 : :
1738 : 0 : r = 1;
1739 : :
1740 [ # # ]: 0 : } else if (netns < 0) {
1741 : 0 : r = netns;
1742 : 0 : goto fail;
1743 : :
1744 : : } else {
1745 : : /* Yay, found something, so let's join the namespace */
1746 [ # # ]: 0 : if (setns(netns, CLONE_NEWNET) < 0) {
1747 : 0 : r = -errno;
1748 : 0 : goto fail;
1749 : : }
1750 : :
1751 : 0 : r = 0;
1752 : : }
1753 : :
1754 : 0 : q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1755 [ # # ]: 0 : if (q < 0) {
1756 : 0 : r = q;
1757 : 0 : goto fail;
1758 : : }
1759 : :
1760 : 0 : fail:
1761 : 0 : (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1762 : 0 : return r;
1763 : : }
1764 : :
1765 : 0 : int open_netns_path(const int netns_storage_socket[static 2], const char *path) {
1766 : 0 : _cleanup_close_ int netns = -1;
1767 : : int q, r;
1768 : :
1769 [ # # ]: 0 : assert(netns_storage_socket);
1770 [ # # ]: 0 : assert(netns_storage_socket[0] >= 0);
1771 [ # # ]: 0 : assert(netns_storage_socket[1] >= 0);
1772 [ # # ]: 0 : assert(path);
1773 : :
1774 : : /* If the storage socket doesn't contain a netns fd yet, open one via the file system and store it in
1775 : : * it. This is supposed to be called ahead of time, i.e. before setup_netns() which will allocate a
1776 : : * new anonymous netns if needed. */
1777 : :
1778 [ # # ]: 0 : if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1779 : 0 : return -errno;
1780 : :
1781 : 0 : netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1782 [ # # ]: 0 : if (netns == -EAGAIN) {
1783 : : /* Nothing stored yet. Open the file from the file system. */
1784 : :
1785 : 0 : netns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
1786 [ # # ]: 0 : if (netns < 0) {
1787 : 0 : r = -errno;
1788 : 0 : goto fail;
1789 : : }
1790 : :
1791 : 0 : r = fd_is_network_ns(netns);
1792 [ # # ]: 0 : if (r == 0) { /* Not a netns? Refuse early. */
1793 : 0 : r = -EINVAL;
1794 : 0 : goto fail;
1795 : : }
1796 [ # # # # ]: 0 : if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
1797 : 0 : goto fail;
1798 : :
1799 : 0 : r = 1;
1800 : :
1801 [ # # ]: 0 : } else if (netns < 0) {
1802 : 0 : r = netns;
1803 : 0 : goto fail;
1804 : : } else
1805 : 0 : r = 0; /* Already allocated */
1806 : :
1807 : 0 : q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1808 [ # # ]: 0 : if (q < 0) {
1809 : 0 : r = q;
1810 : 0 : goto fail;
1811 : : }
1812 : :
1813 : 0 : fail:
1814 : 0 : (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1815 : 0 : return r;
1816 : : }
1817 : :
1818 : 0 : bool ns_type_supported(NamespaceType type) {
1819 : : const char *t, *ns_proc;
1820 : :
1821 : 0 : t = namespace_type_to_string(type);
1822 [ # # ]: 0 : if (!t) /* Don't know how to translate this? Then it's not supported */
1823 : 0 : return false;
1824 : :
1825 [ # # # # : 0 : ns_proc = strjoina("/proc/self/ns/", t);
# # # # #
# # # ]
1826 : 0 : return access(ns_proc, F_OK) == 0;
1827 : : }
1828 : :
1829 : : static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1830 : : [PROTECT_HOME_NO] = "no",
1831 : : [PROTECT_HOME_YES] = "yes",
1832 : : [PROTECT_HOME_READ_ONLY] = "read-only",
1833 : : [PROTECT_HOME_TMPFS] = "tmpfs",
1834 : : };
1835 : :
1836 [ + + + + : 732 : DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
+ + ]
1837 : :
1838 : : static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1839 : : [PROTECT_SYSTEM_NO] = "no",
1840 : : [PROTECT_SYSTEM_YES] = "yes",
1841 : : [PROTECT_SYSTEM_FULL] = "full",
1842 : : [PROTECT_SYSTEM_STRICT] = "strict",
1843 : : };
1844 : :
1845 [ + + + + : 732 : DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
+ + ]
1846 : :
1847 : : static const char* const namespace_type_table[] = {
1848 : : [NAMESPACE_MOUNT] = "mnt",
1849 : : [NAMESPACE_CGROUP] = "cgroup",
1850 : : [NAMESPACE_UTS] = "uts",
1851 : : [NAMESPACE_IPC] = "ipc",
1852 : : [NAMESPACE_USER] = "user",
1853 : : [NAMESPACE_PID] = "pid",
1854 : : [NAMESPACE_NET] = "net",
1855 : : };
1856 : :
1857 [ + + + + ]: 72 : DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
|