Line data Source code
1 : /* SPDX-License-Identifier: LGPL-2.1+ */
2 :
3 : #include <errno.h>
4 : #include <sched.h>
5 : #include <stdio.h>
6 : #include <string.h>
7 : #include <sys/mount.h>
8 : #include <sys/stat.h>
9 : #include <unistd.h>
10 : #include <linux/fs.h>
11 :
12 : #include "alloc-util.h"
13 : #include "base-filesystem.h"
14 : #include "dev-setup.h"
15 : #include "fd-util.h"
16 : #include "fs-util.h"
17 : #include "label.h"
18 : #include "loop-util.h"
19 : #include "loopback-setup.h"
20 : #include "missing.h"
21 : #include "mkdir.h"
22 : #include "mount-util.h"
23 : #include "mountpoint-util.h"
24 : #include "namespace-util.h"
25 : #include "namespace.h"
26 : #include "nulstr-util.h"
27 : #include "path-util.h"
28 : #include "selinux-util.h"
29 : #include "socket-util.h"
30 : #include "sort-util.h"
31 : #include "stat-util.h"
32 : #include "string-table.h"
33 : #include "string-util.h"
34 : #include "strv.h"
35 : #include "umask-util.h"
36 : #include "user-util.h"
37 :
38 : #define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
39 :
40 : typedef enum MountMode {
41 : /* This is ordered by priority! */
42 : INACCESSIBLE,
43 : BIND_MOUNT,
44 : BIND_MOUNT_RECURSIVE,
45 : PRIVATE_TMP,
46 : PRIVATE_DEV,
47 : BIND_DEV,
48 : EMPTY_DIR,
49 : SYSFS,
50 : PROCFS,
51 : READONLY,
52 : READWRITE,
53 : TMPFS,
54 : READWRITE_IMPLICIT, /* Should have the lowest priority. */
55 : _MOUNT_MODE_MAX,
56 : } MountMode;
57 :
58 : typedef struct MountEntry {
59 : const char *path_const; /* Memory allocated on stack or static */
60 : MountMode mode:5;
61 : bool ignore:1; /* Ignore if path does not exist? */
62 : bool has_prefix:1; /* Already is prefixed by the root dir? */
63 : bool read_only:1; /* Shall this mount point be read-only? */
64 : bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */
65 : bool applied:1; /* Already applied */
66 : char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */
67 : const char *source_const; /* The source path, for bind mounts */
68 : char *source_malloc;
69 : const char *options_const;/* Mount options for tmpfs */
70 : char *options_malloc;
71 : unsigned long flags; /* Mount flags used by EMPTY_DIR and TMPFS. Do not include MS_RDONLY here, but please use read_only. */
72 : unsigned n_followed;
73 : } MountEntry;
74 :
75 : /* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
76 : * something there already. These mounts are hence overridden by any other explicitly configured mounts. */
77 : static const MountEntry apivfs_table[] = {
78 : { "/proc", PROCFS, false },
79 : { "/dev", BIND_DEV, false },
80 : { "/sys", SYSFS, false },
81 : };
82 :
83 : /* ProtectKernelTunables= option and the related filesystem APIs */
84 : static const MountEntry protect_kernel_tunables_table[] = {
85 : { "/proc/acpi", READONLY, true },
86 : { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
87 : { "/proc/asound", READONLY, true },
88 : { "/proc/bus", READONLY, true },
89 : { "/proc/fs", READONLY, true },
90 : { "/proc/irq", READONLY, true },
91 : { "/proc/kallsyms", INACCESSIBLE, true },
92 : { "/proc/kcore", INACCESSIBLE, true },
93 : { "/proc/latency_stats", READONLY, true },
94 : { "/proc/mtrr", READONLY, true },
95 : { "/proc/scsi", READONLY, true },
96 : { "/proc/sys", READONLY, false },
97 : { "/proc/sysrq-trigger", READONLY, true },
98 : { "/proc/timer_stats", READONLY, true },
99 : { "/sys", READONLY, false },
100 : { "/sys/fs/bpf", READONLY, true },
101 : { "/sys/fs/cgroup", READWRITE_IMPLICIT, false }, /* READONLY is set by ProtectControlGroups= option */
102 : { "/sys/fs/selinux", READWRITE_IMPLICIT, true },
103 : { "/sys/kernel/debug", READONLY, true },
104 : { "/sys/kernel/tracing", READONLY, true },
105 : };
106 :
107 : /* ProtectKernelModules= option */
108 : static const MountEntry protect_kernel_modules_table[] = {
109 : #if HAVE_SPLIT_USR
110 : { "/lib/modules", INACCESSIBLE, true },
111 : #endif
112 : { "/usr/lib/modules", INACCESSIBLE, true },
113 : };
114 :
115 : /*
116 : * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
117 : * system should be protected by ProtectSystem=
118 : */
119 : static const MountEntry protect_home_read_only_table[] = {
120 : { "/home", READONLY, true },
121 : { "/run/user", READONLY, true },
122 : { "/root", READONLY, true },
123 : };
124 :
125 : /* ProtectHome=tmpfs table */
126 : static const MountEntry protect_home_tmpfs_table[] = {
127 : { "/home", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
128 : { "/run/user", TMPFS, true, .read_only = true, .options_const = "mode=0755", .flags = MS_NODEV|MS_STRICTATIME },
129 : { "/root", TMPFS, true, .read_only = true, .options_const = "mode=0700", .flags = MS_NODEV|MS_STRICTATIME },
130 : };
131 :
132 : /* ProtectHome=yes table */
133 : static const MountEntry protect_home_yes_table[] = {
134 : { "/home", INACCESSIBLE, true },
135 : { "/run/user", INACCESSIBLE, true },
136 : { "/root", INACCESSIBLE, true },
137 : };
138 :
139 : /* ProtectSystem=yes table */
140 : static const MountEntry protect_system_yes_table[] = {
141 : { "/usr", READONLY, false },
142 : { "/boot", READONLY, true },
143 : { "/efi", READONLY, true },
144 : #if HAVE_SPLIT_USR
145 : { "/lib", READONLY, true },
146 : { "/lib64", READONLY, true },
147 : { "/bin", READONLY, true },
148 : # if HAVE_SPLIT_BIN
149 : { "/sbin", READONLY, true },
150 : # endif
151 : #endif
152 : };
153 :
154 : /* ProtectSystem=full includes ProtectSystem=yes */
155 : static const MountEntry protect_system_full_table[] = {
156 : { "/usr", READONLY, false },
157 : { "/boot", READONLY, true },
158 : { "/efi", READONLY, true },
159 : { "/etc", READONLY, false },
160 : #if HAVE_SPLIT_USR
161 : { "/lib", READONLY, true },
162 : { "/lib64", READONLY, true },
163 : { "/bin", READONLY, true },
164 : # if HAVE_SPLIT_BIN
165 : { "/sbin", READONLY, true },
166 : # endif
167 : #endif
168 : };
169 :
170 : /*
171 : * ProtectSystem=strict table. In this strict mode, we mount everything
172 : * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
173 : * which are left writable, but PrivateDevices= + ProtectKernelTunables=
174 : * protect those, and these options should be fully orthogonal.
175 : * (And of course /home and friends are also left writable, as ProtectHome=
176 : * shall manage those, orthogonally).
177 : */
178 : static const MountEntry protect_system_strict_table[] = {
179 : { "/", READONLY, false },
180 : { "/proc", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
181 : { "/sys", READWRITE_IMPLICIT, false }, /* ProtectKernelTunables= */
182 : { "/dev", READWRITE_IMPLICIT, false }, /* PrivateDevices= */
183 : { "/home", READWRITE_IMPLICIT, true }, /* ProtectHome= */
184 : { "/run/user", READWRITE_IMPLICIT, true }, /* ProtectHome= */
185 : { "/root", READWRITE_IMPLICIT, true }, /* ProtectHome= */
186 : };
187 :
188 : static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
189 : [INACCESSIBLE] = "inaccessible",
190 : [BIND_MOUNT] = "bind",
191 : [BIND_MOUNT_RECURSIVE] = "rbind",
192 : [PRIVATE_TMP] = "private-tmp",
193 : [PRIVATE_DEV] = "private-dev",
194 : [BIND_DEV] = "bind-dev",
195 : [EMPTY_DIR] = "empty",
196 : [SYSFS] = "sysfs",
197 : [PROCFS] = "procfs",
198 : [READONLY] = "read-only",
199 : [READWRITE] = "read-write",
200 : [TMPFS] = "tmpfs",
201 : [READWRITE_IMPLICIT] = "rw-implicit",
202 : };
203 :
204 0 : DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(mount_mode, MountMode);
205 :
206 0 : static const char *mount_entry_path(const MountEntry *p) {
207 0 : assert(p);
208 :
209 : /* Returns the path of this bind mount. If the malloc()-allocated ->path_buffer field is set we return that,
210 : * otherwise the stack/static ->path field is returned. */
211 :
212 0 : return p->path_malloc ?: p->path_const;
213 : }
214 :
215 0 : static bool mount_entry_read_only(const MountEntry *p) {
216 0 : assert(p);
217 :
218 0 : return p->read_only || IN_SET(p->mode, READONLY, INACCESSIBLE);
219 : }
220 :
221 0 : static const char *mount_entry_source(const MountEntry *p) {
222 0 : assert(p);
223 :
224 0 : return p->source_malloc ?: p->source_const;
225 : }
226 :
227 0 : static const char *mount_entry_options(const MountEntry *p) {
228 0 : assert(p);
229 :
230 0 : return p->options_malloc ?: p->options_const;
231 : }
232 :
233 0 : static void mount_entry_done(MountEntry *p) {
234 0 : assert(p);
235 :
236 0 : p->path_malloc = mfree(p->path_malloc);
237 0 : p->source_malloc = mfree(p->source_malloc);
238 0 : p->options_malloc = mfree(p->options_malloc);
239 0 : }
240 :
241 0 : static int append_access_mounts(MountEntry **p, char **strv, MountMode mode, bool forcibly_require_prefix) {
242 : char **i;
243 :
244 0 : assert(p);
245 :
246 : /* Adds a list of user-supplied READWRITE/READWRITE_IMPLICIT/READONLY/INACCESSIBLE entries */
247 :
248 0 : STRV_FOREACH(i, strv) {
249 0 : bool ignore = false, needs_prefix = false;
250 0 : const char *e = *i;
251 :
252 : /* Look for any prefixes */
253 0 : if (startswith(e, "-")) {
254 0 : e++;
255 0 : ignore = true;
256 : }
257 0 : if (startswith(e, "+")) {
258 0 : e++;
259 0 : needs_prefix = true;
260 : }
261 :
262 0 : if (!path_is_absolute(e))
263 0 : return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
264 : "Path is not absolute: %s", e);
265 :
266 0 : *((*p)++) = (MountEntry) {
267 : .path_const = e,
268 : .mode = mode,
269 : .ignore = ignore,
270 0 : .has_prefix = !needs_prefix && !forcibly_require_prefix,
271 : };
272 : }
273 :
274 0 : return 0;
275 : }
276 :
277 0 : static int append_empty_dir_mounts(MountEntry **p, char **strv) {
278 : char **i;
279 :
280 0 : assert(p);
281 :
282 : /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
283 : * "/private/" boundary directories for DynamicUser=1. */
284 :
285 0 : STRV_FOREACH(i, strv) {
286 :
287 0 : *((*p)++) = (MountEntry) {
288 0 : .path_const = *i,
289 : .mode = EMPTY_DIR,
290 : .ignore = false,
291 : .read_only = true,
292 : .options_const = "mode=755",
293 : .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
294 : };
295 : }
296 :
297 0 : return 0;
298 : }
299 :
300 0 : static int append_bind_mounts(MountEntry **p, const BindMount *binds, size_t n) {
301 : size_t i;
302 :
303 0 : assert(p);
304 :
305 0 : for (i = 0; i < n; i++) {
306 0 : const BindMount *b = binds + i;
307 :
308 0 : *((*p)++) = (MountEntry) {
309 0 : .path_const = b->destination,
310 0 : .mode = b->recursive ? BIND_MOUNT_RECURSIVE : BIND_MOUNT,
311 0 : .read_only = b->read_only,
312 0 : .nosuid = b->nosuid,
313 0 : .source_const = b->source,
314 0 : .ignore = b->ignore_enoent,
315 : };
316 : }
317 :
318 0 : return 0;
319 : }
320 :
321 0 : static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, size_t n) {
322 : size_t i;
323 : int r;
324 :
325 0 : assert(p);
326 :
327 0 : for (i = 0; i < n; i++) {
328 0 : const TemporaryFileSystem *t = tmpfs + i;
329 0 : _cleanup_free_ char *o = NULL, *str = NULL;
330 : unsigned long flags;
331 0 : bool ro = false;
332 :
333 0 : if (!path_is_absolute(t->path))
334 0 : return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
335 : "Path is not absolute: %s",
336 : t->path);
337 :
338 0 : str = strjoin("mode=0755,", t->options);
339 0 : if (!str)
340 0 : return -ENOMEM;
341 :
342 0 : r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o);
343 0 : if (r < 0)
344 0 : return log_debug_errno(r, "Failed to parse mount option '%s': %m", str);
345 :
346 0 : ro = flags & MS_RDONLY;
347 0 : if (ro)
348 0 : flags ^= MS_RDONLY;
349 :
350 0 : *((*p)++) = (MountEntry) {
351 0 : .path_const = t->path,
352 : .mode = TMPFS,
353 : .read_only = ro,
354 0 : .options_malloc = TAKE_PTR(o),
355 : .flags = flags,
356 : };
357 : }
358 :
359 0 : return 0;
360 : }
361 :
362 0 : static int append_static_mounts(MountEntry **p, const MountEntry *mounts, size_t n, bool ignore_protect) {
363 : size_t i;
364 :
365 0 : assert(p);
366 0 : assert(mounts);
367 :
368 : /* Adds a list of static pre-defined entries */
369 :
370 0 : for (i = 0; i < n; i++)
371 0 : *((*p)++) = (MountEntry) {
372 0 : .path_const = mount_entry_path(mounts+i),
373 0 : .mode = mounts[i].mode,
374 0 : .ignore = mounts[i].ignore || ignore_protect,
375 : };
376 :
377 0 : return 0;
378 : }
379 :
380 0 : static int append_protect_home(MountEntry **p, ProtectHome protect_home, bool ignore_protect) {
381 0 : assert(p);
382 :
383 0 : switch (protect_home) {
384 :
385 0 : case PROTECT_HOME_NO:
386 0 : return 0;
387 :
388 0 : case PROTECT_HOME_READ_ONLY:
389 0 : return append_static_mounts(p, protect_home_read_only_table, ELEMENTSOF(protect_home_read_only_table), ignore_protect);
390 :
391 0 : case PROTECT_HOME_TMPFS:
392 0 : return append_static_mounts(p, protect_home_tmpfs_table, ELEMENTSOF(protect_home_tmpfs_table), ignore_protect);
393 :
394 0 : case PROTECT_HOME_YES:
395 0 : return append_static_mounts(p, protect_home_yes_table, ELEMENTSOF(protect_home_yes_table), ignore_protect);
396 :
397 0 : default:
398 0 : assert_not_reached("Unexpected ProtectHome= value");
399 : }
400 : }
401 :
402 0 : static int append_protect_system(MountEntry **p, ProtectSystem protect_system, bool ignore_protect) {
403 0 : assert(p);
404 :
405 0 : switch (protect_system) {
406 :
407 0 : case PROTECT_SYSTEM_NO:
408 0 : return 0;
409 :
410 0 : case PROTECT_SYSTEM_STRICT:
411 0 : return append_static_mounts(p, protect_system_strict_table, ELEMENTSOF(protect_system_strict_table), ignore_protect);
412 :
413 0 : case PROTECT_SYSTEM_YES:
414 0 : return append_static_mounts(p, protect_system_yes_table, ELEMENTSOF(protect_system_yes_table), ignore_protect);
415 :
416 0 : case PROTECT_SYSTEM_FULL:
417 0 : return append_static_mounts(p, protect_system_full_table, ELEMENTSOF(protect_system_full_table), ignore_protect);
418 :
419 0 : default:
420 0 : assert_not_reached("Unexpected ProtectSystem= value");
421 : }
422 : }
423 :
424 0 : static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
425 : int d;
426 :
427 : /* If the paths are not equal, then order prefixes first */
428 0 : d = path_compare(mount_entry_path(a), mount_entry_path(b));
429 0 : if (d != 0)
430 0 : return d;
431 :
432 : /* If the paths are equal, check the mode */
433 0 : return CMP((int) a->mode, (int) b->mode);
434 : }
435 :
436 0 : static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) {
437 : size_t i;
438 :
439 : /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */
440 :
441 0 : for (i = 0; i < n; i++) {
442 : char *s;
443 :
444 0 : if (m[i].has_prefix)
445 0 : continue;
446 :
447 0 : s = path_join(root_directory, mount_entry_path(m+i));
448 0 : if (!s)
449 0 : return -ENOMEM;
450 :
451 0 : free_and_replace(m[i].path_malloc, s);
452 0 : m[i].has_prefix = true;
453 : }
454 :
455 0 : return 0;
456 : }
457 :
458 0 : static void drop_duplicates(MountEntry *m, size_t *n) {
459 : MountEntry *f, *t, *previous;
460 :
461 0 : assert(m);
462 0 : assert(n);
463 :
464 : /* Drops duplicate entries. Expects that the array is properly ordered already. */
465 :
466 0 : for (f = m, t = m, previous = NULL; f < m + *n; f++) {
467 :
468 : /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
469 : * above. Note that we only drop duplicates that haven't been mounted yet. */
470 0 : if (previous &&
471 0 : path_equal(mount_entry_path(f), mount_entry_path(previous)) &&
472 0 : !f->applied && !previous->applied) {
473 0 : log_debug("%s (%s) is duplicate.", mount_entry_path(f), mount_mode_to_string(f->mode));
474 0 : previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
475 0 : mount_entry_done(f);
476 0 : continue;
477 : }
478 :
479 0 : *t = *f;
480 0 : previous = t;
481 0 : t++;
482 : }
483 :
484 0 : *n = t - m;
485 0 : }
486 :
487 0 : static void drop_inaccessible(MountEntry *m, size_t *n) {
488 : MountEntry *f, *t;
489 0 : const char *clear = NULL;
490 :
491 0 : assert(m);
492 0 : assert(n);
493 :
494 : /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
495 : * ordered already. */
496 :
497 0 : for (f = m, t = m; f < m + *n; f++) {
498 :
499 : /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
500 : * it, as inaccessible paths really should drop the entire subtree. */
501 0 : if (clear && path_startswith(mount_entry_path(f), clear)) {
502 0 : log_debug("%s is masked by %s.", mount_entry_path(f), clear);
503 0 : mount_entry_done(f);
504 0 : continue;
505 : }
506 :
507 0 : clear = f->mode == INACCESSIBLE ? mount_entry_path(f) : NULL;
508 :
509 0 : *t = *f;
510 0 : t++;
511 : }
512 :
513 0 : *n = t - m;
514 0 : }
515 :
516 0 : static void drop_nop(MountEntry *m, size_t *n) {
517 : MountEntry *f, *t;
518 :
519 0 : assert(m);
520 0 : assert(n);
521 :
522 : /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
523 : * list is ordered by prefixes. */
524 :
525 0 : for (f = m, t = m; f < m + *n; f++) {
526 :
527 : /* Only suppress such subtrees for READONLY, READWRITE and READWRITE_IMPLICIT entries */
528 0 : if (IN_SET(f->mode, READONLY, READWRITE, READWRITE_IMPLICIT)) {
529 : MountEntry *p;
530 0 : bool found = false;
531 :
532 : /* Now let's find the first parent of the entry we are looking at. */
533 0 : for (p = t-1; p >= m; p--) {
534 0 : if (path_startswith(mount_entry_path(f), mount_entry_path(p))) {
535 0 : found = true;
536 0 : break;
537 : }
538 : }
539 :
540 : /* We found it, let's see if it's the same mode, if so, we can drop this entry */
541 0 : if (found && p->mode == f->mode) {
542 0 : log_debug("%s (%s) is made redundant by %s (%s)",
543 : mount_entry_path(f), mount_mode_to_string(f->mode),
544 : mount_entry_path(p), mount_mode_to_string(p->mode));
545 0 : mount_entry_done(f);
546 0 : continue;
547 : }
548 : }
549 :
550 0 : *t = *f;
551 0 : t++;
552 : }
553 :
554 0 : *n = t - m;
555 0 : }
556 :
557 0 : static void drop_outside_root(const char *root_directory, MountEntry *m, size_t *n) {
558 : MountEntry *f, *t;
559 :
560 0 : assert(m);
561 0 : assert(n);
562 :
563 : /* Nothing to do */
564 0 : if (!root_directory)
565 0 : return;
566 :
567 : /* Drops all mounts that are outside of the root directory. */
568 :
569 0 : for (f = m, t = m; f < m + *n; f++) {
570 :
571 0 : if (!path_startswith(mount_entry_path(f), root_directory)) {
572 0 : log_debug("%s is outside of root directory.", mount_entry_path(f));
573 0 : mount_entry_done(f);
574 0 : continue;
575 : }
576 :
577 0 : *t = *f;
578 0 : t++;
579 : }
580 :
581 0 : *n = t - m;
582 : }
583 :
584 0 : static int clone_device_node(
585 : const char *d,
586 : const char *temporary_mount,
587 : bool *make_devnode) {
588 :
589 0 : _cleanup_free_ char *sl = NULL;
590 : const char *dn, *bn, *t;
591 : struct stat st;
592 : int r;
593 :
594 0 : if (stat(d, &st) < 0) {
595 0 : if (errno == ENOENT) {
596 0 : log_debug_errno(errno, "Device node '%s' to clone does not exist, ignoring.", d);
597 0 : return -ENXIO;
598 : }
599 :
600 0 : return log_debug_errno(errno, "Failed to stat() device node '%s' to clone, ignoring: %m", d);
601 : }
602 :
603 0 : if (!S_ISBLK(st.st_mode) &&
604 0 : !S_ISCHR(st.st_mode))
605 0 : return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
606 : "Device node '%s' to clone is not a device node, ignoring.",
607 : d);
608 :
609 0 : dn = strjoina(temporary_mount, d);
610 :
611 : /* First, try to create device node properly */
612 0 : if (*make_devnode) {
613 0 : mac_selinux_create_file_prepare(d, st.st_mode);
614 0 : r = mknod(dn, st.st_mode, st.st_rdev);
615 0 : mac_selinux_create_file_clear();
616 0 : if (r >= 0)
617 0 : goto add_symlink;
618 0 : if (errno != EPERM)
619 0 : return log_debug_errno(errno, "mknod failed for %s: %m", d);
620 :
621 : /* This didn't work, let's not try this again for the next iterations. */
622 0 : *make_devnode = false;
623 : }
624 :
625 : /* We're about to fallback to bind-mounting the device
626 : * node. So create a dummy bind-mount target. */
627 0 : mac_selinux_create_file_prepare(d, 0);
628 0 : r = mknod(dn, S_IFREG, 0);
629 0 : mac_selinux_create_file_clear();
630 0 : if (r < 0 && errno != EEXIST)
631 0 : return log_debug_errno(errno, "mknod() fallback failed for '%s': %m", d);
632 :
633 : /* Fallback to bind-mounting:
634 : * The assumption here is that all used device nodes carry standard
635 : * properties. Specifically, the devices nodes we bind-mount should
636 : * either be owned by root:root or root:tty (e.g. /dev/tty, /dev/ptmx)
637 : * and should not carry ACLs. */
638 0 : if (mount(d, dn, NULL, MS_BIND, NULL) < 0)
639 0 : return log_debug_errno(errno, "Bind mounting failed for '%s': %m", d);
640 :
641 0 : add_symlink:
642 0 : bn = path_startswith(d, "/dev/");
643 0 : if (!bn)
644 0 : return 0;
645 :
646 : /* Create symlinks like /dev/char/1:9 → ../urandom */
647 0 : if (asprintf(&sl, "%s/dev/%s/%u:%u", temporary_mount, S_ISCHR(st.st_mode) ? "char" : "block", major(st.st_rdev), minor(st.st_rdev)) < 0)
648 0 : return log_oom();
649 :
650 0 : (void) mkdir_parents(sl, 0755);
651 :
652 0 : t = strjoina("../", bn);
653 :
654 0 : if (symlink(t, sl) < 0)
655 0 : log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, sl);
656 :
657 0 : return 0;
658 : }
659 :
660 0 : static int mount_private_dev(MountEntry *m) {
661 : static const char devnodes[] =
662 : "/dev/null\0"
663 : "/dev/zero\0"
664 : "/dev/full\0"
665 : "/dev/random\0"
666 : "/dev/urandom\0"
667 : "/dev/tty\0";
668 :
669 0 : char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
670 0 : const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
671 0 : bool can_mknod = true;
672 0 : _cleanup_umask_ mode_t u;
673 : int r;
674 :
675 0 : assert(m);
676 :
677 0 : u = umask(0000);
678 :
679 0 : if (!mkdtemp(temporary_mount))
680 0 : return log_debug_errno(errno, "Failed to create temporary directory '%s': %m", temporary_mount);
681 :
682 0 : dev = strjoina(temporary_mount, "/dev");
683 0 : (void) mkdir(dev, 0755);
684 0 : if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
685 0 : r = log_debug_errno(errno, "Failed to mount tmpfs on '%s': %m", dev);
686 0 : goto fail;
687 : }
688 :
689 0 : devpts = strjoina(temporary_mount, "/dev/pts");
690 0 : (void) mkdir(devpts, 0755);
691 0 : if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
692 0 : r = log_debug_errno(errno, "Failed to bind mount /dev/pts on '%s': %m", devpts);
693 0 : goto fail;
694 : }
695 :
696 : /* /dev/ptmx can either be a device node or a symlink to /dev/pts/ptmx.
697 : * When /dev/ptmx a device node, /dev/pts/ptmx has 000 permissions making it inaccessible.
698 : * Thus, in that case make a clone.
699 : * In nspawn and other containers it will be a symlink, in that case make it a symlink. */
700 0 : r = is_symlink("/dev/ptmx");
701 0 : if (r < 0) {
702 0 : log_debug_errno(r, "Failed to detect whether /dev/ptmx is a symlink or not: %m");
703 0 : goto fail;
704 0 : } else if (r > 0) {
705 0 : devptmx = strjoina(temporary_mount, "/dev/ptmx");
706 0 : if (symlink("pts/ptmx", devptmx) < 0) {
707 0 : r = log_debug_errno(errno, "Failed to create a symlink '%s' to pts/ptmx: %m", devptmx);
708 0 : goto fail;
709 : }
710 : } else {
711 0 : r = clone_device_node("/dev/ptmx", temporary_mount, &can_mknod);
712 0 : if (r < 0)
713 0 : goto fail;
714 : }
715 :
716 0 : devshm = strjoina(temporary_mount, "/dev/shm");
717 0 : (void) mkdir(devshm, 0755);
718 0 : r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
719 0 : if (r < 0) {
720 0 : r = log_debug_errno(errno, "Failed to bind mount /dev/shm on '%s': %m", devshm);
721 0 : goto fail;
722 : }
723 :
724 0 : devmqueue = strjoina(temporary_mount, "/dev/mqueue");
725 0 : (void) mkdir(devmqueue, 0755);
726 0 : if (mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL) < 0)
727 0 : log_debug_errno(errno, "Failed to bind mount /dev/mqueue on '%s', ignoring: %m", devmqueue);
728 :
729 0 : devhugepages = strjoina(temporary_mount, "/dev/hugepages");
730 0 : (void) mkdir(devhugepages, 0755);
731 0 : if (mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL) < 0)
732 0 : log_debug_errno(errno, "Failed to bind mount /dev/hugepages on '%s', ignoring: %m", devhugepages);
733 :
734 0 : devlog = strjoina(temporary_mount, "/dev/log");
735 0 : if (symlink("/run/systemd/journal/dev-log", devlog) < 0)
736 0 : log_debug_errno(errno, "Failed to create a symlink '%s' to /run/systemd/journal/dev-log, ignoring: %m", devlog);
737 :
738 0 : NULSTR_FOREACH(d, devnodes) {
739 0 : r = clone_device_node(d, temporary_mount, &can_mknod);
740 : /* ENXIO means the the *source* is not a device file, skip creation in that case */
741 0 : if (r < 0 && r != -ENXIO)
742 0 : goto fail;
743 : }
744 :
745 0 : r = dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
746 0 : if (r < 0)
747 0 : log_debug_errno(r, "Failed to setup basic device tree at '%s', ignoring: %m", temporary_mount);
748 :
749 : /* Create the /dev directory if missing. It is more likely to be
750 : * missing when the service is started with RootDirectory. This is
751 : * consistent with mount units creating the mount points when missing.
752 : */
753 0 : (void) mkdir_p_label(mount_entry_path(m), 0755);
754 :
755 : /* Unmount everything in old /dev */
756 0 : r = umount_recursive(mount_entry_path(m), 0);
757 0 : if (r < 0)
758 0 : log_debug_errno(r, "Failed to unmount directories below '%s', ignoring: %m", mount_entry_path(m));
759 :
760 0 : if (mount(dev, mount_entry_path(m), NULL, MS_MOVE, NULL) < 0) {
761 0 : r = log_debug_errno(errno, "Failed to move mount point '%s' to '%s': %m", dev, mount_entry_path(m));
762 0 : goto fail;
763 : }
764 :
765 0 : (void) rmdir(dev);
766 0 : (void) rmdir(temporary_mount);
767 :
768 0 : return 0;
769 :
770 0 : fail:
771 0 : if (devpts)
772 0 : (void) umount(devpts);
773 :
774 0 : if (devshm)
775 0 : (void) umount(devshm);
776 :
777 0 : if (devhugepages)
778 0 : (void) umount(devhugepages);
779 :
780 0 : if (devmqueue)
781 0 : (void) umount(devmqueue);
782 :
783 0 : (void) umount(dev);
784 0 : (void) rmdir(dev);
785 0 : (void) rmdir(temporary_mount);
786 :
787 0 : return r;
788 : }
789 :
790 0 : static int mount_bind_dev(const MountEntry *m) {
791 : int r;
792 :
793 0 : assert(m);
794 :
795 : /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
796 : * /dev. This is only used when RootDirectory= is set. */
797 :
798 0 : (void) mkdir_p_label(mount_entry_path(m), 0755);
799 :
800 0 : r = path_is_mount_point(mount_entry_path(m), NULL, 0);
801 0 : if (r < 0)
802 0 : return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
803 0 : if (r > 0) /* make this a NOP if /dev is already a mount point */
804 0 : return 0;
805 :
806 0 : if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
807 0 : return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
808 :
809 0 : return 1;
810 : }
811 :
812 0 : static int mount_sysfs(const MountEntry *m) {
813 : int r;
814 :
815 0 : assert(m);
816 :
817 0 : (void) mkdir_p_label(mount_entry_path(m), 0755);
818 :
819 0 : r = path_is_mount_point(mount_entry_path(m), NULL, 0);
820 0 : if (r < 0)
821 0 : return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
822 0 : if (r > 0) /* make this a NOP if /sys is already a mount point */
823 0 : return 0;
824 :
825 : /* Bind mount the host's version so that we get all child mounts of it, too. */
826 0 : if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
827 0 : return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
828 :
829 0 : return 1;
830 : }
831 :
832 0 : static int mount_procfs(const MountEntry *m) {
833 : int r;
834 :
835 0 : assert(m);
836 :
837 0 : (void) mkdir_p_label(mount_entry_path(m), 0755);
838 :
839 0 : r = path_is_mount_point(mount_entry_path(m), NULL, 0);
840 0 : if (r < 0)
841 0 : return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
842 0 : if (r > 0) /* make this a NOP if /proc is already a mount point */
843 0 : return 0;
844 :
845 : /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
846 0 : if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
847 0 : return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
848 :
849 0 : return 1;
850 : }
851 :
852 0 : static int mount_tmpfs(const MountEntry *m) {
853 0 : assert(m);
854 :
855 : /* First, get rid of everything that is below if there is anything. Then, overmount with our new tmpfs */
856 :
857 0 : (void) mkdir_p_label(mount_entry_path(m), 0755);
858 0 : (void) umount_recursive(mount_entry_path(m), 0);
859 :
860 0 : if (mount("tmpfs", mount_entry_path(m), "tmpfs", m->flags, mount_entry_options(m)) < 0)
861 0 : return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
862 :
863 0 : return 1;
864 : }
865 :
866 0 : static int follow_symlink(
867 : const char *root_directory,
868 : MountEntry *m) {
869 :
870 0 : _cleanup_free_ char *target = NULL;
871 : int r;
872 :
873 : /* Let's chase symlinks, but only one step at a time. That's because depending where the symlink points we
874 : * might need to change the order in which we mount stuff. Hence: let's normalize piecemeal, and do one step at
875 : * a time by specifying CHASE_STEP. This function returns 0 if we resolved one step, and > 0 if we reached the
876 : * end and already have a fully normalized name. */
877 :
878 0 : r = chase_symlinks(mount_entry_path(m), root_directory, CHASE_STEP|CHASE_NONEXISTENT, &target);
879 0 : if (r < 0)
880 0 : return log_debug_errno(r, "Failed to chase symlinks '%s': %m", mount_entry_path(m));
881 0 : if (r > 0) /* Reached the end, nothing more to resolve */
882 0 : return 1;
883 :
884 0 : if (m->n_followed >= CHASE_SYMLINKS_MAX) /* put a boundary on things */
885 0 : return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
886 : "Symlink loop on '%s'.",
887 : mount_entry_path(m));
888 :
889 0 : log_debug("Followed mount entry path symlink %s → %s.", mount_entry_path(m), target);
890 :
891 0 : free_and_replace(m->path_malloc, target);
892 0 : m->has_prefix = true;
893 :
894 0 : m->n_followed ++;
895 :
896 0 : return 0;
897 : }
898 :
899 0 : static int apply_mount(
900 : const char *root_directory,
901 : MountEntry *m) {
902 :
903 0 : bool rbind = true, make = false;
904 : const char *what;
905 : int r;
906 :
907 0 : assert(m);
908 :
909 0 : log_debug("Applying namespace mount on %s", mount_entry_path(m));
910 :
911 0 : switch (m->mode) {
912 :
913 0 : case INACCESSIBLE: {
914 : struct stat target;
915 :
916 : /* First, get rid of everything that is below if there
917 : * is anything... Then, overmount it with an
918 : * inaccessible path. */
919 0 : (void) umount_recursive(mount_entry_path(m), 0);
920 :
921 0 : if (lstat(mount_entry_path(m), &target) < 0) {
922 0 : if (errno == ENOENT && m->ignore)
923 0 : return 0;
924 :
925 0 : return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
926 : }
927 :
928 0 : what = mode_to_inaccessible_node(target.st_mode);
929 0 : if (!what)
930 0 : return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
931 : "File type not supported for inaccessible mounts. Note that symlinks are not allowed");
932 0 : break;
933 : }
934 :
935 0 : case READONLY:
936 : case READWRITE:
937 : case READWRITE_IMPLICIT:
938 0 : r = path_is_mount_point(mount_entry_path(m), root_directory, 0);
939 0 : if (r == -ENOENT && m->ignore)
940 0 : return 0;
941 0 : if (r < 0)
942 0 : return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", mount_entry_path(m));
943 0 : if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
944 0 : return 0;
945 : /* This isn't a mount point yet, let's make it one. */
946 0 : what = mount_entry_path(m);
947 0 : break;
948 :
949 0 : case BIND_MOUNT:
950 0 : rbind = false;
951 :
952 : _fallthrough_;
953 0 : case BIND_MOUNT_RECURSIVE: {
954 0 : _cleanup_free_ char *chased = NULL;
955 :
956 : /* Since mount() will always follow symlinks we chase the symlinks on our own first. Note that bind
957 : * mount source paths are always relative to the host root, hence we pass NULL as root directory to
958 : * chase_symlinks() here. */
959 :
960 0 : r = chase_symlinks(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased);
961 0 : if (r == -ENOENT && m->ignore) {
962 0 : log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m));
963 0 : return 0;
964 : }
965 0 : if (r < 0)
966 0 : return log_debug_errno(r, "Failed to follow symlinks on %s: %m", mount_entry_source(m));
967 :
968 0 : log_debug("Followed source symlinks %s → %s.", mount_entry_source(m), chased);
969 :
970 0 : free_and_replace(m->source_malloc, chased);
971 :
972 0 : what = mount_entry_source(m);
973 0 : make = true;
974 0 : break;
975 : }
976 :
977 0 : case EMPTY_DIR:
978 : case TMPFS:
979 0 : return mount_tmpfs(m);
980 :
981 0 : case PRIVATE_TMP:
982 0 : what = mount_entry_source(m);
983 0 : make = true;
984 0 : break;
985 :
986 0 : case PRIVATE_DEV:
987 0 : return mount_private_dev(m);
988 :
989 0 : case BIND_DEV:
990 0 : return mount_bind_dev(m);
991 :
992 0 : case SYSFS:
993 0 : return mount_sysfs(m);
994 :
995 0 : case PROCFS:
996 0 : return mount_procfs(m);
997 :
998 0 : default:
999 0 : assert_not_reached("Unknown mode");
1000 : }
1001 :
1002 0 : assert(what);
1003 :
1004 0 : if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0) {
1005 0 : bool try_again = false;
1006 0 : r = -errno;
1007 :
1008 0 : if (r == -ENOENT && make) {
1009 : struct stat st;
1010 :
1011 : /* Hmm, either the source or the destination are missing. Let's see if we can create the destination, then try again */
1012 :
1013 0 : if (stat(what, &st) < 0)
1014 0 : log_debug_errno(errno, "Mount point source '%s' is not accessible: %m", what);
1015 : else {
1016 : int q;
1017 :
1018 0 : (void) mkdir_parents(mount_entry_path(m), 0755);
1019 :
1020 0 : if (S_ISDIR(st.st_mode))
1021 0 : q = mkdir(mount_entry_path(m), 0755) < 0 ? -errno : 0;
1022 : else
1023 0 : q = touch(mount_entry_path(m));
1024 :
1025 0 : if (q < 0)
1026 0 : log_debug_errno(q, "Failed to create destination mount point node '%s': %m", mount_entry_path(m));
1027 : else
1028 0 : try_again = true;
1029 : }
1030 : }
1031 :
1032 0 : if (try_again) {
1033 0 : if (mount(what, mount_entry_path(m), NULL, MS_BIND|(rbind ? MS_REC : 0), NULL) < 0)
1034 0 : r = -errno;
1035 : else
1036 0 : r = 0;
1037 : }
1038 :
1039 0 : if (r < 0)
1040 0 : return log_debug_errno(r, "Failed to mount %s to %s: %m", what, mount_entry_path(m));
1041 : }
1042 :
1043 0 : log_debug("Successfully mounted %s to %s", what, mount_entry_path(m));
1044 0 : return 0;
1045 : }
1046 :
1047 : /* Change per-mount flags on an existing mount */
1048 0 : static int bind_remount_one(const char *path, unsigned long orig_flags, unsigned long new_flags, unsigned long flags_mask) {
1049 0 : if (mount(NULL, path, NULL, (orig_flags & ~flags_mask) | MS_REMOUNT | MS_BIND | new_flags, NULL) < 0)
1050 0 : return -errno;
1051 :
1052 0 : return 0;
1053 : }
1054 :
1055 0 : static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) {
1056 0 : unsigned long new_flags = 0, flags_mask = 0;
1057 0 : bool submounts = false;
1058 0 : int r = 0;
1059 :
1060 0 : assert(m);
1061 0 : assert(proc_self_mountinfo);
1062 :
1063 0 : if (mount_entry_read_only(m) || m->mode == PRIVATE_DEV) {
1064 0 : new_flags |= MS_RDONLY;
1065 0 : flags_mask |= MS_RDONLY;
1066 : }
1067 :
1068 0 : if (m->nosuid) {
1069 0 : new_flags |= MS_NOSUID;
1070 0 : flags_mask |= MS_NOSUID;
1071 : }
1072 :
1073 0 : if (flags_mask == 0) /* No Change? */
1074 0 : return 0;
1075 :
1076 : /* We generally apply these changes recursively, except for /dev, and the cases we know there's
1077 : * nothing further down. Set /dev readonly, but not submounts like /dev/shm. Also, we only set the
1078 : * per-mount read-only flag. We can't set it on the superblock, if we are inside a user namespace
1079 : * and running Linux <= 4.17. */
1080 0 : submounts =
1081 0 : mount_entry_read_only(m) &&
1082 0 : !IN_SET(m->mode, EMPTY_DIR, TMPFS);
1083 0 : if (submounts)
1084 0 : r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, blacklist, proc_self_mountinfo);
1085 : else
1086 0 : r = bind_remount_one(mount_entry_path(m), m->flags, new_flags, flags_mask);
1087 :
1088 : /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked
1089 : * read-only already stays this way. This improves compatibility with container managers, where we
1090 : * won't attempt to undo read-only mounts already applied. */
1091 :
1092 0 : if (r == -ENOENT && m->ignore)
1093 0 : return 0;
1094 0 : if (r < 0)
1095 0 : return log_debug_errno(r, "Failed to re-mount '%s'%s: %m", mount_entry_path(m),
1096 : submounts ? " and its submounts" : "");
1097 0 : return 0;
1098 : }
1099 :
1100 0 : static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
1101 0 : assert(ns_info);
1102 :
1103 : /*
1104 : * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=,
1105 : * since to protect the API VFS mounts, they need to be around in the
1106 : * first place...
1107 : */
1108 :
1109 0 : return ns_info->mount_apivfs ||
1110 0 : ns_info->protect_control_groups ||
1111 : ns_info->protect_kernel_tunables;
1112 : }
1113 :
1114 0 : static size_t namespace_calculate_mounts(
1115 : const NamespaceInfo *ns_info,
1116 : char** read_write_paths,
1117 : char** read_only_paths,
1118 : char** inaccessible_paths,
1119 : char** empty_directories,
1120 : size_t n_bind_mounts,
1121 : size_t n_temporary_filesystems,
1122 : const char* tmp_dir,
1123 : const char* var_tmp_dir,
1124 : ProtectHome protect_home,
1125 : ProtectSystem protect_system) {
1126 :
1127 : size_t protect_home_cnt;
1128 0 : size_t protect_system_cnt =
1129 : (protect_system == PROTECT_SYSTEM_STRICT ?
1130 0 : ELEMENTSOF(protect_system_strict_table) :
1131 : ((protect_system == PROTECT_SYSTEM_FULL) ?
1132 0 : ELEMENTSOF(protect_system_full_table) :
1133 : ((protect_system == PROTECT_SYSTEM_YES) ?
1134 0 : ELEMENTSOF(protect_system_yes_table) : 0)));
1135 :
1136 0 : protect_home_cnt =
1137 : (protect_home == PROTECT_HOME_YES ?
1138 0 : ELEMENTSOF(protect_home_yes_table) :
1139 : ((protect_home == PROTECT_HOME_READ_ONLY) ?
1140 0 : ELEMENTSOF(protect_home_read_only_table) :
1141 : ((protect_home == PROTECT_HOME_TMPFS) ?
1142 0 : ELEMENTSOF(protect_home_tmpfs_table) : 0)));
1143 :
1144 0 : return !!tmp_dir + !!var_tmp_dir +
1145 0 : strv_length(read_write_paths) +
1146 0 : strv_length(read_only_paths) +
1147 0 : strv_length(inaccessible_paths) +
1148 0 : strv_length(empty_directories) +
1149 0 : n_bind_mounts +
1150 0 : n_temporary_filesystems +
1151 0 : ns_info->private_dev +
1152 0 : (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
1153 0 : (ns_info->protect_control_groups ? 1 : 0) +
1154 0 : (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
1155 0 : protect_home_cnt + protect_system_cnt +
1156 0 : (ns_info->protect_hostname ? 2 : 0) +
1157 0 : (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0);
1158 : }
1159 :
1160 0 : static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) {
1161 0 : assert(root_directory);
1162 0 : assert(n_mounts);
1163 0 : assert(mounts || *n_mounts == 0);
1164 :
1165 0 : typesafe_qsort(mounts, *n_mounts, mount_path_compare);
1166 :
1167 0 : drop_duplicates(mounts, n_mounts);
1168 0 : drop_outside_root(root_directory, mounts, n_mounts);
1169 0 : drop_inaccessible(mounts, n_mounts);
1170 0 : drop_nop(mounts, n_mounts);
1171 0 : }
1172 :
1173 0 : int setup_namespace(
1174 : const char* root_directory,
1175 : const char* root_image,
1176 : const NamespaceInfo *ns_info,
1177 : char** read_write_paths,
1178 : char** read_only_paths,
1179 : char** inaccessible_paths,
1180 : char** empty_directories,
1181 : const BindMount *bind_mounts,
1182 : size_t n_bind_mounts,
1183 : const TemporaryFileSystem *temporary_filesystems,
1184 : size_t n_temporary_filesystems,
1185 : const char* tmp_dir,
1186 : const char* var_tmp_dir,
1187 : ProtectHome protect_home,
1188 : ProtectSystem protect_system,
1189 : unsigned long mount_flags,
1190 : DissectImageFlags dissect_image_flags,
1191 : char **error_path) {
1192 :
1193 0 : _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
1194 0 : _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
1195 0 : _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
1196 0 : _cleanup_free_ void *root_hash = NULL;
1197 0 : MountEntry *m = NULL, *mounts = NULL;
1198 0 : size_t n_mounts, root_hash_size = 0;
1199 0 : bool require_prefix = false;
1200 : const char *root;
1201 0 : int r = 0;
1202 :
1203 0 : assert(ns_info);
1204 :
1205 0 : if (mount_flags == 0)
1206 0 : mount_flags = MS_SHARED;
1207 :
1208 0 : if (root_image) {
1209 0 : dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
1210 :
1211 0 : if (protect_system == PROTECT_SYSTEM_STRICT &&
1212 0 : protect_home != PROTECT_HOME_NO &&
1213 0 : strv_isempty(read_write_paths))
1214 0 : dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
1215 :
1216 0 : r = loop_device_make_by_path(root_image,
1217 0 : dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
1218 : &loop_device);
1219 0 : if (r < 0)
1220 0 : return log_debug_errno(r, "Failed to create loop device for root image: %m");
1221 :
1222 0 : r = root_hash_load(root_image, &root_hash, &root_hash_size);
1223 0 : if (r < 0)
1224 0 : return log_debug_errno(r, "Failed to load root hash: %m");
1225 :
1226 0 : r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
1227 0 : if (r < 0)
1228 0 : return log_debug_errno(r, "Failed to dissect image: %m");
1229 :
1230 0 : r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
1231 0 : if (r < 0)
1232 0 : return log_debug_errno(r, "Failed to decrypt dissected image: %m");
1233 : }
1234 :
1235 0 : if (root_directory)
1236 0 : root = root_directory;
1237 : else {
1238 : /* Always create the mount namespace in a temporary directory, instead of operating
1239 : * directly in the root. The temporary directory prevents any mounts from being
1240 : * potentially obscured my other mounts we already applied.
1241 : * We use the same mount point for all images, which is safe, since they all live
1242 : * in their own namespaces after all, and hence won't see each other. */
1243 :
1244 0 : root = "/run/systemd/unit-root";
1245 0 : (void) mkdir_label(root, 0700);
1246 0 : require_prefix = true;
1247 : }
1248 :
1249 0 : n_mounts = namespace_calculate_mounts(
1250 : ns_info,
1251 : read_write_paths,
1252 : read_only_paths,
1253 : inaccessible_paths,
1254 : empty_directories,
1255 : n_bind_mounts,
1256 : n_temporary_filesystems,
1257 : tmp_dir, var_tmp_dir,
1258 : protect_home, protect_system);
1259 :
1260 0 : if (n_mounts > 0) {
1261 0 : m = mounts = new0(MountEntry, n_mounts);
1262 0 : if (!mounts)
1263 0 : return -ENOMEM;
1264 :
1265 0 : r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix);
1266 0 : if (r < 0)
1267 0 : goto finish;
1268 :
1269 0 : r = append_access_mounts(&m, read_only_paths, READONLY, require_prefix);
1270 0 : if (r < 0)
1271 0 : goto finish;
1272 :
1273 0 : r = append_access_mounts(&m, inaccessible_paths, INACCESSIBLE, require_prefix);
1274 0 : if (r < 0)
1275 0 : goto finish;
1276 :
1277 0 : r = append_empty_dir_mounts(&m, empty_directories);
1278 0 : if (r < 0)
1279 0 : goto finish;
1280 :
1281 0 : r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
1282 0 : if (r < 0)
1283 0 : goto finish;
1284 :
1285 0 : r = append_tmpfs_mounts(&m, temporary_filesystems, n_temporary_filesystems);
1286 0 : if (r < 0)
1287 0 : goto finish;
1288 :
1289 0 : if (tmp_dir) {
1290 0 : *(m++) = (MountEntry) {
1291 : .path_const = "/tmp",
1292 : .mode = PRIVATE_TMP,
1293 : .source_const = tmp_dir,
1294 : };
1295 : }
1296 :
1297 0 : if (var_tmp_dir) {
1298 0 : *(m++) = (MountEntry) {
1299 : .path_const = "/var/tmp",
1300 : .mode = PRIVATE_TMP,
1301 : .source_const = var_tmp_dir,
1302 : };
1303 : }
1304 :
1305 0 : if (ns_info->private_dev) {
1306 0 : *(m++) = (MountEntry) {
1307 : .path_const = "/dev",
1308 : .mode = PRIVATE_DEV,
1309 : .flags = DEV_MOUNT_OPTIONS,
1310 : };
1311 : }
1312 :
1313 0 : if (ns_info->protect_kernel_tunables) {
1314 0 : r = append_static_mounts(&m, protect_kernel_tunables_table, ELEMENTSOF(protect_kernel_tunables_table), ns_info->ignore_protect_paths);
1315 0 : if (r < 0)
1316 0 : goto finish;
1317 : }
1318 :
1319 0 : if (ns_info->protect_kernel_modules) {
1320 0 : r = append_static_mounts(&m, protect_kernel_modules_table, ELEMENTSOF(protect_kernel_modules_table), ns_info->ignore_protect_paths);
1321 0 : if (r < 0)
1322 0 : goto finish;
1323 : }
1324 :
1325 0 : if (ns_info->protect_control_groups) {
1326 0 : *(m++) = (MountEntry) {
1327 : .path_const = "/sys/fs/cgroup",
1328 : .mode = READONLY,
1329 : };
1330 : }
1331 :
1332 0 : r = append_protect_home(&m, protect_home, ns_info->ignore_protect_paths);
1333 0 : if (r < 0)
1334 0 : goto finish;
1335 :
1336 0 : r = append_protect_system(&m, protect_system, false);
1337 0 : if (r < 0)
1338 0 : goto finish;
1339 :
1340 0 : if (namespace_info_mount_apivfs(ns_info)) {
1341 0 : r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
1342 0 : if (r < 0)
1343 0 : goto finish;
1344 : }
1345 :
1346 0 : if (ns_info->protect_hostname) {
1347 0 : *(m++) = (MountEntry) {
1348 : .path_const = "/proc/sys/kernel/hostname",
1349 : .mode = READONLY,
1350 : };
1351 0 : *(m++) = (MountEntry) {
1352 : .path_const = "/proc/sys/kernel/domainname",
1353 : .mode = READONLY,
1354 : };
1355 : }
1356 :
1357 0 : assert(mounts + n_mounts == m);
1358 :
1359 : /* Prepend the root directory where that's necessary */
1360 0 : r = prefix_where_needed(mounts, n_mounts, root);
1361 0 : if (r < 0)
1362 0 : goto finish;
1363 :
1364 0 : normalize_mounts(root, mounts, &n_mounts);
1365 : }
1366 :
1367 : /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
1368 :
1369 0 : if (unshare(CLONE_NEWNS) < 0) {
1370 0 : r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
1371 0 : if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS))
1372 : /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter in place
1373 : * that doesn't allow us to create namespaces (or a missing cap), then propagate a recognizable
1374 : * error back, which the caller can use to detect this case (and only this) and optionally
1375 : * continue without namespacing applied. */
1376 0 : r = -ENOANO;
1377 :
1378 0 : goto finish;
1379 : }
1380 :
1381 : /* Remount / as SLAVE so that nothing now mounted in the namespace
1382 : * shows up in the parent */
1383 0 : if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1384 0 : r = log_debug_errno(errno, "Failed to remount '/' as SLAVE: %m");
1385 0 : goto finish;
1386 : }
1387 :
1388 0 : if (root_image) {
1389 : /* A root image is specified, mount it to the right place */
1390 0 : r = dissected_image_mount(dissected_image, root, UID_INVALID, dissect_image_flags);
1391 0 : if (r < 0) {
1392 0 : log_debug_errno(r, "Failed to mount root image: %m");
1393 0 : goto finish;
1394 : }
1395 :
1396 0 : if (decrypted_image) {
1397 0 : r = decrypted_image_relinquish(decrypted_image);
1398 0 : if (r < 0) {
1399 0 : log_debug_errno(r, "Failed to relinquish decrypted image: %m");
1400 0 : goto finish;
1401 : }
1402 : }
1403 :
1404 0 : loop_device_relinquish(loop_device);
1405 :
1406 0 : } else if (root_directory) {
1407 :
1408 : /* A root directory is specified. Turn its directory into bind mount, if it isn't one yet. */
1409 0 : r = path_is_mount_point(root, NULL, AT_SYMLINK_FOLLOW);
1410 0 : if (r < 0) {
1411 0 : log_debug_errno(r, "Failed to detect that %s is a mount point or not: %m", root);
1412 0 : goto finish;
1413 : }
1414 0 : if (r == 0) {
1415 0 : if (mount(root, root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1416 0 : r = log_debug_errno(errno, "Failed to bind mount '%s': %m", root);
1417 0 : goto finish;
1418 : }
1419 : }
1420 :
1421 : } else {
1422 :
1423 : /* Let's mount the main root directory to the root directory to use */
1424 0 : if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) {
1425 0 : r = log_debug_errno(errno, "Failed to bind mount '/' on '%s': %m", root);
1426 0 : goto finish;
1427 : }
1428 : }
1429 :
1430 : /* Try to set up the new root directory before mounting anything else there. */
1431 0 : if (root_image || root_directory)
1432 0 : (void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
1433 :
1434 0 : if (n_mounts > 0) {
1435 0 : _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
1436 0 : _cleanup_free_ char **blacklist = NULL;
1437 : size_t j;
1438 :
1439 : /* Open /proc/self/mountinfo now as it may become unavailable if we mount anything on top of /proc.
1440 : * For example, this is the case with the option: 'InaccessiblePaths=/proc' */
1441 0 : proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
1442 0 : if (!proc_self_mountinfo) {
1443 0 : r = log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
1444 0 : if (error_path)
1445 0 : *error_path = strdup("/proc/self/mountinfo");
1446 0 : goto finish;
1447 : }
1448 :
1449 : /* First round, establish all mounts we need */
1450 0 : for (;;) {
1451 0 : bool again = false;
1452 :
1453 0 : for (m = mounts; m < mounts + n_mounts; ++m) {
1454 :
1455 0 : if (m->applied)
1456 0 : continue;
1457 :
1458 0 : r = follow_symlink(root, m);
1459 0 : if (r < 0) {
1460 0 : if (error_path && mount_entry_path(m))
1461 0 : *error_path = strdup(mount_entry_path(m));
1462 0 : goto finish;
1463 : }
1464 0 : if (r == 0) {
1465 : /* We hit a symlinked mount point. The entry got rewritten and might point to a
1466 : * very different place now. Let's normalize the changed list, and start from
1467 : * the beginning. After all to mount the entry at the new location we might
1468 : * need some other mounts first */
1469 0 : again = true;
1470 0 : break;
1471 : }
1472 :
1473 0 : r = apply_mount(root, m);
1474 0 : if (r < 0) {
1475 0 : if (error_path && mount_entry_path(m))
1476 0 : *error_path = strdup(mount_entry_path(m));
1477 0 : goto finish;
1478 : }
1479 :
1480 0 : m->applied = true;
1481 : }
1482 :
1483 0 : if (!again)
1484 0 : break;
1485 :
1486 0 : normalize_mounts(root, mounts, &n_mounts);
1487 : }
1488 :
1489 : /* Create a blacklist we can pass to bind_mount_recursive() */
1490 0 : blacklist = new(char*, n_mounts+1);
1491 0 : if (!blacklist) {
1492 0 : r = -ENOMEM;
1493 0 : goto finish;
1494 : }
1495 0 : for (j = 0; j < n_mounts; j++)
1496 0 : blacklist[j] = (char*) mount_entry_path(mounts+j);
1497 0 : blacklist[j] = NULL;
1498 :
1499 : /* Second round, flip the ro bits if necessary. */
1500 0 : for (m = mounts; m < mounts + n_mounts; ++m) {
1501 0 : r = make_read_only(m, blacklist, proc_self_mountinfo);
1502 0 : if (r < 0) {
1503 0 : if (error_path && mount_entry_path(m))
1504 0 : *error_path = strdup(mount_entry_path(m));
1505 0 : goto finish;
1506 : }
1507 : }
1508 : }
1509 :
1510 : /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
1511 0 : r = mount_move_root(root);
1512 0 : if (r < 0) {
1513 0 : log_debug_errno(r, "Failed to mount root with MS_MOVE: %m");
1514 0 : goto finish;
1515 : }
1516 :
1517 : /* Remount / as the desired mode. Note that this will not
1518 : * reestablish propagation from our side to the host, since
1519 : * what's disconnected is disconnected. */
1520 0 : if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
1521 0 : r = log_debug_errno(errno, "Failed to remount '/' with desired mount flags: %m");
1522 0 : goto finish;
1523 : }
1524 :
1525 0 : r = 0;
1526 :
1527 0 : finish:
1528 0 : for (m = mounts; m < mounts + n_mounts; m++)
1529 0 : mount_entry_done(m);
1530 :
1531 0 : free(mounts);
1532 :
1533 0 : return r;
1534 : }
1535 :
1536 559 : void bind_mount_free_many(BindMount *b, size_t n) {
1537 : size_t i;
1538 :
1539 559 : assert(b || n == 0);
1540 :
1541 559 : for (i = 0; i < n; i++) {
1542 0 : free(b[i].source);
1543 0 : free(b[i].destination);
1544 : }
1545 :
1546 559 : free(b);
1547 559 : }
1548 :
1549 0 : int bind_mount_add(BindMount **b, size_t *n, const BindMount *item) {
1550 0 : _cleanup_free_ char *s = NULL, *d = NULL;
1551 : BindMount *c;
1552 :
1553 0 : assert(b);
1554 0 : assert(n);
1555 0 : assert(item);
1556 :
1557 0 : s = strdup(item->source);
1558 0 : if (!s)
1559 0 : return -ENOMEM;
1560 :
1561 0 : d = strdup(item->destination);
1562 0 : if (!d)
1563 0 : return -ENOMEM;
1564 :
1565 0 : c = reallocarray(*b, *n + 1, sizeof(BindMount));
1566 0 : if (!c)
1567 0 : return -ENOMEM;
1568 :
1569 0 : *b = c;
1570 :
1571 0 : c[(*n) ++] = (BindMount) {
1572 0 : .source = TAKE_PTR(s),
1573 0 : .destination = TAKE_PTR(d),
1574 0 : .read_only = item->read_only,
1575 0 : .nosuid = item->nosuid,
1576 0 : .recursive = item->recursive,
1577 0 : .ignore_enoent = item->ignore_enoent,
1578 : };
1579 :
1580 0 : return 0;
1581 : }
1582 :
1583 559 : void temporary_filesystem_free_many(TemporaryFileSystem *t, size_t n) {
1584 : size_t i;
1585 :
1586 559 : assert(t || n == 0);
1587 :
1588 559 : for (i = 0; i < n; i++) {
1589 0 : free(t[i].path);
1590 0 : free(t[i].options);
1591 : }
1592 :
1593 559 : free(t);
1594 559 : }
1595 :
1596 0 : int temporary_filesystem_add(
1597 : TemporaryFileSystem **t,
1598 : size_t *n,
1599 : const char *path,
1600 : const char *options) {
1601 :
1602 0 : _cleanup_free_ char *p = NULL, *o = NULL;
1603 : TemporaryFileSystem *c;
1604 :
1605 0 : assert(t);
1606 0 : assert(n);
1607 0 : assert(path);
1608 :
1609 0 : p = strdup(path);
1610 0 : if (!p)
1611 0 : return -ENOMEM;
1612 :
1613 0 : if (!isempty(options)) {
1614 0 : o = strdup(options);
1615 0 : if (!o)
1616 0 : return -ENOMEM;
1617 : }
1618 :
1619 0 : c = reallocarray(*t, *n + 1, sizeof(TemporaryFileSystem));
1620 0 : if (!c)
1621 0 : return -ENOMEM;
1622 :
1623 0 : *t = c;
1624 :
1625 0 : c[(*n) ++] = (TemporaryFileSystem) {
1626 0 : .path = TAKE_PTR(p),
1627 0 : .options = TAKE_PTR(o),
1628 : };
1629 :
1630 0 : return 0;
1631 : }
1632 :
1633 0 : static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
1634 0 : _cleanup_free_ char *x = NULL;
1635 : char bid[SD_ID128_STRING_MAX];
1636 : sd_id128_t boot_id;
1637 : int r;
1638 :
1639 0 : assert(id);
1640 0 : assert(prefix);
1641 0 : assert(path);
1642 :
1643 : /* We include the boot id in the directory so that after a
1644 : * reboot we can easily identify obsolete directories. */
1645 :
1646 0 : r = sd_id128_get_boot(&boot_id);
1647 0 : if (r < 0)
1648 0 : return r;
1649 :
1650 0 : x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX");
1651 0 : if (!x)
1652 0 : return -ENOMEM;
1653 :
1654 0 : RUN_WITH_UMASK(0077)
1655 0 : if (!mkdtemp(x))
1656 0 : return -errno;
1657 :
1658 0 : RUN_WITH_UMASK(0000) {
1659 : char *y;
1660 :
1661 0 : y = strjoina(x, "/tmp");
1662 :
1663 0 : if (mkdir(y, 0777 | S_ISVTX) < 0)
1664 0 : return -errno;
1665 : }
1666 :
1667 0 : *path = TAKE_PTR(x);
1668 :
1669 0 : return 0;
1670 : }
1671 :
1672 0 : int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
1673 : char *a, *b;
1674 : int r;
1675 :
1676 0 : assert(id);
1677 0 : assert(tmp_dir);
1678 0 : assert(var_tmp_dir);
1679 :
1680 0 : r = setup_one_tmp_dir(id, "/tmp", &a);
1681 0 : if (r < 0)
1682 0 : return r;
1683 :
1684 0 : r = setup_one_tmp_dir(id, "/var/tmp", &b);
1685 0 : if (r < 0) {
1686 : char *t;
1687 :
1688 0 : t = strjoina(a, "/tmp");
1689 0 : (void) rmdir(t);
1690 0 : (void) rmdir(a);
1691 :
1692 0 : free(a);
1693 0 : return r;
1694 : }
1695 :
1696 0 : *tmp_dir = a;
1697 0 : *var_tmp_dir = b;
1698 :
1699 0 : return 0;
1700 : }
1701 :
1702 0 : int setup_netns(const int netns_storage_socket[static 2]) {
1703 0 : _cleanup_close_ int netns = -1;
1704 : int r, q;
1705 :
1706 0 : assert(netns_storage_socket);
1707 0 : assert(netns_storage_socket[0] >= 0);
1708 0 : assert(netns_storage_socket[1] >= 0);
1709 :
1710 : /* We use the passed socketpair as a storage buffer for our
1711 : * namespace reference fd. Whatever process runs this first
1712 : * shall create a new namespace, all others should just join
1713 : * it. To serialize that we use a file lock on the socket
1714 : * pair.
1715 : *
1716 : * It's a bit crazy, but hey, works great! */
1717 :
1718 0 : if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1719 0 : return -errno;
1720 :
1721 0 : netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1722 0 : if (netns == -EAGAIN) {
1723 : /* Nothing stored yet, so let's create a new namespace. */
1724 :
1725 0 : if (unshare(CLONE_NEWNET) < 0) {
1726 0 : r = -errno;
1727 0 : goto fail;
1728 : }
1729 :
1730 0 : (void) loopback_setup();
1731 :
1732 0 : netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
1733 0 : if (netns < 0) {
1734 0 : r = -errno;
1735 0 : goto fail;
1736 : }
1737 :
1738 0 : r = 1;
1739 :
1740 0 : } else if (netns < 0) {
1741 0 : r = netns;
1742 0 : goto fail;
1743 :
1744 : } else {
1745 : /* Yay, found something, so let's join the namespace */
1746 0 : if (setns(netns, CLONE_NEWNET) < 0) {
1747 0 : r = -errno;
1748 0 : goto fail;
1749 : }
1750 :
1751 0 : r = 0;
1752 : }
1753 :
1754 0 : q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1755 0 : if (q < 0) {
1756 0 : r = q;
1757 0 : goto fail;
1758 : }
1759 :
1760 0 : fail:
1761 0 : (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1762 0 : return r;
1763 : }
1764 :
1765 0 : int open_netns_path(const int netns_storage_socket[static 2], const char *path) {
1766 0 : _cleanup_close_ int netns = -1;
1767 : int q, r;
1768 :
1769 0 : assert(netns_storage_socket);
1770 0 : assert(netns_storage_socket[0] >= 0);
1771 0 : assert(netns_storage_socket[1] >= 0);
1772 0 : assert(path);
1773 :
1774 : /* If the storage socket doesn't contain a netns fd yet, open one via the file system and store it in
1775 : * it. This is supposed to be called ahead of time, i.e. before setup_netns() which will allocate a
1776 : * new anonymous netns if needed. */
1777 :
1778 0 : if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
1779 0 : return -errno;
1780 :
1781 0 : netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
1782 0 : if (netns == -EAGAIN) {
1783 : /* Nothing stored yet. Open the file from the file system. */
1784 :
1785 0 : netns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
1786 0 : if (netns < 0) {
1787 0 : r = -errno;
1788 0 : goto fail;
1789 : }
1790 :
1791 0 : r = fd_is_network_ns(netns);
1792 0 : if (r == 0) { /* Not a netns? Refuse early. */
1793 0 : r = -EINVAL;
1794 0 : goto fail;
1795 : }
1796 0 : if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */
1797 0 : goto fail;
1798 :
1799 0 : r = 1;
1800 :
1801 0 : } else if (netns < 0) {
1802 0 : r = netns;
1803 0 : goto fail;
1804 : } else
1805 0 : r = 0; /* Already allocated */
1806 :
1807 0 : q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
1808 0 : if (q < 0) {
1809 0 : r = q;
1810 0 : goto fail;
1811 : }
1812 :
1813 0 : fail:
1814 0 : (void) lockf(netns_storage_socket[0], F_ULOCK, 0);
1815 0 : return r;
1816 : }
1817 :
1818 0 : bool ns_type_supported(NamespaceType type) {
1819 : const char *t, *ns_proc;
1820 :
1821 0 : t = namespace_type_to_string(type);
1822 0 : if (!t) /* Don't know how to translate this? Then it's not supported */
1823 0 : return false;
1824 :
1825 0 : ns_proc = strjoina("/proc/self/ns/", t);
1826 0 : return access(ns_proc, F_OK) == 0;
1827 : }
1828 :
1829 : static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
1830 : [PROTECT_HOME_NO] = "no",
1831 : [PROTECT_HOME_YES] = "yes",
1832 : [PROTECT_HOME_READ_ONLY] = "read-only",
1833 : [PROTECT_HOME_TMPFS] = "tmpfs",
1834 : };
1835 :
1836 183 : DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_home, ProtectHome, PROTECT_HOME_YES);
1837 :
1838 : static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
1839 : [PROTECT_SYSTEM_NO] = "no",
1840 : [PROTECT_SYSTEM_YES] = "yes",
1841 : [PROTECT_SYSTEM_FULL] = "full",
1842 : [PROTECT_SYSTEM_STRICT] = "strict",
1843 : };
1844 :
1845 183 : DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
1846 :
1847 : static const char* const namespace_type_table[] = {
1848 : [NAMESPACE_MOUNT] = "mnt",
1849 : [NAMESPACE_CGROUP] = "cgroup",
1850 : [NAMESPACE_UTS] = "uts",
1851 : [NAMESPACE_IPC] = "ipc",
1852 : [NAMESPACE_USER] = "user",
1853 : [NAMESPACE_PID] = "pid",
1854 : [NAMESPACE_NET] = "net",
1855 : };
1856 :
1857 18 : DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
|