Line data Source code
1 : /* SPDX-License-Identifier: LGPL-2.1+ */
2 :
3 : #include <sys/mount.h>
4 : #include <linux/magic.h>
5 :
6 : #include "alloc-util.h"
7 : #include "escape.h"
8 : #include "fd-util.h"
9 : #include "format-util.h"
10 : #include "fs-util.h"
11 : #include "label.h"
12 : #include "mkdir.h"
13 : #include "mount-util.h"
14 : #include "mountpoint-util.h"
15 : #include "nspawn-mount.h"
16 : #include "parse-util.h"
17 : #include "path-util.h"
18 : #include "rm-rf.h"
19 : #include "set.h"
20 : #include "sort-util.h"
21 : #include "stat-util.h"
22 : #include "string-util.h"
23 : #include "strv.h"
24 : #include "tmpfile-util.h"
25 : #include "user-util.h"
26 :
27 0 : CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
28 : CustomMount *c, *ret;
29 :
30 0 : assert(l);
31 0 : assert(n);
32 0 : assert(t >= 0);
33 0 : assert(t < _CUSTOM_MOUNT_TYPE_MAX);
34 :
35 0 : c = reallocarray(*l, *n + 1, sizeof(CustomMount));
36 0 : if (!c)
37 0 : return NULL;
38 :
39 0 : *l = c;
40 0 : ret = *l + *n;
41 0 : (*n)++;
42 :
43 0 : *ret = (CustomMount) { .type = t };
44 :
45 0 : return ret;
46 : }
47 :
48 4 : void custom_mount_free_all(CustomMount *l, size_t n) {
49 : size_t i;
50 :
51 4 : for (i = 0; i < n; i++) {
52 0 : CustomMount *m = l + i;
53 :
54 0 : free(m->source);
55 0 : free(m->destination);
56 0 : free(m->options);
57 :
58 0 : if (m->work_dir) {
59 0 : (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
60 0 : free(m->work_dir);
61 : }
62 :
63 0 : if (m->rm_rf_tmpdir) {
64 0 : (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
65 0 : free(m->rm_rf_tmpdir);
66 : }
67 :
68 0 : strv_free(m->lower);
69 0 : free(m->type_argument);
70 : }
71 :
72 4 : free(l);
73 4 : }
74 :
75 0 : static int custom_mount_compare(const CustomMount *a, const CustomMount *b) {
76 : int r;
77 :
78 0 : r = path_compare(a->destination, b->destination);
79 0 : if (r != 0)
80 0 : return r;
81 :
82 0 : return CMP(a->type, b->type);
83 : }
84 :
85 0 : static bool source_path_is_valid(const char *p) {
86 0 : assert(p);
87 :
88 0 : if (*p == '+')
89 0 : p++;
90 :
91 0 : return path_is_absolute(p);
92 : }
93 :
94 0 : static char *resolve_source_path(const char *dest, const char *source) {
95 :
96 0 : if (!source)
97 0 : return NULL;
98 :
99 0 : if (source[0] == '+')
100 0 : return path_join(dest, source + 1);
101 :
102 0 : return strdup(source);
103 : }
104 :
105 0 : int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
106 : size_t i;
107 : int r;
108 :
109 : /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
110 : * parent process, so that we know the temporary directories to remove on exit before we fork off the
111 : * children. */
112 :
113 0 : assert(l || n == 0);
114 :
115 : /* Order the custom mounts, and make sure we have a working directory */
116 0 : typesafe_qsort(l, n, custom_mount_compare);
117 :
118 0 : for (i = 0; i < n; i++) {
119 0 : CustomMount *m = l + i;
120 :
121 : /* /proc we mount in the inner child, i.e. when we acquired CLONE_NEWPID. All other mounts we mount
122 : * already in the outer child, so that the mounts are already established before CLONE_NEWPID and in
123 : * particular CLONE_NEWUSER. This also means any custom mounts below /proc also need to be mounted in
124 : * the inner child, not the outer one. Determine this here. */
125 0 : m->in_userns = path_startswith(m->destination, "/proc");
126 :
127 0 : if (m->type == CUSTOM_MOUNT_BIND) {
128 0 : if (m->source) {
129 : char *s;
130 :
131 0 : s = resolve_source_path(dest, m->source);
132 0 : if (!s)
133 0 : return log_oom();
134 :
135 0 : free_and_replace(m->source, s);
136 : } else {
137 : /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
138 :
139 0 : m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
140 0 : if (!m->rm_rf_tmpdir)
141 0 : return log_oom();
142 :
143 0 : if (!mkdtemp(m->rm_rf_tmpdir)) {
144 0 : m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
145 0 : return log_error_errno(errno, "Failed to acquire temporary directory: %m");
146 : }
147 :
148 0 : m->source = path_join(m->rm_rf_tmpdir, "src");
149 0 : if (!m->source)
150 0 : return log_oom();
151 :
152 0 : if (mkdir(m->source, 0755) < 0)
153 0 : return log_error_errno(errno, "Failed to create %s: %m", m->source);
154 : }
155 : }
156 :
157 0 : if (m->type == CUSTOM_MOUNT_OVERLAY) {
158 : char **j;
159 :
160 0 : STRV_FOREACH(j, m->lower) {
161 : char *s;
162 :
163 0 : s = resolve_source_path(dest, *j);
164 0 : if (!s)
165 0 : return log_oom();
166 :
167 0 : free_and_replace(*j, s);
168 : }
169 :
170 0 : if (m->work_dir) {
171 : char *s;
172 :
173 0 : s = resolve_source_path(dest, m->work_dir);
174 0 : if (!s)
175 0 : return log_oom();
176 :
177 0 : free_and_replace(m->work_dir, s);
178 : } else {
179 0 : assert(m->source);
180 :
181 0 : r = tempfn_random(m->source, NULL, &m->work_dir);
182 0 : if (r < 0)
183 0 : return log_error_errno(r, "Failed to acquire working directory: %m");
184 : }
185 :
186 0 : (void) mkdir_label(m->work_dir, 0700);
187 : }
188 : }
189 :
190 0 : return 0;
191 : }
192 :
193 0 : int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
194 0 : _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
195 0 : const char *p = s;
196 : CustomMount *m;
197 : int r;
198 :
199 0 : assert(l);
200 0 : assert(n);
201 :
202 0 : r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
203 0 : if (r < 0)
204 0 : return r;
205 0 : if (r == 0)
206 0 : return -EINVAL;
207 0 : if (r == 1) {
208 0 : destination = strdup(source[0] == '+' ? source+1 : source);
209 0 : if (!destination)
210 0 : return -ENOMEM;
211 : }
212 0 : if (r == 2 && !isempty(p)) {
213 0 : opts = strdup(p);
214 0 : if (!opts)
215 0 : return -ENOMEM;
216 : }
217 :
218 0 : if (isempty(source))
219 0 : source = mfree(source);
220 0 : else if (!source_path_is_valid(source))
221 0 : return -EINVAL;
222 :
223 0 : if (!path_is_absolute(destination))
224 0 : return -EINVAL;
225 0 : if (empty_or_root(destination))
226 0 : return -EINVAL;
227 :
228 0 : m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
229 0 : if (!m)
230 0 : return -ENOMEM;
231 :
232 0 : m->source = TAKE_PTR(source);
233 0 : m->destination = TAKE_PTR(destination);
234 0 : m->read_only = read_only;
235 0 : m->options = TAKE_PTR(opts);
236 :
237 0 : return 0;
238 : }
239 :
240 0 : int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
241 0 : _cleanup_free_ char *path = NULL, *opts = NULL;
242 0 : const char *p = s;
243 : CustomMount *m;
244 : int r;
245 :
246 0 : assert(l);
247 0 : assert(n);
248 0 : assert(s);
249 :
250 0 : r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
251 0 : if (r < 0)
252 0 : return r;
253 0 : if (r == 0)
254 0 : return -EINVAL;
255 :
256 0 : if (isempty(p))
257 0 : opts = strdup("mode=0755");
258 : else
259 0 : opts = strdup(p);
260 0 : if (!opts)
261 0 : return -ENOMEM;
262 :
263 0 : if (!path_is_absolute(path))
264 0 : return -EINVAL;
265 0 : if (empty_or_root(path))
266 0 : return -EINVAL;
267 :
268 0 : m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
269 0 : if (!m)
270 0 : return -ENOMEM;
271 :
272 0 : m->destination = TAKE_PTR(path);
273 0 : m->options = TAKE_PTR(opts);
274 :
275 0 : return 0;
276 : }
277 :
278 0 : int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
279 0 : _cleanup_free_ char *upper = NULL, *destination = NULL;
280 0 : _cleanup_strv_free_ char **lower = NULL;
281 : CustomMount *m;
282 : int k;
283 :
284 0 : k = strv_split_extract(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
285 0 : if (k < 0)
286 0 : return k;
287 0 : if (k < 2)
288 0 : return -EADDRNOTAVAIL;
289 0 : if (k == 2) {
290 : /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
291 : * we'll also define the destination mount point the same as the upper. */
292 :
293 0 : if (!source_path_is_valid(lower[0]) ||
294 0 : !source_path_is_valid(lower[1]))
295 0 : return -EINVAL;
296 :
297 0 : upper = TAKE_PTR(lower[1]);
298 :
299 0 : destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
300 0 : if (!destination)
301 0 : return -ENOMEM;
302 : } else {
303 : char **i;
304 :
305 : /* If more than two parameters are specified, the last one is the destination, the second to last one
306 : * the "upper", and all before that the "lower" directories. */
307 :
308 0 : destination = lower[k - 1];
309 0 : upper = TAKE_PTR(lower[k - 2]);
310 :
311 0 : STRV_FOREACH(i, lower)
312 0 : if (!source_path_is_valid(*i))
313 0 : return -EINVAL;
314 :
315 : /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
316 : * in /var/tmp */
317 0 : if (isempty(upper))
318 0 : upper = mfree(upper);
319 0 : else if (!source_path_is_valid(upper))
320 0 : return -EINVAL;
321 :
322 0 : if (!path_is_absolute(destination))
323 0 : return -EINVAL;
324 : }
325 :
326 0 : if (empty_or_root(destination))
327 0 : return -EINVAL;
328 :
329 0 : m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
330 0 : if (!m)
331 0 : return -ENOMEM;
332 :
333 0 : m->destination = TAKE_PTR(destination);
334 0 : m->source = TAKE_PTR(upper);
335 0 : m->lower = TAKE_PTR(lower);
336 0 : m->read_only = read_only;
337 :
338 0 : return 0;
339 : }
340 :
341 0 : int inaccessible_mount_parse(CustomMount **l, size_t *n, const char *s) {
342 0 : _cleanup_free_ char *path = NULL;
343 : CustomMount *m;
344 :
345 0 : assert(l);
346 0 : assert(n);
347 0 : assert(s);
348 :
349 0 : if (!path_is_absolute(s))
350 0 : return -EINVAL;
351 :
352 0 : path = strdup(s);
353 0 : if (!path)
354 0 : return -ENOMEM;
355 :
356 0 : m = custom_mount_add(l, n, CUSTOM_MOUNT_INACCESSIBLE);
357 0 : if (!m)
358 0 : return -ENOMEM;
359 :
360 0 : m->destination = TAKE_PTR(path);
361 0 : return 0;
362 : }
363 :
364 0 : int tmpfs_patch_options(
365 : const char *options,
366 : uid_t uid_shift,
367 : const char *selinux_apifs_context,
368 : char **ret) {
369 :
370 0 : char *buf = NULL;
371 :
372 0 : if (uid_shift != UID_INVALID) {
373 0 : if (asprintf(&buf, "%s%suid=" UID_FMT ",gid=" UID_FMT,
374 : strempty(options), options ? "," : "",
375 : uid_shift, uid_shift) < 0)
376 0 : return -ENOMEM;
377 :
378 0 : options = buf;
379 : }
380 :
381 : #if HAVE_SELINUX
382 0 : if (selinux_apifs_context) {
383 : char *t;
384 :
385 0 : t = strjoin(strempty(options), options ? "," : "",
386 : "context=\"", selinux_apifs_context, "\"");
387 0 : free(buf);
388 0 : if (!t)
389 0 : return -ENOMEM;
390 :
391 0 : buf = t;
392 : }
393 : #endif
394 :
395 0 : if (!buf && options) {
396 0 : buf = strdup(options);
397 0 : if (!buf)
398 0 : return -ENOMEM;
399 : }
400 0 : *ret = buf;
401 :
402 0 : return !!buf;
403 : }
404 :
405 0 : int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
406 : const char *full, *top, *x;
407 : int r;
408 0 : unsigned long extra_flags = 0;
409 :
410 0 : top = prefix_roota(dest, "/sys");
411 0 : r = path_is_fs_type(top, SYSFS_MAGIC);
412 0 : if (r < 0)
413 0 : return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
414 : /* /sys might already be mounted as sysfs by the outer child in the
415 : * !netns case. In this case, it's all good. Don't touch it because we
416 : * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
417 : */
418 0 : if (r > 0)
419 0 : return 0;
420 :
421 0 : full = prefix_roota(top, "/full");
422 :
423 0 : (void) mkdir(full, 0755);
424 :
425 0 : if (mount_settings & MOUNT_APPLY_APIVFS_RO)
426 0 : extra_flags |= MS_RDONLY;
427 :
428 0 : r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
429 : MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
430 0 : if (r < 0)
431 0 : return r;
432 :
433 0 : FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
434 0 : _cleanup_free_ char *from = NULL, *to = NULL;
435 :
436 0 : from = path_join(full, x);
437 0 : if (!from)
438 0 : return log_oom();
439 :
440 0 : to = path_join(top, x);
441 0 : if (!to)
442 0 : return log_oom();
443 :
444 0 : (void) mkdir(to, 0755);
445 :
446 0 : r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
447 0 : if (r < 0)
448 0 : return r;
449 :
450 0 : r = mount_verbose(LOG_ERR, NULL, to, NULL,
451 : MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
452 0 : if (r < 0)
453 0 : return r;
454 : }
455 :
456 0 : r = umount_verbose(full);
457 0 : if (r < 0)
458 0 : return r;
459 :
460 0 : if (rmdir(full) < 0)
461 0 : return log_error_errno(errno, "Failed to remove %s: %m", full);
462 :
463 : /* Create mountpoint for cgroups. Otherwise we are not allowed since we
464 : * remount /sys read-only.
465 : */
466 0 : x = prefix_roota(top, "/fs/cgroup");
467 0 : (void) mkdir_p(x, 0755);
468 :
469 0 : return mount_verbose(LOG_ERR, NULL, top, NULL,
470 : MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
471 : }
472 :
473 0 : static int mkdir_userns(const char *path, mode_t mode, uid_t uid_shift) {
474 : int r;
475 :
476 0 : assert(path);
477 :
478 0 : r = mkdir_errno_wrapper(path, mode);
479 0 : if (r < 0 && r != -EEXIST)
480 0 : return r;
481 :
482 0 : if (uid_shift == UID_INVALID)
483 0 : return 0;
484 :
485 0 : if (lchown(path, uid_shift, uid_shift) < 0)
486 0 : return -errno;
487 :
488 0 : return 0;
489 : }
490 :
491 0 : static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, uid_t uid_shift) {
492 : const char *p, *e;
493 : int r;
494 :
495 0 : assert(path);
496 :
497 0 : if (prefix && !path_startswith(path, prefix))
498 0 : return -ENOTDIR;
499 :
500 : /* create every parent directory in the path, except the last component */
501 0 : p = path + strspn(path, "/");
502 0 : for (;;) {
503 0 : char t[strlen(path) + 1];
504 :
505 0 : e = p + strcspn(p, "/");
506 0 : p = e + strspn(e, "/");
507 :
508 : /* Is this the last component? If so, then we're done */
509 0 : if (*p == 0)
510 0 : break;
511 :
512 0 : memcpy(t, path, e - path);
513 0 : t[e-path] = 0;
514 :
515 0 : if (prefix && path_startswith(prefix, t))
516 0 : continue;
517 :
518 0 : r = mkdir_userns(t, mode, uid_shift);
519 0 : if (r < 0)
520 0 : return r;
521 : }
522 :
523 0 : return mkdir_userns(path, mode, uid_shift);
524 : }
525 :
526 0 : int mount_all(const char *dest,
527 : MountSettingsMask mount_settings,
528 : uid_t uid_shift,
529 : const char *selinux_apifs_context) {
530 :
531 : #define PROC_INACCESSIBLE_REG(path) \
532 : { "/run/systemd/inaccessible/reg", (path), NULL, NULL, MS_BIND, \
533 : MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
534 : { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
535 : MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
536 :
537 : #define PROC_READ_ONLY(path) \
538 : { (path), (path), NULL, NULL, MS_BIND, \
539 : MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
540 : { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
541 : MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
542 :
543 : typedef struct MountPoint {
544 : const char *what;
545 : const char *where;
546 : const char *type;
547 : const char *options;
548 : unsigned long flags;
549 : MountSettingsMask mount_settings;
550 : } MountPoint;
551 :
552 : static const MountPoint mount_table[] = {
553 : /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
554 : { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
555 : MOUNT_FATAL|MOUNT_IN_USERNS },
556 :
557 : { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
558 : MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
559 :
560 : { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND,
561 : MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
562 :
563 : { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
564 : MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
565 :
566 : /* Make these files inaccessible to container payloads: they potentially leak information about kernel
567 : * internals or the host's execution environment to the container */
568 : PROC_INACCESSIBLE_REG("/proc/kallsyms"),
569 : PROC_INACCESSIBLE_REG("/proc/kcore"),
570 : PROC_INACCESSIBLE_REG("/proc/keys"),
571 : PROC_INACCESSIBLE_REG("/proc/sysrq-trigger"),
572 : PROC_INACCESSIBLE_REG("/proc/timer_list"),
573 :
574 : /* Make these directories read-only to container payloads: they show hardware information, and in some
575 : * cases contain tunables the container really shouldn't have access to. */
576 : PROC_READ_ONLY("/proc/acpi"),
577 : PROC_READ_ONLY("/proc/apm"),
578 : PROC_READ_ONLY("/proc/asound"),
579 : PROC_READ_ONLY("/proc/bus"),
580 : PROC_READ_ONLY("/proc/fs"),
581 : PROC_READ_ONLY("/proc/irq"),
582 : PROC_READ_ONLY("/proc/scsi"),
583 :
584 : { "mqueue", "/dev/mqueue", "mqueue", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
585 : MOUNT_IN_USERNS },
586 :
587 : /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
588 : { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
589 : MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP },
590 : { "tmpfs", "/sys", "tmpfs", "mode=555", MS_NOSUID|MS_NOEXEC|MS_NODEV,
591 : MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
592 : { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
593 : MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
594 : { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
595 : MOUNT_FATAL }, /* skipped if above was mounted */
596 : { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
597 : MOUNT_FATAL },
598 : { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
599 : MOUNT_FATAL },
600 : { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
601 : MOUNT_FATAL },
602 :
603 : #if HAVE_SELINUX
604 : { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
605 : 0 }, /* Bind mount first */
606 : { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
607 : 0 }, /* Then, make it r/o */
608 : #endif
609 : };
610 :
611 0 : bool use_userns = (mount_settings & MOUNT_USE_USERNS);
612 0 : bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
613 0 : bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
614 0 : bool in_userns = (mount_settings & MOUNT_IN_USERNS);
615 0 : bool tmpfs_tmp = (mount_settings & MOUNT_APPLY_TMPFS_TMP);
616 : size_t k;
617 : int r;
618 :
619 0 : for (k = 0; k < ELEMENTSOF(mount_table); k++) {
620 0 : _cleanup_free_ char *where = NULL, *options = NULL;
621 : const char *o;
622 0 : bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
623 :
624 0 : if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
625 0 : continue;
626 :
627 0 : if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
628 0 : continue;
629 :
630 0 : if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
631 0 : continue;
632 :
633 0 : if (!tmpfs_tmp && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_TMPFS_TMP))
634 0 : continue;
635 :
636 0 : r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where);
637 0 : if (r < 0)
638 0 : return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
639 :
640 : /* Skip this entry if it is not a remount. */
641 0 : if (mount_table[k].what) {
642 0 : r = path_is_mount_point(where, NULL, 0);
643 0 : if (r < 0 && r != -ENOENT)
644 0 : return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
645 0 : if (r > 0)
646 0 : continue;
647 : }
648 :
649 0 : r = mkdir_userns_p(dest, where, 0755, (use_userns && !in_userns) ? uid_shift : UID_INVALID);
650 0 : if (r < 0 && r != -EEXIST) {
651 0 : if (fatal && r != -EROFS)
652 0 : return log_error_errno(r, "Failed to create directory %s: %m", where);
653 :
654 0 : log_debug_errno(r, "Failed to create directory %s: %m", where);
655 : /* If we failed mkdir() or chown() due to the root
656 : * directory being read only, attempt to mount this fs
657 : * anyway and let mount_verbose log any errors */
658 0 : if (r != -EROFS)
659 0 : continue;
660 : }
661 :
662 0 : o = mount_table[k].options;
663 0 : if (streq_ptr(mount_table[k].type, "tmpfs")) {
664 0 : r = tmpfs_patch_options(o, in_userns ? 0 : uid_shift, selinux_apifs_context, &options);
665 0 : if (r < 0)
666 0 : return log_oom();
667 0 : if (r > 0)
668 0 : o = options;
669 : }
670 :
671 0 : r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
672 : mount_table[k].what,
673 : where,
674 : mount_table[k].type,
675 : mount_table[k].flags,
676 : o);
677 0 : if (r < 0 && fatal)
678 0 : return r;
679 : }
680 :
681 0 : return 0;
682 : }
683 :
684 0 : static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
685 0 : const char *p = options;
686 0 : unsigned long flags = *mount_flags;
687 0 : char *opts = NULL;
688 : int r;
689 :
690 0 : assert(options);
691 :
692 0 : for (;;) {
693 0 : _cleanup_free_ char *word = NULL;
694 :
695 0 : r = extract_first_word(&p, &word, ",", 0);
696 0 : if (r < 0)
697 0 : return log_error_errno(r, "Failed to extract mount option: %m");
698 0 : if (r == 0)
699 0 : break;
700 :
701 0 : if (streq(word, "rbind"))
702 0 : flags |= MS_REC;
703 0 : else if (streq(word, "norbind"))
704 0 : flags &= ~MS_REC;
705 : else {
706 0 : log_error("Invalid bind mount option: %s", word);
707 0 : return -EINVAL;
708 : }
709 : }
710 :
711 0 : *mount_flags = flags;
712 : /* in the future mount_opts will hold string options for mount(2) */
713 0 : *mount_opts = opts;
714 :
715 0 : return 0;
716 : }
717 :
718 0 : static int mount_bind(const char *dest, CustomMount *m) {
719 0 : _cleanup_free_ char *mount_opts = NULL, *where = NULL;
720 0 : unsigned long mount_flags = MS_BIND | MS_REC;
721 : struct stat source_st, dest_st;
722 : int r;
723 :
724 0 : assert(dest);
725 0 : assert(m);
726 :
727 0 : if (m->options) {
728 0 : r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
729 0 : if (r < 0)
730 0 : return r;
731 : }
732 :
733 0 : if (stat(m->source, &source_st) < 0)
734 0 : return log_error_errno(errno, "Failed to stat %s: %m", m->source);
735 :
736 0 : r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
737 0 : if (r < 0)
738 0 : return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
739 0 : if (r > 0) { /* Path exists already? */
740 :
741 0 : if (stat(where, &dest_st) < 0)
742 0 : return log_error_errno(errno, "Failed to stat %s: %m", where);
743 :
744 0 : if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode))
745 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
746 : "Cannot bind mount directory %s on file %s.",
747 : m->source, where);
748 :
749 0 : if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode))
750 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
751 : "Cannot bind mount file %s on directory %s.",
752 : m->source, where);
753 :
754 : } else { /* Path doesn't exist yet? */
755 0 : r = mkdir_parents_label(where, 0755);
756 0 : if (r < 0)
757 0 : return log_error_errno(r, "Failed to make parents of %s: %m", where);
758 :
759 : /* Create the mount point. Any non-directory file can be
760 : * mounted on any non-directory file (regular, fifo, socket,
761 : * char, block).
762 : */
763 0 : if (S_ISDIR(source_st.st_mode))
764 0 : r = mkdir_label(where, 0755);
765 : else
766 0 : r = touch(where);
767 0 : if (r < 0)
768 0 : return log_error_errno(r, "Failed to create mount point %s: %m", where);
769 : }
770 :
771 0 : r = mount_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
772 0 : if (r < 0)
773 0 : return r;
774 :
775 0 : if (m->read_only) {
776 0 : r = bind_remount_recursive(where, MS_RDONLY, MS_RDONLY, NULL);
777 0 : if (r < 0)
778 0 : return log_error_errno(r, "Read-only bind mount failed: %m");
779 : }
780 :
781 0 : return 0;
782 : }
783 :
784 0 : static int mount_tmpfs(
785 : const char *dest,
786 : CustomMount *m,
787 : bool userns, uid_t uid_shift, uid_t uid_range,
788 : const char *selinux_apifs_context) {
789 :
790 : const char *options;
791 0 : _cleanup_free_ char *buf = NULL, *where = NULL;
792 : int r;
793 :
794 0 : assert(dest);
795 0 : assert(m);
796 :
797 0 : r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
798 0 : if (r < 0)
799 0 : return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
800 0 : if (r == 0) { /* Doesn't exist yet? */
801 0 : r = mkdir_p_label(where, 0755);
802 0 : if (r < 0)
803 0 : return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
804 : }
805 :
806 0 : r = tmpfs_patch_options(m->options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
807 0 : if (r < 0)
808 0 : return log_oom();
809 0 : options = r > 0 ? buf : m->options;
810 :
811 0 : return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
812 : }
813 :
814 0 : static char *joined_and_escaped_lower_dirs(char **lower) {
815 0 : _cleanup_strv_free_ char **sv = NULL;
816 :
817 0 : sv = strv_copy(lower);
818 0 : if (!sv)
819 0 : return NULL;
820 :
821 0 : strv_reverse(sv);
822 :
823 0 : if (!strv_shell_escape(sv, ",:"))
824 0 : return NULL;
825 :
826 0 : return strv_join(sv, ":");
827 : }
828 :
829 0 : static int mount_overlay(const char *dest, CustomMount *m) {
830 0 : _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
831 : const char *options;
832 : int r;
833 :
834 0 : assert(dest);
835 0 : assert(m);
836 :
837 0 : r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
838 0 : if (r < 0)
839 0 : return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
840 0 : if (r == 0) { /* Doesn't exist yet? */
841 0 : r = mkdir_label(where, 0755);
842 0 : if (r < 0)
843 0 : return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
844 : }
845 :
846 0 : (void) mkdir_p_label(m->source, 0755);
847 :
848 0 : lower = joined_and_escaped_lower_dirs(m->lower);
849 0 : if (!lower)
850 0 : return log_oom();
851 :
852 0 : escaped_source = shell_escape(m->source, ",:");
853 0 : if (!escaped_source)
854 0 : return log_oom();
855 :
856 0 : if (m->read_only)
857 0 : options = strjoina("lowerdir=", escaped_source, ":", lower);
858 : else {
859 0 : _cleanup_free_ char *escaped_work_dir = NULL;
860 :
861 0 : escaped_work_dir = shell_escape(m->work_dir, ",:");
862 0 : if (!escaped_work_dir)
863 0 : return log_oom();
864 :
865 0 : options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
866 : }
867 :
868 0 : return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
869 : }
870 :
871 0 : static int mount_inaccessible(const char *dest, CustomMount *m) {
872 0 : _cleanup_free_ char *where = NULL;
873 : const char *source;
874 : struct stat st;
875 : int r;
876 :
877 0 : assert(dest);
878 0 : assert(m);
879 :
880 0 : r = chase_symlinks_and_stat(m->destination, dest, CHASE_PREFIX_ROOT, &where, &st);
881 0 : if (r < 0) {
882 0 : log_full_errno(m->graceful ? LOG_DEBUG : LOG_ERR, r, "Failed to resolve %s/%s: %m", dest, m->destination);
883 0 : return m->graceful ? 0 : r;
884 : }
885 :
886 0 : assert_se(source = mode_to_inaccessible_node(st.st_mode));
887 :
888 0 : r = mount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, source, where, NULL, MS_BIND, NULL);
889 0 : if (r < 0)
890 0 : return m->graceful ? 0 : r;
891 :
892 0 : r = mount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, NULL, where, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL);
893 0 : if (r < 0) {
894 0 : umount_verbose(where);
895 0 : return m->graceful ? 0 : r;
896 : }
897 :
898 0 : return 0;
899 : }
900 :
901 0 : static int mount_arbitrary(const char *dest, CustomMount *m) {
902 0 : _cleanup_free_ char *where = NULL;
903 : int r;
904 :
905 0 : assert(dest);
906 0 : assert(m);
907 :
908 0 : r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
909 0 : if (r < 0)
910 0 : return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
911 0 : if (r == 0) { /* Doesn't exist yet? */
912 0 : r = mkdir_p_label(where, 0755);
913 0 : if (r < 0)
914 0 : return log_error_errno(r, "Creating mount point for mount %s failed: %m", where);
915 : }
916 :
917 0 : return mount_verbose(LOG_ERR, m->source, where, m->type_argument, 0, m->options);
918 : }
919 :
920 0 : int mount_custom(
921 : const char *dest,
922 : CustomMount *mounts, size_t n,
923 : bool userns, uid_t uid_shift, uid_t uid_range,
924 : const char *selinux_apifs_context,
925 : bool in_userns) {
926 :
927 : size_t i;
928 : int r;
929 :
930 0 : assert(dest);
931 :
932 0 : for (i = 0; i < n; i++) {
933 0 : CustomMount *m = mounts + i;
934 :
935 0 : if (m->in_userns != in_userns)
936 0 : continue;
937 :
938 0 : switch (m->type) {
939 :
940 0 : case CUSTOM_MOUNT_BIND:
941 0 : r = mount_bind(dest, m);
942 0 : break;
943 :
944 0 : case CUSTOM_MOUNT_TMPFS:
945 0 : r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
946 0 : break;
947 :
948 0 : case CUSTOM_MOUNT_OVERLAY:
949 0 : r = mount_overlay(dest, m);
950 0 : break;
951 :
952 0 : case CUSTOM_MOUNT_INACCESSIBLE:
953 0 : r = mount_inaccessible(dest, m);
954 0 : break;
955 :
956 0 : case CUSTOM_MOUNT_ARBITRARY:
957 0 : r = mount_arbitrary(dest, m);
958 0 : break;
959 :
960 0 : default:
961 0 : assert_not_reached("Unknown custom mount type");
962 : }
963 :
964 0 : if (r < 0)
965 0 : return r;
966 : }
967 :
968 0 : return 0;
969 : }
970 :
971 0 : static int setup_volatile_state(
972 : const char *directory,
973 : bool userns, uid_t uid_shift, uid_t uid_range,
974 : const char *selinux_apifs_context) {
975 :
976 0 : _cleanup_free_ char *buf = NULL;
977 : const char *p, *options;
978 : int r;
979 :
980 0 : assert(directory);
981 :
982 : /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */
983 :
984 0 : r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
985 0 : if (r < 0)
986 0 : return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
987 :
988 0 : p = prefix_roota(directory, "/var");
989 0 : r = mkdir(p, 0755);
990 0 : if (r < 0 && errno != EEXIST)
991 0 : return log_error_errno(errno, "Failed to create %s: %m", directory);
992 :
993 0 : options = "mode=755";
994 0 : r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
995 0 : if (r < 0)
996 0 : return log_oom();
997 0 : if (r > 0)
998 0 : options = buf;
999 :
1000 0 : return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
1001 : }
1002 :
1003 0 : static int setup_volatile_yes(
1004 : const char *directory,
1005 : bool userns, uid_t uid_shift, uid_t uid_range,
1006 : const char *selinux_apifs_context) {
1007 :
1008 0 : bool tmpfs_mounted = false, bind_mounted = false;
1009 0 : char template[] = "/tmp/nspawn-volatile-XXXXXX";
1010 0 : _cleanup_free_ char *buf = NULL, *bindir = NULL;
1011 : const char *f, *t, *options;
1012 : struct stat st;
1013 : int r;
1014 :
1015 0 : assert(directory);
1016 :
1017 : /* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and
1018 : * that read-only. Before we start setting this up let's validate if the image has the /usr merge
1019 : * implemented, and let's output a friendly log message if it hasn't. */
1020 :
1021 0 : bindir = path_join(directory, "/bin");
1022 0 : if (!bindir)
1023 0 : return log_oom();
1024 0 : if (lstat(bindir, &st) < 0) {
1025 0 : if (errno != ENOENT)
1026 0 : return log_error_errno(errno, "Failed to stat /bin directory below image: %m");
1027 :
1028 : /* ENOENT is fine, just means the image is probably just a naked /usr and we can create the
1029 : * rest. */
1030 0 : } else if (S_ISDIR(st.st_mode))
1031 0 : return log_error_errno(SYNTHETIC_ERRNO(EISDIR),
1032 : "Sorry, --volatile=yes mode is not supported with OS images that have not merged /bin/, /sbin/, /lib/, /lib64/ into /usr/. "
1033 : "Please work with your distribution and help them adopt the merged /usr scheme.");
1034 0 : else if (!S_ISLNK(st.st_mode))
1035 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1036 : "Error starting image: if --volatile=yes is used /bin must be a symlink (for merged /usr support) or non-existent (in which case a symlink is created automatically).");
1037 :
1038 0 : if (!mkdtemp(template))
1039 0 : return log_error_errno(errno, "Failed to create temporary directory: %m");
1040 :
1041 0 : options = "mode=755";
1042 0 : r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
1043 0 : if (r < 0)
1044 0 : goto fail;
1045 0 : if (r > 0)
1046 0 : options = buf;
1047 :
1048 0 : r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1049 0 : if (r < 0)
1050 0 : goto fail;
1051 :
1052 0 : tmpfs_mounted = true;
1053 :
1054 0 : f = prefix_roota(directory, "/usr");
1055 0 : t = prefix_roota(template, "/usr");
1056 :
1057 0 : r = mkdir(t, 0755);
1058 0 : if (r < 0 && errno != EEXIST) {
1059 0 : r = log_error_errno(errno, "Failed to create %s: %m", t);
1060 0 : goto fail;
1061 : }
1062 :
1063 0 : r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
1064 0 : if (r < 0)
1065 0 : goto fail;
1066 :
1067 0 : bind_mounted = true;
1068 :
1069 0 : r = bind_remount_recursive(t, MS_RDONLY, MS_RDONLY, NULL);
1070 0 : if (r < 0) {
1071 0 : log_error_errno(r, "Failed to remount %s read-only: %m", t);
1072 0 : goto fail;
1073 : }
1074 :
1075 0 : r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
1076 0 : if (r < 0)
1077 0 : goto fail;
1078 :
1079 0 : (void) rmdir(template);
1080 :
1081 0 : return 0;
1082 :
1083 0 : fail:
1084 0 : if (bind_mounted)
1085 0 : (void) umount_verbose(t);
1086 :
1087 0 : if (tmpfs_mounted)
1088 0 : (void) umount_verbose(template);
1089 0 : (void) rmdir(template);
1090 0 : return r;
1091 : }
1092 :
1093 0 : static int setup_volatile_overlay(
1094 : const char *directory,
1095 : bool userns, uid_t uid_shift, uid_t uid_range,
1096 : const char *selinux_apifs_context) {
1097 :
1098 0 : _cleanup_free_ char *buf = NULL, *escaped_directory = NULL, *escaped_upper = NULL, *escaped_work = NULL;
1099 0 : char template[] = "/tmp/nspawn-volatile-XXXXXX";
1100 : const char *upper, *work, *options;
1101 0 : bool tmpfs_mounted = false;
1102 : int r;
1103 :
1104 0 : assert(directory);
1105 :
1106 : /* --volatile=overlay means we mount an overlayfs to the root dir. */
1107 :
1108 0 : if (!mkdtemp(template))
1109 0 : return log_error_errno(errno, "Failed to create temporary directory: %m");
1110 :
1111 0 : options = "mode=755";
1112 0 : r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
1113 0 : if (r < 0)
1114 0 : goto finish;
1115 0 : if (r > 0)
1116 0 : options = buf;
1117 :
1118 0 : r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1119 0 : if (r < 0)
1120 0 : goto finish;
1121 :
1122 0 : tmpfs_mounted = true;
1123 :
1124 0 : upper = strjoina(template, "/upper");
1125 0 : work = strjoina(template, "/work");
1126 :
1127 0 : if (mkdir(upper, 0755) < 0) {
1128 0 : r = log_error_errno(errno, "Failed to create %s: %m", upper);
1129 0 : goto finish;
1130 : }
1131 0 : if (mkdir(work, 0755) < 0) {
1132 0 : r = log_error_errno(errno, "Failed to create %s: %m", work);
1133 0 : goto finish;
1134 : }
1135 :
1136 : /* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice
1137 : * that the kernel allows us to do that without going through some mount point rearrangements. */
1138 :
1139 0 : escaped_directory = shell_escape(directory, ",:");
1140 0 : escaped_upper = shell_escape(upper, ",:");
1141 0 : escaped_work = shell_escape(work, ",:");
1142 0 : if (!escaped_directory || !escaped_upper || !escaped_work) {
1143 0 : r = -ENOMEM;
1144 0 : goto finish;
1145 : }
1146 :
1147 0 : options = strjoina("lowerdir=", escaped_directory, ",upperdir=", escaped_upper, ",workdir=", escaped_work);
1148 0 : r = mount_verbose(LOG_ERR, "overlay", directory, "overlay", 0, options);
1149 :
1150 0 : finish:
1151 0 : if (tmpfs_mounted)
1152 0 : (void) umount_verbose(template);
1153 :
1154 0 : (void) rmdir(template);
1155 0 : return r;
1156 : }
1157 :
1158 0 : int setup_volatile_mode(
1159 : const char *directory,
1160 : VolatileMode mode,
1161 : bool userns, uid_t uid_shift, uid_t uid_range,
1162 : const char *selinux_apifs_context) {
1163 :
1164 0 : switch (mode) {
1165 :
1166 0 : case VOLATILE_YES:
1167 0 : return setup_volatile_yes(directory, userns, uid_shift, uid_range, selinux_apifs_context);
1168 :
1169 0 : case VOLATILE_STATE:
1170 0 : return setup_volatile_state(directory, userns, uid_shift, uid_range, selinux_apifs_context);
1171 :
1172 0 : case VOLATILE_OVERLAY:
1173 0 : return setup_volatile_overlay(directory, userns, uid_shift, uid_range, selinux_apifs_context);
1174 :
1175 0 : default:
1176 0 : return 0;
1177 : }
1178 : }
1179 :
1180 : /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
1181 0 : int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1182 0 : _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1183 0 : const char *p = s;
1184 : int r;
1185 :
1186 0 : assert(pivot_root_new);
1187 0 : assert(pivot_root_old);
1188 :
1189 0 : r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1190 0 : if (r < 0)
1191 0 : return r;
1192 0 : if (r == 0)
1193 0 : return -EINVAL;
1194 :
1195 0 : if (isempty(p))
1196 0 : root_old = NULL;
1197 : else {
1198 0 : root_old = strdup(p);
1199 0 : if (!root_old)
1200 0 : return -ENOMEM;
1201 : }
1202 :
1203 0 : if (!path_is_absolute(root_new))
1204 0 : return -EINVAL;
1205 0 : if (root_old && !path_is_absolute(root_old))
1206 0 : return -EINVAL;
1207 :
1208 0 : free_and_replace(*pivot_root_new, root_new);
1209 0 : free_and_replace(*pivot_root_old, root_old);
1210 :
1211 0 : return 0;
1212 : }
1213 :
1214 0 : int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1215 0 : _cleanup_free_ char *directory_pivot_root_new = NULL;
1216 0 : _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
1217 0 : char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
1218 0 : bool remove_pivot_tmp = false;
1219 : int r;
1220 :
1221 0 : assert(directory);
1222 :
1223 0 : if (!pivot_root_new)
1224 0 : return 0;
1225 :
1226 : /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1227 : * If pivot_root_old is NULL, the existing / disappears.
1228 : * This requires a temporary directory, pivot_tmp, which is
1229 : * not a child of either.
1230 : *
1231 : * This is typically used for OSTree-style containers, where
1232 : * the root partition contains several sysroots which could be
1233 : * run. Normally, one would be chosen by the bootloader and
1234 : * pivoted to / by initramfs.
1235 : *
1236 : * For example, for an OSTree deployment, pivot_root_new
1237 : * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1238 : * code doesn’t do the /var mount which OSTree expects: use
1239 : * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1240 : *
1241 : * So in the OSTree case, we’ll end up with something like:
1242 : * - directory = /tmp/nspawn-root-123456
1243 : * - pivot_root_new = /ostree/deploy/os/deploy/123abc
1244 : * - pivot_root_old = /sysroot
1245 : * - directory_pivot_root_new =
1246 : * /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1247 : * - pivot_tmp = /tmp/nspawn-pivot-123456
1248 : * - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1249 : *
1250 : * Requires all file systems at directory and below to be mounted
1251 : * MS_PRIVATE or MS_SLAVE so they can be moved.
1252 : */
1253 0 : directory_pivot_root_new = path_join(directory, pivot_root_new);
1254 0 : if (!directory_pivot_root_new)
1255 0 : return log_oom();
1256 :
1257 : /* Remount directory_pivot_root_new to make it movable. */
1258 0 : r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
1259 0 : if (r < 0)
1260 0 : goto done;
1261 :
1262 0 : if (pivot_root_old) {
1263 0 : if (!mkdtemp(pivot_tmp)) {
1264 0 : r = log_error_errno(errno, "Failed to create temporary directory: %m");
1265 0 : goto done;
1266 : }
1267 :
1268 0 : remove_pivot_tmp = true;
1269 0 : pivot_tmp_pivot_root_old = path_join(pivot_tmp, pivot_root_old);
1270 0 : if (!pivot_tmp_pivot_root_old) {
1271 0 : r = log_oom();
1272 0 : goto done;
1273 : }
1274 :
1275 0 : r = mount_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
1276 0 : if (r < 0)
1277 0 : goto done;
1278 :
1279 0 : r = mount_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
1280 0 : if (r < 0)
1281 0 : goto done;
1282 :
1283 0 : r = mount_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
1284 0 : if (r < 0)
1285 0 : goto done;
1286 : } else {
1287 0 : r = mount_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
1288 0 : if (r < 0)
1289 0 : goto done;
1290 : }
1291 :
1292 0 : done:
1293 0 : if (remove_pivot_tmp)
1294 0 : (void) rmdir(pivot_tmp);
1295 :
1296 0 : return r;
1297 : }
|