Branch data Line data Source code
1 : : /* SPDX-License-Identifier: LGPL-2.1+ */
2 : :
3 : : #if HAVE_BLKID
4 : : #include <blkid.h>
5 : : #endif
6 : : #include <errno.h>
7 : : #include <getopt.h>
8 : : #include <grp.h>
9 : : #include <linux/fs.h>
10 : : #include <linux/loop.h>
11 : : #include <pwd.h>
12 : : #include <sched.h>
13 : : #if HAVE_SELINUX
14 : : #include <selinux/selinux.h>
15 : : #endif
16 : : #include <signal.h>
17 : : #include <stdio.h>
18 : : #include <stdlib.h>
19 : : #include <string.h>
20 : : #include <sys/file.h>
21 : : #include <sys/personality.h>
22 : : #include <sys/prctl.h>
23 : : #include <sys/types.h>
24 : : #include <sys/wait.h>
25 : : #include <unistd.h>
26 : :
27 : : #include "sd-bus.h"
28 : : #include "sd-daemon.h"
29 : : #include "sd-id128.h"
30 : :
31 : : #include "alloc-util.h"
32 : : #include "barrier.h"
33 : : #include "base-filesystem.h"
34 : : #include "blkid-util.h"
35 : : #include "btrfs-util.h"
36 : : #include "bus-error.h"
37 : : #include "bus-util.h"
38 : : #include "cap-list.h"
39 : : #include "capability-util.h"
40 : : #include "cgroup-util.h"
41 : : #include "copy.h"
42 : : #include "cpu-set-util.h"
43 : : #include "dev-setup.h"
44 : : #include "dissect-image.h"
45 : : #include "env-util.h"
46 : : #include "fd-util.h"
47 : : #include "fdset.h"
48 : : #include "fileio.h"
49 : : #include "format-util.h"
50 : : #include "fs-util.h"
51 : : #include "gpt.h"
52 : : #include "hexdecoct.h"
53 : : #include "hostname-util.h"
54 : : #include "id128-util.h"
55 : : #include "log.h"
56 : : #include "loop-util.h"
57 : : #include "loopback-setup.h"
58 : : #include "machine-image.h"
59 : : #include "macro.h"
60 : : #include "main-func.h"
61 : : #include "missing.h"
62 : : #include "mkdir.h"
63 : : #include "mount-util.h"
64 : : #include "mountpoint-util.h"
65 : : #include "namespace-util.h"
66 : : #include "netlink-util.h"
67 : : #include "nspawn-cgroup.h"
68 : : #include "nspawn-def.h"
69 : : #include "nspawn-expose-ports.h"
70 : : #include "nspawn-mount.h"
71 : : #include "nspawn-network.h"
72 : : #include "nspawn-oci.h"
73 : : #include "nspawn-patch-uid.h"
74 : : #include "nspawn-register.h"
75 : : #include "nspawn-seccomp.h"
76 : : #include "nspawn-settings.h"
77 : : #include "nspawn-setuid.h"
78 : : #include "nspawn-stub-pid1.h"
79 : : #include "nulstr-util.h"
80 : : #include "os-util.h"
81 : : #include "pager.h"
82 : : #include "parse-util.h"
83 : : #include "path-util.h"
84 : : #include "pretty-print.h"
85 : : #include "process-util.h"
86 : : #include "ptyfwd.h"
87 : : #include "random-util.h"
88 : : #include "raw-clone.h"
89 : : #include "rlimit-util.h"
90 : : #include "rm-rf.h"
91 : : #if HAVE_SECCOMP
92 : : #include "seccomp-util.h"
93 : : #endif
94 : : #include "selinux-util.h"
95 : : #include "signal-util.h"
96 : : #include "socket-util.h"
97 : : #include "stat-util.h"
98 : : #include "stdio-util.h"
99 : : #include "string-table.h"
100 : : #include "string-util.h"
101 : : #include "strv.h"
102 : : #include "sysctl-util.h"
103 : : #include "terminal-util.h"
104 : : #include "tmpfile-util.h"
105 : : #include "umask-util.h"
106 : : #include "user-util.h"
107 : : #include "util.h"
108 : :
109 : : #if HAVE_SPLIT_USR
110 : : #define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
111 : : #else
112 : : #define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
113 : : #endif
114 : :
115 : : /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
116 : : * nspawn_notify_socket_path is relative to the container
117 : : * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
118 : : #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
119 : :
120 : : #define EXIT_FORCE_RESTART 133
121 : :
122 : : typedef enum ContainerStatus {
123 : : CONTAINER_TERMINATED,
124 : : CONTAINER_REBOOTED,
125 : : } ContainerStatus;
126 : :
127 : : static char *arg_directory = NULL;
128 : : static char *arg_template = NULL;
129 : : static char *arg_chdir = NULL;
130 : : static char *arg_pivot_root_new = NULL;
131 : : static char *arg_pivot_root_old = NULL;
132 : : static char *arg_user = NULL;
133 : : static uid_t arg_uid = UID_INVALID;
134 : : static gid_t arg_gid = GID_INVALID;
135 : : static gid_t* arg_supplementary_gids = NULL;
136 : : static size_t arg_n_supplementary_gids = 0;
137 : : static sd_id128_t arg_uuid = {};
138 : : static char *arg_machine = NULL; /* The name used by the host to refer to this */
139 : : static char *arg_hostname = NULL; /* The name the payload sees by default */
140 : : static const char *arg_selinux_context = NULL;
141 : : static const char *arg_selinux_apifs_context = NULL;
142 : : static char *arg_slice = NULL;
143 : : static bool arg_private_network = false;
144 : : static bool arg_read_only = false;
145 : : static StartMode arg_start_mode = START_PID1;
146 : : static bool arg_ephemeral = false;
147 : : static LinkJournal arg_link_journal = LINK_AUTO;
148 : : static bool arg_link_journal_try = false;
149 : : static uint64_t arg_caps_retain =
150 : : (1ULL << CAP_AUDIT_CONTROL) |
151 : : (1ULL << CAP_AUDIT_WRITE) |
152 : : (1ULL << CAP_CHOWN) |
153 : : (1ULL << CAP_DAC_OVERRIDE) |
154 : : (1ULL << CAP_DAC_READ_SEARCH) |
155 : : (1ULL << CAP_FOWNER) |
156 : : (1ULL << CAP_FSETID) |
157 : : (1ULL << CAP_IPC_OWNER) |
158 : : (1ULL << CAP_KILL) |
159 : : (1ULL << CAP_LEASE) |
160 : : (1ULL << CAP_LINUX_IMMUTABLE) |
161 : : (1ULL << CAP_MKNOD) |
162 : : (1ULL << CAP_NET_BIND_SERVICE) |
163 : : (1ULL << CAP_NET_BROADCAST) |
164 : : (1ULL << CAP_NET_RAW) |
165 : : (1ULL << CAP_SETFCAP) |
166 : : (1ULL << CAP_SETGID) |
167 : : (1ULL << CAP_SETPCAP) |
168 : : (1ULL << CAP_SETUID) |
169 : : (1ULL << CAP_SYS_ADMIN) |
170 : : (1ULL << CAP_SYS_BOOT) |
171 : : (1ULL << CAP_SYS_CHROOT) |
172 : : (1ULL << CAP_SYS_NICE) |
173 : : (1ULL << CAP_SYS_PTRACE) |
174 : : (1ULL << CAP_SYS_RESOURCE) |
175 : : (1ULL << CAP_SYS_TTY_CONFIG);
176 : : static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
177 : : static CustomMount *arg_custom_mounts = NULL;
178 : : static size_t arg_n_custom_mounts = 0;
179 : : static char **arg_setenv = NULL;
180 : : static bool arg_quiet = false;
181 : : static bool arg_register = true;
182 : : static bool arg_keep_unit = false;
183 : : static char **arg_network_interfaces = NULL;
184 : : static char **arg_network_macvlan = NULL;
185 : : static char **arg_network_ipvlan = NULL;
186 : : static bool arg_network_veth = false;
187 : : static char **arg_network_veth_extra = NULL;
188 : : static char *arg_network_bridge = NULL;
189 : : static char *arg_network_zone = NULL;
190 : : static char *arg_network_namespace_path = NULL;
191 : : static PagerFlags arg_pager_flags = 0;
192 : : static unsigned long arg_personality = PERSONALITY_INVALID;
193 : : static char *arg_image = NULL;
194 : : static char *arg_oci_bundle = NULL;
195 : : static VolatileMode arg_volatile_mode = VOLATILE_NO;
196 : : static ExposePort *arg_expose_ports = NULL;
197 : : static char **arg_property = NULL;
198 : : static sd_bus_message *arg_property_message = NULL;
199 : : static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
200 : : static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
201 : : static bool arg_userns_chown = false;
202 : : static int arg_kill_signal = 0;
203 : : static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
204 : : static SettingsMask arg_settings_mask = 0;
205 : : static int arg_settings_trusted = -1;
206 : : static char **arg_parameters = NULL;
207 : : static const char *arg_container_service_name = "systemd-nspawn";
208 : : static bool arg_notify_ready = false;
209 : : static bool arg_use_cgns = true;
210 : : static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
211 : : static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
212 : : static void *arg_root_hash = NULL;
213 : : static size_t arg_root_hash_size = 0;
214 : : static char **arg_syscall_whitelist = NULL;
215 : : static char **arg_syscall_blacklist = NULL;
216 : : #if HAVE_SECCOMP
217 : : static scmp_filter_ctx arg_seccomp = NULL;
218 : : #endif
219 : : static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
220 : : static bool arg_no_new_privileges = false;
221 : : static int arg_oom_score_adjust = 0;
222 : : static bool arg_oom_score_adjust_set = false;
223 : : static CPUSet arg_cpu_set = {};
224 : : static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
225 : : static TimezoneMode arg_timezone = TIMEZONE_AUTO;
226 : : static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
227 : : static DeviceNode* arg_extra_nodes = NULL;
228 : : static size_t arg_n_extra_nodes = 0;
229 : : static char **arg_sysctl = NULL;
230 : : static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
231 : :
232 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
233 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
234 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
235 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
236 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
237 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
238 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
239 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
240 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
241 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
242 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
243 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
244 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
245 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
246 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
247 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
248 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
249 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
250 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
251 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
252 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
253 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
254 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
255 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
256 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
257 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
258 : : #if HAVE_SECCOMP
259 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
260 : : #endif
261 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
262 : 16 : STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
263 : :
264 : 12 : static int help(void) {
265 : 12 : _cleanup_free_ char *link = NULL;
266 : : int r;
267 : :
268 : 12 : (void) pager_open(arg_pager_flags);
269 : :
270 : 12 : r = terminal_urlify_man("systemd-nspawn", "1", &link);
271 [ - + ]: 12 : if (r < 0)
272 : 0 : return log_oom();
273 : :
274 : 12 : printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
275 : : "Spawn a command or OS in a light-weight container.\n\n"
276 : : " -h --help Show this help\n"
277 : : " --version Print version string\n"
278 : : " -q --quiet Do not show status information\n"
279 : : " --no-pager Do not pipe output into a pager\n"
280 : : " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
281 : : "%3$sImage:%4$s\n"
282 : : " -D --directory=PATH Root directory for the container\n"
283 : : " --template=PATH Initialize root directory from template directory,\n"
284 : : " if missing\n"
285 : : " -x --ephemeral Run container with snapshot of root directory, and\n"
286 : : " remove it after exit\n"
287 : : " -i --image=PATH Root file system disk image (or device node) for\n"
288 : : " the container\n"
289 : : " --oci-bundle=PATH OCI bundle directory\n"
290 : : " --read-only Mount the root directory read-only\n"
291 : : " --volatile[=MODE] Run the system in volatile mode\n"
292 : : " --root-hash=HASH Specify verity root hash for root disk image\n"
293 : : " --pivot-root=PATH[:PATH]\n"
294 : : " Pivot root to given directory in the container\n\n"
295 : : "%3$sExecution:%4$s\n"
296 : : " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
297 : : " -b --boot Boot up full system (i.e. invoke init)\n"
298 : : " --chdir=PATH Set working directory in the container\n"
299 : : " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
300 : : " -u --user=USER Run the command under specified user or UID\n"
301 : : " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
302 : : " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
303 : : "%3$sSystem Identity:%4$s\n"
304 : : " -M --machine=NAME Set the machine name for the container\n"
305 : : " --hostname=NAME Override the hostname for the container\n"
306 : : " --uuid=UUID Set a specific machine UUID for the container\n\n"
307 : : "%3$sProperties:%4$s\n"
308 : : " -S --slice=SLICE Place the container in the specified slice\n"
309 : : " --property=NAME=VALUE Set scope unit property\n"
310 : : " --register=BOOLEAN Register container as machine\n"
311 : : " --keep-unit Do not register a scope for the machine, reuse\n"
312 : : " the service unit nspawn is running in\n\n"
313 : : "%3$sUser Namespacing:%4$s\n"
314 : : " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
315 : : " --private-users[=UIDBASE[:NUIDS]]\n"
316 : : " Similar, but with user configured UID/GID range\n"
317 : : " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
318 : : "%3$sNetworking:%4$s\n"
319 : : " --private-network Disable network in container\n"
320 : : " --network-interface=INTERFACE\n"
321 : : " Assign an existing network interface to the\n"
322 : : " container\n"
323 : : " --network-macvlan=INTERFACE\n"
324 : : " Create a macvlan network interface based on an\n"
325 : : " existing network interface to the container\n"
326 : : " --network-ipvlan=INTERFACE\n"
327 : : " Create a ipvlan network interface based on an\n"
328 : : " existing network interface to the container\n"
329 : : " -n --network-veth Add a virtual Ethernet connection between host\n"
330 : : " and container\n"
331 : : " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
332 : : " Add an additional virtual Ethernet link between\n"
333 : : " host and container\n"
334 : : " --network-bridge=INTERFACE\n"
335 : : " Add a virtual Ethernet connection to the container\n"
336 : : " and attach it to an existing bridge on the host\n"
337 : : " --network-zone=NAME Similar, but attach the new interface to an\n"
338 : : " an automatically managed bridge interface\n"
339 : : " --network-namespace-path=PATH\n"
340 : : " Set network namespace to the one represented by\n"
341 : : " the specified kernel namespace file node\n"
342 : : " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
343 : : " Expose a container IP port on the host\n\n"
344 : : "%3$sSecurity:%4$s\n"
345 : : " --capability=CAP In addition to the default, retain specified\n"
346 : : " capability\n"
347 : : " --drop-capability=CAP Drop the specified capability from the default set\n"
348 : : " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
349 : : " --system-call-filter=LIST|~LIST\n"
350 : : " Permit/prohibit specific system calls\n"
351 : : " -Z --selinux-context=SECLABEL\n"
352 : : " Set the SELinux security context to be used by\n"
353 : : " processes in the container\n"
354 : : " -L --selinux-apifs-context=SECLABEL\n"
355 : : " Set the SELinux security context to be used by\n"
356 : : " API/tmpfs file systems in the container\n\n"
357 : : "%3$sResources:%4$s\n"
358 : : " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
359 : : " --oom-score-adjust=VALUE\n"
360 : : " Adjust the OOM score value for the payload\n"
361 : : " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
362 : : " --personality=ARCH Pick personality for this container\n\n"
363 : : "%3$sIntegration:%4$s\n"
364 : : " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
365 : : " --timezone=MODE Select mode of /etc/localtime initialization\n"
366 : : " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
367 : : " host, try-guest, try-host\n"
368 : : " -j Equivalent to --link-journal=try-guest\n\n"
369 : : "%3$sMounts:%4$s\n"
370 : : " --bind=PATH[:PATH[:OPTIONS]]\n"
371 : : " Bind mount a file or directory from the host into\n"
372 : : " the container\n"
373 : : " --bind-ro=PATH[:PATH[:OPTIONS]\n"
374 : : " Similar, but creates a read-only bind mount\n"
375 : : " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
376 : : " it\n"
377 : : " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
378 : : " --overlay=PATH[:PATH...]:PATH\n"
379 : : " Create an overlay mount from the host to \n"
380 : : " the container\n"
381 : : " --overlay-ro=PATH[:PATH...]:PATH\n"
382 : : " Similar, but creates a read-only overlay mount\n\n"
383 : : "%3$sInput/Output:%4$s\n"
384 : : " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
385 : : " set up for the container.\n"
386 : : " -P --pipe Equivalent to --console=pipe\n"
387 : : "\nSee the %2$s for details.\n"
388 : : , program_invocation_short_name
389 : : , link
390 : : , ansi_underline(), ansi_normal());
391 : :
392 : 12 : return 0;
393 : : }
394 : :
395 : 0 : static int custom_mount_check_all(void) {
396 : : size_t i;
397 : :
398 [ # # ]: 0 : for (i = 0; i < arg_n_custom_mounts; i++) {
399 : 0 : CustomMount *m = &arg_custom_mounts[i];
400 : :
401 [ # # # # ]: 0 : if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
402 [ # # ]: 0 : if (arg_userns_chown)
403 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
404 : : "--private-users-chown may not be combined with custom root mounts.");
405 [ # # ]: 0 : else if (arg_uid_shift == UID_INVALID)
406 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
407 : : "--private-users with automatic UID shift may not be combined with custom root mounts.");
408 : : }
409 : : }
410 : :
411 : 0 : return 0;
412 : : }
413 : :
414 : 0 : static int detect_unified_cgroup_hierarchy_from_environment(void) {
415 : : const char *e;
416 : : int r;
417 : :
418 : : /* Allow the user to control whether the unified hierarchy is used */
419 : 0 : e = getenv("UNIFIED_CGROUP_HIERARCHY");
420 [ # # ]: 0 : if (e) {
421 : 0 : r = parse_boolean(e);
422 [ # # ]: 0 : if (r < 0)
423 [ # # ]: 0 : return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
424 [ # # ]: 0 : if (r > 0)
425 : 0 : arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
426 : : else
427 : 0 : arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
428 : : }
429 : :
430 : 0 : return 0;
431 : : }
432 : :
433 : 0 : static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
434 : : int r;
435 : :
436 : : /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
437 : : * image actually supports. */
438 : 0 : r = cg_all_unified();
439 [ # # ]: 0 : if (r < 0)
440 [ # # ]: 0 : return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
441 [ # # ]: 0 : if (r > 0) {
442 : : /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
443 : : * routine only detects 231, so we'll have a false negative here for 230. */
444 : 0 : r = systemd_installation_has_version(directory, 230);
445 [ # # ]: 0 : if (r < 0)
446 [ # # ]: 0 : return log_error_errno(r, "Failed to determine systemd version in container: %m");
447 [ # # ]: 0 : if (r > 0)
448 : 0 : arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
449 : : else
450 : 0 : arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
451 [ # # ]: 0 : } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
452 : : /* Mixed cgroup hierarchy support was added in 233 */
453 : 0 : r = systemd_installation_has_version(directory, 233);
454 [ # # ]: 0 : if (r < 0)
455 [ # # ]: 0 : return log_error_errno(r, "Failed to determine systemd version in container: %m");
456 [ # # ]: 0 : if (r > 0)
457 : 0 : arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
458 : : else
459 : 0 : arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
460 : : } else
461 : 0 : arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
462 : :
463 [ # # # # : 0 : log_debug("Using %s hierarchy for container.",
# # ]
464 : : arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
465 : : arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
466 : :
467 : 0 : return 0;
468 : : }
469 : :
470 : 0 : static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
471 : : int r;
472 : :
473 : 0 : r = getenv_bool(name);
474 [ # # ]: 0 : if (r == -ENXIO)
475 : 0 : return;
476 [ # # ]: 0 : if (r < 0)
477 [ # # ]: 0 : log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
478 : :
479 [ # # ]: 0 : arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
480 : 0 : arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
481 : : }
482 : :
483 : 0 : static void parse_mount_settings_env(void) {
484 : : const char *e;
485 : : int r;
486 : :
487 : 0 : r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
488 [ # # ]: 0 : if (r >= 0)
489 [ # # ]: 0 : SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
490 [ # # ]: 0 : else if (r != -ENXIO)
491 [ # # ]: 0 : log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
492 : :
493 : 0 : e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
494 [ # # ]: 0 : if (!e)
495 : 0 : return;
496 : :
497 [ # # ]: 0 : if (streq(e, "network")) {
498 : 0 : arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
499 : 0 : return;
500 : : }
501 : :
502 : 0 : r = parse_boolean(e);
503 [ # # ]: 0 : if (r < 0) {
504 [ # # ]: 0 : log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
505 : 0 : return;
506 : : }
507 : :
508 [ # # ]: 0 : SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
509 : 0 : SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
510 : : }
511 : :
512 : 0 : static void parse_environment(void) {
513 : : const char *e;
514 : : int r;
515 : :
516 : 0 : parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
517 : 0 : parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
518 : 0 : parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
519 : 0 : parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
520 : :
521 : 0 : parse_mount_settings_env();
522 : :
523 : : /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
524 : : * even if it is supported. If not supported, it has no effect. */
525 [ # # ]: 0 : if (!cg_ns_supported())
526 : 0 : arg_use_cgns = false;
527 : : else {
528 : 0 : r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
529 [ # # ]: 0 : if (r < 0) {
530 [ # # ]: 0 : if (r != -ENXIO)
531 [ # # ]: 0 : log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m");
532 : :
533 : 0 : arg_use_cgns = true;
534 : : } else {
535 : 0 : arg_use_cgns = r > 0;
536 : 0 : arg_settings_mask |= SETTING_USE_CGNS;
537 : : }
538 : : }
539 : :
540 : 0 : e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
541 [ # # ]: 0 : if (e)
542 : 0 : arg_container_service_name = e;
543 : :
544 : 0 : detect_unified_cgroup_hierarchy_from_environment();
545 : 0 : }
546 : :
547 : 16 : static int parse_argv(int argc, char *argv[]) {
548 : : enum {
549 : : ARG_VERSION = 0x100,
550 : : ARG_PRIVATE_NETWORK,
551 : : ARG_UUID,
552 : : ARG_READ_ONLY,
553 : : ARG_CAPABILITY,
554 : : ARG_DROP_CAPABILITY,
555 : : ARG_LINK_JOURNAL,
556 : : ARG_BIND,
557 : : ARG_BIND_RO,
558 : : ARG_TMPFS,
559 : : ARG_OVERLAY,
560 : : ARG_OVERLAY_RO,
561 : : ARG_INACCESSIBLE,
562 : : ARG_SHARE_SYSTEM,
563 : : ARG_REGISTER,
564 : : ARG_KEEP_UNIT,
565 : : ARG_NETWORK_INTERFACE,
566 : : ARG_NETWORK_MACVLAN,
567 : : ARG_NETWORK_IPVLAN,
568 : : ARG_NETWORK_BRIDGE,
569 : : ARG_NETWORK_ZONE,
570 : : ARG_NETWORK_VETH_EXTRA,
571 : : ARG_NETWORK_NAMESPACE_PATH,
572 : : ARG_PERSONALITY,
573 : : ARG_VOLATILE,
574 : : ARG_TEMPLATE,
575 : : ARG_PROPERTY,
576 : : ARG_PRIVATE_USERS,
577 : : ARG_KILL_SIGNAL,
578 : : ARG_SETTINGS,
579 : : ARG_CHDIR,
580 : : ARG_PIVOT_ROOT,
581 : : ARG_PRIVATE_USERS_CHOWN,
582 : : ARG_NOTIFY_READY,
583 : : ARG_ROOT_HASH,
584 : : ARG_SYSTEM_CALL_FILTER,
585 : : ARG_RLIMIT,
586 : : ARG_HOSTNAME,
587 : : ARG_NO_NEW_PRIVILEGES,
588 : : ARG_OOM_SCORE_ADJUST,
589 : : ARG_CPU_AFFINITY,
590 : : ARG_RESOLV_CONF,
591 : : ARG_TIMEZONE,
592 : : ARG_CONSOLE,
593 : : ARG_PIPE,
594 : : ARG_OCI_BUNDLE,
595 : : ARG_NO_PAGER,
596 : : };
597 : :
598 : : static const struct option options[] = {
599 : : { "help", no_argument, NULL, 'h' },
600 : : { "version", no_argument, NULL, ARG_VERSION },
601 : : { "directory", required_argument, NULL, 'D' },
602 : : { "template", required_argument, NULL, ARG_TEMPLATE },
603 : : { "ephemeral", no_argument, NULL, 'x' },
604 : : { "user", required_argument, NULL, 'u' },
605 : : { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
606 : : { "as-pid2", no_argument, NULL, 'a' },
607 : : { "boot", no_argument, NULL, 'b' },
608 : : { "uuid", required_argument, NULL, ARG_UUID },
609 : : { "read-only", no_argument, NULL, ARG_READ_ONLY },
610 : : { "capability", required_argument, NULL, ARG_CAPABILITY },
611 : : { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
612 : : { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
613 : : { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
614 : : { "bind", required_argument, NULL, ARG_BIND },
615 : : { "bind-ro", required_argument, NULL, ARG_BIND_RO },
616 : : { "tmpfs", required_argument, NULL, ARG_TMPFS },
617 : : { "overlay", required_argument, NULL, ARG_OVERLAY },
618 : : { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
619 : : { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
620 : : { "machine", required_argument, NULL, 'M' },
621 : : { "hostname", required_argument, NULL, ARG_HOSTNAME },
622 : : { "slice", required_argument, NULL, 'S' },
623 : : { "setenv", required_argument, NULL, 'E' },
624 : : { "selinux-context", required_argument, NULL, 'Z' },
625 : : { "selinux-apifs-context", required_argument, NULL, 'L' },
626 : : { "quiet", no_argument, NULL, 'q' },
627 : : { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
628 : : { "register", required_argument, NULL, ARG_REGISTER },
629 : : { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
630 : : { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
631 : : { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
632 : : { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
633 : : { "network-veth", no_argument, NULL, 'n' },
634 : : { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
635 : : { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
636 : : { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
637 : : { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
638 : : { "personality", required_argument, NULL, ARG_PERSONALITY },
639 : : { "image", required_argument, NULL, 'i' },
640 : : { "volatile", optional_argument, NULL, ARG_VOLATILE },
641 : : { "port", required_argument, NULL, 'p' },
642 : : { "property", required_argument, NULL, ARG_PROPERTY },
643 : : { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
644 : : { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
645 : : { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
646 : : { "settings", required_argument, NULL, ARG_SETTINGS },
647 : : { "chdir", required_argument, NULL, ARG_CHDIR },
648 : : { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
649 : : { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
650 : : { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
651 : : { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
652 : : { "rlimit", required_argument, NULL, ARG_RLIMIT },
653 : : { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
654 : : { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
655 : : { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
656 : : { "timezone", required_argument, NULL, ARG_TIMEZONE },
657 : : { "console", required_argument, NULL, ARG_CONSOLE },
658 : : { "pipe", no_argument, NULL, ARG_PIPE },
659 : : { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
660 : : { "no-pager", no_argument, NULL, ARG_NO_PAGER },
661 : : {}
662 : : };
663 : :
664 : : int c, r;
665 : : const char *p;
666 : 16 : uint64_t plus = 0, minus = 0;
667 : 16 : bool mask_all_settings = false, mask_no_settings = false;
668 : :
669 [ - + ]: 16 : assert(argc >= 0);
670 [ - + ]: 16 : assert(argv);
671 : :
672 [ + - ]: 16 : while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
673 [ + - - - : 16 : switch (c) {
- - - - -
- - - - -
- - - - -
- - - - -
- - - - -
- - - - -
- - - - -
- - - - -
- - - - -
- - - - -
- - - - -
- - + - ]
674 : :
675 : 12 : case 'h':
676 : 12 : return help();
677 : :
678 : 0 : case ARG_VERSION:
679 : 0 : return version();
680 : :
681 : 0 : case 'D':
682 : 0 : r = parse_path_argument_and_warn(optarg, false, &arg_directory);
683 [ # # ]: 0 : if (r < 0)
684 : 0 : return r;
685 : :
686 : 0 : arg_settings_mask |= SETTING_DIRECTORY;
687 : 0 : break;
688 : :
689 : 0 : case ARG_TEMPLATE:
690 : 0 : r = parse_path_argument_and_warn(optarg, false, &arg_template);
691 [ # # ]: 0 : if (r < 0)
692 : 0 : return r;
693 : :
694 : 0 : arg_settings_mask |= SETTING_DIRECTORY;
695 : 0 : break;
696 : :
697 : 0 : case 'i':
698 : 0 : r = parse_path_argument_and_warn(optarg, false, &arg_image);
699 [ # # ]: 0 : if (r < 0)
700 : 0 : return r;
701 : :
702 : 0 : arg_settings_mask |= SETTING_DIRECTORY;
703 : 0 : break;
704 : :
705 : 0 : case ARG_OCI_BUNDLE:
706 : 0 : r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
707 [ # # ]: 0 : if (r < 0)
708 : 0 : return r;
709 : :
710 : 0 : break;
711 : :
712 : 0 : case 'x':
713 : 0 : arg_ephemeral = true;
714 : 0 : arg_settings_mask |= SETTING_EPHEMERAL;
715 : 0 : break;
716 : :
717 : 0 : case 'u':
718 : 0 : r = free_and_strdup(&arg_user, optarg);
719 [ # # ]: 0 : if (r < 0)
720 : 0 : return log_oom();
721 : :
722 : 0 : arg_settings_mask |= SETTING_USER;
723 : 0 : break;
724 : :
725 : 0 : case ARG_NETWORK_ZONE: {
726 : : char *j;
727 : :
728 : 0 : j = strjoin("vz-", optarg);
729 [ # # ]: 0 : if (!j)
730 : 0 : return log_oom();
731 : :
732 [ # # ]: 0 : if (!ifname_valid(j)) {
733 [ # # ]: 0 : log_error("Network zone name not valid: %s", j);
734 : 0 : free(j);
735 : 0 : return -EINVAL;
736 : : }
737 : :
738 : 0 : free_and_replace(arg_network_zone, j);
739 : :
740 : 0 : arg_network_veth = true;
741 : 0 : arg_private_network = true;
742 : 0 : arg_settings_mask |= SETTING_NETWORK;
743 : 0 : break;
744 : : }
745 : :
746 : 0 : case ARG_NETWORK_BRIDGE:
747 : :
748 [ # # ]: 0 : if (!ifname_valid(optarg))
749 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
750 : : "Bridge interface name not valid: %s", optarg);
751 : :
752 : 0 : r = free_and_strdup(&arg_network_bridge, optarg);
753 [ # # ]: 0 : if (r < 0)
754 : 0 : return log_oom();
755 : :
756 : : _fallthrough_;
757 : : case 'n':
758 : 0 : arg_network_veth = true;
759 : 0 : arg_private_network = true;
760 : 0 : arg_settings_mask |= SETTING_NETWORK;
761 : 0 : break;
762 : :
763 : 0 : case ARG_NETWORK_VETH_EXTRA:
764 : 0 : r = veth_extra_parse(&arg_network_veth_extra, optarg);
765 [ # # ]: 0 : if (r < 0)
766 [ # # ]: 0 : return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
767 : :
768 : 0 : arg_private_network = true;
769 : 0 : arg_settings_mask |= SETTING_NETWORK;
770 : 0 : break;
771 : :
772 : 0 : case ARG_NETWORK_INTERFACE:
773 [ # # ]: 0 : if (!ifname_valid(optarg))
774 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
775 : : "Network interface name not valid: %s", optarg);
776 : :
777 [ # # ]: 0 : if (strv_extend(&arg_network_interfaces, optarg) < 0)
778 : 0 : return log_oom();
779 : :
780 : 0 : arg_private_network = true;
781 : 0 : arg_settings_mask |= SETTING_NETWORK;
782 : 0 : break;
783 : :
784 : 0 : case ARG_NETWORK_MACVLAN:
785 : :
786 [ # # ]: 0 : if (!ifname_valid(optarg))
787 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
788 : : "MACVLAN network interface name not valid: %s", optarg);
789 : :
790 [ # # ]: 0 : if (strv_extend(&arg_network_macvlan, optarg) < 0)
791 : 0 : return log_oom();
792 : :
793 : 0 : arg_private_network = true;
794 : 0 : arg_settings_mask |= SETTING_NETWORK;
795 : 0 : break;
796 : :
797 : 0 : case ARG_NETWORK_IPVLAN:
798 : :
799 [ # # ]: 0 : if (!ifname_valid(optarg))
800 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
801 : : "IPVLAN network interface name not valid: %s", optarg);
802 : :
803 [ # # ]: 0 : if (strv_extend(&arg_network_ipvlan, optarg) < 0)
804 : 0 : return log_oom();
805 : :
806 : : _fallthrough_;
807 : : case ARG_PRIVATE_NETWORK:
808 : 0 : arg_private_network = true;
809 : 0 : arg_settings_mask |= SETTING_NETWORK;
810 : 0 : break;
811 : :
812 : 0 : case ARG_NETWORK_NAMESPACE_PATH:
813 : 0 : r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
814 [ # # ]: 0 : if (r < 0)
815 : 0 : return r;
816 : :
817 : 0 : arg_settings_mask |= SETTING_NETWORK;
818 : 0 : break;
819 : :
820 : 0 : case 'b':
821 [ # # ]: 0 : if (arg_start_mode == START_PID2)
822 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
823 : : "--boot and --as-pid2 may not be combined.");
824 : :
825 : 0 : arg_start_mode = START_BOOT;
826 : 0 : arg_settings_mask |= SETTING_START_MODE;
827 : 0 : break;
828 : :
829 : 0 : case 'a':
830 [ # # ]: 0 : if (arg_start_mode == START_BOOT)
831 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
832 : : "--boot and --as-pid2 may not be combined.");
833 : :
834 : 0 : arg_start_mode = START_PID2;
835 : 0 : arg_settings_mask |= SETTING_START_MODE;
836 : 0 : break;
837 : :
838 : 0 : case ARG_UUID:
839 : 0 : r = sd_id128_from_string(optarg, &arg_uuid);
840 [ # # ]: 0 : if (r < 0)
841 [ # # ]: 0 : return log_error_errno(r, "Invalid UUID: %s", optarg);
842 : :
843 [ # # ]: 0 : if (sd_id128_is_null(arg_uuid))
844 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
845 : : "Machine UUID may not be all zeroes.");
846 : :
847 : 0 : arg_settings_mask |= SETTING_MACHINE_ID;
848 : 0 : break;
849 : :
850 : 0 : case 'S':
851 : 0 : r = free_and_strdup(&arg_slice, optarg);
852 [ # # ]: 0 : if (r < 0)
853 : 0 : return log_oom();
854 : :
855 : 0 : arg_settings_mask |= SETTING_SLICE;
856 : 0 : break;
857 : :
858 : 0 : case 'M':
859 [ # # ]: 0 : if (isempty(optarg))
860 : 0 : arg_machine = mfree(arg_machine);
861 : : else {
862 [ # # ]: 0 : if (!machine_name_is_valid(optarg))
863 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
864 : : "Invalid machine name: %s", optarg);
865 : :
866 : 0 : r = free_and_strdup(&arg_machine, optarg);
867 [ # # ]: 0 : if (r < 0)
868 : 0 : return log_oom();
869 : : }
870 : 0 : break;
871 : :
872 : 0 : case ARG_HOSTNAME:
873 [ # # ]: 0 : if (isempty(optarg))
874 : 0 : arg_hostname = mfree(arg_hostname);
875 : : else {
876 [ # # ]: 0 : if (!hostname_is_valid(optarg, false))
877 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
878 : : "Invalid hostname: %s", optarg);
879 : :
880 : 0 : r = free_and_strdup(&arg_hostname, optarg);
881 [ # # ]: 0 : if (r < 0)
882 : 0 : return log_oom();
883 : : }
884 : :
885 : 0 : arg_settings_mask |= SETTING_HOSTNAME;
886 : 0 : break;
887 : :
888 : 0 : case 'Z':
889 : 0 : arg_selinux_context = optarg;
890 : 0 : break;
891 : :
892 : 0 : case 'L':
893 : 0 : arg_selinux_apifs_context = optarg;
894 : 0 : break;
895 : :
896 : 0 : case ARG_READ_ONLY:
897 : 0 : arg_read_only = true;
898 : 0 : arg_settings_mask |= SETTING_READ_ONLY;
899 : 0 : break;
900 : :
901 : 0 : case ARG_CAPABILITY:
902 : : case ARG_DROP_CAPABILITY: {
903 : 0 : p = optarg;
904 : 0 : for (;;) {
905 [ # # # ]: 0 : _cleanup_free_ char *t = NULL;
906 : :
907 : 0 : r = extract_first_word(&p, &t, ",", 0);
908 [ # # ]: 0 : if (r < 0)
909 [ # # ]: 0 : return log_error_errno(r, "Failed to parse capability %s.", t);
910 [ # # ]: 0 : if (r == 0)
911 : 0 : break;
912 : :
913 [ # # ]: 0 : if (streq(t, "all")) {
914 [ # # ]: 0 : if (c == ARG_CAPABILITY)
915 : 0 : plus = (uint64_t) -1;
916 : : else
917 : 0 : minus = (uint64_t) -1;
918 : : } else {
919 : 0 : r = capability_from_name(t);
920 [ # # ]: 0 : if (r < 0)
921 [ # # ]: 0 : return log_error_errno(r, "Failed to parse capability %s.", t);
922 : :
923 [ # # ]: 0 : if (c == ARG_CAPABILITY)
924 : 0 : plus |= 1ULL << r;
925 : : else
926 : 0 : minus |= 1ULL << r;
927 : : }
928 : : }
929 : :
930 : 0 : arg_settings_mask |= SETTING_CAPABILITY;
931 : 0 : break;
932 : : }
933 : :
934 : 0 : case ARG_NO_NEW_PRIVILEGES:
935 : 0 : r = parse_boolean(optarg);
936 [ # # ]: 0 : if (r < 0)
937 [ # # ]: 0 : return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
938 : :
939 : 0 : arg_no_new_privileges = r;
940 : 0 : arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
941 : 0 : break;
942 : :
943 : 0 : case 'j':
944 : 0 : arg_link_journal = LINK_GUEST;
945 : 0 : arg_link_journal_try = true;
946 : 0 : arg_settings_mask |= SETTING_LINK_JOURNAL;
947 : 0 : break;
948 : :
949 : 0 : case ARG_LINK_JOURNAL:
950 : 0 : r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
951 [ # # ]: 0 : if (r < 0)
952 [ # # ]: 0 : return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
953 : :
954 : 0 : arg_settings_mask |= SETTING_LINK_JOURNAL;
955 : 0 : break;
956 : :
957 : 0 : case ARG_BIND:
958 : : case ARG_BIND_RO:
959 : 0 : r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
960 [ # # ]: 0 : if (r < 0)
961 [ # # ]: 0 : return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
962 : :
963 : 0 : arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
964 : 0 : break;
965 : :
966 : 0 : case ARG_TMPFS:
967 : 0 : r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
968 [ # # ]: 0 : if (r < 0)
969 [ # # ]: 0 : return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
970 : :
971 : 0 : arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
972 : 0 : break;
973 : :
974 : 0 : case ARG_OVERLAY:
975 : : case ARG_OVERLAY_RO:
976 : 0 : r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
977 [ # # ]: 0 : if (r == -EADDRNOTAVAIL)
978 [ # # ]: 0 : return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
979 [ # # ]: 0 : if (r < 0)
980 [ # # ]: 0 : return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
981 : :
982 : 0 : arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
983 : 0 : break;
984 : :
985 : 0 : case ARG_INACCESSIBLE:
986 : 0 : r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
987 [ # # ]: 0 : if (r < 0)
988 [ # # ]: 0 : return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
989 : :
990 : 0 : arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
991 : 0 : break;
992 : :
993 : 0 : case 'E': {
994 : : char **n;
995 : :
996 [ # # ]: 0 : if (!env_assignment_is_valid(optarg))
997 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
998 : : "Environment variable assignment '%s' is not valid.", optarg);
999 : :
1000 : 0 : n = strv_env_set(arg_setenv, optarg);
1001 [ # # ]: 0 : if (!n)
1002 : 0 : return log_oom();
1003 : :
1004 : 0 : strv_free_and_replace(arg_setenv, n);
1005 : 0 : arg_settings_mask |= SETTING_ENVIRONMENT;
1006 : 0 : break;
1007 : : }
1008 : :
1009 : 0 : case 'q':
1010 : 0 : arg_quiet = true;
1011 : 0 : break;
1012 : :
1013 : 0 : case ARG_SHARE_SYSTEM:
1014 : : /* We don't officially support this anymore, except for compat reasons. People should use the
1015 : : * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1016 [ # # ]: 0 : log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1017 : 0 : arg_clone_ns_flags = 0;
1018 : 0 : break;
1019 : :
1020 : 0 : case ARG_REGISTER:
1021 : 0 : r = parse_boolean(optarg);
1022 [ # # ]: 0 : if (r < 0) {
1023 [ # # ]: 0 : log_error("Failed to parse --register= argument: %s", optarg);
1024 : 0 : return r;
1025 : : }
1026 : :
1027 : 0 : arg_register = r;
1028 : 0 : break;
1029 : :
1030 : 0 : case ARG_KEEP_UNIT:
1031 : 0 : arg_keep_unit = true;
1032 : 0 : break;
1033 : :
1034 : 0 : case ARG_PERSONALITY:
1035 : :
1036 : 0 : arg_personality = personality_from_string(optarg);
1037 [ # # ]: 0 : if (arg_personality == PERSONALITY_INVALID)
1038 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1039 : : "Unknown or unsupported personality '%s'.", optarg);
1040 : :
1041 : 0 : arg_settings_mask |= SETTING_PERSONALITY;
1042 : 0 : break;
1043 : :
1044 : 0 : case ARG_VOLATILE:
1045 : :
1046 [ # # ]: 0 : if (!optarg)
1047 : 0 : arg_volatile_mode = VOLATILE_YES;
1048 [ # # ]: 0 : else if (streq(optarg, "help")) {
1049 [ # # # # ]: 0 : DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1050 : 0 : return 0;
1051 : : } else {
1052 : : VolatileMode m;
1053 : :
1054 : 0 : m = volatile_mode_from_string(optarg);
1055 [ # # ]: 0 : if (m < 0)
1056 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1057 : : "Failed to parse --volatile= argument: %s", optarg);
1058 : : else
1059 : 0 : arg_volatile_mode = m;
1060 : : }
1061 : :
1062 : 0 : arg_settings_mask |= SETTING_VOLATILE_MODE;
1063 : 0 : break;
1064 : :
1065 : 0 : case 'p':
1066 : 0 : r = expose_port_parse(&arg_expose_ports, optarg);
1067 [ # # ]: 0 : if (r == -EEXIST)
1068 [ # # ]: 0 : return log_error_errno(r, "Duplicate port specification: %s", optarg);
1069 [ # # ]: 0 : if (r < 0)
1070 [ # # ]: 0 : return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1071 : :
1072 : 0 : arg_settings_mask |= SETTING_EXPOSE_PORTS;
1073 : 0 : break;
1074 : :
1075 : 0 : case ARG_PROPERTY:
1076 [ # # ]: 0 : if (strv_extend(&arg_property, optarg) < 0)
1077 : 0 : return log_oom();
1078 : :
1079 : 0 : break;
1080 : :
1081 : 0 : case ARG_PRIVATE_USERS: {
1082 : 0 : int boolean = -1;
1083 : :
1084 [ # # ]: 0 : if (!optarg)
1085 : 0 : boolean = true;
1086 [ # # ]: 0 : else if (!in_charset(optarg, DIGITS))
1087 : : /* do *not* parse numbers as booleans */
1088 : 0 : boolean = parse_boolean(optarg);
1089 : :
1090 [ # # ]: 0 : if (boolean == false) {
1091 : : /* no: User namespacing off */
1092 : 0 : arg_userns_mode = USER_NAMESPACE_NO;
1093 : 0 : arg_uid_shift = UID_INVALID;
1094 : 0 : arg_uid_range = UINT32_C(0x10000);
1095 [ # # ]: 0 : } else if (boolean == true) {
1096 : : /* yes: User namespacing on, UID range is read from root dir */
1097 : 0 : arg_userns_mode = USER_NAMESPACE_FIXED;
1098 : 0 : arg_uid_shift = UID_INVALID;
1099 : 0 : arg_uid_range = UINT32_C(0x10000);
1100 [ # # ]: 0 : } else if (streq(optarg, "pick")) {
1101 : : /* pick: User namespacing on, UID range is picked randomly */
1102 : 0 : arg_userns_mode = USER_NAMESPACE_PICK;
1103 : 0 : arg_uid_shift = UID_INVALID;
1104 : 0 : arg_uid_range = UINT32_C(0x10000);
1105 : : } else {
1106 [ # # ]: 0 : _cleanup_free_ char *buffer = NULL;
1107 : : const char *range, *shift;
1108 : :
1109 : : /* anything else: User namespacing on, UID range is explicitly configured */
1110 : :
1111 : 0 : range = strchr(optarg, ':');
1112 [ # # ]: 0 : if (range) {
1113 : 0 : buffer = strndup(optarg, range - optarg);
1114 [ # # ]: 0 : if (!buffer)
1115 : 0 : return log_oom();
1116 : 0 : shift = buffer;
1117 : :
1118 : 0 : range++;
1119 : 0 : r = safe_atou32(range, &arg_uid_range);
1120 [ # # ]: 0 : if (r < 0)
1121 [ # # ]: 0 : return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1122 : : } else
1123 : 0 : shift = optarg;
1124 : :
1125 : 0 : r = parse_uid(shift, &arg_uid_shift);
1126 [ # # ]: 0 : if (r < 0)
1127 [ # # ]: 0 : return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1128 : :
1129 : 0 : arg_userns_mode = USER_NAMESPACE_FIXED;
1130 : : }
1131 : :
1132 [ # # ]: 0 : if (arg_uid_range <= 0)
1133 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1134 : : "UID range cannot be 0.");
1135 : :
1136 : 0 : arg_settings_mask |= SETTING_USERNS;
1137 : 0 : break;
1138 : : }
1139 : :
1140 : 0 : case 'U':
1141 [ # # ]: 0 : if (userns_supported()) {
1142 : 0 : arg_userns_mode = USER_NAMESPACE_PICK;
1143 : 0 : arg_uid_shift = UID_INVALID;
1144 : 0 : arg_uid_range = UINT32_C(0x10000);
1145 : :
1146 : 0 : arg_settings_mask |= SETTING_USERNS;
1147 : : }
1148 : :
1149 : 0 : break;
1150 : :
1151 : 0 : case ARG_PRIVATE_USERS_CHOWN:
1152 : 0 : arg_userns_chown = true;
1153 : :
1154 : 0 : arg_settings_mask |= SETTING_USERNS;
1155 : 0 : break;
1156 : :
1157 : 0 : case ARG_KILL_SIGNAL:
1158 [ # # ]: 0 : if (streq(optarg, "help")) {
1159 [ # # # # ]: 0 : DUMP_STRING_TABLE(signal, int, _NSIG);
1160 : 0 : return 0;
1161 : : }
1162 : :
1163 : 0 : arg_kill_signal = signal_from_string(optarg);
1164 [ # # ]: 0 : if (arg_kill_signal < 0)
1165 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1166 : : "Cannot parse signal: %s", optarg);
1167 : :
1168 : 0 : arg_settings_mask |= SETTING_KILL_SIGNAL;
1169 : 0 : break;
1170 : :
1171 : 0 : case ARG_SETTINGS:
1172 : :
1173 : : /* no → do not read files
1174 : : * yes → read files, do not override cmdline, trust only subset
1175 : : * override → read files, override cmdline, trust only subset
1176 : : * trusted → read files, do not override cmdline, trust all
1177 : : */
1178 : :
1179 : 0 : r = parse_boolean(optarg);
1180 [ # # ]: 0 : if (r < 0) {
1181 [ # # ]: 0 : if (streq(optarg, "trusted")) {
1182 : 0 : mask_all_settings = false;
1183 : 0 : mask_no_settings = false;
1184 : 0 : arg_settings_trusted = true;
1185 : :
1186 [ # # ]: 0 : } else if (streq(optarg, "override")) {
1187 : 0 : mask_all_settings = false;
1188 : 0 : mask_no_settings = true;
1189 : 0 : arg_settings_trusted = -1;
1190 : : } else
1191 [ # # ]: 0 : return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1192 [ # # ]: 0 : } else if (r > 0) {
1193 : : /* yes */
1194 : 0 : mask_all_settings = false;
1195 : 0 : mask_no_settings = false;
1196 : 0 : arg_settings_trusted = -1;
1197 : : } else {
1198 : : /* no */
1199 : 0 : mask_all_settings = true;
1200 : 0 : mask_no_settings = false;
1201 : 0 : arg_settings_trusted = false;
1202 : : }
1203 : :
1204 : 0 : break;
1205 : :
1206 : 0 : case ARG_CHDIR:
1207 [ # # ]: 0 : if (!path_is_absolute(optarg))
1208 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1209 : : "Working directory %s is not an absolute path.", optarg);
1210 : :
1211 : 0 : r = free_and_strdup(&arg_chdir, optarg);
1212 [ # # ]: 0 : if (r < 0)
1213 : 0 : return log_oom();
1214 : :
1215 : 0 : arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1216 : 0 : break;
1217 : :
1218 : 0 : case ARG_PIVOT_ROOT:
1219 : 0 : r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1220 [ # # ]: 0 : if (r < 0)
1221 [ # # ]: 0 : return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1222 : :
1223 : 0 : arg_settings_mask |= SETTING_PIVOT_ROOT;
1224 : 0 : break;
1225 : :
1226 : 0 : case ARG_NOTIFY_READY:
1227 : 0 : r = parse_boolean(optarg);
1228 [ # # ]: 0 : if (r < 0)
1229 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1230 : : "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1231 : 0 : arg_notify_ready = r;
1232 : 0 : arg_settings_mask |= SETTING_NOTIFY_READY;
1233 : 0 : break;
1234 : :
1235 : 0 : case ARG_ROOT_HASH: {
1236 : : void *k;
1237 : : size_t l;
1238 : :
1239 : 0 : r = unhexmem(optarg, strlen(optarg), &k, &l);
1240 [ # # ]: 0 : if (r < 0)
1241 [ # # ]: 0 : return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1242 [ # # ]: 0 : if (l < sizeof(sd_id128_t)) {
1243 : 0 : free(k);
1244 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
1245 : : }
1246 : :
1247 : 0 : free(arg_root_hash);
1248 : 0 : arg_root_hash = k;
1249 : 0 : arg_root_hash_size = l;
1250 : 0 : break;
1251 : : }
1252 : :
1253 : 0 : case ARG_SYSTEM_CALL_FILTER: {
1254 : : bool negative;
1255 : : const char *items;
1256 : :
1257 : 0 : negative = optarg[0] == '~';
1258 [ # # ]: 0 : items = negative ? optarg + 1 : optarg;
1259 : :
1260 : 0 : for (;;) {
1261 [ # # # ]: 0 : _cleanup_free_ char *word = NULL;
1262 : :
1263 : 0 : r = extract_first_word(&items, &word, NULL, 0);
1264 [ # # ]: 0 : if (r == 0)
1265 : 0 : break;
1266 [ # # ]: 0 : if (r == -ENOMEM)
1267 : 0 : return log_oom();
1268 [ # # ]: 0 : if (r < 0)
1269 [ # # ]: 0 : return log_error_errno(r, "Failed to parse system call filter: %m");
1270 : :
1271 [ # # ]: 0 : if (negative)
1272 : 0 : r = strv_extend(&arg_syscall_blacklist, word);
1273 : : else
1274 : 0 : r = strv_extend(&arg_syscall_whitelist, word);
1275 [ # # ]: 0 : if (r < 0)
1276 : 0 : return log_oom();
1277 : : }
1278 : :
1279 : 0 : arg_settings_mask |= SETTING_SYSCALL_FILTER;
1280 : 0 : break;
1281 : : }
1282 : :
1283 : 0 : case ARG_RLIMIT: {
1284 : : const char *eq;
1285 [ # # ]: 0 : _cleanup_free_ char *name = NULL;
1286 : : int rl;
1287 : :
1288 [ # # ]: 0 : if (streq(optarg, "help")) {
1289 [ # # # # ]: 0 : DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1290 : 0 : return 0;
1291 : : }
1292 : :
1293 : 0 : eq = strchr(optarg, '=');
1294 [ # # ]: 0 : if (!eq)
1295 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1296 : : "--rlimit= expects an '=' assignment.");
1297 : :
1298 : 0 : name = strndup(optarg, eq - optarg);
1299 [ # # ]: 0 : if (!name)
1300 : 0 : return log_oom();
1301 : :
1302 : 0 : rl = rlimit_from_string_harder(name);
1303 [ # # ]: 0 : if (rl < 0)
1304 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1305 : : "Unknown resource limit: %s", name);
1306 : :
1307 [ # # ]: 0 : if (!arg_rlimit[rl]) {
1308 : 0 : arg_rlimit[rl] = new0(struct rlimit, 1);
1309 [ # # ]: 0 : if (!arg_rlimit[rl])
1310 : 0 : return log_oom();
1311 : : }
1312 : :
1313 : 0 : r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1314 [ # # ]: 0 : if (r < 0)
1315 [ # # ]: 0 : return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1316 : :
1317 : 0 : arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1318 : 0 : break;
1319 : : }
1320 : :
1321 : 0 : case ARG_OOM_SCORE_ADJUST:
1322 : 0 : r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1323 [ # # ]: 0 : if (r < 0)
1324 [ # # ]: 0 : return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1325 : :
1326 : 0 : arg_oom_score_adjust_set = true;
1327 : 0 : arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1328 : 0 : break;
1329 : :
1330 : 0 : case ARG_CPU_AFFINITY: {
1331 : : CPUSet cpuset;
1332 : :
1333 : 0 : r = parse_cpu_set(optarg, &cpuset);
1334 [ # # ]: 0 : if (r < 0)
1335 [ # # ]: 0 : return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1336 : :
1337 : 0 : cpu_set_reset(&arg_cpu_set);
1338 : 0 : arg_cpu_set = cpuset;
1339 : 0 : arg_settings_mask |= SETTING_CPU_AFFINITY;
1340 : 0 : break;
1341 : : }
1342 : :
1343 : 0 : case ARG_RESOLV_CONF:
1344 [ # # ]: 0 : if (streq(optarg, "help")) {
1345 [ # # # # ]: 0 : DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1346 : 0 : return 0;
1347 : : }
1348 : :
1349 : 0 : arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1350 [ # # ]: 0 : if (arg_resolv_conf < 0)
1351 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1352 : : "Failed to parse /etc/resolv.conf mode: %s", optarg);
1353 : :
1354 : 0 : arg_settings_mask |= SETTING_RESOLV_CONF;
1355 : 0 : break;
1356 : :
1357 : 0 : case ARG_TIMEZONE:
1358 [ # # ]: 0 : if (streq(optarg, "help")) {
1359 [ # # # # ]: 0 : DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1360 : 0 : return 0;
1361 : : }
1362 : :
1363 : 0 : arg_timezone = timezone_mode_from_string(optarg);
1364 [ # # ]: 0 : if (arg_timezone < 0)
1365 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1366 : : "Failed to parse /etc/localtime mode: %s", optarg);
1367 : :
1368 : 0 : arg_settings_mask |= SETTING_TIMEZONE;
1369 : 0 : break;
1370 : :
1371 : 0 : case ARG_CONSOLE:
1372 [ # # ]: 0 : if (streq(optarg, "interactive"))
1373 : 0 : arg_console_mode = CONSOLE_INTERACTIVE;
1374 [ # # ]: 0 : else if (streq(optarg, "read-only"))
1375 : 0 : arg_console_mode = CONSOLE_READ_ONLY;
1376 [ # # ]: 0 : else if (streq(optarg, "passive"))
1377 : 0 : arg_console_mode = CONSOLE_PASSIVE;
1378 [ # # ]: 0 : else if (streq(optarg, "pipe"))
1379 : 0 : arg_console_mode = CONSOLE_PIPE;
1380 [ # # ]: 0 : else if (streq(optarg, "help"))
1381 : 0 : puts("interactive\n"
1382 : : "read-only\n"
1383 : : "passive\n"
1384 : : "pipe");
1385 : : else
1386 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
1387 : :
1388 : 0 : arg_settings_mask |= SETTING_CONSOLE_MODE;
1389 : 0 : break;
1390 : :
1391 : 0 : case 'P':
1392 : : case ARG_PIPE:
1393 : 0 : arg_console_mode = CONSOLE_PIPE;
1394 : 0 : arg_settings_mask |= SETTING_CONSOLE_MODE;
1395 : 0 : break;
1396 : :
1397 : 0 : case ARG_NO_PAGER:
1398 : 0 : arg_pager_flags |= PAGER_DISABLE;
1399 : 0 : break;
1400 : :
1401 : 4 : case '?':
1402 : 4 : return -EINVAL;
1403 : :
1404 : 0 : default:
1405 : 0 : assert_not_reached("Unhandled option");
1406 : : }
1407 : :
1408 [ # # ]: 0 : if (argc > optind) {
1409 : 0 : strv_free(arg_parameters);
1410 : 0 : arg_parameters = strv_copy(argv + optind);
1411 [ # # ]: 0 : if (!arg_parameters)
1412 : 0 : return log_oom();
1413 : :
1414 : 0 : arg_settings_mask |= SETTING_START_MODE;
1415 : : }
1416 : :
1417 [ # # # # : 0 : if (arg_ephemeral && arg_template && !arg_directory)
# # ]
1418 : : /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1419 : : * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1420 : : * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1421 : : * --directory=". */
1422 : 0 : arg_directory = TAKE_PTR(arg_template);
1423 : :
1424 [ # # ]: 0 : arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
1425 : :
1426 : : /* Make sure to parse environment before we reset the settings mask below */
1427 : 0 : parse_environment();
1428 : :
1429 : : /* Load all settings from .nspawn files */
1430 [ # # ]: 0 : if (mask_no_settings)
1431 : 0 : arg_settings_mask = 0;
1432 : :
1433 : : /* Don't load any settings from .nspawn files */
1434 [ # # ]: 0 : if (mask_all_settings)
1435 : 0 : arg_settings_mask = _SETTINGS_MASK_ALL;
1436 : :
1437 : 0 : return 1;
1438 : : }
1439 : :
1440 : 0 : static int verify_arguments(void) {
1441 : : int r;
1442 : :
1443 [ # # ]: 0 : if (arg_userns_mode != USER_NAMESPACE_NO)
1444 : 0 : arg_mount_settings |= MOUNT_USE_USERNS;
1445 : :
1446 [ # # ]: 0 : if (arg_private_network)
1447 : 0 : arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1448 : :
1449 [ # # ]: 0 : if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1450 [ # # ]: 0 : !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1451 : 0 : arg_register = false;
1452 [ # # ]: 0 : if (arg_start_mode != START_PID1)
1453 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1454 : : }
1455 : :
1456 [ # # ]: 0 : if (arg_userns_mode == USER_NAMESPACE_PICK)
1457 : 0 : arg_userns_chown = true;
1458 : :
1459 [ # # # # ]: 0 : if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1460 : 0 : arg_kill_signal = SIGRTMIN+3;
1461 : :
1462 [ # # ]: 0 : if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1463 : 0 : arg_read_only = true;
1464 : :
1465 [ # # # # : 0 : if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
# # ]
1466 : : /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1467 : : * The latter is not technically a user session, but we don't need to labour the point. */
1468 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1469 : :
1470 [ # # # # ]: 0 : if (arg_directory && arg_image)
1471 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1472 : :
1473 [ # # # # ]: 0 : if (arg_template && arg_image)
1474 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1475 : :
1476 [ # # # # : 0 : if (arg_template && !(arg_directory || arg_machine))
# # ]
1477 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1478 : :
1479 [ # # # # ]: 0 : if (arg_ephemeral && arg_template)
1480 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1481 : :
1482 [ # # # # : 0 : if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
# # ]
1483 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1484 : :
1485 [ # # # # ]: 0 : if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1486 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1487 : :
1488 [ # # # # ]: 0 : if (arg_userns_chown && arg_read_only)
1489 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1490 : : "--read-only and --private-users-chown may not be combined.");
1491 : :
1492 : : /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1493 : : * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
1494 : : * copy-up (in case of overlay) making the entire exercise pointless. */
1495 [ # # # # ]: 0 : if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1496 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1497 : :
1498 : : /* If --network-namespace-path is given with any other network-related option, we need to error out,
1499 : : * to avoid conflicts between different network options. */
1500 [ # # ]: 0 : if (arg_network_namespace_path &&
1501 [ # # # # : 0 : (arg_network_interfaces || arg_network_macvlan ||
# # ]
1502 [ # # # # ]: 0 : arg_network_ipvlan || arg_network_veth_extra ||
1503 [ # # # # ]: 0 : arg_network_bridge || arg_network_zone ||
1504 [ # # ]: 0 : arg_network_veth || arg_private_network))
1505 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1506 : :
1507 [ # # # # ]: 0 : if (arg_network_bridge && arg_network_zone)
1508 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1509 : : "--network-bridge= and --network-zone= may not be combined.");
1510 : :
1511 [ # # # # : 0 : if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
# # ]
1512 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1513 : :
1514 [ # # # # ]: 0 : if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1515 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1516 : :
1517 [ # # # # ]: 0 : if (arg_expose_ports && !arg_private_network)
1518 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1519 : :
1520 : : #if ! HAVE_LIBIPTC
1521 : : if (arg_expose_ports)
1522 : : return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1523 : : #endif
1524 : :
1525 : 0 : r = custom_mount_check_all();
1526 [ # # ]: 0 : if (r < 0)
1527 : 0 : return r;
1528 : :
1529 : 0 : return 0;
1530 : : }
1531 : :
1532 : 0 : static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1533 [ # # ]: 0 : assert(p);
1534 : :
1535 [ # # ]: 0 : if (arg_userns_mode == USER_NAMESPACE_NO)
1536 : 0 : return 0;
1537 : :
1538 [ # # # # ]: 0 : if (uid == UID_INVALID && gid == GID_INVALID)
1539 : 0 : return 0;
1540 : :
1541 [ # # ]: 0 : if (uid != UID_INVALID) {
1542 : 0 : uid += arg_uid_shift;
1543 : :
1544 [ # # # # ]: 0 : if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1545 : 0 : return -EOVERFLOW;
1546 : : }
1547 : :
1548 [ # # ]: 0 : if (gid != GID_INVALID) {
1549 : 0 : gid += (gid_t) arg_uid_shift;
1550 : :
1551 [ # # # # ]: 0 : if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1552 : 0 : return -EOVERFLOW;
1553 : : }
1554 : :
1555 [ # # ]: 0 : if (lchown(p, uid, gid) < 0)
1556 : 0 : return -errno;
1557 : :
1558 : 0 : return 0;
1559 : : }
1560 : :
1561 : 0 : static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1562 : : const char *q;
1563 : : int r;
1564 : :
1565 [ # # # # : 0 : q = prefix_roota(root, path);
# # # # #
# # # # #
# # ]
1566 : 0 : r = mkdir_errno_wrapper(q, mode);
1567 [ # # ]: 0 : if (r == -EEXIST)
1568 : 0 : return 0;
1569 [ # # ]: 0 : if (r < 0)
1570 : 0 : return r;
1571 : :
1572 : 0 : return userns_lchown(q, uid, gid);
1573 : : }
1574 : :
1575 : 0 : static const char *timezone_from_path(const char *path) {
1576 : 0 : return PATH_STARTSWITH_SET(
1577 : : path,
1578 : : "../usr/share/zoneinfo/",
1579 : : "/usr/share/zoneinfo/");
1580 : : }
1581 : :
1582 : 0 : static bool etc_writable(void) {
1583 [ # # # # : 0 : return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
# # ]
1584 : : }
1585 : :
1586 : 0 : static int setup_timezone(const char *dest) {
1587 : 0 : _cleanup_free_ char *p = NULL, *etc = NULL;
1588 : : const char *where, *check;
1589 : : TimezoneMode m;
1590 : : int r;
1591 : :
1592 [ # # ]: 0 : assert(dest);
1593 : :
1594 [ # # # # ]: 0 : if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1595 : 0 : r = readlink_malloc("/etc/localtime", &p);
1596 [ # # # # ]: 0 : if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1597 [ # # ]: 0 : m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1598 [ # # # # ]: 0 : else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1599 [ # # ]: 0 : m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1600 [ # # ]: 0 : else if (r < 0) {
1601 [ # # ]: 0 : log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1602 : : /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1603 : : * file.
1604 : : *
1605 : : * Example:
1606 : : * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1607 : : */
1608 : 0 : return 0;
1609 [ # # ]: 0 : } else if (arg_timezone == TIMEZONE_AUTO)
1610 [ # # ]: 0 : m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1611 : : else
1612 : 0 : m = arg_timezone;
1613 : : } else
1614 : 0 : m = arg_timezone;
1615 : :
1616 [ # # ]: 0 : if (m == TIMEZONE_OFF)
1617 : 0 : return 0;
1618 : :
1619 : 0 : r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1620 [ # # ]: 0 : if (r < 0) {
1621 [ # # ]: 0 : log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1622 : 0 : return 0;
1623 : : }
1624 : :
1625 [ # # # # : 0 : where = strjoina(etc, "/localtime");
# # # # #
# # # ]
1626 : :
1627 [ # # # # : 0 : switch (m) {
# ]
1628 : :
1629 : 0 : case TIMEZONE_DELETE:
1630 [ # # ]: 0 : if (unlink(where) < 0)
1631 [ # # # # ]: 0 : log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1632 : :
1633 : 0 : return 0;
1634 : :
1635 : 0 : case TIMEZONE_SYMLINK: {
1636 [ # # # ]: 0 : _cleanup_free_ char *q = NULL;
1637 : : const char *z, *what;
1638 : :
1639 : 0 : z = timezone_from_path(p);
1640 [ # # ]: 0 : if (!z) {
1641 [ # # ]: 0 : log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1642 : 0 : return 0;
1643 : : }
1644 : :
1645 : 0 : r = readlink_malloc(where, &q);
1646 [ # # # # ]: 0 : if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1647 : 0 : return 0; /* Already pointing to the right place? Then do nothing .. */
1648 : :
1649 [ # # # # : 0 : check = strjoina(dest, "/usr/share/zoneinfo/", z);
# # # # #
# # # ]
1650 : 0 : r = chase_symlinks(check, dest, 0, NULL);
1651 [ # # ]: 0 : if (r < 0)
1652 [ # # ]: 0 : log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1653 : : else {
1654 [ # # # # ]: 0 : if (unlink(where) < 0 && errno != ENOENT) {
1655 [ # # # # : 0 : log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
# # ]
1656 : : errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1657 : 0 : return 0;
1658 : : }
1659 : :
1660 [ # # # # : 0 : what = strjoina("../usr/share/zoneinfo/", z);
# # # # #
# # # ]
1661 [ # # ]: 0 : if (symlink(what, where) < 0) {
1662 [ # # # # : 0 : log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
# # ]
1663 : : errno, "Failed to correct timezone of container, ignoring: %m");
1664 : 0 : return 0;
1665 : : }
1666 : :
1667 : 0 : break;
1668 : : }
1669 : :
1670 : : _fallthrough_;
1671 : : }
1672 : :
1673 : : case TIMEZONE_BIND: {
1674 [ # # ]: 0 : _cleanup_free_ char *resolved = NULL;
1675 : : int found;
1676 : :
1677 : 0 : found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1678 [ # # ]: 0 : if (found < 0) {
1679 [ # # ]: 0 : log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1680 : 0 : return 0;
1681 : : }
1682 : :
1683 [ # # ]: 0 : if (found == 0) /* missing? */
1684 : 0 : (void) touch(resolved);
1685 : :
1686 : 0 : r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1687 [ # # ]: 0 : if (r >= 0)
1688 : 0 : return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1689 : :
1690 : : _fallthrough_;
1691 : : }
1692 : :
1693 : : case TIMEZONE_COPY:
1694 : : /* If mounting failed, try to copy */
1695 : 0 : r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1696 [ # # ]: 0 : if (r < 0) {
1697 [ # # # # : 0 : log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
# # ]
1698 : : "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1699 : 0 : return 0;
1700 : : }
1701 : :
1702 : 0 : break;
1703 : :
1704 : 0 : default:
1705 : 0 : assert_not_reached("unexpected mode");
1706 : : }
1707 : :
1708 : : /* Fix permissions of the symlink or file copy we just created */
1709 : 0 : r = userns_lchown(where, 0, 0);
1710 [ # # ]: 0 : if (r < 0)
1711 [ # # ]: 0 : log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
1712 : :
1713 : 0 : return 0;
1714 : : }
1715 : :
1716 : 0 : static int have_resolv_conf(const char *path) {
1717 [ # # ]: 0 : assert(path);
1718 : :
1719 [ # # ]: 0 : if (access(path, F_OK) < 0) {
1720 [ # # ]: 0 : if (errno == ENOENT)
1721 : 0 : return 0;
1722 : :
1723 [ # # ]: 0 : return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1724 : : }
1725 : :
1726 : 0 : return 1;
1727 : : }
1728 : :
1729 : 0 : static int resolved_listening(void) {
1730 : 0 : _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1731 : 0 : _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1732 : 0 : _cleanup_free_ char *dns_stub_listener_mode = NULL;
1733 : : int r;
1734 : :
1735 : : /* Check if resolved is listening */
1736 : :
1737 : 0 : r = sd_bus_open_system(&bus);
1738 [ # # ]: 0 : if (r < 0)
1739 [ # # ]: 0 : return log_debug_errno(r, "Failed to open system bus: %m");
1740 : :
1741 : 0 : r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1742 [ # # ]: 0 : if (r < 0)
1743 [ # # ]: 0 : return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1744 [ # # ]: 0 : if (r == 0)
1745 : 0 : return 0;
1746 : :
1747 : 0 : r = sd_bus_get_property_string(bus,
1748 : : "org.freedesktop.resolve1",
1749 : : "/org/freedesktop/resolve1",
1750 : : "org.freedesktop.resolve1.Manager",
1751 : : "DNSStubListener",
1752 : : &error,
1753 : : &dns_stub_listener_mode);
1754 [ # # ]: 0 : if (r < 0)
1755 [ # # ]: 0 : return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
1756 : :
1757 : 0 : return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
1758 : : }
1759 : :
1760 : 0 : static int setup_resolv_conf(const char *dest) {
1761 : 0 : _cleanup_free_ char *etc = NULL;
1762 : : const char *where, *what;
1763 : : ResolvConfMode m;
1764 : : int r;
1765 : :
1766 [ # # ]: 0 : assert(dest);
1767 : :
1768 [ # # ]: 0 : if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1769 [ # # ]: 0 : if (arg_private_network)
1770 : 0 : m = RESOLV_CONF_OFF;
1771 [ # # # # ]: 0 : else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
1772 [ # # ]: 0 : m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
1773 [ # # ]: 0 : else if (have_resolv_conf("/etc/resolv.conf") > 0)
1774 [ # # ]: 0 : m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
1775 : : else
1776 [ # # ]: 0 : m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
1777 : : } else
1778 : 0 : m = arg_resolv_conf;
1779 : :
1780 [ # # ]: 0 : if (m == RESOLV_CONF_OFF)
1781 : 0 : return 0;
1782 : :
1783 : 0 : r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1784 [ # # ]: 0 : if (r < 0) {
1785 [ # # ]: 0 : log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1786 : 0 : return 0;
1787 : : }
1788 : :
1789 [ # # # # : 0 : where = strjoina(etc, "/resolv.conf");
# # # # #
# # # ]
1790 : :
1791 [ # # ]: 0 : if (m == RESOLV_CONF_DELETE) {
1792 [ # # ]: 0 : if (unlink(where) < 0)
1793 [ # # # # ]: 0 : log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1794 : :
1795 : 0 : return 0;
1796 : : }
1797 : :
1798 [ # # # # ]: 0 : if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1799 : 0 : what = STATIC_RESOLV_CONF;
1800 : : else
1801 : 0 : what = "/etc/resolv.conf";
1802 : :
1803 [ # # # # ]: 0 : if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1804 [ # # ]: 0 : _cleanup_free_ char *resolved = NULL;
1805 : : int found;
1806 : :
1807 : 0 : found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1808 [ # # ]: 0 : if (found < 0) {
1809 [ # # ]: 0 : log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1810 : 0 : return 0;
1811 : : }
1812 : :
1813 [ # # ]: 0 : if (found == 0) /* missing? */
1814 : 0 : (void) touch(resolved);
1815 : :
1816 : 0 : r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
1817 [ # # ]: 0 : if (r >= 0)
1818 : 0 : return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1819 : : }
1820 : :
1821 : : /* If that didn't work, let's copy the file */
1822 : 0 : r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
1823 [ # # ]: 0 : if (r < 0) {
1824 : : /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1825 : : * resolved or something similar runs inside and the symlink points there.
1826 : : *
1827 : : * If the disk image is read-only, there's also no point in complaining.
1828 : : */
1829 [ # # # # : 0 : log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
# # # # ]
1830 : : "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1831 : 0 : return 0;
1832 : : }
1833 : :
1834 : 0 : r = userns_lchown(where, 0, 0);
1835 [ # # ]: 0 : if (r < 0)
1836 [ # # ]: 0 : log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1837 : :
1838 : 0 : return 0;
1839 : : }
1840 : :
1841 : 0 : static int setup_boot_id(void) {
1842 : 0 : _cleanup_(unlink_and_freep) char *from = NULL;
1843 : 0 : _cleanup_free_ char *path = NULL;
1844 : 0 : sd_id128_t rnd = SD_ID128_NULL;
1845 : : const char *to;
1846 : : int r;
1847 : :
1848 : : /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
1849 : :
1850 : 0 : r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
1851 [ # # ]: 0 : if (r < 0)
1852 [ # # ]: 0 : return log_error_errno(r, "Failed to generate random boot ID path: %m");
1853 : :
1854 : 0 : r = sd_id128_randomize(&rnd);
1855 [ # # ]: 0 : if (r < 0)
1856 [ # # ]: 0 : return log_error_errno(r, "Failed to generate random boot id: %m");
1857 : :
1858 : 0 : r = id128_write(path, ID128_UUID, rnd, false);
1859 [ # # ]: 0 : if (r < 0)
1860 [ # # ]: 0 : return log_error_errno(r, "Failed to write boot id: %m");
1861 : :
1862 : 0 : from = TAKE_PTR(path);
1863 : 0 : to = "/proc/sys/kernel/random/boot_id";
1864 : :
1865 : 0 : r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1866 [ # # ]: 0 : if (r < 0)
1867 : 0 : return r;
1868 : :
1869 : 0 : return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1870 : : }
1871 : :
1872 : 0 : static int copy_devnodes(const char *dest) {
1873 : : static const char devnodes[] =
1874 : : "null\0"
1875 : : "zero\0"
1876 : : "full\0"
1877 : : "random\0"
1878 : : "urandom\0"
1879 : : "tty\0"
1880 : : "net/tun\0";
1881 : :
1882 : 0 : _cleanup_umask_ mode_t u;
1883 : : const char *d;
1884 : 0 : int r = 0;
1885 : :
1886 [ # # ]: 0 : assert(dest);
1887 : :
1888 : 0 : u = umask(0000);
1889 : :
1890 : : /* Create /dev/net, so that we can create /dev/net/tun in it */
1891 [ # # ]: 0 : if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1892 [ # # ]: 0 : return log_error_errno(r, "Failed to create /dev/net directory: %m");
1893 : :
1894 [ # # # # ]: 0 : NULSTR_FOREACH(d, devnodes) {
1895 [ # # # # ]: 0 : _cleanup_free_ char *from = NULL, *to = NULL;
1896 : : struct stat st;
1897 : :
1898 : 0 : from = path_join("/dev/", d);
1899 [ # # ]: 0 : if (!from)
1900 : 0 : return log_oom();
1901 : :
1902 : 0 : to = path_join(dest, from);
1903 [ # # ]: 0 : if (!to)
1904 : 0 : return log_oom();
1905 : :
1906 [ # # ]: 0 : if (stat(from, &st) < 0) {
1907 : :
1908 [ # # ]: 0 : if (errno != ENOENT)
1909 [ # # ]: 0 : return log_error_errno(errno, "Failed to stat %s: %m", from);
1910 : :
1911 [ # # # # ]: 0 : } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1912 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO),
1913 : : "%s is not a char or block device, cannot copy.", from);
1914 : : else {
1915 [ # # # # : 0 : _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
# # # # ]
1916 : :
1917 [ # # ]: 0 : if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1918 : : /* Explicitly warn the user when /dev is already populated. */
1919 [ # # ]: 0 : if (errno == EEXIST)
1920 [ # # ]: 0 : log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
1921 [ # # ]: 0 : if (errno != EPERM)
1922 [ # # ]: 0 : return log_error_errno(errno, "mknod(%s) failed: %m", to);
1923 : :
1924 : : /* Some systems abusively restrict mknod but allow bind mounts. */
1925 : 0 : r = touch(to);
1926 [ # # ]: 0 : if (r < 0)
1927 [ # # ]: 0 : return log_error_errno(r, "touch (%s) failed: %m", to);
1928 : 0 : r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1929 [ # # ]: 0 : if (r < 0)
1930 [ # # ]: 0 : return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1931 : : }
1932 : :
1933 : 0 : r = userns_lchown(to, 0, 0);
1934 [ # # ]: 0 : if (r < 0)
1935 [ # # ]: 0 : return log_error_errno(r, "chown() of device node %s failed: %m", to);
1936 : :
1937 [ # # ]: 0 : dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
1938 [ # # ]: 0 : if (!dn)
1939 : 0 : return log_oom();
1940 : :
1941 : 0 : r = userns_mkdir(dest, dn, 0755, 0, 0);
1942 [ # # ]: 0 : if (r < 0)
1943 [ # # ]: 0 : return log_error_errno(r, "Failed to create '%s': %m", dn);
1944 : :
1945 [ # # ]: 0 : if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
1946 : 0 : return log_oom();
1947 : :
1948 : 0 : prefixed = path_join(dest, sl);
1949 [ # # ]: 0 : if (!prefixed)
1950 : 0 : return log_oom();
1951 : :
1952 : 0 : t = path_join("..", d);
1953 [ # # ]: 0 : if (!t)
1954 : 0 : return log_oom();
1955 : :
1956 [ # # ]: 0 : if (symlink(t, prefixed) < 0)
1957 [ # # ]: 0 : log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
1958 : : }
1959 : : }
1960 : :
1961 : 0 : return r;
1962 : : }
1963 : :
1964 : 0 : static int make_extra_nodes(const char *dest) {
1965 : 0 : _cleanup_umask_ mode_t u;
1966 : : size_t i;
1967 : : int r;
1968 : :
1969 : 0 : u = umask(0000);
1970 : :
1971 [ # # ]: 0 : for (i = 0; i < arg_n_extra_nodes; i++) {
1972 [ # # ]: 0 : _cleanup_free_ char *path = NULL;
1973 : 0 : DeviceNode *n = arg_extra_nodes + i;
1974 : :
1975 : 0 : path = path_join(dest, n->path);
1976 [ # # ]: 0 : if (!path)
1977 : 0 : return log_oom();
1978 : :
1979 [ # # # # : 0 : if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
# # ]
1980 [ # # ]: 0 : return log_error_errno(errno, "Failed to create device node '%s': %m", path);
1981 : :
1982 : 0 : r = chmod_and_chown(path, n->mode, n->uid, n->gid);
1983 [ # # ]: 0 : if (r < 0)
1984 [ # # ]: 0 : return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
1985 : : }
1986 : :
1987 : 0 : return 0;
1988 : : }
1989 : :
1990 : 0 : static int setup_pts(const char *dest) {
1991 : 0 : _cleanup_free_ char *options = NULL;
1992 : : const char *p;
1993 : : int r;
1994 : :
1995 : : #if HAVE_SELINUX
1996 [ # # ]: 0 : if (arg_selinux_apifs_context)
1997 : 0 : (void) asprintf(&options,
1998 : : "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1999 : : arg_uid_shift + TTY_GID,
2000 : : arg_selinux_apifs_context);
2001 : : else
2002 : : #endif
2003 : 0 : (void) asprintf(&options,
2004 : : "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2005 : : arg_uid_shift + TTY_GID);
2006 : :
2007 [ # # ]: 0 : if (!options)
2008 : 0 : return log_oom();
2009 : :
2010 : : /* Mount /dev/pts itself */
2011 [ # # # # : 0 : p = prefix_roota(dest, "/dev/pts");
# # # # #
# # # # #
# # ]
2012 : 0 : r = mkdir_errno_wrapper(p, 0755);
2013 [ # # ]: 0 : if (r < 0)
2014 [ # # ]: 0 : return log_error_errno(r, "Failed to create /dev/pts: %m");
2015 : :
2016 : 0 : r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2017 [ # # ]: 0 : if (r < 0)
2018 : 0 : return r;
2019 : 0 : r = userns_lchown(p, 0, 0);
2020 [ # # ]: 0 : if (r < 0)
2021 [ # # ]: 0 : return log_error_errno(r, "Failed to chown /dev/pts: %m");
2022 : :
2023 : : /* Create /dev/ptmx symlink */
2024 [ # # # # : 0 : p = prefix_roota(dest, "/dev/ptmx");
# # # # #
# # # # #
# # ]
2025 [ # # ]: 0 : if (symlink("pts/ptmx", p) < 0)
2026 [ # # ]: 0 : return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2027 : 0 : r = userns_lchown(p, 0, 0);
2028 [ # # ]: 0 : if (r < 0)
2029 [ # # ]: 0 : return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2030 : :
2031 : : /* And fix /dev/pts/ptmx ownership */
2032 [ # # # # : 0 : p = prefix_roota(dest, "/dev/pts/ptmx");
# # # # #
# # # # #
# # ]
2033 : 0 : r = userns_lchown(p, 0, 0);
2034 [ # # ]: 0 : if (r < 0)
2035 [ # # ]: 0 : return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2036 : :
2037 : 0 : return 0;
2038 : : }
2039 : :
2040 : 0 : static int setup_stdio_as_dev_console(void) {
2041 : : int terminal;
2042 : : int r;
2043 : :
2044 : 0 : terminal = open_terminal("/dev/console", O_RDWR);
2045 [ # # ]: 0 : if (terminal < 0)
2046 [ # # ]: 0 : return log_error_errno(terminal, "Failed to open console: %m");
2047 : :
2048 : : /* Make sure we can continue logging to the original stderr, even if
2049 : : * stderr points elsewhere now */
2050 : 0 : r = log_dup_console();
2051 [ # # ]: 0 : if (r < 0)
2052 [ # # ]: 0 : return log_error_errno(r, "Failed to duplicate stderr: %m");
2053 : :
2054 : : /* invalidates 'terminal' on success and failure */
2055 : 0 : r = rearrange_stdio(terminal, terminal, terminal);
2056 [ # # ]: 0 : if (r < 0)
2057 [ # # ]: 0 : return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2058 : :
2059 : 0 : return 0;
2060 : : }
2061 : :
2062 : 0 : static int setup_dev_console(const char *console) {
2063 : 0 : _cleanup_free_ char *p = NULL;
2064 : : int r;
2065 : :
2066 : : /* Create /dev/console symlink */
2067 : 0 : r = path_make_relative("/dev", console, &p);
2068 [ # # ]: 0 : if (r < 0)
2069 [ # # ]: 0 : return log_error_errno(r, "Failed to create relative path: %m");
2070 : :
2071 [ # # ]: 0 : if (symlink(p, "/dev/console") < 0)
2072 [ # # ]: 0 : return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2073 : :
2074 : 0 : return 0;
2075 : : }
2076 : :
2077 : 0 : static int setup_keyring(void) {
2078 : : key_serial_t keyring;
2079 : :
2080 : : /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
2081 : : * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
2082 : : * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
2083 : : * these system calls let's make sure we don't leak anything into the container. */
2084 : :
2085 : 0 : keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2086 [ # # ]: 0 : if (keyring == -1) {
2087 [ # # ]: 0 : if (errno == ENOSYS)
2088 [ # # ]: 0 : log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2089 [ # # # # ]: 0 : else if (IN_SET(errno, EACCES, EPERM))
2090 [ # # ]: 0 : log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2091 : : else
2092 [ # # ]: 0 : return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2093 : : }
2094 : :
2095 : 0 : return 0;
2096 : : }
2097 : :
2098 : 0 : static int setup_kmsg(int kmsg_socket) {
2099 : 0 : _cleanup_(unlink_and_freep) char *from = NULL;
2100 : 0 : _cleanup_free_ char *fifo = NULL;
2101 : 0 : _cleanup_close_ int fd = -1;
2102 : 0 : _cleanup_umask_ mode_t u;
2103 : : int r;
2104 : :
2105 [ # # ]: 0 : assert(kmsg_socket >= 0);
2106 : :
2107 : 0 : u = umask(0000);
2108 : :
2109 : : /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
2110 : : * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2111 : : * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2112 : : * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2113 : :
2114 : 0 : r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2115 [ # # ]: 0 : if (r < 0)
2116 [ # # ]: 0 : return log_error_errno(r, "Failed to generate kmsg path: %m");
2117 : :
2118 [ # # ]: 0 : if (mkfifo(fifo, 0600) < 0)
2119 [ # # ]: 0 : return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2120 : :
2121 : 0 : from = TAKE_PTR(fifo);
2122 : :
2123 : 0 : r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2124 [ # # ]: 0 : if (r < 0)
2125 : 0 : return r;
2126 : :
2127 : 0 : fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2128 [ # # ]: 0 : if (fd < 0)
2129 [ # # ]: 0 : return log_error_errno(errno, "Failed to open fifo: %m");
2130 : :
2131 : : /* Store away the fd in the socket, so that it stays open as long as we run the child */
2132 : 0 : r = send_one_fd(kmsg_socket, fd, 0);
2133 [ # # ]: 0 : if (r < 0)
2134 [ # # ]: 0 : return log_error_errno(r, "Failed to send FIFO fd: %m");
2135 : :
2136 : 0 : return 0;
2137 : : }
2138 : :
2139 : 0 : static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2140 : 0 : union in_addr_union *exposed = userdata;
2141 : :
2142 [ # # ]: 0 : assert(rtnl);
2143 [ # # ]: 0 : assert(m);
2144 [ # # ]: 0 : assert(exposed);
2145 : :
2146 : 0 : expose_port_execute(rtnl, arg_expose_ports, exposed);
2147 : 0 : return 0;
2148 : : }
2149 : :
2150 : 0 : static int setup_hostname(void) {
2151 : : int r;
2152 : :
2153 [ # # ]: 0 : if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2154 : 0 : return 0;
2155 : :
2156 [ # # ]: 0 : r = sethostname_idempotent(arg_hostname ?: arg_machine);
2157 [ # # ]: 0 : if (r < 0)
2158 [ # # ]: 0 : return log_error_errno(r, "Failed to set hostname: %m");
2159 : :
2160 : 0 : return 0;
2161 : : }
2162 : :
2163 : 0 : static int setup_journal(const char *directory) {
2164 : 0 : _cleanup_free_ char *d = NULL;
2165 : : const char *dirname, *p, *q;
2166 : : sd_id128_t this_id;
2167 : : char id[33];
2168 : : bool try;
2169 : : int r;
2170 : :
2171 : : /* Don't link journals in ephemeral mode */
2172 [ # # ]: 0 : if (arg_ephemeral)
2173 : 0 : return 0;
2174 : :
2175 [ # # ]: 0 : if (arg_link_journal == LINK_NO)
2176 : 0 : return 0;
2177 : :
2178 [ # # # # ]: 0 : try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2179 : :
2180 : 0 : r = sd_id128_get_machine(&this_id);
2181 [ # # ]: 0 : if (r < 0)
2182 [ # # ]: 0 : return log_error_errno(r, "Failed to retrieve machine ID: %m");
2183 : :
2184 [ # # ]: 0 : if (sd_id128_equal(arg_uuid, this_id)) {
2185 [ # # # # ]: 0 : log_full(try ? LOG_WARNING : LOG_ERR,
2186 : : "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
2187 [ # # ]: 0 : if (try)
2188 : 0 : return 0;
2189 : 0 : return -EEXIST;
2190 : : }
2191 : :
2192 [ # # ]: 0 : FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2193 : 0 : r = userns_mkdir(directory, dirname, 0755, 0, 0);
2194 [ # # ]: 0 : if (r < 0) {
2195 [ # # # # ]: 0 : bool ignore = r == -EROFS && try;
2196 [ # # # # : 0 : log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
# # ]
2197 : : "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2198 [ # # ]: 0 : return ignore ? 0 : r;
2199 : : }
2200 : : }
2201 : :
2202 : 0 : (void) sd_id128_to_string(arg_uuid, id);
2203 : :
2204 [ # # # # : 0 : p = strjoina("/var/log/journal/", id);
# # # # #
# # # ]
2205 [ # # # # : 0 : q = prefix_roota(directory, p);
# # # # #
# # # # #
# # ]
2206 : :
2207 [ # # ]: 0 : if (path_is_mount_point(p, NULL, 0) > 0) {
2208 [ # # ]: 0 : if (try)
2209 : 0 : return 0;
2210 : :
2211 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2212 : : "%s: already a mount point, refusing to use for journal", p);
2213 : : }
2214 : :
2215 [ # # ]: 0 : if (path_is_mount_point(q, NULL, 0) > 0) {
2216 [ # # ]: 0 : if (try)
2217 : 0 : return 0;
2218 : :
2219 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2220 : : "%s: already a mount point, refusing to use for journal", q);
2221 : : }
2222 : :
2223 : 0 : r = readlink_and_make_absolute(p, &d);
2224 [ # # ]: 0 : if (r >= 0) {
2225 [ # # # # ]: 0 : if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2226 [ # # ]: 0 : path_equal(d, q)) {
2227 : :
2228 : 0 : r = userns_mkdir(directory, p, 0755, 0, 0);
2229 [ # # ]: 0 : if (r < 0)
2230 [ # # ]: 0 : log_warning_errno(r, "Failed to create directory %s: %m", q);
2231 : 0 : return 0;
2232 : : }
2233 : :
2234 [ # # ]: 0 : if (unlink(p) < 0)
2235 [ # # ]: 0 : return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2236 [ # # ]: 0 : } else if (r == -EINVAL) {
2237 : :
2238 [ # # # # ]: 0 : if (arg_link_journal == LINK_GUEST &&
2239 : 0 : rmdir(p) < 0) {
2240 : :
2241 [ # # ]: 0 : if (errno == ENOTDIR) {
2242 [ # # ]: 0 : log_error("%s already exists and is neither a symlink nor a directory", p);
2243 : 0 : return r;
2244 : : } else
2245 [ # # ]: 0 : return log_error_errno(errno, "Failed to remove %s: %m", p);
2246 : : }
2247 [ # # ]: 0 : } else if (r != -ENOENT)
2248 [ # # ]: 0 : return log_error_errno(r, "readlink(%s) failed: %m", p);
2249 : :
2250 [ # # ]: 0 : if (arg_link_journal == LINK_GUEST) {
2251 : :
2252 [ # # ]: 0 : if (symlink(q, p) < 0) {
2253 [ # # ]: 0 : if (try) {
2254 [ # # ]: 0 : log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2255 : 0 : return 0;
2256 : : } else
2257 [ # # ]: 0 : return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2258 : : }
2259 : :
2260 : 0 : r = userns_mkdir(directory, p, 0755, 0, 0);
2261 [ # # ]: 0 : if (r < 0)
2262 [ # # ]: 0 : log_warning_errno(r, "Failed to create directory %s: %m", q);
2263 : 0 : return 0;
2264 : : }
2265 : :
2266 [ # # ]: 0 : if (arg_link_journal == LINK_HOST) {
2267 : : /* don't create parents here — if the host doesn't have
2268 : : * permanent journal set up, don't force it here */
2269 : :
2270 : 0 : r = mkdir_errno_wrapper(p, 0755);
2271 [ # # # # ]: 0 : if (r < 0 && r != -EEXIST) {
2272 [ # # ]: 0 : if (try) {
2273 [ # # ]: 0 : log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2274 : 0 : return 0;
2275 : : } else
2276 [ # # ]: 0 : return log_error_errno(r, "Failed to create %s: %m", p);
2277 : : }
2278 : :
2279 [ # # ]: 0 : } else if (access(p, F_OK) < 0)
2280 : 0 : return 0;
2281 : :
2282 [ # # ]: 0 : if (dir_is_empty(q) == 0)
2283 [ # # ]: 0 : log_warning("%s is not empty, proceeding anyway.", q);
2284 : :
2285 : 0 : r = userns_mkdir(directory, p, 0755, 0, 0);
2286 [ # # ]: 0 : if (r < 0)
2287 [ # # ]: 0 : return log_error_errno(r, "Failed to create %s: %m", q);
2288 : :
2289 : 0 : r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2290 [ # # ]: 0 : if (r < 0)
2291 [ # # ]: 0 : return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2292 : :
2293 : 0 : return 0;
2294 : : }
2295 : :
2296 : 0 : static int drop_capabilities(uid_t uid) {
2297 : : CapabilityQuintet q;
2298 : :
2299 : : /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2300 : : * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2301 : : * arg_caps_retain. */
2302 : :
2303 [ # # ]: 0 : if (capability_quintet_is_set(&arg_full_capabilities)) {
2304 : 0 : q = arg_full_capabilities;
2305 : :
2306 [ # # ]: 0 : if (q.bounding == (uint64_t) -1)
2307 [ # # ]: 0 : q.bounding = uid == 0 ? arg_caps_retain : 0;
2308 : :
2309 [ # # ]: 0 : if (q.effective == (uint64_t) -1)
2310 [ # # ]: 0 : q.effective = uid == 0 ? q.bounding : 0;
2311 : :
2312 [ # # ]: 0 : if (q.inheritable == (uint64_t) -1)
2313 [ # # ]: 0 : q.inheritable = uid == 0 ? q.bounding : 0;
2314 : :
2315 [ # # ]: 0 : if (q.permitted == (uint64_t) -1)
2316 [ # # ]: 0 : q.permitted = uid == 0 ? q.bounding : 0;
2317 : :
2318 [ # # # # ]: 0 : if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2319 : 0 : q.ambient = 0;
2320 : :
2321 [ # # ]: 0 : if (capability_quintet_mangle(&q))
2322 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2323 : :
2324 : : } else {
2325 : 0 : q = (CapabilityQuintet) {
2326 : : .bounding = arg_caps_retain,
2327 [ # # ]: 0 : .effective = uid == 0 ? arg_caps_retain : 0,
2328 [ # # ]: 0 : .inheritable = uid == 0 ? arg_caps_retain : 0,
2329 [ # # ]: 0 : .permitted = uid == 0 ? arg_caps_retain : 0,
2330 [ # # ]: 0 : .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2331 : : };
2332 : :
2333 : : /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2334 : : * in order to maintain the same behavior as systemd < 242. */
2335 [ # # ]: 0 : if (capability_quintet_mangle(&q))
2336 [ # # ]: 0 : log_warning("Some capabilities will not be set because they are not in the current bounding set.");
2337 : :
2338 : : }
2339 : :
2340 : 0 : return capability_quintet_enforce(&q);
2341 : : }
2342 : :
2343 : 0 : static int reset_audit_loginuid(void) {
2344 : 0 : _cleanup_free_ char *p = NULL;
2345 : : int r;
2346 : :
2347 [ # # ]: 0 : if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2348 : 0 : return 0;
2349 : :
2350 : 0 : r = read_one_line_file("/proc/self/loginuid", &p);
2351 [ # # ]: 0 : if (r == -ENOENT)
2352 : 0 : return 0;
2353 [ # # ]: 0 : if (r < 0)
2354 [ # # ]: 0 : return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2355 : :
2356 : : /* Already reset? */
2357 [ # # ]: 0 : if (streq(p, "4294967295"))
2358 : 0 : return 0;
2359 : :
2360 : 0 : r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2361 [ # # ]: 0 : if (r < 0) {
2362 [ # # ]: 0 : log_error_errno(r,
2363 : : "Failed to reset audit login UID. This probably means that your kernel is too\n"
2364 : : "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2365 : : "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2366 : : "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2367 : : "using systemd-nspawn. Sleeping for 5s... (%m)");
2368 : :
2369 : 0 : sleep(5);
2370 : : }
2371 : :
2372 : 0 : return 0;
2373 : : }
2374 : :
2375 : 0 : static int setup_propagate(const char *root) {
2376 : : const char *p, *q;
2377 : : int r;
2378 : :
2379 : 0 : (void) mkdir_p("/run/systemd/nspawn/", 0755);
2380 : 0 : (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2381 [ # # # # : 0 : p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
# # # # #
# # # ]
2382 : 0 : (void) mkdir_p(p, 0600);
2383 : :
2384 : 0 : r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2385 [ # # ]: 0 : if (r < 0)
2386 [ # # ]: 0 : return log_error_errno(r, "Failed to create /run/systemd: %m");
2387 : :
2388 : 0 : r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2389 [ # # ]: 0 : if (r < 0)
2390 [ # # ]: 0 : return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
2391 : :
2392 : 0 : r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2393 [ # # ]: 0 : if (r < 0)
2394 [ # # ]: 0 : return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
2395 : :
2396 [ # # # # : 0 : q = prefix_roota(root, "/run/systemd/nspawn/incoming");
# # # # #
# # # # #
# # ]
2397 : 0 : r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2398 [ # # ]: 0 : if (r < 0)
2399 : 0 : return r;
2400 : :
2401 : 0 : r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2402 [ # # ]: 0 : if (r < 0)
2403 : 0 : return r;
2404 : :
2405 : : /* machined will MS_MOVE into that directory, and that's only
2406 : : * supported for non-shared mounts. */
2407 : 0 : return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2408 : : }
2409 : :
2410 : 0 : static int setup_machine_id(const char *directory) {
2411 : : const char *etc_machine_id;
2412 : : sd_id128_t id;
2413 : : int r;
2414 : :
2415 : : /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2416 : : * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2417 : : * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2418 : : * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2419 : : * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2420 : : * container behaves nicely). */
2421 : :
2422 [ # # # # : 0 : etc_machine_id = prefix_roota(directory, "/etc/machine-id");
# # # # #
# # # # #
# # ]
2423 : :
2424 : 0 : r = id128_read(etc_machine_id, ID128_PLAIN, &id);
2425 [ # # ]: 0 : if (r < 0) {
2426 [ # # # # ]: 0 : if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2427 [ # # ]: 0 : return log_error_errno(r, "Failed to read machine ID from container image: %m");
2428 : :
2429 [ # # ]: 0 : if (sd_id128_is_null(arg_uuid)) {
2430 : 0 : r = sd_id128_randomize(&arg_uuid);
2431 [ # # ]: 0 : if (r < 0)
2432 [ # # ]: 0 : return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2433 : : }
2434 : : } else {
2435 [ # # ]: 0 : if (sd_id128_is_null(id))
2436 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2437 : : "Machine ID in container image is zero, refusing.");
2438 : :
2439 : 0 : arg_uuid = id;
2440 : : }
2441 : :
2442 : 0 : return 0;
2443 : : }
2444 : :
2445 : 0 : static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2446 : : int r;
2447 : :
2448 [ # # ]: 0 : assert(directory);
2449 : :
2450 [ # # # # ]: 0 : if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2451 : 0 : return 0;
2452 : :
2453 : 0 : r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2454 [ # # ]: 0 : if (r == -EOPNOTSUPP)
2455 [ # # ]: 0 : return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2456 [ # # ]: 0 : if (r == -EBADE)
2457 [ # # ]: 0 : return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2458 [ # # ]: 0 : if (r < 0)
2459 [ # # ]: 0 : return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2460 [ # # ]: 0 : if (r == 0)
2461 [ # # ]: 0 : log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2462 : : else
2463 [ # # ]: 0 : log_debug("Patched directory tree to match UID/GID range.");
2464 : :
2465 : 0 : return r;
2466 : : }
2467 : :
2468 : : /*
2469 : : * Return values:
2470 : : * < 0 : wait_for_terminate() failed to get the state of the
2471 : : * container, the container was terminated by a signal, or
2472 : : * failed for an unknown reason. No change is made to the
2473 : : * container argument.
2474 : : * > 0 : The program executed in the container terminated with an
2475 : : * error. The exit code of the program executed in the
2476 : : * container is returned. The container argument has been set
2477 : : * to CONTAINER_TERMINATED.
2478 : : * 0 : The container is being rebooted, has been shut down or exited
2479 : : * successfully. The container argument has been set to either
2480 : : * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2481 : : *
2482 : : * That is, success is indicated by a return value of zero, and an
2483 : : * error is indicated by a non-zero value.
2484 : : */
2485 : 0 : static int wait_for_container(pid_t pid, ContainerStatus *container) {
2486 : : siginfo_t status;
2487 : : int r;
2488 : :
2489 : 0 : r = wait_for_terminate(pid, &status);
2490 [ # # ]: 0 : if (r < 0)
2491 [ # # ]: 0 : return log_warning_errno(r, "Failed to wait for container: %m");
2492 : :
2493 [ # # # # ]: 0 : switch (status.si_code) {
2494 : :
2495 : 0 : case CLD_EXITED:
2496 [ # # ]: 0 : if (status.si_status == 0)
2497 [ # # # # ]: 0 : log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2498 : : else
2499 [ # # # # ]: 0 : log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2500 : :
2501 : 0 : *container = CONTAINER_TERMINATED;
2502 : 0 : return status.si_status;
2503 : :
2504 : 0 : case CLD_KILLED:
2505 [ # # ]: 0 : if (status.si_status == SIGINT) {
2506 [ # # # # ]: 0 : log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2507 : 0 : *container = CONTAINER_TERMINATED;
2508 : 0 : return 0;
2509 : :
2510 [ # # ]: 0 : } else if (status.si_status == SIGHUP) {
2511 [ # # # # ]: 0 : log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2512 : 0 : *container = CONTAINER_REBOOTED;
2513 : 0 : return 0;
2514 : : }
2515 : :
2516 : : _fallthrough_;
2517 : : case CLD_DUMPED:
2518 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO),
2519 : : "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2520 : :
2521 : 0 : default:
2522 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO),
2523 : : "Container %s failed due to unknown reason.", arg_machine);
2524 : : }
2525 : : }
2526 : :
2527 : 0 : static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2528 : : pid_t pid;
2529 : :
2530 : 0 : pid = PTR_TO_PID(userdata);
2531 [ # # ]: 0 : if (pid > 0) {
2532 [ # # ]: 0 : if (kill(pid, arg_kill_signal) >= 0) {
2533 [ # # ]: 0 : log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2534 : 0 : sd_event_source_set_userdata(s, NULL);
2535 : 0 : return 0;
2536 : : }
2537 : : }
2538 : :
2539 : 0 : sd_event_exit(sd_event_source_get_event(s), 0);
2540 : 0 : return 0;
2541 : : }
2542 : :
2543 : 0 : static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2544 : : pid_t pid;
2545 : :
2546 [ # # ]: 0 : assert(s);
2547 [ # # ]: 0 : assert(ssi);
2548 : :
2549 : 0 : pid = PTR_TO_PID(userdata);
2550 : :
2551 : 0 : for (;;) {
2552 : 0 : siginfo_t si = {};
2553 : :
2554 [ # # ]: 0 : if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2555 [ # # ]: 0 : return log_error_errno(errno, "Failed to waitid(): %m");
2556 [ # # ]: 0 : if (si.si_pid == 0) /* No pending children. */
2557 : 0 : break;
2558 [ # # ]: 0 : if (si.si_pid == pid) {
2559 : : /* The main process we care for has exited. Return from
2560 : : * signal handler but leave the zombie. */
2561 : 0 : sd_event_exit(sd_event_source_get_event(s), 0);
2562 : 0 : break;
2563 : : }
2564 : :
2565 : : /* Reap all other children. */
2566 : 0 : (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2567 : : }
2568 : :
2569 : 0 : return 0;
2570 : : }
2571 : :
2572 : 0 : static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2573 : : pid_t pid;
2574 : :
2575 [ # # ]: 0 : assert(m);
2576 : :
2577 : 0 : pid = PTR_TO_PID(userdata);
2578 : :
2579 [ # # ]: 0 : if (arg_kill_signal > 0) {
2580 [ # # ]: 0 : log_info("Container termination requested. Attempting to halt container.");
2581 : 0 : (void) kill(pid, arg_kill_signal);
2582 : : } else {
2583 [ # # ]: 0 : log_info("Container termination requested. Exiting.");
2584 : 0 : sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2585 : : }
2586 : :
2587 : 0 : return 0;
2588 : : }
2589 : :
2590 : 0 : static int determine_names(void) {
2591 : : int r;
2592 : :
2593 [ # # # # : 0 : if (arg_template && !arg_directory && arg_machine) {
# # ]
2594 : :
2595 : : /* If --template= was specified then we should not
2596 : : * search for a machine, but instead create a new one
2597 : : * in /var/lib/machine. */
2598 : :
2599 : 0 : arg_directory = path_join("/var/lib/machines", arg_machine);
2600 [ # # ]: 0 : if (!arg_directory)
2601 : 0 : return log_oom();
2602 : : }
2603 : :
2604 [ # # # # ]: 0 : if (!arg_image && !arg_directory) {
2605 [ # # ]: 0 : if (arg_machine) {
2606 [ # # ]: 0 : _cleanup_(image_unrefp) Image *i = NULL;
2607 : :
2608 : 0 : r = image_find(IMAGE_MACHINE, arg_machine, &i);
2609 [ # # ]: 0 : if (r == -ENOENT)
2610 [ # # ]: 0 : return log_error_errno(r, "No image for machine '%s'.", arg_machine);
2611 [ # # ]: 0 : if (r < 0)
2612 [ # # ]: 0 : return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2613 : :
2614 [ # # # # ]: 0 : if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
2615 : 0 : r = free_and_strdup(&arg_image, i->path);
2616 : : else
2617 : 0 : r = free_and_strdup(&arg_directory, i->path);
2618 [ # # ]: 0 : if (r < 0)
2619 : 0 : return log_oom();
2620 : :
2621 [ # # ]: 0 : if (!arg_ephemeral)
2622 [ # # # # ]: 0 : arg_read_only = arg_read_only || i->read_only;
2623 : : } else {
2624 : 0 : r = safe_getcwd(&arg_directory);
2625 [ # # ]: 0 : if (r < 0)
2626 [ # # ]: 0 : return log_error_errno(r, "Failed to determine current directory: %m");
2627 : : }
2628 : :
2629 [ # # # # ]: 0 : if (!arg_directory && !arg_image)
2630 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
2631 : : }
2632 : :
2633 [ # # ]: 0 : if (!arg_machine) {
2634 [ # # # # ]: 0 : if (arg_directory && path_equal(arg_directory, "/"))
2635 : 0 : arg_machine = gethostname_malloc();
2636 : : else {
2637 [ # # ]: 0 : if (arg_image) {
2638 : : char *e;
2639 : :
2640 : 0 : arg_machine = strdup(basename(arg_image));
2641 : :
2642 : : /* Truncate suffix if there is one */
2643 : 0 : e = endswith(arg_machine, ".raw");
2644 [ # # ]: 0 : if (e)
2645 : 0 : *e = 0;
2646 : : } else
2647 : 0 : arg_machine = strdup(basename(arg_directory));
2648 : : }
2649 [ # # ]: 0 : if (!arg_machine)
2650 : 0 : return log_oom();
2651 : :
2652 : 0 : hostname_cleanup(arg_machine);
2653 [ # # ]: 0 : if (!machine_name_is_valid(arg_machine))
2654 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
2655 : :
2656 [ # # ]: 0 : if (arg_ephemeral) {
2657 : : char *b;
2658 : :
2659 : : /* Add a random suffix when this is an
2660 : : * ephemeral machine, so that we can run many
2661 : : * instances at once without manually having
2662 : : * to specify -M each time. */
2663 : :
2664 [ # # ]: 0 : if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2665 : 0 : return log_oom();
2666 : :
2667 : 0 : free(arg_machine);
2668 : 0 : arg_machine = b;
2669 : : }
2670 : : }
2671 : :
2672 : 0 : return 0;
2673 : : }
2674 : :
2675 : 0 : static int chase_symlinks_and_update(char **p, unsigned flags) {
2676 : : char *chased;
2677 : : int r;
2678 : :
2679 [ # # ]: 0 : assert(p);
2680 : :
2681 [ # # ]: 0 : if (!*p)
2682 : 0 : return 0;
2683 : :
2684 : 0 : r = chase_symlinks(*p, NULL, flags, &chased);
2685 [ # # ]: 0 : if (r < 0)
2686 [ # # ]: 0 : return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2687 : :
2688 : 0 : free_and_replace(*p, chased);
2689 : 0 : return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
2690 : : }
2691 : :
2692 : 0 : static int determine_uid_shift(const char *directory) {
2693 : : int r;
2694 : :
2695 [ # # ]: 0 : if (arg_userns_mode == USER_NAMESPACE_NO) {
2696 : 0 : arg_uid_shift = 0;
2697 : 0 : return 0;
2698 : : }
2699 : :
2700 [ # # ]: 0 : if (arg_uid_shift == UID_INVALID) {
2701 : : struct stat st;
2702 : :
2703 : 0 : r = stat(directory, &st);
2704 [ # # ]: 0 : if (r < 0)
2705 [ # # ]: 0 : return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2706 : :
2707 : 0 : arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2708 : :
2709 [ # # ]: 0 : if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2710 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2711 : : "UID and GID base of %s don't match.", directory);
2712 : :
2713 : 0 : arg_uid_range = UINT32_C(0x10000);
2714 : : }
2715 : :
2716 [ # # ]: 0 : if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2717 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2718 : : "UID base too high for UID range.");
2719 : :
2720 : 0 : return 0;
2721 : : }
2722 : :
2723 : 0 : static unsigned long effective_clone_ns_flags(void) {
2724 : 0 : unsigned long flags = arg_clone_ns_flags;
2725 : :
2726 [ # # ]: 0 : if (arg_private_network)
2727 : 0 : flags |= CLONE_NEWNET;
2728 [ # # ]: 0 : if (arg_use_cgns)
2729 : 0 : flags |= CLONE_NEWCGROUP;
2730 [ # # ]: 0 : if (arg_userns_mode != USER_NAMESPACE_NO)
2731 : 0 : flags |= CLONE_NEWUSER;
2732 : :
2733 : 0 : return flags;
2734 : : }
2735 : :
2736 : 0 : static int patch_sysctl(void) {
2737 : :
2738 : : /* This table is inspired by runc's sysctl() function */
2739 : : static const struct {
2740 : : const char *key;
2741 : : bool prefix;
2742 : : unsigned long clone_flags;
2743 : : } safe_sysctl[] = {
2744 : : { "kernel.hostname", false, CLONE_NEWUTS },
2745 : : { "kernel.domainname", false, CLONE_NEWUTS },
2746 : : { "kernel.msgmax", false, CLONE_NEWIPC },
2747 : : { "kernel.msgmnb", false, CLONE_NEWIPC },
2748 : : { "kernel.msgmni", false, CLONE_NEWIPC },
2749 : : { "kernel.sem", false, CLONE_NEWIPC },
2750 : : { "kernel.shmall", false, CLONE_NEWIPC },
2751 : : { "kernel.shmmax", false, CLONE_NEWIPC },
2752 : : { "kernel.shmmni", false, CLONE_NEWIPC },
2753 : : { "fs.mqueue.", true, CLONE_NEWIPC },
2754 : : { "net.", true, CLONE_NEWNET },
2755 : : };
2756 : :
2757 : : unsigned long flags;
2758 : : char **k, **v;
2759 : : int r;
2760 : :
2761 : 0 : flags = effective_clone_ns_flags();
2762 : :
2763 [ # # # # : 0 : STRV_FOREACH_PAIR(k, v, arg_sysctl) {
# # ]
2764 : 0 : bool good = false;
2765 : : size_t i;
2766 : :
2767 [ # # ]: 0 : for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
2768 : :
2769 [ # # ]: 0 : if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
2770 : 0 : continue;
2771 : :
2772 [ # # ]: 0 : if (safe_sysctl[i].prefix)
2773 : 0 : good = startswith(*k, safe_sysctl[i].key);
2774 : : else
2775 : 0 : good = streq(*k, safe_sysctl[i].key);
2776 : :
2777 [ # # ]: 0 : if (good)
2778 : 0 : break;
2779 : : }
2780 : :
2781 [ # # ]: 0 : if (!good)
2782 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
2783 : :
2784 : 0 : r = sysctl_write(*k, *v);
2785 [ # # ]: 0 : if (r < 0)
2786 [ # # ]: 0 : return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
2787 : : }
2788 : :
2789 : 0 : return 0;
2790 : : }
2791 : :
2792 : 0 : static int inner_child(
2793 : : Barrier *barrier,
2794 : : const char *directory,
2795 : : bool secondary,
2796 : : int kmsg_socket,
2797 : : int rtnl_socket,
2798 : : int master_pty_socket,
2799 : : FDSet *fds) {
2800 : :
2801 : 0 : _cleanup_free_ char *home = NULL;
2802 : : char as_uuid[37];
2803 : 0 : size_t n_env = 1;
2804 : 0 : const char *envp[] = {
2805 : : "PATH=" DEFAULT_PATH_COMPAT,
2806 : : NULL, /* container */
2807 : : NULL, /* TERM */
2808 : : NULL, /* HOME */
2809 : : NULL, /* USER */
2810 : : NULL, /* LOGNAME */
2811 : : NULL, /* container_uuid */
2812 : : NULL, /* LISTEN_FDS */
2813 : : NULL, /* LISTEN_PID */
2814 : : NULL, /* NOTIFY_SOCKET */
2815 : : NULL
2816 : : };
2817 : : const char *exec_target;
2818 : 0 : _cleanup_strv_free_ char **env_use = NULL;
2819 : : int r, which_failed;
2820 : :
2821 : : /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2822 : : * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2823 : : * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2824 : : * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2825 : : * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2826 : : * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2827 : : * namespace.
2828 : : *
2829 : : * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2830 : : * unshare(). See below. */
2831 : :
2832 [ # # ]: 0 : assert(barrier);
2833 [ # # ]: 0 : assert(directory);
2834 [ # # ]: 0 : assert(kmsg_socket >= 0);
2835 : :
2836 [ # # ]: 0 : log_debug("Inner child is initializing.");
2837 : :
2838 [ # # ]: 0 : if (arg_userns_mode != USER_NAMESPACE_NO) {
2839 : : /* Tell the parent, that it now can write the UID map. */
2840 : 0 : (void) barrier_place(barrier); /* #1 */
2841 : :
2842 : : /* Wait until the parent wrote the UID map */
2843 [ # # ]: 0 : if (!barrier_place_and_sync(barrier)) /* #2 */
2844 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2845 : : "Parent died too early");
2846 : : }
2847 : :
2848 : 0 : r = reset_uid_gid();
2849 [ # # ]: 0 : if (r < 0)
2850 [ # # ]: 0 : return log_error_errno(r, "Couldn't become new root: %m");
2851 : :
2852 : 0 : r = mount_all(NULL,
2853 : 0 : arg_mount_settings | MOUNT_IN_USERNS,
2854 : : arg_uid_shift,
2855 : : arg_selinux_apifs_context);
2856 [ # # ]: 0 : if (r < 0)
2857 : 0 : return r;
2858 : :
2859 [ # # # # ]: 0 : if (!arg_network_namespace_path && arg_private_network) {
2860 : 0 : r = unshare(CLONE_NEWNET);
2861 [ # # ]: 0 : if (r < 0)
2862 [ # # ]: 0 : return log_error_errno(errno, "Failed to unshare network namespace: %m");
2863 : :
2864 : : /* Tell the parent that it can setup network interfaces. */
2865 : 0 : (void) barrier_place(barrier); /* #3 */
2866 : : }
2867 : :
2868 : 0 : r = mount_sysfs(NULL, arg_mount_settings);
2869 [ # # ]: 0 : if (r < 0)
2870 : 0 : return r;
2871 : :
2872 : : /* Wait until we are cgroup-ified, so that we
2873 : : * can mount the right cgroup path writable */
2874 [ # # ]: 0 : if (!barrier_place_and_sync(barrier)) /* #4 */
2875 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2876 : : "Parent died too early");
2877 : :
2878 [ # # ]: 0 : if (arg_use_cgns) {
2879 : 0 : r = unshare(CLONE_NEWCGROUP);
2880 [ # # ]: 0 : if (r < 0)
2881 [ # # ]: 0 : return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
2882 : 0 : r = mount_cgroups(
2883 : : "",
2884 : : arg_unified_cgroup_hierarchy,
2885 : : arg_userns_mode != USER_NAMESPACE_NO,
2886 : : arg_uid_shift,
2887 : : arg_uid_range,
2888 : : arg_selinux_apifs_context,
2889 : : true);
2890 [ # # ]: 0 : if (r < 0)
2891 : 0 : return r;
2892 : : } else {
2893 : 0 : r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2894 [ # # ]: 0 : if (r < 0)
2895 : 0 : return r;
2896 : : }
2897 : :
2898 : 0 : r = setup_boot_id();
2899 [ # # ]: 0 : if (r < 0)
2900 : 0 : return r;
2901 : :
2902 : 0 : r = setup_kmsg(kmsg_socket);
2903 [ # # ]: 0 : if (r < 0)
2904 : 0 : return r;
2905 : 0 : kmsg_socket = safe_close(kmsg_socket);
2906 : :
2907 : 0 : r = mount_custom(
2908 : : "/",
2909 : : arg_custom_mounts,
2910 : : arg_n_custom_mounts,
2911 : : false,
2912 : : 0,
2913 : : 0,
2914 : : arg_selinux_apifs_context,
2915 : : true);
2916 [ # # ]: 0 : if (r < 0)
2917 : 0 : return r;
2918 : :
2919 [ # # ]: 0 : if (setsid() < 0)
2920 [ # # ]: 0 : return log_error_errno(errno, "setsid() failed: %m");
2921 : :
2922 [ # # ]: 0 : if (arg_private_network)
2923 : 0 : loopback_setup();
2924 : :
2925 [ # # ]: 0 : if (arg_expose_ports) {
2926 : 0 : r = expose_port_send_rtnl(rtnl_socket);
2927 [ # # ]: 0 : if (r < 0)
2928 : 0 : return r;
2929 : 0 : rtnl_socket = safe_close(rtnl_socket);
2930 : : }
2931 : :
2932 [ # # ]: 0 : if (arg_console_mode != CONSOLE_PIPE) {
2933 [ # # ]: 0 : _cleanup_close_ int master = -1;
2934 [ # # ]: 0 : _cleanup_free_ char *console = NULL;
2935 : :
2936 : : /* Allocate a pty and make it available as /dev/console. */
2937 : 0 : master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
2938 [ # # ]: 0 : if (master < 0)
2939 [ # # ]: 0 : return log_error_errno(master, "Failed to allocate a pty: %m");
2940 : :
2941 : 0 : r = setup_dev_console(console);
2942 [ # # ]: 0 : if (r < 0)
2943 [ # # ]: 0 : return log_error_errno(r, "Failed to setup /dev/console: %m");
2944 : :
2945 : 0 : r = send_one_fd(master_pty_socket, master, 0);
2946 [ # # ]: 0 : if (r < 0)
2947 [ # # ]: 0 : return log_error_errno(r, "Failed to send master fd: %m");
2948 : 0 : master_pty_socket = safe_close(master_pty_socket);
2949 : :
2950 : 0 : r = setup_stdio_as_dev_console();
2951 [ # # ]: 0 : if (r < 0)
2952 : 0 : return r;
2953 : : }
2954 : :
2955 : 0 : r = patch_sysctl();
2956 [ # # ]: 0 : if (r < 0)
2957 : 0 : return r;
2958 : :
2959 [ # # ]: 0 : if (arg_oom_score_adjust_set) {
2960 : 0 : r = set_oom_score_adjust(arg_oom_score_adjust);
2961 [ # # ]: 0 : if (r < 0)
2962 [ # # ]: 0 : return log_error_errno(r, "Failed to adjust OOM score: %m");
2963 : : }
2964 : :
2965 [ # # ]: 0 : if (arg_cpu_set.set)
2966 [ # # ]: 0 : if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
2967 [ # # ]: 0 : return log_error_errno(errno, "Failed to set CPU affinity: %m");
2968 : :
2969 : 0 : (void) setup_hostname();
2970 : :
2971 [ # # ]: 0 : if (arg_personality != PERSONALITY_INVALID) {
2972 : 0 : r = safe_personality(arg_personality);
2973 [ # # ]: 0 : if (r < 0)
2974 [ # # ]: 0 : return log_error_errno(r, "personality() failed: %m");
2975 [ # # ]: 0 : } else if (secondary) {
2976 : 0 : r = safe_personality(PER_LINUX32);
2977 [ # # ]: 0 : if (r < 0)
2978 [ # # ]: 0 : return log_error_errno(r, "personality() failed: %m");
2979 : : }
2980 : :
2981 : 0 : r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
2982 [ # # ]: 0 : if (r < 0)
2983 [ # # ]: 0 : return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
2984 : :
2985 : : #if HAVE_SECCOMP
2986 [ # # ]: 0 : if (arg_seccomp) {
2987 : :
2988 [ # # ]: 0 : if (is_seccomp_available()) {
2989 : :
2990 : 0 : r = seccomp_load(arg_seccomp);
2991 [ # # # # ]: 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
2992 [ # # ]: 0 : return log_error_errno(r, "Failed to install seccomp filter: %m");
2993 [ # # ]: 0 : if (r < 0)
2994 [ # # ]: 0 : log_debug_errno(r, "Failed to install seccomp filter: %m");
2995 : : }
2996 : : } else
2997 : : #endif
2998 : : {
2999 : 0 : r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
3000 [ # # ]: 0 : if (r < 0)
3001 : 0 : return r;
3002 : : }
3003 : :
3004 : : #if HAVE_SELINUX
3005 [ # # ]: 0 : if (arg_selinux_context)
3006 [ # # ]: 0 : if (setexeccon(arg_selinux_context) < 0)
3007 [ # # ]: 0 : return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3008 : : #endif
3009 : :
3010 : : /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3011 : : * if we need to later on. */
3012 [ # # ]: 0 : if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3013 [ # # ]: 0 : return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3014 : :
3015 [ # # # # ]: 0 : if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3016 : 0 : r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
3017 : : else
3018 : 0 : r = change_uid_gid(arg_user, &home);
3019 [ # # ]: 0 : if (r < 0)
3020 : 0 : return r;
3021 : :
3022 : 0 : r = drop_capabilities(getuid());
3023 [ # # ]: 0 : if (r < 0)
3024 [ # # ]: 0 : return log_error_errno(r, "Dropping capabilities failed: %m");
3025 : :
3026 [ # # ]: 0 : if (arg_no_new_privileges)
3027 [ # # ]: 0 : if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3028 [ # # ]: 0 : return log_error_errno(errno, "Failed to disable new privileges: %m");
3029 : :
3030 : : /* LXC sets container=lxc, so follow the scheme here */
3031 [ # # # # : 0 : envp[n_env++] = strjoina("container=", arg_container_service_name);
# # # # #
# # # ]
3032 : :
3033 : 0 : envp[n_env] = strv_find_prefix(environ, "TERM=");
3034 [ # # ]: 0 : if (envp[n_env])
3035 : 0 : n_env++;
3036 : :
3037 [ # # # # : 0 : if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
# # ]
3038 [ # # # # ]: 0 : if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3039 : 0 : return log_oom();
3040 : :
3041 [ # # # # : 0 : if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
# # ]
3042 [ # # # # : 0 : if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
# # ]
3043 [ # # ]: 0 : asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3044 : 0 : return log_oom();
3045 : :
3046 [ # # ]: 0 : assert(!sd_id128_is_null(arg_uuid));
3047 : :
3048 [ # # ]: 0 : if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
3049 : 0 : return log_oom();
3050 : :
3051 [ # # ]: 0 : if (fdset_size(fds) > 0) {
3052 : 0 : r = fdset_cloexec(fds, false);
3053 [ # # ]: 0 : if (r < 0)
3054 [ # # ]: 0 : return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3055 : :
3056 [ # # # # ]: 0 : if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3057 : 0 : (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3058 : 0 : return log_oom();
3059 : : }
3060 [ # # ]: 0 : if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3061 : 0 : return log_oom();
3062 : :
3063 : 0 : env_use = strv_env_merge(2, envp, arg_setenv);
3064 [ # # ]: 0 : if (!env_use)
3065 : 0 : return log_oom();
3066 : :
3067 : : /* Let the parent know that we are ready and
3068 : : * wait until the parent is ready with the
3069 : : * setup, too... */
3070 [ # # ]: 0 : if (!barrier_place_and_sync(barrier)) /* #5 */
3071 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3072 : : "Parent died too early");
3073 : :
3074 [ # # ]: 0 : if (arg_chdir)
3075 [ # # ]: 0 : if (chdir(arg_chdir) < 0)
3076 [ # # ]: 0 : return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3077 : :
3078 [ # # ]: 0 : if (arg_start_mode == START_PID2) {
3079 : 0 : r = stub_pid1(arg_uuid);
3080 [ # # ]: 0 : if (r < 0)
3081 : 0 : return r;
3082 : : }
3083 : :
3084 [ # # ]: 0 : log_debug("Inner child completed, invoking payload.");
3085 : :
3086 : : /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3087 : : * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3088 : : * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3089 : 0 : log_close();
3090 : 0 : log_set_open_when_needed(true);
3091 : :
3092 : 0 : (void) fdset_close_others(fds);
3093 : :
3094 [ # # ]: 0 : if (arg_start_mode == START_BOOT) {
3095 : : char **a;
3096 : : size_t m;
3097 : :
3098 : : /* Automatically search for the init system */
3099 : :
3100 : 0 : m = strv_length(arg_parameters);
3101 [ # # # # ]: 0 : a = newa(char*, m + 2);
3102 : 0 : memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3103 : 0 : a[1 + m] = NULL;
3104 : :
3105 : 0 : a[0] = (char*) "/usr/lib/systemd/systemd";
3106 : 0 : execve(a[0], a, env_use);
3107 : :
3108 : 0 : a[0] = (char*) "/lib/systemd/systemd";
3109 : 0 : execve(a[0], a, env_use);
3110 : :
3111 : 0 : a[0] = (char*) "/sbin/init";
3112 : 0 : execve(a[0], a, env_use);
3113 : :
3114 : 0 : exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3115 [ # # ]: 0 : } else if (!strv_isempty(arg_parameters)) {
3116 : : const char *dollar_path;
3117 : :
3118 : 0 : exec_target = arg_parameters[0];
3119 : :
3120 : : /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3121 : : * binary. */
3122 : 0 : dollar_path = strv_env_get(env_use, "PATH");
3123 [ # # ]: 0 : if (dollar_path) {
3124 [ # # ]: 0 : if (putenv((char*) dollar_path) != 0)
3125 [ # # ]: 0 : return log_error_errno(errno, "Failed to update $PATH: %m");
3126 : : }
3127 : :
3128 : 0 : execvpe(arg_parameters[0], arg_parameters, env_use);
3129 : : } else {
3130 [ # # ]: 0 : if (!arg_chdir)
3131 : : /* If we cannot change the directory, we'll end up in /, that is expected. */
3132 [ # # ]: 0 : (void) chdir(home ?: "/root");
3133 : :
3134 : 0 : execle("/bin/bash", "-bash", NULL, env_use);
3135 : 0 : execle("/bin/sh", "-sh", NULL, env_use);
3136 : :
3137 : 0 : exec_target = "/bin/bash, /bin/sh";
3138 : : }
3139 : :
3140 [ # # ]: 0 : return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3141 : : }
3142 : :
3143 : 0 : static int setup_sd_notify_child(void) {
3144 : 0 : _cleanup_close_ int fd = -1;
3145 : 0 : union sockaddr_union sa = {
3146 : : .un.sun_family = AF_UNIX,
3147 : : .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3148 : : };
3149 : : int r;
3150 : :
3151 : 0 : fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3152 [ # # ]: 0 : if (fd < 0)
3153 [ # # ]: 0 : return log_error_errno(errno, "Failed to allocate notification socket: %m");
3154 : :
3155 : 0 : (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3156 : 0 : (void) sockaddr_un_unlink(&sa.un);
3157 : :
3158 [ # # # # ]: 0 : r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3159 [ # # ]: 0 : if (r < 0)
3160 [ # # ]: 0 : return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3161 : :
3162 : 0 : r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3163 [ # # ]: 0 : if (r < 0)
3164 [ # # ]: 0 : return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3165 : :
3166 : 0 : r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3167 [ # # ]: 0 : if (r < 0)
3168 [ # # ]: 0 : return log_error_errno(r, "SO_PASSCRED failed: %m");
3169 : :
3170 : 0 : return TAKE_FD(fd);
3171 : : }
3172 : :
3173 : 0 : static int outer_child(
3174 : : Barrier *barrier,
3175 : : const char *directory,
3176 : : DissectedImage *dissected_image,
3177 : : bool secondary,
3178 : : int pid_socket,
3179 : : int uuid_socket,
3180 : : int notify_socket,
3181 : : int kmsg_socket,
3182 : : int rtnl_socket,
3183 : : int uid_shift_socket,
3184 : : int master_pty_socket,
3185 : : int unified_cgroup_hierarchy_socket,
3186 : : FDSet *fds,
3187 : : int netns_fd) {
3188 : :
3189 : 0 : _cleanup_close_ int fd = -1;
3190 : : pid_t pid;
3191 : : ssize_t l;
3192 : : int r;
3193 : :
3194 : : /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3195 : : * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3196 : : * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3197 : : * initializations a second child (the "inner" one) is forked off it, and it exits. */
3198 : :
3199 [ # # ]: 0 : assert(barrier);
3200 [ # # ]: 0 : assert(directory);
3201 [ # # ]: 0 : assert(pid_socket >= 0);
3202 [ # # ]: 0 : assert(uuid_socket >= 0);
3203 [ # # ]: 0 : assert(notify_socket >= 0);
3204 [ # # ]: 0 : assert(master_pty_socket >= 0);
3205 [ # # ]: 0 : assert(kmsg_socket >= 0);
3206 : :
3207 [ # # ]: 0 : log_debug("Outer child is initializing.");
3208 : :
3209 [ # # ]: 0 : if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3210 [ # # ]: 0 : return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3211 : :
3212 : 0 : r = reset_audit_loginuid();
3213 [ # # ]: 0 : if (r < 0)
3214 : 0 : return r;
3215 : :
3216 : : /* Mark everything as slave, so that we still
3217 : : * receive mounts from the real root, but don't
3218 : : * propagate mounts to the real root. */
3219 : 0 : r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3220 [ # # ]: 0 : if (r < 0)
3221 : 0 : return r;
3222 : :
3223 [ # # ]: 0 : if (dissected_image) {
3224 : : /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3225 : : * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3226 : : * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3227 : : * makes sure ESP partitions and userns are compatible. */
3228 : :
3229 : 0 : r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3230 : 0 : DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3231 [ # # ]: 0 : (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
3232 [ # # ]: 0 : (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3233 [ # # ]: 0 : if (r < 0)
3234 : 0 : return r;
3235 : : }
3236 : :
3237 : 0 : r = determine_uid_shift(directory);
3238 [ # # ]: 0 : if (r < 0)
3239 : 0 : return r;
3240 : :
3241 [ # # ]: 0 : if (arg_userns_mode != USER_NAMESPACE_NO) {
3242 : : /* Let the parent know which UID shift we read from the image */
3243 : 0 : l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3244 [ # # ]: 0 : if (l < 0)
3245 [ # # ]: 0 : return log_error_errno(errno, "Failed to send UID shift: %m");
3246 [ # # ]: 0 : if (l != sizeof(arg_uid_shift))
3247 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO),
3248 : : "Short write while sending UID shift.");
3249 : :
3250 [ # # ]: 0 : if (arg_userns_mode == USER_NAMESPACE_PICK) {
3251 : : /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3252 : : * we just read from the image is available. If yes, it will send the UID shift back to us, if
3253 : : * not it will pick a different one, and send it back to us. */
3254 : :
3255 : 0 : l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3256 [ # # ]: 0 : if (l < 0)
3257 [ # # ]: 0 : return log_error_errno(errno, "Failed to recv UID shift: %m");
3258 [ # # ]: 0 : if (l != sizeof(arg_uid_shift))
3259 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO),
3260 : : "Short read while receiving UID shift.");
3261 : : }
3262 : :
3263 [ # # # # ]: 0 : log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3264 : : "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3265 : : }
3266 : :
3267 [ # # ]: 0 : if (path_equal(directory, "/")) {
3268 : : /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3269 : : * place, so that we can make changes to its mount structure (for example, to implement
3270 : : * --volatile=) without this interfering with our ability to access files such as
3271 : : * /etc/localtime to copy into the container. Note that we use a fixed place for this
3272 : : * (instead of a temporary directory, since we are living in our own mount namspace here
3273 : : * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3274 : 0 : (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3275 : :
3276 : 0 : r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3277 [ # # ]: 0 : if (r < 0)
3278 : 0 : return r;
3279 : :
3280 : 0 : directory = "/run/systemd/nspawn-root";
3281 : :
3282 [ # # ]: 0 : } else if (!dissected_image) {
3283 : : /* Turn directory into bind mount (we need that so that we can move the bind mount to root
3284 : : * later on). */
3285 : 0 : r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3286 [ # # ]: 0 : if (r < 0)
3287 : 0 : return r;
3288 : : }
3289 : :
3290 : 0 : r = setup_pivot_root(
3291 : : directory,
3292 : : arg_pivot_root_new,
3293 : : arg_pivot_root_old);
3294 [ # # ]: 0 : if (r < 0)
3295 : 0 : return r;
3296 : :
3297 : 0 : r = setup_volatile_mode(
3298 : : directory,
3299 : : arg_volatile_mode,
3300 : : arg_userns_mode != USER_NAMESPACE_NO,
3301 : : arg_uid_shift,
3302 : : arg_uid_range,
3303 : : arg_selinux_apifs_context);
3304 [ # # ]: 0 : if (r < 0)
3305 : 0 : return r;
3306 : :
3307 [ # # ]: 0 : if (dissected_image) {
3308 : : /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3309 : 0 : r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3310 [ # # ]: 0 : DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
3311 [ # # ]: 0 : if (r < 0)
3312 : 0 : return r;
3313 : : }
3314 : :
3315 [ # # ]: 0 : if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3316 : : /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3317 : :
3318 : 0 : r = detect_unified_cgroup_hierarchy_from_image(directory);
3319 [ # # ]: 0 : if (r < 0)
3320 : 0 : return r;
3321 : :
3322 : 0 : l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3323 [ # # ]: 0 : if (l < 0)
3324 [ # # ]: 0 : return log_error_errno(errno, "Failed to send cgroup mode: %m");
3325 [ # # ]: 0 : if (l != sizeof(arg_unified_cgroup_hierarchy))
3326 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO),
3327 : : "Short write while sending cgroup mode.");
3328 : :
3329 : 0 : unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3330 : : }
3331 : :
3332 : : /* Mark everything as shared so our mounts get propagated down. This is
3333 : : * required to make new bind mounts available in systemd services
3334 : : * inside the container that create a new mount namespace.
3335 : : * See https://github.com/systemd/systemd/issues/3860
3336 : : * Further submounts (such as /dev) done after this will inherit the
3337 : : * shared propagation mode. */
3338 : 0 : r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3339 [ # # ]: 0 : if (r < 0)
3340 : 0 : return r;
3341 : :
3342 : 0 : r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3343 [ # # ]: 0 : if (r < 0)
3344 : 0 : return r;
3345 : :
3346 : 0 : r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3347 [ # # ]: 0 : if (r < 0)
3348 : 0 : return r;
3349 : :
3350 [ # # # # ]: 0 : if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
3351 : 0 : r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3352 [ # # ]: 0 : if (r < 0)
3353 [ # # ]: 0 : return log_error_errno(r, "Failed to make tree read-only: %m");
3354 : : }
3355 : :
3356 : 0 : r = mount_all(directory,
3357 : : arg_mount_settings,
3358 : : arg_uid_shift,
3359 : : arg_selinux_apifs_context);
3360 [ # # ]: 0 : if (r < 0)
3361 : 0 : return r;
3362 : :
3363 : 0 : r = copy_devnodes(directory);
3364 [ # # ]: 0 : if (r < 0)
3365 : 0 : return r;
3366 : :
3367 : 0 : r = make_extra_nodes(directory);
3368 [ # # ]: 0 : if (r < 0)
3369 : 0 : return r;
3370 : :
3371 : 0 : (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3372 : 0 : (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
3373 : :
3374 : 0 : r = setup_pts(directory);
3375 [ # # ]: 0 : if (r < 0)
3376 : 0 : return r;
3377 : :
3378 : 0 : r = setup_propagate(directory);
3379 [ # # ]: 0 : if (r < 0)
3380 : 0 : return r;
3381 : :
3382 : 0 : r = setup_keyring();
3383 [ # # ]: 0 : if (r < 0)
3384 : 0 : return r;
3385 : :
3386 : 0 : r = setup_timezone(directory);
3387 [ # # ]: 0 : if (r < 0)
3388 : 0 : return r;
3389 : :
3390 : 0 : r = setup_resolv_conf(directory);
3391 [ # # ]: 0 : if (r < 0)
3392 : 0 : return r;
3393 : :
3394 : 0 : r = setup_machine_id(directory);
3395 [ # # ]: 0 : if (r < 0)
3396 : 0 : return r;
3397 : :
3398 : 0 : r = setup_journal(directory);
3399 [ # # ]: 0 : if (r < 0)
3400 : 0 : return r;
3401 : :
3402 : 0 : r = mount_custom(
3403 : : directory,
3404 : : arg_custom_mounts,
3405 : : arg_n_custom_mounts,
3406 : : arg_userns_mode != USER_NAMESPACE_NO,
3407 : : arg_uid_shift,
3408 : : arg_uid_range,
3409 : : arg_selinux_apifs_context,
3410 : : false);
3411 [ # # ]: 0 : if (r < 0)
3412 : 0 : return r;
3413 : :
3414 [ # # ]: 0 : if (!arg_use_cgns) {
3415 : 0 : r = mount_cgroups(
3416 : : directory,
3417 : : arg_unified_cgroup_hierarchy,
3418 : : arg_userns_mode != USER_NAMESPACE_NO,
3419 : : arg_uid_shift,
3420 : : arg_uid_range,
3421 : : arg_selinux_apifs_context,
3422 : : false);
3423 [ # # ]: 0 : if (r < 0)
3424 : 0 : return r;
3425 : : }
3426 : :
3427 : 0 : r = mount_move_root(directory);
3428 [ # # ]: 0 : if (r < 0)
3429 [ # # ]: 0 : return log_error_errno(r, "Failed to move root directory: %m");
3430 : :
3431 : 0 : fd = setup_sd_notify_child();
3432 [ # # ]: 0 : if (fd < 0)
3433 : 0 : return fd;
3434 : :
3435 : 0 : pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3436 : 0 : arg_clone_ns_flags |
3437 [ # # ]: 0 : (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3438 [ # # ]: 0 : if (pid < 0)
3439 [ # # ]: 0 : return log_error_errno(errno, "Failed to fork inner child: %m");
3440 [ # # ]: 0 : if (pid == 0) {
3441 : 0 : pid_socket = safe_close(pid_socket);
3442 : 0 : uuid_socket = safe_close(uuid_socket);
3443 : 0 : notify_socket = safe_close(notify_socket);
3444 : 0 : uid_shift_socket = safe_close(uid_shift_socket);
3445 : :
3446 : : /* The inner child has all namespaces that are
3447 : : * requested, so that we all are owned by the user if
3448 : : * user namespaces are turned on. */
3449 : :
3450 [ # # ]: 0 : if (arg_network_namespace_path) {
3451 : 0 : r = namespace_enter(-1, -1, netns_fd, -1, -1);
3452 [ # # ]: 0 : if (r < 0)
3453 [ # # ]: 0 : return log_error_errno(r, "Failed to join network namespace: %m");
3454 : : }
3455 : :
3456 : 0 : r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
3457 [ # # ]: 0 : if (r < 0)
3458 : 0 : _exit(EXIT_FAILURE);
3459 : :
3460 : 0 : _exit(EXIT_SUCCESS);
3461 : : }
3462 : :
3463 : 0 : l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3464 [ # # ]: 0 : if (l < 0)
3465 [ # # ]: 0 : return log_error_errno(errno, "Failed to send PID: %m");
3466 [ # # ]: 0 : if (l != sizeof(pid))
3467 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO),
3468 : : "Short write while sending PID.");
3469 : :
3470 : 0 : l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3471 [ # # ]: 0 : if (l < 0)
3472 [ # # ]: 0 : return log_error_errno(errno, "Failed to send machine ID: %m");
3473 [ # # ]: 0 : if (l != sizeof(arg_uuid))
3474 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO),
3475 : : "Short write while sending machine ID.");
3476 : :
3477 : 0 : l = send_one_fd(notify_socket, fd, 0);
3478 [ # # ]: 0 : if (l < 0)
3479 [ # # ]: 0 : return log_error_errno(l, "Failed to send notify fd: %m");
3480 : :
3481 : 0 : pid_socket = safe_close(pid_socket);
3482 : 0 : uuid_socket = safe_close(uuid_socket);
3483 : 0 : notify_socket = safe_close(notify_socket);
3484 : 0 : master_pty_socket = safe_close(master_pty_socket);
3485 : 0 : kmsg_socket = safe_close(kmsg_socket);
3486 : 0 : rtnl_socket = safe_close(rtnl_socket);
3487 : 0 : netns_fd = safe_close(netns_fd);
3488 : :
3489 : 0 : return 0;
3490 : : }
3491 : :
3492 : 0 : static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3493 : 0 : bool tried_hashed = false;
3494 : 0 : unsigned n_tries = 100;
3495 : : uid_t candidate;
3496 : : int r;
3497 : :
3498 [ # # ]: 0 : assert(shift);
3499 [ # # ]: 0 : assert(ret_lock_file);
3500 [ # # ]: 0 : assert(arg_userns_mode == USER_NAMESPACE_PICK);
3501 [ # # ]: 0 : assert(arg_uid_range == 0x10000U);
3502 : :
3503 : 0 : candidate = *shift;
3504 : :
3505 : 0 : (void) mkdir("/run/systemd/nspawn-uid", 0755);
3506 : :
3507 : 0 : for (;;) {
3508 : : char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3509 [ # # ]: 0 : _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
3510 : :
3511 [ # # ]: 0 : if (--n_tries <= 0)
3512 : 0 : return -EBUSY;
3513 : :
3514 [ # # # # ]: 0 : if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
3515 : 0 : goto next;
3516 [ # # ]: 0 : if ((candidate & UINT32_C(0xFFFF)) != 0)
3517 : 0 : goto next;
3518 : :
3519 [ # # ]: 0 : xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3520 : 0 : r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3521 [ # # ]: 0 : if (r == -EBUSY) /* Range already taken by another nspawn instance */
3522 : 0 : goto next;
3523 [ # # ]: 0 : if (r < 0)
3524 : 0 : return r;
3525 : :
3526 : : /* Make some superficial checks whether the range is currently known in the user database */
3527 [ # # ]: 0 : if (getpwuid(candidate))
3528 : 0 : goto next;
3529 [ # # ]: 0 : if (getpwuid(candidate + UINT32_C(0xFFFE)))
3530 : 0 : goto next;
3531 [ # # ]: 0 : if (getgrgid(candidate))
3532 : 0 : goto next;
3533 [ # # ]: 0 : if (getgrgid(candidate + UINT32_C(0xFFFE)))
3534 : 0 : goto next;
3535 : :
3536 : 0 : *ret_lock_file = lf;
3537 : 0 : lf = (struct LockFile) LOCK_FILE_INIT;
3538 : 0 : *shift = candidate;
3539 : 0 : return 0;
3540 : :
3541 : 0 : next:
3542 [ # # # # ]: 0 : if (arg_machine && !tried_hashed) {
3543 : : /* Try to hash the base from the container name */
3544 : :
3545 : : static const uint8_t hash_key[] = {
3546 : : 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3547 : : 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3548 : : };
3549 : :
3550 : 0 : candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3551 : :
3552 : 0 : tried_hashed = true;
3553 : : } else
3554 : 0 : random_bytes(&candidate, sizeof(candidate));
3555 : :
3556 : 0 : candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
3557 : 0 : candidate &= (uid_t) UINT32_C(0xFFFF0000);
3558 : : }
3559 : : }
3560 : :
3561 : 0 : static int setup_uid_map(pid_t pid) {
3562 : : char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3563 : : int r;
3564 : :
3565 [ # # ]: 0 : assert(pid > 1);
3566 : :
3567 [ # # ]: 0 : xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3568 [ # # ]: 0 : xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3569 : 0 : r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3570 [ # # ]: 0 : if (r < 0)
3571 [ # # ]: 0 : return log_error_errno(r, "Failed to write UID map: %m");
3572 : :
3573 : : /* We always assign the same UID and GID ranges */
3574 [ # # ]: 0 : xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3575 : 0 : r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3576 [ # # ]: 0 : if (r < 0)
3577 [ # # ]: 0 : return log_error_errno(r, "Failed to write GID map: %m");
3578 : :
3579 : 0 : return 0;
3580 : : }
3581 : :
3582 : 0 : static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
3583 : : char buf[NOTIFY_BUFFER_MAX+1];
3584 : 0 : char *p = NULL;
3585 : 0 : struct iovec iovec = {
3586 : : .iov_base = buf,
3587 : : .iov_len = sizeof(buf)-1,
3588 : : };
3589 : : union {
3590 : : struct cmsghdr cmsghdr;
3591 : : uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3592 : : CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3593 : 0 : } control = {};
3594 : 0 : struct msghdr msghdr = {
3595 : : .msg_iov = &iovec,
3596 : : .msg_iovlen = 1,
3597 : : .msg_control = &control,
3598 : : .msg_controllen = sizeof(control),
3599 : : };
3600 : : struct cmsghdr *cmsg;
3601 : 0 : struct ucred *ucred = NULL;
3602 : : ssize_t n;
3603 : : pid_t inner_child_pid;
3604 : 0 : _cleanup_strv_free_ char **tags = NULL;
3605 : :
3606 [ # # ]: 0 : assert(userdata);
3607 : :
3608 : 0 : inner_child_pid = PTR_TO_PID(userdata);
3609 : :
3610 [ # # ]: 0 : if (revents != EPOLLIN) {
3611 [ # # ]: 0 : log_warning("Got unexpected poll event for notify fd.");
3612 : 0 : return 0;
3613 : : }
3614 : :
3615 : 0 : n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3616 [ # # ]: 0 : if (n < 0) {
3617 [ # # # # ]: 0 : if (IN_SET(errno, EAGAIN, EINTR))
3618 : 0 : return 0;
3619 : :
3620 [ # # ]: 0 : return log_warning_errno(errno, "Couldn't read notification socket: %m");
3621 : : }
3622 : 0 : cmsg_close_all(&msghdr);
3623 : :
3624 [ # # # # ]: 0 : CMSG_FOREACH(cmsg, &msghdr) {
3625 [ # # ]: 0 : if (cmsg->cmsg_level == SOL_SOCKET &&
3626 [ # # ]: 0 : cmsg->cmsg_type == SCM_CREDENTIALS &&
3627 [ # # ]: 0 : cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3628 : :
3629 : 0 : ucred = (struct ucred*) CMSG_DATA(cmsg);
3630 : : }
3631 : : }
3632 : :
3633 [ # # # # ]: 0 : if (!ucred || ucred->pid != inner_child_pid) {
3634 [ # # ]: 0 : log_debug("Received notify message without valid credentials. Ignoring.");
3635 : 0 : return 0;
3636 : : }
3637 : :
3638 [ # # ]: 0 : if ((size_t) n >= sizeof(buf)) {
3639 [ # # ]: 0 : log_warning("Received notify message exceeded maximum size. Ignoring.");
3640 : 0 : return 0;
3641 : : }
3642 : :
3643 : 0 : buf[n] = 0;
3644 : 0 : tags = strv_split(buf, "\n\r");
3645 [ # # ]: 0 : if (!tags)
3646 : 0 : return log_oom();
3647 : :
3648 [ # # ]: 0 : if (strv_find(tags, "READY=1"))
3649 : 0 : (void) sd_notifyf(false, "READY=1\n");
3650 : :
3651 : 0 : p = strv_find_startswith(tags, "STATUS=");
3652 [ # # ]: 0 : if (p)
3653 : 0 : (void) sd_notifyf(false, "STATUS=Container running: %s", p);
3654 : :
3655 : 0 : return 0;
3656 : : }
3657 : :
3658 : 0 : static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
3659 : : int r;
3660 : :
3661 : 0 : r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3662 [ # # ]: 0 : if (r < 0)
3663 [ # # ]: 0 : return log_error_errno(r, "Failed to allocate notify event source: %m");
3664 : :
3665 : 0 : (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
3666 : :
3667 : 0 : return 0;
3668 : : }
3669 : :
3670 : 0 : static int merge_settings(Settings *settings, const char *path) {
3671 : : int rl;
3672 : :
3673 [ # # ]: 0 : assert(settings);
3674 [ # # ]: 0 : assert(path);
3675 : :
3676 : : /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3677 : : * that this steals the fields of the Settings* structure, and hence modifies it. */
3678 : :
3679 [ # # ]: 0 : if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3680 [ # # ]: 0 : settings->start_mode >= 0) {
3681 : 0 : arg_start_mode = settings->start_mode;
3682 : 0 : strv_free_and_replace(arg_parameters, settings->parameters);
3683 : : }
3684 : :
3685 [ # # ]: 0 : if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3686 : 0 : arg_ephemeral = settings->ephemeral;
3687 : :
3688 [ # # ]: 0 : if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
3689 [ # # ]: 0 : settings->root) {
3690 : :
3691 [ # # ]: 0 : if (!arg_settings_trusted)
3692 [ # # ]: 0 : log_warning("Ignoring root directory setting, file %s is not trusted.", path);
3693 : : else
3694 : 0 : free_and_replace(arg_directory, settings->root);
3695 : : }
3696 : :
3697 [ # # ]: 0 : if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3698 [ # # ]: 0 : settings->pivot_root_new) {
3699 : 0 : free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3700 : 0 : free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3701 : : }
3702 : :
3703 [ # # ]: 0 : if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3704 [ # # ]: 0 : settings->working_directory)
3705 : 0 : free_and_replace(arg_chdir, settings->working_directory);
3706 : :
3707 [ # # ]: 0 : if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3708 [ # # ]: 0 : settings->environment)
3709 : 0 : strv_free_and_replace(arg_setenv, settings->environment);
3710 : :
3711 [ # # ]: 0 : if ((arg_settings_mask & SETTING_USER) == 0) {
3712 : :
3713 [ # # ]: 0 : if (settings->user)
3714 : 0 : free_and_replace(arg_user, settings->user);
3715 : :
3716 [ # # ]: 0 : if (uid_is_valid(settings->uid))
3717 : 0 : arg_uid = settings->uid;
3718 [ # # ]: 0 : if (gid_is_valid(settings->gid))
3719 : 0 : arg_gid = settings->gid;
3720 [ # # ]: 0 : if (settings->n_supplementary_gids > 0) {
3721 : 0 : free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
3722 : 0 : arg_n_supplementary_gids = settings->n_supplementary_gids;
3723 : : }
3724 : : }
3725 : :
3726 [ # # ]: 0 : if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3727 : : uint64_t plus, minus;
3728 : :
3729 : : /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
3730 : : * Settings structure */
3731 : :
3732 : 0 : plus = settings->capability;
3733 : 0 : minus = settings->drop_capability;
3734 : :
3735 [ # # ]: 0 : if ((arg_settings_mask & SETTING_NETWORK) == 0) {
3736 [ # # ]: 0 : if (settings_private_network(settings))
3737 : 0 : plus |= UINT64_C(1) << CAP_NET_ADMIN;
3738 : : else
3739 : 0 : minus |= UINT64_C(1) << CAP_NET_ADMIN;
3740 : : }
3741 : :
3742 [ # # # # ]: 0 : if (!arg_settings_trusted && plus != 0) {
3743 [ # # ]: 0 : if (settings->capability != 0)
3744 [ # # ]: 0 : log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
3745 : : } else
3746 : 0 : arg_caps_retain |= plus;
3747 : :
3748 : 0 : arg_caps_retain &= ~minus;
3749 : :
3750 : : /* Copy the full capabilities over too */
3751 [ # # ]: 0 : if (capability_quintet_is_set(&settings->full_capabilities)) {
3752 [ # # ]: 0 : if (!arg_settings_trusted)
3753 [ # # ]: 0 : log_warning("Ignoring capability settings, file %s is not trusted.", path);
3754 : : else
3755 : 0 : arg_full_capabilities = settings->full_capabilities;
3756 : : }
3757 : : }
3758 : :
3759 [ # # ]: 0 : if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3760 [ # # ]: 0 : settings->kill_signal > 0)
3761 : 0 : arg_kill_signal = settings->kill_signal;
3762 : :
3763 [ # # ]: 0 : if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3764 [ # # ]: 0 : settings->personality != PERSONALITY_INVALID)
3765 : 0 : arg_personality = settings->personality;
3766 : :
3767 [ # # ]: 0 : if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3768 [ # # ]: 0 : !sd_id128_is_null(settings->machine_id)) {
3769 : :
3770 [ # # ]: 0 : if (!arg_settings_trusted)
3771 [ # # ]: 0 : log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
3772 : : else
3773 : 0 : arg_uuid = settings->machine_id;
3774 : : }
3775 : :
3776 [ # # ]: 0 : if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3777 [ # # ]: 0 : settings->read_only >= 0)
3778 : 0 : arg_read_only = settings->read_only;
3779 : :
3780 [ # # ]: 0 : if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3781 [ # # ]: 0 : settings->volatile_mode != _VOLATILE_MODE_INVALID)
3782 : 0 : arg_volatile_mode = settings->volatile_mode;
3783 : :
3784 [ # # ]: 0 : if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3785 [ # # ]: 0 : settings->n_custom_mounts > 0) {
3786 : :
3787 [ # # ]: 0 : if (!arg_settings_trusted)
3788 [ # # ]: 0 : log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
3789 : : else {
3790 : 0 : custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3791 : 0 : arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
3792 : 0 : arg_n_custom_mounts = settings->n_custom_mounts;
3793 : 0 : settings->n_custom_mounts = 0;
3794 : : }
3795 : : }
3796 : :
3797 [ # # ]: 0 : if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3798 [ # # ]: 0 : (settings->private_network >= 0 ||
3799 [ # # ]: 0 : settings->network_veth >= 0 ||
3800 [ # # ]: 0 : settings->network_bridge ||
3801 [ # # ]: 0 : settings->network_zone ||
3802 [ # # ]: 0 : settings->network_interfaces ||
3803 [ # # ]: 0 : settings->network_macvlan ||
3804 [ # # ]: 0 : settings->network_ipvlan ||
3805 [ # # ]: 0 : settings->network_veth_extra ||
3806 [ # # ]: 0 : settings->network_namespace_path)) {
3807 : :
3808 [ # # ]: 0 : if (!arg_settings_trusted)
3809 [ # # ]: 0 : log_warning("Ignoring network settings, file %s is not trusted.", path);
3810 : : else {
3811 : 0 : arg_network_veth = settings_network_veth(settings);
3812 : 0 : arg_private_network = settings_private_network(settings);
3813 : :
3814 : 0 : strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3815 : 0 : strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3816 : 0 : strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3817 : 0 : strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
3818 : :
3819 : 0 : free_and_replace(arg_network_bridge, settings->network_bridge);
3820 : 0 : free_and_replace(arg_network_zone, settings->network_zone);
3821 : :
3822 : 0 : free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
3823 : : }
3824 : : }
3825 : :
3826 [ # # ]: 0 : if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3827 [ # # ]: 0 : settings->expose_ports) {
3828 : :
3829 [ # # ]: 0 : if (!arg_settings_trusted)
3830 [ # # ]: 0 : log_warning("Ignoring Port= setting, file %s is not trusted.", path);
3831 : : else {
3832 : 0 : expose_port_free_all(arg_expose_ports);
3833 : 0 : arg_expose_ports = TAKE_PTR(settings->expose_ports);
3834 : : }
3835 : : }
3836 : :
3837 [ # # ]: 0 : if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3838 [ # # ]: 0 : settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3839 : :
3840 [ # # ]: 0 : if (!arg_settings_trusted)
3841 [ # # ]: 0 : log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
3842 : : else {
3843 : 0 : arg_userns_mode = settings->userns_mode;
3844 : 0 : arg_uid_shift = settings->uid_shift;
3845 : 0 : arg_uid_range = settings->uid_range;
3846 : 0 : arg_userns_chown = settings->userns_chown;
3847 : : }
3848 : : }
3849 : :
3850 [ # # ]: 0 : if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3851 : 0 : arg_notify_ready = settings->notify_ready;
3852 : :
3853 [ # # ]: 0 : if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3854 : :
3855 [ # # # # ]: 0 : if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
3856 [ # # ]: 0 : log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
3857 : : else {
3858 : 0 : strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3859 : 0 : strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
3860 : : }
3861 : :
3862 : : #if HAVE_SECCOMP
3863 [ # # # # ]: 0 : if (!arg_settings_trusted && settings->seccomp)
3864 [ # # ]: 0 : log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
3865 : : else {
3866 : 0 : seccomp_release(arg_seccomp);
3867 : 0 : arg_seccomp = TAKE_PTR(settings->seccomp);
3868 : : }
3869 : : #endif
3870 : : }
3871 : :
3872 [ # # ]: 0 : for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3873 [ # # ]: 0 : if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3874 : 0 : continue;
3875 : :
3876 [ # # ]: 0 : if (!settings->rlimit[rl])
3877 : 0 : continue;
3878 : :
3879 [ # # ]: 0 : if (!arg_settings_trusted) {
3880 [ # # ]: 0 : log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
3881 : 0 : continue;
3882 : : }
3883 : :
3884 : 0 : free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3885 : : }
3886 : :
3887 [ # # ]: 0 : if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3888 [ # # ]: 0 : settings->hostname)
3889 : 0 : free_and_replace(arg_hostname, settings->hostname);
3890 : :
3891 [ # # ]: 0 : if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3892 [ # # ]: 0 : settings->no_new_privileges >= 0)
3893 : 0 : arg_no_new_privileges = settings->no_new_privileges;
3894 : :
3895 [ # # ]: 0 : if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3896 [ # # ]: 0 : settings->oom_score_adjust_set) {
3897 : :
3898 [ # # ]: 0 : if (!arg_settings_trusted)
3899 [ # # ]: 0 : log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
3900 : : else {
3901 : 0 : arg_oom_score_adjust = settings->oom_score_adjust;
3902 : 0 : arg_oom_score_adjust_set = true;
3903 : : }
3904 : : }
3905 : :
3906 [ # # ]: 0 : if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3907 [ # # ]: 0 : settings->cpu_set.set) {
3908 : :
3909 [ # # ]: 0 : if (!arg_settings_trusted)
3910 [ # # ]: 0 : log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
3911 : : else {
3912 : 0 : cpu_set_reset(&arg_cpu_set);
3913 : 0 : arg_cpu_set = settings->cpu_set;
3914 : 0 : settings->cpu_set = (CPUSet) {};
3915 : : }
3916 : : }
3917 : :
3918 [ # # ]: 0 : if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3919 [ # # ]: 0 : settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3920 : 0 : arg_resolv_conf = settings->resolv_conf;
3921 : :
3922 [ # # ]: 0 : if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3923 [ # # ]: 0 : settings->link_journal != _LINK_JOURNAL_INVALID) {
3924 : :
3925 [ # # ]: 0 : if (!arg_settings_trusted)
3926 [ # # ]: 0 : log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3927 : : else {
3928 : 0 : arg_link_journal = settings->link_journal;
3929 : 0 : arg_link_journal_try = settings->link_journal_try;
3930 : : }
3931 : : }
3932 : :
3933 [ # # ]: 0 : if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3934 [ # # ]: 0 : settings->timezone != _TIMEZONE_MODE_INVALID)
3935 : 0 : arg_timezone = settings->timezone;
3936 : :
3937 [ # # ]: 0 : if ((arg_settings_mask & SETTING_SLICE) == 0 &&
3938 [ # # ]: 0 : settings->slice) {
3939 : :
3940 [ # # ]: 0 : if (!arg_settings_trusted)
3941 [ # # ]: 0 : log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
3942 : : else
3943 : 0 : free_and_replace(arg_slice, settings->slice);
3944 : : }
3945 : :
3946 [ # # ]: 0 : if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
3947 [ # # ]: 0 : settings->use_cgns >= 0) {
3948 : :
3949 [ # # ]: 0 : if (!arg_settings_trusted)
3950 [ # # ]: 0 : log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
3951 : : else
3952 : 0 : arg_use_cgns = settings->use_cgns;
3953 : : }
3954 : :
3955 [ # # ]: 0 : if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
3956 [ # # ]: 0 : settings->clone_ns_flags != (unsigned long) -1) {
3957 : :
3958 [ # # ]: 0 : if (!arg_settings_trusted)
3959 [ # # ]: 0 : log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
3960 : : else
3961 : 0 : arg_clone_ns_flags = settings->clone_ns_flags;
3962 : : }
3963 : :
3964 [ # # ]: 0 : if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
3965 [ # # ]: 0 : settings->console_mode >= 0) {
3966 : :
3967 [ # # ]: 0 : if (!arg_settings_trusted)
3968 [ # # ]: 0 : log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
3969 : : else
3970 : 0 : arg_console_mode = settings->console_mode;
3971 : : }
3972 : :
3973 : : /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
3974 : : * don't consult arg_settings_mask for them. */
3975 : :
3976 : 0 : sd_bus_message_unref(arg_property_message);
3977 : 0 : arg_property_message = TAKE_PTR(settings->properties);
3978 : :
3979 : 0 : arg_console_width = settings->console_width;
3980 : 0 : arg_console_height = settings->console_height;
3981 : :
3982 : 0 : device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3983 : 0 : arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
3984 : 0 : arg_n_extra_nodes = settings->n_extra_nodes;
3985 : :
3986 : 0 : return 0;
3987 : : }
3988 : :
3989 : 0 : static int load_settings(void) {
3990 : 0 : _cleanup_(settings_freep) Settings *settings = NULL;
3991 : 0 : _cleanup_fclose_ FILE *f = NULL;
3992 : 0 : _cleanup_free_ char *p = NULL;
3993 : : const char *fn, *i;
3994 : : int r;
3995 : :
3996 [ # # ]: 0 : if (arg_oci_bundle)
3997 : 0 : return 0;
3998 : :
3999 : : /* If all settings are masked, there's no point in looking for
4000 : : * the settings file */
4001 [ # # ]: 0 : if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4002 : 0 : return 0;
4003 : :
4004 [ # # # # : 0 : fn = strjoina(arg_machine, ".nspawn");
# # # # #
# # # ]
4005 : :
4006 : : /* We first look in the admin's directories in /etc and /run */
4007 [ # # ]: 0 : FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4008 [ # # # ]: 0 : _cleanup_free_ char *j = NULL;
4009 : :
4010 : 0 : j = path_join(i, fn);
4011 [ # # ]: 0 : if (!j)
4012 : 0 : return log_oom();
4013 : :
4014 : 0 : f = fopen(j, "re");
4015 [ # # ]: 0 : if (f) {
4016 : 0 : p = TAKE_PTR(j);
4017 : :
4018 : : /* By default, we trust configuration from /etc and /run */
4019 [ # # ]: 0 : if (arg_settings_trusted < 0)
4020 : 0 : arg_settings_trusted = true;
4021 : :
4022 : 0 : break;
4023 : : }
4024 : :
4025 [ # # ]: 0 : if (errno != ENOENT)
4026 [ # # ]: 0 : return log_error_errno(errno, "Failed to open %s: %m", j);
4027 : : }
4028 : :
4029 [ # # ]: 0 : if (!f) {
4030 : : /* After that, let's look for a file next to the
4031 : : * actual image we shall boot. */
4032 : :
4033 [ # # ]: 0 : if (arg_image) {
4034 : 0 : p = file_in_same_dir(arg_image, fn);
4035 [ # # ]: 0 : if (!p)
4036 : 0 : return log_oom();
4037 [ # # # # ]: 0 : } else if (arg_directory && !path_equal(arg_directory, "/")) {
4038 : 0 : p = file_in_same_dir(arg_directory, fn);
4039 [ # # ]: 0 : if (!p)
4040 : 0 : return log_oom();
4041 : : }
4042 : :
4043 [ # # ]: 0 : if (p) {
4044 : 0 : f = fopen(p, "re");
4045 [ # # # # ]: 0 : if (!f && errno != ENOENT)
4046 [ # # ]: 0 : return log_error_errno(errno, "Failed to open %s: %m", p);
4047 : :
4048 : : /* By default, we do not trust configuration from /var/lib/machines */
4049 [ # # ]: 0 : if (arg_settings_trusted < 0)
4050 : 0 : arg_settings_trusted = false;
4051 : : }
4052 : : }
4053 : :
4054 [ # # ]: 0 : if (!f)
4055 : 0 : return 0;
4056 : :
4057 [ # # ]: 0 : log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4058 : :
4059 : 0 : r = settings_load(f, p, &settings);
4060 [ # # ]: 0 : if (r < 0)
4061 : 0 : return r;
4062 : :
4063 : 0 : return merge_settings(settings, p);
4064 : : }
4065 : :
4066 : 0 : static int load_oci_bundle(void) {
4067 : 0 : _cleanup_(settings_freep) Settings *settings = NULL;
4068 : : int r;
4069 : :
4070 [ # # ]: 0 : if (!arg_oci_bundle)
4071 : 0 : return 0;
4072 : :
4073 : : /* By default let's trust OCI bundles */
4074 [ # # ]: 0 : if (arg_settings_trusted < 0)
4075 : 0 : arg_settings_trusted = true;
4076 : :
4077 : 0 : r = oci_load(NULL, arg_oci_bundle, &settings);
4078 [ # # ]: 0 : if (r < 0)
4079 : 0 : return r;
4080 : :
4081 : 0 : return merge_settings(settings, arg_oci_bundle);
4082 : : }
4083 : :
4084 : 0 : static int run_container(
4085 : : DissectedImage *dissected_image,
4086 : : bool secondary,
4087 : : FDSet *fds,
4088 : : char veth_name[IFNAMSIZ], bool *veth_created,
4089 : : union in_addr_union *exposed,
4090 : : int *master, pid_t *pid, int *ret) {
4091 : :
4092 : : static const struct sigaction sa = {
4093 : : .sa_handler = nop_signal_handler,
4094 : : .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4095 : : };
4096 : :
4097 : 0 : _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4098 : 0 : _cleanup_close_ int etc_passwd_lock = -1;
4099 : : _cleanup_close_pair_ int
4100 : 0 : kmsg_socket_pair[2] = { -1, -1 },
4101 : 0 : rtnl_socket_pair[2] = { -1, -1 },
4102 : 0 : pid_socket_pair[2] = { -1, -1 },
4103 : 0 : uuid_socket_pair[2] = { -1, -1 },
4104 : 0 : notify_socket_pair[2] = { -1, -1 },
4105 : 0 : uid_shift_socket_pair[2] = { -1, -1 },
4106 : 0 : master_pty_socket_pair[2] = { -1, -1 },
4107 : 0 : unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4108 : :
4109 : 0 : _cleanup_close_ int notify_socket = -1;
4110 : 0 : _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4111 : 0 : _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4112 : 0 : _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4113 : 0 : _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4114 : 0 : _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4115 : 0 : _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4116 : 0 : ContainerStatus container_status = 0;
4117 : 0 : int ifi = 0, r;
4118 : : ssize_t l;
4119 : : sigset_t mask_chld;
4120 : 0 : _cleanup_close_ int netns_fd = -1;
4121 : :
4122 [ # # ]: 0 : assert_se(sigemptyset(&mask_chld) == 0);
4123 [ # # ]: 0 : assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4124 : :
4125 [ # # ]: 0 : if (arg_userns_mode == USER_NAMESPACE_PICK) {
4126 : : /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4127 : : * check with getpwuid() if the specific user already exists. Note that /etc might be
4128 : : * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4129 : : * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4130 : : * really just an extra safety net. We kinda assume that the UID range we allocate from is
4131 : : * really ours. */
4132 : :
4133 : 0 : etc_passwd_lock = take_etc_passwd_lock(NULL);
4134 [ # # # # ]: 0 : if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4135 [ # # ]: 0 : return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4136 : : }
4137 : :
4138 : 0 : r = barrier_create(&barrier);
4139 [ # # ]: 0 : if (r < 0)
4140 [ # # ]: 0 : return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4141 : :
4142 [ # # ]: 0 : if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4143 [ # # ]: 0 : return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4144 : :
4145 [ # # ]: 0 : if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4146 [ # # ]: 0 : return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4147 : :
4148 [ # # ]: 0 : if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4149 [ # # ]: 0 : return log_error_errno(errno, "Failed to create pid socket pair: %m");
4150 : :
4151 [ # # ]: 0 : if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4152 [ # # ]: 0 : return log_error_errno(errno, "Failed to create id socket pair: %m");
4153 : :
4154 [ # # ]: 0 : if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4155 [ # # ]: 0 : return log_error_errno(errno, "Failed to create notify socket pair: %m");
4156 : :
4157 [ # # ]: 0 : if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4158 [ # # ]: 0 : return log_error_errno(errno, "Failed to create console socket pair: %m");
4159 : :
4160 [ # # ]: 0 : if (arg_userns_mode != USER_NAMESPACE_NO)
4161 [ # # ]: 0 : if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4162 [ # # ]: 0 : return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4163 : :
4164 [ # # ]: 0 : if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4165 [ # # ]: 0 : if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4166 [ # # ]: 0 : return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4167 : :
4168 : : /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4169 : : * parent's blocking calls and give it a chance to call wait() and terminate. */
4170 : 0 : r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4171 [ # # ]: 0 : if (r < 0)
4172 [ # # ]: 0 : return log_error_errno(errno, "Failed to change the signal mask: %m");
4173 : :
4174 : 0 : r = sigaction(SIGCHLD, &sa, NULL);
4175 [ # # ]: 0 : if (r < 0)
4176 [ # # ]: 0 : return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4177 : :
4178 [ # # ]: 0 : if (arg_network_namespace_path) {
4179 : 0 : netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4180 [ # # ]: 0 : if (netns_fd < 0)
4181 [ # # ]: 0 : return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4182 : :
4183 : 0 : r = fd_is_network_ns(netns_fd);
4184 [ # # ]: 0 : if (r == -EUCLEAN)
4185 [ # # ]: 0 : log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4186 [ # # ]: 0 : else if (r < 0)
4187 [ # # ]: 0 : return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4188 [ # # ]: 0 : else if (r == 0)
4189 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4190 : : "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4191 : : }
4192 : :
4193 : 0 : *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4194 [ # # ]: 0 : if (*pid < 0)
4195 [ # # # # ]: 0 : return log_error_errno(errno, "clone() failed%s: %m",
4196 : : errno == EINVAL ?
4197 : : ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4198 : :
4199 [ # # ]: 0 : if (*pid == 0) {
4200 : : /* The outer child only has a file system namespace. */
4201 : 0 : barrier_set_role(&barrier, BARRIER_CHILD);
4202 : :
4203 : 0 : kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4204 : 0 : rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4205 : 0 : pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4206 : 0 : uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4207 : 0 : notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4208 : 0 : master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
4209 : 0 : uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4210 : 0 : unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
4211 : :
4212 : 0 : (void) reset_all_signal_handlers();
4213 : 0 : (void) reset_signal_mask();
4214 : :
4215 : 0 : r = outer_child(&barrier,
4216 : : arg_directory,
4217 : : dissected_image,
4218 : : secondary,
4219 : : pid_socket_pair[1],
4220 : : uuid_socket_pair[1],
4221 : : notify_socket_pair[1],
4222 : : kmsg_socket_pair[1],
4223 : : rtnl_socket_pair[1],
4224 : : uid_shift_socket_pair[1],
4225 : : master_pty_socket_pair[1],
4226 : : unified_cgroup_hierarchy_socket_pair[1],
4227 : : fds,
4228 : : netns_fd);
4229 [ # # ]: 0 : if (r < 0)
4230 : 0 : _exit(EXIT_FAILURE);
4231 : :
4232 : 0 : _exit(EXIT_SUCCESS);
4233 : : }
4234 : :
4235 : 0 : barrier_set_role(&barrier, BARRIER_PARENT);
4236 : :
4237 : 0 : fdset_close(fds);
4238 : :
4239 : 0 : kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4240 : 0 : rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4241 : 0 : pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4242 : 0 : uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4243 : 0 : notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4244 : 0 : master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
4245 : 0 : uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
4246 : 0 : unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
4247 : :
4248 [ # # ]: 0 : if (arg_userns_mode != USER_NAMESPACE_NO) {
4249 : : /* The child just let us know the UID shift it might have read from the image. */
4250 : 0 : l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4251 [ # # ]: 0 : if (l < 0)
4252 [ # # ]: 0 : return log_error_errno(errno, "Failed to read UID shift: %m");
4253 [ # # ]: 0 : if (l != sizeof arg_uid_shift)
4254 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4255 : :
4256 [ # # ]: 0 : if (arg_userns_mode == USER_NAMESPACE_PICK) {
4257 : : /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4258 : : * image, but if that's already in use, pick a new one, and report back to the child,
4259 : : * which one we now picked. */
4260 : :
4261 : 0 : r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4262 [ # # ]: 0 : if (r < 0)
4263 [ # # ]: 0 : return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4264 : :
4265 : 0 : l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4266 [ # # ]: 0 : if (l < 0)
4267 [ # # ]: 0 : return log_error_errno(errno, "Failed to send UID shift: %m");
4268 [ # # ]: 0 : if (l != sizeof arg_uid_shift)
4269 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4270 : : }
4271 : : }
4272 : :
4273 [ # # ]: 0 : if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4274 : : /* The child let us know the support cgroup mode it might have read from the image. */
4275 : 0 : l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4276 [ # # ]: 0 : if (l < 0)
4277 [ # # ]: 0 : return log_error_errno(errno, "Failed to read cgroup mode: %m");
4278 [ # # ]: 0 : if (l != sizeof(arg_unified_cgroup_hierarchy))
4279 [ # # # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4280 : : l, l == 0 ? " The child is most likely dead." : "");
4281 : : }
4282 : :
4283 : : /* Wait for the outer child. */
4284 : 0 : r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4285 [ # # ]: 0 : if (r < 0)
4286 : 0 : return r;
4287 [ # # ]: 0 : if (r != EXIT_SUCCESS)
4288 : 0 : return -EIO;
4289 : :
4290 : : /* And now retrieve the PID of the inner child. */
4291 : 0 : l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4292 [ # # ]: 0 : if (l < 0)
4293 [ # # ]: 0 : return log_error_errno(errno, "Failed to read inner child PID: %m");
4294 [ # # ]: 0 : if (l != sizeof *pid)
4295 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4296 : :
4297 : : /* We also retrieve container UUID in case it was generated by outer child */
4298 : 0 : l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4299 [ # # ]: 0 : if (l < 0)
4300 [ # # ]: 0 : return log_error_errno(errno, "Failed to read container machine ID: %m");
4301 [ # # ]: 0 : if (l != sizeof(arg_uuid))
4302 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4303 : :
4304 : : /* We also retrieve the socket used for notifications generated by outer child */
4305 : 0 : notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4306 [ # # ]: 0 : if (notify_socket < 0)
4307 [ # # ]: 0 : return log_error_errno(notify_socket,
4308 : : "Failed to receive notification socket from the outer child: %m");
4309 : :
4310 [ # # ]: 0 : log_debug("Init process invoked as PID "PID_FMT, *pid);
4311 : :
4312 [ # # ]: 0 : if (arg_userns_mode != USER_NAMESPACE_NO) {
4313 [ # # ]: 0 : if (!barrier_place_and_sync(&barrier)) /* #1 */
4314 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4315 : :
4316 : 0 : r = setup_uid_map(*pid);
4317 [ # # ]: 0 : if (r < 0)
4318 : 0 : return r;
4319 : :
4320 : 0 : (void) barrier_place(&barrier); /* #2 */
4321 : : }
4322 : :
4323 [ # # ]: 0 : if (arg_private_network) {
4324 [ # # ]: 0 : if (!arg_network_namespace_path) {
4325 : : /* Wait until the child has unshared its network namespace. */
4326 [ # # ]: 0 : if (!barrier_place_and_sync(&barrier)) /* #3 */
4327 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4328 : : }
4329 : :
4330 : 0 : r = move_network_interfaces(*pid, arg_network_interfaces);
4331 [ # # ]: 0 : if (r < 0)
4332 : 0 : return r;
4333 : :
4334 [ # # ]: 0 : if (arg_network_veth) {
4335 : 0 : r = setup_veth(arg_machine, *pid, veth_name,
4336 [ # # # # ]: 0 : arg_network_bridge || arg_network_zone);
4337 [ # # ]: 0 : if (r < 0)
4338 : 0 : return r;
4339 [ # # ]: 0 : else if (r > 0)
4340 : 0 : ifi = r;
4341 : :
4342 [ # # ]: 0 : if (arg_network_bridge) {
4343 : : /* Add the interface to a bridge */
4344 : 0 : r = setup_bridge(veth_name, arg_network_bridge, false);
4345 [ # # ]: 0 : if (r < 0)
4346 : 0 : return r;
4347 [ # # ]: 0 : if (r > 0)
4348 : 0 : ifi = r;
4349 [ # # ]: 0 : } else if (arg_network_zone) {
4350 : : /* Add the interface to a bridge, possibly creating it */
4351 : 0 : r = setup_bridge(veth_name, arg_network_zone, true);
4352 [ # # ]: 0 : if (r < 0)
4353 : 0 : return r;
4354 [ # # ]: 0 : if (r > 0)
4355 : 0 : ifi = r;
4356 : : }
4357 : : }
4358 : :
4359 : 0 : r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4360 [ # # ]: 0 : if (r < 0)
4361 : 0 : return r;
4362 : :
4363 : : /* We created the primary and extra veth links now; let's remember this, so that we know to
4364 : : remove them later on. Note that we don't bother with removing veth links that were created
4365 : : here when their setup failed half-way, because in that case the kernel should be able to
4366 : : remove them on its own, since they cannot be referenced by anything yet. */
4367 : 0 : *veth_created = true;
4368 : :
4369 : 0 : r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4370 [ # # ]: 0 : if (r < 0)
4371 : 0 : return r;
4372 : :
4373 : 0 : r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4374 [ # # ]: 0 : if (r < 0)
4375 : 0 : return r;
4376 : : }
4377 : :
4378 [ # # # # ]: 0 : if (arg_register || !arg_keep_unit) {
4379 : 0 : r = sd_bus_default_system(&bus);
4380 [ # # ]: 0 : if (r < 0)
4381 [ # # ]: 0 : return log_error_errno(r, "Failed to open system bus: %m");
4382 : :
4383 : 0 : r = sd_bus_set_close_on_exit(bus, false);
4384 [ # # ]: 0 : if (r < 0)
4385 [ # # ]: 0 : return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
4386 : : }
4387 : :
4388 [ # # ]: 0 : if (!arg_keep_unit) {
4389 : : /* When a new scope is created for this container, then we'll be registered as its controller, in which
4390 : : * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4391 : : * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4392 : :
4393 : 0 : r = sd_bus_match_signal_async(
4394 : : bus,
4395 : : NULL,
4396 : : "org.freedesktop.systemd1",
4397 : : NULL,
4398 : : "org.freedesktop.systemd1.Scope",
4399 : : "RequestStop",
4400 : : on_request_stop, NULL, PID_TO_PTR(*pid));
4401 [ # # ]: 0 : if (r < 0)
4402 [ # # ]: 0 : return log_error_errno(r, "Failed to request RequestStop match: %m");
4403 : : }
4404 : :
4405 [ # # ]: 0 : if (arg_register) {
4406 : 0 : r = register_machine(
4407 : : bus,
4408 : : arg_machine,
4409 : : *pid,
4410 : : arg_directory,
4411 : : arg_uuid,
4412 : : ifi,
4413 : : arg_slice,
4414 : : arg_custom_mounts, arg_n_custom_mounts,
4415 : : arg_kill_signal,
4416 : : arg_property,
4417 : : arg_property_message,
4418 : : arg_keep_unit,
4419 : : arg_container_service_name);
4420 [ # # ]: 0 : if (r < 0)
4421 : 0 : return r;
4422 : :
4423 [ # # ]: 0 : } else if (!arg_keep_unit) {
4424 : 0 : r = allocate_scope(
4425 : : bus,
4426 : : arg_machine,
4427 : : *pid,
4428 : : arg_slice,
4429 : : arg_custom_mounts, arg_n_custom_mounts,
4430 : : arg_kill_signal,
4431 : : arg_property,
4432 : : arg_property_message);
4433 [ # # ]: 0 : if (r < 0)
4434 : 0 : return r;
4435 : :
4436 [ # # # # ]: 0 : } else if (arg_slice || arg_property)
4437 [ # # ]: 0 : log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
4438 : :
4439 : 0 : r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
4440 [ # # ]: 0 : if (r < 0)
4441 : 0 : return r;
4442 : :
4443 : 0 : r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4444 [ # # ]: 0 : if (r < 0)
4445 : 0 : return r;
4446 : :
4447 : 0 : r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4448 [ # # ]: 0 : if (r < 0)
4449 : 0 : return r;
4450 : :
4451 : : /* Notify the child that the parent is ready with all
4452 : : * its setup (including cgroup-ification), and that
4453 : : * the child can now hand over control to the code to
4454 : : * run inside the container. */
4455 : 0 : (void) barrier_place(&barrier); /* #4 */
4456 : :
4457 : : /* Block SIGCHLD here, before notifying child.
4458 : : * process_pty() will handle it with the other signals. */
4459 [ # # ]: 0 : assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4460 : :
4461 : : /* Reset signal to default */
4462 : 0 : r = default_signals(SIGCHLD, -1);
4463 [ # # ]: 0 : if (r < 0)
4464 [ # # ]: 0 : return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4465 : :
4466 : 0 : r = sd_event_new(&event);
4467 [ # # ]: 0 : if (r < 0)
4468 [ # # ]: 0 : return log_error_errno(r, "Failed to get default event source: %m");
4469 : :
4470 : 0 : (void) sd_event_set_watchdog(event, true);
4471 : :
4472 [ # # ]: 0 : if (bus) {
4473 : 0 : r = sd_bus_attach_event(bus, event, 0);
4474 [ # # ]: 0 : if (r < 0)
4475 [ # # ]: 0 : return log_error_errno(r, "Failed to attach bus to event loop: %m");
4476 : : }
4477 : :
4478 : 0 : r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), ¬ify_event_source);
4479 [ # # ]: 0 : if (r < 0)
4480 : 0 : return r;
4481 : :
4482 : : /* Let the child know that we are ready and wait that the child is completely ready now. */
4483 [ # # ]: 0 : if (!barrier_place_and_sync(&barrier)) /* #5 */
4484 [ # # ]: 0 : return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4485 : :
4486 : : /* At this point we have made use of the UID we picked, and thus nss-mymachines
4487 : : * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4488 : 0 : etc_passwd_lock = safe_close(etc_passwd_lock);
4489 : :
4490 : 0 : (void) sd_notifyf(false,
4491 : : "STATUS=Container running.\n"
4492 : : "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4493 [ # # ]: 0 : if (!arg_notify_ready)
4494 : 0 : (void) sd_notify(false, "READY=1\n");
4495 : :
4496 [ # # ]: 0 : if (arg_kill_signal > 0) {
4497 : : /* Try to kill the init system on SIGINT or SIGTERM */
4498 : 0 : (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4499 : 0 : (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
4500 : : } else {
4501 : : /* Immediately exit */
4502 : 0 : (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4503 : 0 : (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4504 : : }
4505 : :
4506 : : /* Exit when the child exits */
4507 : 0 : (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
4508 : :
4509 [ # # ]: 0 : if (arg_expose_ports) {
4510 : 0 : r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4511 [ # # ]: 0 : if (r < 0)
4512 : 0 : return r;
4513 : :
4514 : 0 : (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4515 : : }
4516 : :
4517 : 0 : rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4518 : :
4519 [ # # ]: 0 : if (arg_console_mode != CONSOLE_PIPE) {
4520 [ # # ]: 0 : _cleanup_close_ int fd = -1;
4521 : 0 : PTYForwardFlags flags = 0;
4522 : :
4523 : : /* Retrieve the master pty allocated by inner child */
4524 : 0 : fd = receive_one_fd(master_pty_socket_pair[0], 0);
4525 [ # # ]: 0 : if (fd < 0)
4526 [ # # ]: 0 : return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4527 : :
4528 [ # # # ]: 0 : switch (arg_console_mode) {
4529 : :
4530 : 0 : case CONSOLE_READ_ONLY:
4531 : 0 : flags |= PTY_FORWARD_READ_ONLY;
4532 : :
4533 : : _fallthrough_;
4534 : :
4535 : 0 : case CONSOLE_INTERACTIVE:
4536 : 0 : flags |= PTY_FORWARD_IGNORE_VHANGUP;
4537 : :
4538 : 0 : r = pty_forward_new(event, fd, flags, &forward);
4539 [ # # ]: 0 : if (r < 0)
4540 [ # # ]: 0 : return log_error_errno(r, "Failed to create PTY forwarder: %m");
4541 : :
4542 [ # # # # ]: 0 : if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4543 : 0 : (void) pty_forward_set_width_height(forward,
4544 : : arg_console_width,
4545 : : arg_console_height);
4546 : 0 : break;
4547 : :
4548 : 0 : default:
4549 [ # # ]: 0 : assert(arg_console_mode == CONSOLE_PASSIVE);
4550 : : }
4551 : :
4552 : 0 : *master = TAKE_FD(fd);
4553 : : }
4554 : :
4555 : 0 : r = sd_event_loop(event);
4556 [ # # ]: 0 : if (r < 0)
4557 [ # # ]: 0 : return log_error_errno(r, "Failed to run event loop: %m");
4558 : :
4559 [ # # ]: 0 : if (forward) {
4560 : 0 : char last_char = 0;
4561 : :
4562 : 0 : (void) pty_forward_get_last_char(forward, &last_char);
4563 : 0 : forward = pty_forward_free(forward);
4564 : :
4565 [ # # # # ]: 0 : if (!arg_quiet && last_char != '\n')
4566 : 0 : putc('\n', stdout);
4567 : : }
4568 : :
4569 : : /* Kill if it is not dead yet anyway */
4570 [ # # ]: 0 : if (bus) {
4571 [ # # ]: 0 : if (arg_register)
4572 : 0 : terminate_machine(bus, arg_machine);
4573 [ # # ]: 0 : else if (!arg_keep_unit)
4574 : 0 : terminate_scope(bus, arg_machine);
4575 : : }
4576 : :
4577 : : /* Normally redundant, but better safe than sorry */
4578 : 0 : (void) kill(*pid, SIGKILL);
4579 : :
4580 : 0 : r = wait_for_container(*pid, &container_status);
4581 : 0 : *pid = 0;
4582 : :
4583 [ # # ]: 0 : if (r < 0)
4584 : : /* We failed to wait for the container, or the container exited abnormally. */
4585 : 0 : return r;
4586 [ # # # # ]: 0 : if (r > 0 || container_status == CONTAINER_TERMINATED) {
4587 : : /* r > 0 → The container exited with a non-zero status.
4588 : : * As a special case, we need to replace 133 with a different value,
4589 : : * because 133 is special-cased in the service file to reboot the container.
4590 : : * otherwise → The container exited with zero status and a reboot was not requested.
4591 : : */
4592 [ # # ]: 0 : if (r == EXIT_FORCE_RESTART)
4593 : 0 : r = EXIT_FAILURE; /* replace 133 with the general failure code */
4594 : 0 : *ret = r;
4595 : 0 : return 0; /* finito */
4596 : : }
4597 : :
4598 : : /* CONTAINER_REBOOTED, loop again */
4599 : :
4600 [ # # ]: 0 : if (arg_keep_unit) {
4601 : : /* Special handling if we are running as a service: instead of simply
4602 : : * restarting the machine we want to restart the entire service, so let's
4603 : : * inform systemd about this with the special exit code 133. The service
4604 : : * file uses RestartForceExitStatus=133 so that this results in a full
4605 : : * nspawn restart. This is necessary since we might have cgroup parameters
4606 : : * set we want to have flushed out. */
4607 : 0 : *ret = EXIT_FORCE_RESTART;
4608 : 0 : return 0; /* finito */
4609 : : }
4610 : :
4611 : 0 : expose_port_flush(arg_expose_ports, exposed);
4612 : :
4613 : 0 : (void) remove_veth_links(veth_name, arg_network_veth_extra);
4614 : 0 : *veth_created = false;
4615 : 0 : return 1; /* loop again */
4616 : : }
4617 : :
4618 : 0 : static int initialize_rlimits(void) {
4619 : : /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4620 : : * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4621 : : * container execution environments. */
4622 : :
4623 : : static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4624 : : [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4625 : : [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4626 : : [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4627 : : [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4628 : : [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4629 : : [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4630 : : [RLIMIT_MEMLOCK] = { 65536, 65536 },
4631 : : [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4632 : : [RLIMIT_NICE] = { 0, 0 },
4633 : : [RLIMIT_NOFILE] = { 1024, 4096 },
4634 : : [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4635 : : [RLIMIT_RTPRIO] = { 0, 0 },
4636 : : [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4637 : : [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4638 : :
4639 : : /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4640 : : * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4641 : : * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4642 : : * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4643 : : * that PID 1 changes a number of other resource limits during early initialization which is why we
4644 : : * don't read the other limits from PID 1 but prefer the static table above. */
4645 : : };
4646 : :
4647 : : int rl;
4648 : :
4649 [ # # ]: 0 : for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4650 : : /* Let's only fill in what the user hasn't explicitly configured anyway */
4651 [ # # ]: 0 : if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4652 : : const struct rlimit *v;
4653 : : struct rlimit buffer;
4654 : :
4655 [ # # # # ]: 0 : if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4656 : : /* For these two let's read the limits off PID 1. See above for an explanation. */
4657 : :
4658 [ # # ]: 0 : if (prlimit(1, rl, NULL, &buffer) < 0)
4659 [ # # ]: 0 : return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4660 : :
4661 : 0 : v = &buffer;
4662 : : } else
4663 : 0 : v = kernel_defaults + rl;
4664 : :
4665 : 0 : arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4666 [ # # ]: 0 : if (!arg_rlimit[rl])
4667 : 0 : return log_oom();
4668 : : }
4669 : :
4670 [ # # ]: 0 : if (DEBUG_LOGGING) {
4671 : 0 : _cleanup_free_ char *k = NULL;
4672 : :
4673 : 0 : (void) rlimit_format(arg_rlimit[rl], &k);
4674 [ # # ]: 0 : log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4675 : : }
4676 : : }
4677 : :
4678 : 0 : return 0;
4679 : : }
4680 : :
4681 : 16 : static int run(int argc, char *argv[]) {
4682 : 16 : bool secondary = false, remove_directory = false, remove_image = false,
4683 : 16 : veth_created = false, remove_tmprootdir = false;
4684 : 16 : _cleanup_close_ int master = -1;
4685 : 16 : _cleanup_fdset_free_ FDSet *fds = NULL;
4686 : 16 : int r, n_fd_passed, ret = EXIT_SUCCESS;
4687 : 16 : char veth_name[IFNAMSIZ] = "";
4688 : 16 : union in_addr_union exposed = {};
4689 : 16 : _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4690 : 16 : char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
4691 : 16 : _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
4692 : 16 : _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4693 : 16 : _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
4694 : 16 : pid_t pid = 0;
4695 : :
4696 : 16 : log_parse_environment();
4697 : 16 : log_open();
4698 : :
4699 : 16 : r = parse_argv(argc, argv);
4700 [ + - ]: 16 : if (r <= 0)
4701 : 16 : goto finish;
4702 : :
4703 : 0 : r = must_be_root();
4704 [ # # ]: 0 : if (r < 0)
4705 : 0 : goto finish;
4706 : :
4707 : 0 : r = initialize_rlimits();
4708 [ # # ]: 0 : if (r < 0)
4709 : 0 : goto finish;
4710 : :
4711 : 0 : r = load_oci_bundle();
4712 [ # # ]: 0 : if (r < 0)
4713 : 0 : goto finish;
4714 : :
4715 : 0 : r = determine_names();
4716 [ # # ]: 0 : if (r < 0)
4717 : 0 : goto finish;
4718 : :
4719 : 0 : r = load_settings();
4720 [ # # ]: 0 : if (r < 0)
4721 : 0 : goto finish;
4722 : :
4723 : 0 : r = cg_unified_flush();
4724 [ # # ]: 0 : if (r < 0) {
4725 [ # # ]: 0 : log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4726 : 0 : goto finish;
4727 : : }
4728 : :
4729 : 0 : r = verify_arguments();
4730 [ # # ]: 0 : if (r < 0)
4731 : 0 : goto finish;
4732 : :
4733 : 0 : r = detect_unified_cgroup_hierarchy_from_environment();
4734 [ # # ]: 0 : if (r < 0)
4735 : 0 : goto finish;
4736 : :
4737 : : /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4738 : : * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4739 : : * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4740 : 0 : (void) ignore_signals(SIGPIPE, -1);
4741 : :
4742 : 0 : n_fd_passed = sd_listen_fds(false);
4743 [ # # ]: 0 : if (n_fd_passed > 0) {
4744 : 0 : r = fdset_new_listen_fds(&fds, false);
4745 [ # # ]: 0 : if (r < 0) {
4746 [ # # ]: 0 : log_error_errno(r, "Failed to collect file descriptors: %m");
4747 : 0 : goto finish;
4748 : : }
4749 : : }
4750 : :
4751 : : /* The "default" umask. This is appropriate for most file and directory
4752 : : * operations performed by nspawn, and is the umask that will be used for
4753 : : * the child. Functions like copy_devnodes() change the umask temporarily. */
4754 : 0 : umask(0022);
4755 : :
4756 [ # # ]: 0 : if (arg_directory) {
4757 [ # # ]: 0 : assert(!arg_image);
4758 : :
4759 : : /* Safety precaution: let's not allow running images from the live host OS image, as long as
4760 : : * /var from the host will propagate into container dynamically (because bad things happen if
4761 : : * two systems write to the same /var). Let's allow it for the special cases where /var is
4762 : : * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
4763 [ # # # # : 0 : if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
# # # # ]
4764 [ # # ]: 0 : log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
4765 : 0 : r = -EINVAL;
4766 : 0 : goto finish;
4767 : : }
4768 : :
4769 [ # # ]: 0 : if (arg_ephemeral) {
4770 [ # # ]: 0 : _cleanup_free_ char *np = NULL;
4771 : :
4772 : 0 : r = chase_symlinks_and_update(&arg_directory, 0);
4773 [ # # ]: 0 : if (r < 0)
4774 : 0 : goto finish;
4775 : :
4776 : : /* If the specified path is a mount point we generate the new snapshot immediately
4777 : : * inside it under a random name. However if the specified is not a mount point we
4778 : : * create the new snapshot in the parent directory, just next to it. */
4779 : 0 : r = path_is_mount_point(arg_directory, NULL, 0);
4780 [ # # ]: 0 : if (r < 0) {
4781 [ # # ]: 0 : log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4782 : 0 : goto finish;
4783 : : }
4784 [ # # ]: 0 : if (r > 0)
4785 : 0 : r = tempfn_random_child(arg_directory, "machine.", &np);
4786 : : else
4787 : 0 : r = tempfn_random(arg_directory, "machine.", &np);
4788 [ # # ]: 0 : if (r < 0) {
4789 [ # # ]: 0 : log_error_errno(r, "Failed to generate name for directory snapshot: %m");
4790 : 0 : goto finish;
4791 : : }
4792 : :
4793 : : /* We take an exclusive lock on this image, since it's our private, ephemeral copy
4794 : : * only owned by us and noone else. */
4795 : 0 : r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
4796 [ # # ]: 0 : if (r < 0) {
4797 [ # # ]: 0 : log_error_errno(r, "Failed to lock %s: %m", np);
4798 : 0 : goto finish;
4799 : : }
4800 : :
4801 : : {
4802 [ # # ]: 0 : BLOCK_SIGNALS(SIGINT);
4803 : 0 : r = btrfs_subvol_snapshot(arg_directory, np,
4804 [ # # ]: 0 : (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4805 : : BTRFS_SNAPSHOT_FALLBACK_COPY |
4806 : : BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4807 : : BTRFS_SNAPSHOT_RECURSIVE |
4808 : : BTRFS_SNAPSHOT_QUOTA |
4809 : : BTRFS_SNAPSHOT_SIGINT);
4810 : : }
4811 [ # # ]: 0 : if (r == -EINTR) {
4812 [ # # ]: 0 : log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
4813 : 0 : goto finish;
4814 : : }
4815 [ # # ]: 0 : if (r < 0) {
4816 [ # # ]: 0 : log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4817 : 0 : goto finish;
4818 : : }
4819 : :
4820 : 0 : free_and_replace(arg_directory, np);
4821 : 0 : remove_directory = true;
4822 : : } else {
4823 [ # # ]: 0 : r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
4824 [ # # ]: 0 : if (r < 0)
4825 : 0 : goto finish;
4826 : :
4827 [ # # ]: 0 : r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4828 [ # # ]: 0 : if (r == -EBUSY) {
4829 [ # # ]: 0 : log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4830 : 0 : goto finish;
4831 : : }
4832 [ # # ]: 0 : if (r < 0) {
4833 [ # # ]: 0 : log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4834 : 0 : goto finish;
4835 : : }
4836 : :
4837 [ # # ]: 0 : if (arg_template) {
4838 : 0 : r = chase_symlinks_and_update(&arg_template, 0);
4839 [ # # ]: 0 : if (r < 0)
4840 : 0 : goto finish;
4841 : :
4842 : : {
4843 [ # # ]: 0 : BLOCK_SIGNALS(SIGINT);
4844 : 0 : r = btrfs_subvol_snapshot(arg_template, arg_directory,
4845 [ # # ]: 0 : (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4846 : : BTRFS_SNAPSHOT_FALLBACK_COPY |
4847 : : BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4848 : : BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4849 : : BTRFS_SNAPSHOT_RECURSIVE |
4850 : : BTRFS_SNAPSHOT_QUOTA |
4851 : : BTRFS_SNAPSHOT_SIGINT);
4852 : : }
4853 [ # # ]: 0 : if (r == -EEXIST)
4854 [ # # # # ]: 0 : log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4855 : : "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4856 [ # # ]: 0 : else if (r == -EINTR) {
4857 [ # # ]: 0 : log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
4858 : 0 : goto finish;
4859 [ # # ]: 0 : } else if (r < 0) {
4860 [ # # ]: 0 : log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4861 : 0 : goto finish;
4862 : : } else
4863 [ # # # # ]: 0 : log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4864 : : "Populated %s from template %s.", arg_directory, arg_template);
4865 : : }
4866 : : }
4867 : :
4868 [ # # ]: 0 : if (arg_start_mode == START_BOOT) {
4869 : : const char *p;
4870 : :
4871 [ # # ]: 0 : if (arg_pivot_root_new)
4872 [ # # # # : 0 : p = prefix_roota(arg_directory, arg_pivot_root_new);
# # # # #
# # # # #
# # ]
4873 : : else
4874 : 0 : p = arg_directory;
4875 : :
4876 [ # # ]: 0 : if (path_is_os_tree(p) <= 0) {
4877 [ # # ]: 0 : log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
4878 : 0 : r = -EINVAL;
4879 : 0 : goto finish;
4880 : : }
4881 : : } else {
4882 : : const char *p, *q;
4883 : :
4884 [ # # ]: 0 : if (arg_pivot_root_new)
4885 [ # # # # : 0 : p = prefix_roota(arg_directory, arg_pivot_root_new);
# # # # #
# # # # #
# # ]
4886 : : else
4887 : 0 : p = arg_directory;
4888 : :
4889 [ # # # # : 0 : q = strjoina(p, "/usr/");
# # # # #
# # # ]
4890 : :
4891 [ # # ]: 0 : if (laccess(q, F_OK) < 0) {
4892 [ # # ]: 0 : log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
4893 : 0 : r = -EINVAL;
4894 : 0 : goto finish;
4895 : : }
4896 : : }
4897 : :
4898 : : } else {
4899 [ # # ]: 0 : assert(arg_image);
4900 [ # # ]: 0 : assert(!arg_template);
4901 : :
4902 : 0 : r = chase_symlinks_and_update(&arg_image, 0);
4903 [ # # ]: 0 : if (r < 0)
4904 : 0 : goto finish;
4905 : :
4906 [ # # ]: 0 : if (arg_ephemeral) {
4907 [ # # ]: 0 : _cleanup_free_ char *np = NULL;
4908 : :
4909 : 0 : r = tempfn_random(arg_image, "machine.", &np);
4910 [ # # ]: 0 : if (r < 0) {
4911 [ # # ]: 0 : log_error_errno(r, "Failed to generate name for image snapshot: %m");
4912 : 0 : goto finish;
4913 : : }
4914 : :
4915 : : /* Always take an exclusive lock on our own ephemeral copy. */
4916 : 0 : r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
4917 [ # # ]: 0 : if (r < 0) {
4918 [ # # ]: 0 : r = log_error_errno(r, "Failed to create image lock: %m");
4919 : 0 : goto finish;
4920 : : }
4921 : :
4922 : : {
4923 [ # # ]: 0 : BLOCK_SIGNALS(SIGINT);
4924 [ # # ]: 0 : r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
4925 : : }
4926 [ # # ]: 0 : if (r == -EINTR) {
4927 [ # # ]: 0 : log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
4928 : 0 : goto finish;
4929 : : }
4930 [ # # ]: 0 : if (r < 0) {
4931 [ # # ]: 0 : r = log_error_errno(r, "Failed to copy image file: %m");
4932 : 0 : goto finish;
4933 : : }
4934 : :
4935 : 0 : free_and_replace(arg_image, np);
4936 : 0 : remove_image = true;
4937 : : } else {
4938 [ # # ]: 0 : r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4939 [ # # ]: 0 : if (r == -EBUSY) {
4940 [ # # ]: 0 : r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4941 : 0 : goto finish;
4942 : : }
4943 [ # # ]: 0 : if (r < 0) {
4944 [ # # ]: 0 : r = log_error_errno(r, "Failed to create image lock: %m");
4945 : 0 : goto finish;
4946 : : }
4947 : :
4948 [ # # ]: 0 : if (!arg_root_hash) {
4949 : 0 : r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4950 [ # # ]: 0 : if (r < 0) {
4951 [ # # ]: 0 : log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4952 : 0 : goto finish;
4953 : : }
4954 : : }
4955 : : }
4956 : :
4957 [ # # ]: 0 : if (!mkdtemp(tmprootdir)) {
4958 [ # # ]: 0 : r = log_error_errno(errno, "Failed to create temporary directory: %m");
4959 : 0 : goto finish;
4960 : : }
4961 : :
4962 : 0 : remove_tmprootdir = true;
4963 : :
4964 : 0 : arg_directory = strdup(tmprootdir);
4965 [ # # ]: 0 : if (!arg_directory) {
4966 : 0 : r = log_oom();
4967 : 0 : goto finish;
4968 : : }
4969 : :
4970 [ # # ]: 0 : r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4971 [ # # ]: 0 : if (r < 0) {
4972 [ # # ]: 0 : log_error_errno(r, "Failed to set up loopback block device: %m");
4973 : 0 : goto finish;
4974 : : }
4975 : :
4976 : 0 : r = dissect_image_and_warn(
4977 : 0 : loop->fd,
4978 : : arg_image,
4979 : : arg_root_hash, arg_root_hash_size,
4980 : : DISSECT_IMAGE_REQUIRE_ROOT,
4981 : : &dissected_image);
4982 [ # # ]: 0 : if (r == -ENOPKG) {
4983 : : /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
4984 [ # # ]: 0 : log_notice("Note that the disk image needs to\n"
4985 : : " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4986 : : " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4987 : : " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4988 : : " d) or contain a file system without a partition table\n"
4989 : : "in order to be bootable with systemd-nspawn.");
4990 : 0 : goto finish;
4991 : : }
4992 [ # # ]: 0 : if (r < 0)
4993 : 0 : goto finish;
4994 : :
4995 [ # # # # ]: 0 : if (!arg_root_hash && dissected_image->can_verity)
4996 [ # # ]: 0 : log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4997 : :
4998 : 0 : r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
4999 [ # # ]: 0 : if (r < 0)
5000 : 0 : goto finish;
5001 : :
5002 : : /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5003 [ # # # # ]: 0 : if (remove_image && unlink(arg_image) >= 0)
5004 : 0 : remove_image = false;
5005 : : }
5006 : :
5007 : 0 : r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5008 [ # # ]: 0 : if (r < 0)
5009 : 0 : goto finish;
5010 : :
5011 [ # # ]: 0 : if (arg_console_mode < 0)
5012 : 0 : arg_console_mode =
5013 : 0 : isatty(STDIN_FILENO) > 0 &&
5014 [ # # # # ]: 0 : isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5015 : :
5016 [ # # ]: 0 : if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5017 : 0 : arg_quiet = true;
5018 : :
5019 [ # # ]: 0 : if (!arg_quiet)
5020 [ # # # # ]: 0 : log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5021 : : arg_machine, arg_image ?: arg_directory);
5022 : :
5023 [ # # ]: 0 : assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
5024 : :
5025 [ # # ]: 0 : if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
5026 [ # # ]: 0 : r = log_error_errno(errno, "Failed to become subreaper: %m");
5027 : 0 : goto finish;
5028 : : }
5029 : :
5030 : : for (;;) {
5031 : 0 : r = run_container(dissected_image,
5032 : : secondary,
5033 : : fds,
5034 : : veth_name, &veth_created,
5035 : : &exposed, &master,
5036 : : &pid, &ret);
5037 [ # # ]: 0 : if (r <= 0)
5038 : 0 : break;
5039 : : }
5040 : :
5041 : 16 : finish:
5042 [ + + ]: 28 : (void) sd_notify(false,
5043 [ - + ]: 12 : r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5044 : : "STOPPING=1\nSTATUS=Terminating...");
5045 : :
5046 [ - + ]: 16 : if (pid > 0)
5047 : 0 : (void) kill(pid, SIGKILL);
5048 : :
5049 : : /* Try to flush whatever is still queued in the pty */
5050 [ - + ]: 16 : if (master >= 0) {
5051 : 0 : (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
5052 : 0 : master = safe_close(master);
5053 : : }
5054 : :
5055 [ - + ]: 16 : if (pid > 0)
5056 : 0 : (void) wait_for_terminate(pid, NULL);
5057 : :
5058 : 16 : pager_close();
5059 : :
5060 [ - + # # ]: 16 : if (remove_directory && arg_directory) {
5061 : : int k;
5062 : :
5063 : 0 : k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5064 [ # # ]: 0 : if (k < 0)
5065 [ # # ]: 0 : log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5066 : : }
5067 : :
5068 [ - + # # ]: 16 : if (remove_image && arg_image) {
5069 [ # # ]: 0 : if (unlink(arg_image) < 0)
5070 [ # # ]: 0 : log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5071 : : }
5072 : :
5073 [ - + ]: 16 : if (remove_tmprootdir) {
5074 [ # # ]: 0 : if (rmdir(tmprootdir) < 0)
5075 [ # # ]: 0 : log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5076 : : }
5077 : :
5078 [ - + ]: 16 : if (arg_machine) {
5079 : : const char *p;
5080 : :
5081 [ # # # # : 0 : p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
# # # # #
# # # ]
5082 : 0 : (void) rm_rf(p, REMOVE_ROOT);
5083 : : }
5084 : :
5085 : 16 : expose_port_flush(arg_expose_ports, &exposed);
5086 : :
5087 [ - + ]: 16 : if (veth_created)
5088 : 0 : (void) remove_veth_links(veth_name, arg_network_veth_extra);
5089 : 16 : (void) remove_bridge(arg_network_zone);
5090 : :
5091 : 16 : custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5092 : 16 : expose_port_free_all(arg_expose_ports);
5093 : 16 : rlimit_free_all(arg_rlimit);
5094 : 16 : device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5095 : :
5096 [ + + ]: 16 : if (r < 0)
5097 : 4 : return r;
5098 : :
5099 : 12 : return ret;
5100 : : }
5101 : :
5102 [ + + ]: 16 : DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);
|