Line data Source code
1 : /* SPDX-License-Identifier: LGPL-2.1+ */
2 :
3 : #include <fcntl.h>
4 : #include <linux/magic.h>
5 : #if HAVE_ACL
6 : #include <sys/acl.h>
7 : #endif
8 : #include <sys/stat.h>
9 : #include <sys/statvfs.h>
10 : #include <sys/vfs.h>
11 : #include <unistd.h>
12 :
13 : #include "acl-util.h"
14 : #include "dirent-util.h"
15 : #include "fd-util.h"
16 : #include "fs-util.h"
17 : #include "missing.h"
18 : #include "nspawn-def.h"
19 : #include "nspawn-patch-uid.h"
20 : #include "stat-util.h"
21 : #include "stdio-util.h"
22 : #include "string-util.h"
23 : #include "strv.h"
24 : #include "user-util.h"
25 :
26 : #if HAVE_ACL
27 :
28 0 : static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
29 : char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
30 : acl_t acl;
31 :
32 0 : assert(fd >= 0);
33 0 : assert(ret);
34 :
35 0 : if (name) {
36 0 : _cleanup_close_ int child_fd = -1;
37 :
38 0 : child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
39 0 : if (child_fd < 0)
40 0 : return -errno;
41 :
42 0 : xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
43 0 : acl = acl_get_file(procfs_path, type);
44 0 : } else if (type == ACL_TYPE_ACCESS)
45 0 : acl = acl_get_fd(fd);
46 : else {
47 0 : xsprintf(procfs_path, "/proc/self/fd/%i", fd);
48 0 : acl = acl_get_file(procfs_path, type);
49 : }
50 0 : if (!acl)
51 0 : return -errno;
52 :
53 0 : *ret = acl;
54 0 : return 0;
55 : }
56 :
57 0 : static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
58 : char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
59 : int r;
60 :
61 0 : assert(fd >= 0);
62 0 : assert(acl);
63 :
64 0 : if (name) {
65 0 : _cleanup_close_ int child_fd = -1;
66 :
67 0 : child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
68 0 : if (child_fd < 0)
69 0 : return -errno;
70 :
71 0 : xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
72 0 : r = acl_set_file(procfs_path, type, acl);
73 0 : } else if (type == ACL_TYPE_ACCESS)
74 0 : r = acl_set_fd(fd, acl);
75 : else {
76 0 : xsprintf(procfs_path, "/proc/self/fd/%i", fd);
77 0 : r = acl_set_file(procfs_path, type, acl);
78 : }
79 0 : if (r < 0)
80 0 : return -errno;
81 :
82 0 : return 0;
83 : }
84 :
85 0 : static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
86 0 : _cleanup_(acl_freep) acl_t copy = NULL;
87 : acl_entry_t i;
88 : int r;
89 :
90 0 : assert(acl);
91 0 : assert(ret);
92 :
93 0 : r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
94 0 : if (r < 0)
95 0 : return -errno;
96 0 : while (r > 0) {
97 : uid_t *old_uid, new_uid;
98 0 : bool modify = false;
99 : acl_tag_t tag;
100 :
101 0 : if (acl_get_tag_type(i, &tag) < 0)
102 0 : return -errno;
103 :
104 0 : if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
105 :
106 : /* We don't distinguish here between uid_t and gid_t, let's make sure the compiler checks that
107 : * this is actually OK */
108 : assert_cc(sizeof(uid_t) == sizeof(gid_t));
109 :
110 0 : old_uid = acl_get_qualifier(i);
111 0 : if (!old_uid)
112 0 : return -errno;
113 :
114 0 : new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
115 0 : if (!uid_is_valid(new_uid))
116 0 : return -EINVAL;
117 :
118 0 : modify = new_uid != *old_uid;
119 0 : if (modify && !copy) {
120 : int n;
121 :
122 : /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
123 : * beginning, so that we copy all entries, starting from the first, this time. */
124 :
125 0 : n = acl_entries(acl);
126 0 : if (n < 0)
127 0 : return -errno;
128 :
129 0 : copy = acl_init(n);
130 0 : if (!copy)
131 0 : return -errno;
132 :
133 : /* Seek back to the beginning */
134 0 : r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
135 0 : if (r < 0)
136 0 : return -errno;
137 0 : continue;
138 : }
139 : }
140 :
141 0 : if (copy) {
142 : acl_entry_t new_entry;
143 :
144 0 : if (acl_create_entry(©, &new_entry) < 0)
145 0 : return -errno;
146 :
147 0 : if (acl_copy_entry(new_entry, i) < 0)
148 0 : return -errno;
149 :
150 0 : if (modify)
151 0 : if (acl_set_qualifier(new_entry, &new_uid) < 0)
152 0 : return -errno;
153 : }
154 :
155 0 : r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
156 0 : if (r < 0)
157 0 : return -errno;
158 : }
159 :
160 0 : *ret = TAKE_PTR(copy);
161 :
162 0 : return !!*ret;
163 : }
164 :
165 0 : static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
166 0 : _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
167 0 : bool changed = false;
168 : int r;
169 :
170 0 : assert(fd >= 0);
171 0 : assert(st);
172 :
173 : /* ACLs are not supported on symlinks, there's no point in trying */
174 0 : if (S_ISLNK(st->st_mode))
175 0 : return 0;
176 :
177 0 : r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
178 0 : if (r == -EOPNOTSUPP)
179 0 : return 0;
180 0 : if (r < 0)
181 0 : return r;
182 :
183 0 : r = shift_acl(acl, shift, &shifted);
184 0 : if (r < 0)
185 0 : return r;
186 0 : if (r > 0) {
187 0 : r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
188 0 : if (r < 0)
189 0 : return r;
190 :
191 0 : changed = true;
192 : }
193 :
194 0 : if (S_ISDIR(st->st_mode)) {
195 0 : acl_free(acl);
196 0 : acl_free(shifted);
197 :
198 0 : acl = shifted = NULL;
199 :
200 0 : r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
201 0 : if (r < 0)
202 0 : return r;
203 :
204 0 : r = shift_acl(acl, shift, &shifted);
205 0 : if (r < 0)
206 0 : return r;
207 0 : if (r > 0) {
208 0 : r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
209 0 : if (r < 0)
210 0 : return r;
211 :
212 0 : changed = true;
213 : }
214 : }
215 :
216 0 : return changed;
217 : }
218 :
219 : #else
220 :
221 : static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
222 : return 0;
223 : }
224 :
225 : #endif
226 :
227 0 : static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
228 : uid_t new_uid;
229 : gid_t new_gid;
230 0 : bool changed = false;
231 : int r;
232 :
233 0 : assert(fd >= 0);
234 0 : assert(st);
235 :
236 0 : new_uid = shift | (st->st_uid & UINT32_C(0xFFFF));
237 0 : new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));
238 :
239 0 : if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
240 0 : return -EINVAL;
241 :
242 0 : if (st->st_uid != new_uid || st->st_gid != new_gid) {
243 0 : if (name)
244 0 : r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
245 : else
246 0 : r = fchown(fd, new_uid, new_gid);
247 0 : if (r < 0)
248 0 : return -errno;
249 :
250 : /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
251 0 : if (name) {
252 0 : if (!S_ISLNK(st->st_mode))
253 0 : r = fchmodat(fd, name, st->st_mode, 0);
254 : else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
255 0 : r = 0;
256 : } else
257 0 : r = fchmod(fd, st->st_mode);
258 0 : if (r < 0)
259 0 : return -errno;
260 :
261 0 : changed = true;
262 : }
263 :
264 0 : r = patch_acls(fd, name, st, shift);
265 0 : if (r < 0)
266 0 : return r;
267 :
268 0 : return r > 0 || changed;
269 : }
270 :
271 : /*
272 : * Check if the filesystem is fully compatible with user namespaces or
273 : * UID/GID patching. Some filesystems in this list can be fully mounted inside
274 : * user namespaces, however their inodes may relate to host resources or only
275 : * valid in the global user namespace, therefore no patching should be applied.
276 : */
277 0 : static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
278 :
279 0 : assert(sfs);
280 :
281 0 : return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
282 0 : F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
283 0 : F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
284 0 : F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
285 0 : F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
286 0 : F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
287 0 : F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
288 0 : F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
289 0 : F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
290 0 : F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
291 0 : F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
292 0 : F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
293 0 : F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
294 0 : F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
295 0 : F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
296 0 : F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
297 : }
298 :
299 0 : static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
300 0 : _cleanup_closedir_ DIR *d = NULL;
301 0 : bool changed = false;
302 : struct statfs sfs;
303 : int r;
304 :
305 0 : assert(fd >= 0);
306 :
307 0 : if (fstatfs(fd, &sfs) < 0)
308 0 : return -errno;
309 :
310 : /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
311 : * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
312 : * when we hit procfs, sysfs or some other special file systems. */
313 :
314 0 : r = is_fs_fully_userns_compatible(&sfs);
315 0 : if (r < 0)
316 0 : goto finish;
317 0 : if (r > 0) {
318 0 : r = 0; /* don't recurse */
319 0 : goto finish;
320 : }
321 :
322 : /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
323 0 : if ((sfs.f_flags & ST_RDONLY) ||
324 0 : access_fd(fd, W_OK) == -EROFS)
325 0 : goto read_only;
326 :
327 0 : if (S_ISDIR(st->st_mode)) {
328 : struct dirent *de;
329 :
330 0 : if (!donate_fd) {
331 : int copy;
332 :
333 0 : copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
334 0 : if (copy < 0) {
335 0 : r = -errno;
336 0 : goto finish;
337 : }
338 :
339 0 : fd = copy;
340 0 : donate_fd = true;
341 : }
342 :
343 0 : d = fdopendir(fd);
344 0 : if (!d) {
345 0 : r = -errno;
346 0 : goto finish;
347 : }
348 0 : fd = -1;
349 :
350 0 : FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
351 : struct stat fst;
352 :
353 0 : if (dot_or_dot_dot(de->d_name))
354 0 : continue;
355 :
356 0 : if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
357 0 : r = -errno;
358 0 : goto finish;
359 : }
360 :
361 0 : if (S_ISDIR(fst.st_mode)) {
362 : int subdir_fd;
363 :
364 0 : subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
365 0 : if (subdir_fd < 0) {
366 0 : r = -errno;
367 0 : goto finish;
368 :
369 : }
370 :
371 0 : r = recurse_fd(subdir_fd, true, &fst, shift, false);
372 0 : if (r < 0)
373 0 : goto finish;
374 0 : if (r > 0)
375 0 : changed = true;
376 :
377 : } else {
378 0 : r = patch_fd(dirfd(d), de->d_name, &fst, shift);
379 0 : if (r < 0)
380 0 : goto finish;
381 0 : if (r > 0)
382 0 : changed = true;
383 : }
384 : }
385 : }
386 :
387 : /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
388 : * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
389 : * tree is properly chown()ed already. */
390 0 : r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
391 0 : if (r == -EROFS)
392 0 : goto read_only;
393 0 : if (r > 0)
394 0 : changed = true;
395 :
396 0 : r = changed;
397 0 : goto finish;
398 :
399 0 : read_only:
400 0 : if (!is_toplevel) {
401 0 : _cleanup_free_ char *name = NULL;
402 :
403 : /* When we hit a ready-only subtree we simply skip it, but log about it. */
404 0 : (void) fd_get_path(fd, &name);
405 0 : log_debug("Skipping read-only file or directory %s.", strna(name));
406 0 : r = changed;
407 : }
408 :
409 0 : finish:
410 0 : if (donate_fd)
411 0 : safe_close(fd);
412 :
413 0 : return r;
414 : }
415 :
416 0 : static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
417 : struct stat st;
418 : int r;
419 :
420 0 : assert(fd >= 0);
421 :
422 : /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
423 : * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
424 : * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
425 : * UID within the container. */
426 :
427 0 : if ((shift & 0xFFFF) != 0) {
428 : /* We only support containers where the shift starts at a 2^16 boundary */
429 0 : r = -EOPNOTSUPP;
430 0 : goto finish;
431 : }
432 :
433 0 : if (shift == UID_BUSY_BASE) {
434 0 : r = -EINVAL;
435 0 : goto finish;
436 : }
437 :
438 0 : if (range != 0x10000) {
439 : /* We only support containers with 16bit UID ranges for the patching logic */
440 0 : r = -EOPNOTSUPP;
441 0 : goto finish;
442 : }
443 :
444 0 : if (fstat(fd, &st) < 0) {
445 0 : r = -errno;
446 0 : goto finish;
447 : }
448 :
449 0 : if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
450 : /* We only support containers where the uid/gid container ID match */
451 0 : r = -EBADE;
452 0 : goto finish;
453 : }
454 :
455 : /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
456 : * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
457 0 : if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
458 0 : return 0;
459 :
460 : /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
461 : * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
462 : * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
463 :
464 0 : if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
465 0 : if (fchown(fd,
466 0 : UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
467 0 : (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
468 0 : r = -errno;
469 0 : goto finish;
470 : }
471 : }
472 :
473 0 : return recurse_fd(fd, donate_fd, &st, shift, true);
474 :
475 0 : finish:
476 0 : if (donate_fd)
477 0 : safe_close(fd);
478 :
479 0 : return r;
480 : }
481 :
482 0 : int path_patch_uid(const char *path, uid_t shift, uid_t range) {
483 : int fd;
484 :
485 0 : fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
486 0 : if (fd < 0)
487 0 : return -errno;
488 :
489 0 : return fd_patch_uid_internal(fd, true, shift, range);
490 : }
|