Line data Source code
1 : /* SPDX-License-Identifier: LGPL-2.1+ */
2 :
3 : #include <errno.h>
4 : #include <fcntl.h>
5 : #include <sys/resource.h>
6 : #include <sys/socket.h>
7 : #include <sys/stat.h>
8 : #include <unistd.h>
9 :
10 : #include "alloc-util.h"
11 : #include "copy.h"
12 : #include "dirent-util.h"
13 : #include "fd-util.h"
14 : #include "fileio.h"
15 : #include "fs-util.h"
16 : #include "io-util.h"
17 : #include "macro.h"
18 : #include "memfd-util.h"
19 : #include "missing.h"
20 : #include "parse-util.h"
21 : #include "path-util.h"
22 : #include "process-util.h"
23 : #include "socket-util.h"
24 : #include "stdio-util.h"
25 : #include "util.h"
26 : #include "tmpfile-util.h"
27 :
28 : /* The maximum number of iterations in the loop to close descriptors in the fallback case
29 : * when /proc/self/fd/ is inaccessible. */
30 : #define MAX_FD_LOOP_LIMIT (1024*1024)
31 :
32 182907 : int close_nointr(int fd) {
33 182907 : assert(fd >= 0);
34 :
35 182907 : if (close(fd) >= 0)
36 182902 : return 0;
37 :
38 : /*
39 : * Just ignore EINTR; a retry loop is the wrong thing to do on
40 : * Linux.
41 : *
42 : * http://lkml.indiana.edu/hypermail/linux/kernel/0509.1/0877.html
43 : * https://bugzilla.gnome.org/show_bug.cgi?id=682819
44 : * http://utcc.utoronto.ca/~cks/space/blog/unix/CloseEINTR
45 : * https://sites.google.com/site/michaelsafyan/software-engineering/checkforeintrwheninvokingclosethinkagain
46 : */
47 5 : if (errno == EINTR)
48 0 : return 0;
49 :
50 5 : return -errno;
51 : }
52 :
53 367664 : int safe_close(int fd) {
54 :
55 : /*
56 : * Like close_nointr() but cannot fail. Guarantees errno is
57 : * unchanged. Is a NOP with negative fds passed, and returns
58 : * -1, so that it can be used in this syntax:
59 : *
60 : * fd = safe_close(fd);
61 : */
62 :
63 367664 : if (fd >= 0) {
64 182885 : PROTECT_ERRNO;
65 :
66 : /* The kernel might return pretty much any error code
67 : * via close(), but the fd will be closed anyway. The
68 : * only condition we want to check for here is whether
69 : * the fd was invalid at all... */
70 :
71 182885 : assert_se(close_nointr(fd) != -EBADF);
72 : }
73 :
74 367664 : return -1;
75 : }
76 :
77 384 : void safe_close_pair(int p[static 2]) {
78 384 : assert(p);
79 :
80 384 : if (p[0] == p[1]) {
81 : /* Special case pairs which use the same fd in both
82 : * directions... */
83 333 : p[0] = p[1] = safe_close(p[0]);
84 333 : return;
85 : }
86 :
87 51 : p[0] = safe_close(p[0]);
88 51 : p[1] = safe_close(p[1]);
89 : }
90 :
91 202 : void close_many(const int fds[], size_t n_fd) {
92 : size_t i;
93 :
94 202 : assert(fds || n_fd <= 0);
95 :
96 218 : for (i = 0; i < n_fd; i++)
97 16 : safe_close(fds[i]);
98 202 : }
99 :
100 14608 : int fclose_nointr(FILE *f) {
101 14608 : assert(f);
102 :
103 : /* Same as close_nointr(), but for fclose() */
104 :
105 14608 : if (fclose(f) == 0)
106 14608 : return 0;
107 :
108 0 : if (errno == EINTR)
109 0 : return 0;
110 :
111 0 : return -errno;
112 : }
113 :
114 35792 : FILE* safe_fclose(FILE *f) {
115 :
116 : /* Same as safe_close(), but for fclose() */
117 :
118 35792 : if (f) {
119 14608 : PROTECT_ERRNO;
120 :
121 14608 : assert_se(fclose_nointr(f) != -EBADF);
122 : }
123 :
124 35792 : return NULL;
125 : }
126 :
127 0 : DIR* safe_closedir(DIR *d) {
128 :
129 0 : if (d) {
130 0 : PROTECT_ERRNO;
131 :
132 0 : assert_se(closedir(d) >= 0 || errno != EBADF);
133 : }
134 :
135 0 : return NULL;
136 : }
137 :
138 9873 : int fd_nonblock(int fd, bool nonblock) {
139 : int flags, nflags;
140 :
141 9873 : assert(fd >= 0);
142 :
143 9873 : flags = fcntl(fd, F_GETFL, 0);
144 9873 : if (flags < 0)
145 0 : return -errno;
146 :
147 9873 : if (nonblock)
148 24 : nflags = flags | O_NONBLOCK;
149 : else
150 9849 : nflags = flags & ~O_NONBLOCK;
151 :
152 9873 : if (nflags == flags)
153 4 : return 0;
154 :
155 9869 : if (fcntl(fd, F_SETFL, nflags) < 0)
156 0 : return -errno;
157 :
158 9869 : return 0;
159 : }
160 :
161 53 : int fd_cloexec(int fd, bool cloexec) {
162 : int flags, nflags;
163 :
164 53 : assert(fd >= 0);
165 :
166 53 : flags = fcntl(fd, F_GETFD, 0);
167 53 : if (flags < 0)
168 0 : return -errno;
169 :
170 53 : if (cloexec)
171 20 : nflags = flags | FD_CLOEXEC;
172 : else
173 33 : nflags = flags & ~FD_CLOEXEC;
174 :
175 53 : if (nflags == flags)
176 35 : return 0;
177 :
178 18 : if (fcntl(fd, F_SETFD, nflags) < 0)
179 0 : return -errno;
180 :
181 18 : return 0;
182 : }
183 :
184 2 : _pure_ static bool fd_in_set(int fd, const int fdset[], size_t n_fdset) {
185 : size_t i;
186 :
187 2 : assert(n_fdset == 0 || fdset);
188 :
189 3 : for (i = 0; i < n_fdset; i++)
190 2 : if (fdset[i] == fd)
191 1 : return true;
192 :
193 1 : return false;
194 : }
195 :
196 0 : static int get_max_fd(void) {
197 : struct rlimit rl;
198 : rlim_t m;
199 :
200 : /* Return the highest possible fd, based RLIMIT_NOFILE, but enforcing FD_SETSIZE-1 as lower boundary
201 : * and INT_MAX as upper boundary. */
202 :
203 0 : if (getrlimit(RLIMIT_NOFILE, &rl) < 0)
204 0 : return -errno;
205 :
206 0 : m = MAX(rl.rlim_cur, rl.rlim_max);
207 0 : if (m < FD_SETSIZE) /* Let's always cover at least 1024 fds */
208 0 : return FD_SETSIZE-1;
209 :
210 0 : if (m == RLIM_INFINITY || m > INT_MAX) /* Saturate on overflow. After all fds are "int", hence can
211 : * never be above INT_MAX */
212 0 : return INT_MAX;
213 :
214 0 : return (int) (m - 1);
215 : }
216 :
217 1 : int close_all_fds(const int except[], size_t n_except) {
218 1 : _cleanup_closedir_ DIR *d = NULL;
219 : struct dirent *de;
220 1 : int r = 0;
221 :
222 1 : assert(n_except == 0 || except);
223 :
224 1 : d = opendir("/proc/self/fd");
225 1 : if (!d) {
226 : int fd, max_fd;
227 :
228 : /* When /proc isn't available (for example in chroots) the fallback is brute forcing through
229 : * the fd table */
230 :
231 0 : max_fd = get_max_fd();
232 0 : if (max_fd < 0)
233 0 : return max_fd;
234 :
235 : /* Refuse to do the loop over more too many elements. It's better to fail immediately than to
236 : * spin the CPU for a long time. */
237 0 : if (max_fd > MAX_FD_LOOP_LIMIT)
238 0 : return log_debug_errno(SYNTHETIC_ERRNO(EPERM),
239 : "/proc/self/fd is inaccessible. Refusing to loop over %d potential fds.",
240 : max_fd);
241 :
242 0 : for (fd = 3; fd >= 0; fd = fd < max_fd ? fd + 1 : -1) {
243 : int q;
244 :
245 0 : if (fd_in_set(fd, except, n_except))
246 0 : continue;
247 :
248 0 : q = close_nointr(fd);
249 0 : if (q < 0 && q != -EBADF && r >= 0)
250 0 : r = q;
251 : }
252 :
253 0 : return r;
254 : }
255 :
256 9 : FOREACH_DIRENT(de, d, return -errno) {
257 6 : int fd = -1, q;
258 :
259 6 : if (safe_atoi(de->d_name, &fd) < 0)
260 : /* Let's better ignore this, just in case */
261 5 : continue;
262 :
263 6 : if (fd < 3)
264 3 : continue;
265 :
266 3 : if (fd == dirfd(d))
267 1 : continue;
268 :
269 2 : if (fd_in_set(fd, except, n_except))
270 1 : continue;
271 :
272 1 : q = close_nointr(fd);
273 1 : if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */
274 0 : r = q;
275 : }
276 :
277 1 : return r;
278 : }
279 :
280 20 : int same_fd(int a, int b) {
281 : struct stat sta, stb;
282 : pid_t pid;
283 : int r, fa, fb;
284 :
285 20 : assert(a >= 0);
286 20 : assert(b >= 0);
287 :
288 : /* Compares two file descriptors. Note that semantics are
289 : * quite different depending on whether we have kcmp() or we
290 : * don't. If we have kcmp() this will only return true for
291 : * dup()ed file descriptors, but not otherwise. If we don't
292 : * have kcmp() this will also return true for two fds of the same
293 : * file, created by separate open() calls. Since we use this
294 : * call mostly for filtering out duplicates in the fd store
295 : * this difference hopefully doesn't matter too much. */
296 :
297 20 : if (a == b)
298 4 : return true;
299 :
300 : /* Try to use kcmp() if we have it. */
301 16 : pid = getpid_cached();
302 16 : r = kcmp(pid, pid, KCMP_FILE, a, b);
303 16 : if (r == 0)
304 6 : return true;
305 10 : if (r > 0)
306 10 : return false;
307 0 : if (!IN_SET(errno, ENOSYS, EACCES, EPERM))
308 0 : return -errno;
309 :
310 : /* We don't have kcmp(), use fstat() instead. */
311 0 : if (fstat(a, &sta) < 0)
312 0 : return -errno;
313 :
314 0 : if (fstat(b, &stb) < 0)
315 0 : return -errno;
316 :
317 0 : if ((sta.st_mode & S_IFMT) != (stb.st_mode & S_IFMT))
318 0 : return false;
319 :
320 : /* We consider all device fds different, since two device fds
321 : * might refer to quite different device contexts even though
322 : * they share the same inode and backing dev_t. */
323 :
324 0 : if (S_ISCHR(sta.st_mode) || S_ISBLK(sta.st_mode))
325 0 : return false;
326 :
327 0 : if (sta.st_dev != stb.st_dev || sta.st_ino != stb.st_ino)
328 0 : return false;
329 :
330 : /* The fds refer to the same inode on disk, let's also check
331 : * if they have the same fd flags. This is useful to
332 : * distinguish the read and write side of a pipe created with
333 : * pipe(). */
334 0 : fa = fcntl(a, F_GETFL);
335 0 : if (fa < 0)
336 0 : return -errno;
337 :
338 0 : fb = fcntl(b, F_GETFL);
339 0 : if (fb < 0)
340 0 : return -errno;
341 :
342 0 : return fa == fb;
343 : }
344 :
345 2 : void cmsg_close_all(struct msghdr *mh) {
346 : struct cmsghdr *cmsg;
347 :
348 2 : assert(mh);
349 :
350 2 : CMSG_FOREACH(cmsg, mh)
351 0 : if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS)
352 0 : close_many((int*) CMSG_DATA(cmsg), (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int));
353 2 : }
354 :
355 0 : bool fdname_is_valid(const char *s) {
356 : const char *p;
357 :
358 : /* Validates a name for $LISTEN_FDNAMES. We basically allow
359 : * everything ASCII that's not a control character. Also, as
360 : * special exception the ":" character is not allowed, as we
361 : * use that as field separator in $LISTEN_FDNAMES.
362 : *
363 : * Note that the empty string is explicitly allowed
364 : * here. However, we limit the length of the names to 255
365 : * characters. */
366 :
367 0 : if (!s)
368 0 : return false;
369 :
370 0 : for (p = s; *p; p++) {
371 0 : if (*p < ' ')
372 0 : return false;
373 0 : if (*p >= 127)
374 0 : return false;
375 0 : if (*p == ':')
376 0 : return false;
377 : }
378 :
379 0 : return p - s < 256;
380 : }
381 :
382 29 : int fd_get_path(int fd, char **ret) {
383 : char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
384 : int r;
385 :
386 29 : xsprintf(procfs_path, "/proc/self/fd/%i", fd);
387 29 : r = readlink_malloc(procfs_path, ret);
388 29 : if (r == -ENOENT) {
389 : /* ENOENT can mean two things: that the fd does not exist or that /proc is not mounted. Let's make
390 : * things debuggable and distinguish the two. */
391 :
392 0 : if (access("/proc/self/fd/", F_OK) < 0)
393 : /* /proc is not available or not set up properly, we're most likely in some chroot
394 : * environment. */
395 0 : return errno == ENOENT ? -EOPNOTSUPP : -errno;
396 :
397 0 : return -EBADF; /* The directory exists, hence it's the fd that doesn't. */
398 : }
399 :
400 29 : return r;
401 : }
402 :
403 0 : int move_fd(int from, int to, int cloexec) {
404 : int r;
405 :
406 : /* Move fd 'from' to 'to', make sure FD_CLOEXEC remains equal if requested, and release the old fd. If
407 : * 'cloexec' is passed as -1, the original FD_CLOEXEC is inherited for the new fd. If it is 0, it is turned
408 : * off, if it is > 0 it is turned on. */
409 :
410 0 : if (from < 0)
411 0 : return -EBADF;
412 0 : if (to < 0)
413 0 : return -EBADF;
414 :
415 0 : if (from == to) {
416 :
417 0 : if (cloexec >= 0) {
418 0 : r = fd_cloexec(to, cloexec);
419 0 : if (r < 0)
420 0 : return r;
421 : }
422 :
423 0 : return to;
424 : }
425 :
426 0 : if (cloexec < 0) {
427 : int fl;
428 :
429 0 : fl = fcntl(from, F_GETFD, 0);
430 0 : if (fl < 0)
431 0 : return -errno;
432 :
433 0 : cloexec = !!(fl & FD_CLOEXEC);
434 : }
435 :
436 0 : r = dup3(from, to, cloexec ? O_CLOEXEC : 0);
437 0 : if (r < 0)
438 0 : return -errno;
439 :
440 0 : assert(r == to);
441 :
442 0 : safe_close(from);
443 :
444 0 : return to;
445 : }
446 :
447 29 : int acquire_data_fd(const void *data, size_t size, unsigned flags) {
448 :
449 29 : _cleanup_close_pair_ int pipefds[2] = { -1, -1 };
450 29 : char pattern[] = "/dev/shm/data-fd-XXXXXX";
451 29 : _cleanup_close_ int fd = -1;
452 29 : int isz = 0, r;
453 : ssize_t n;
454 : off_t f;
455 :
456 29 : assert(data || size == 0);
457 :
458 : /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
459 : * complex than I wish it was. But here's why:
460 : *
461 : * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
462 : * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
463 : *
464 : * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
465 : * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
466 : * clients can only bump their size to a system-wide limit, which might be quite low.
467 : *
468 : * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
469 : * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
470 : * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
471 : *
472 : * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
473 : *
474 : * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
475 : * figure. */
476 :
477 29 : if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) {
478 : /* As a special case, return /dev/null if we have been called for an empty data block */
479 4 : r = open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY);
480 4 : if (r < 0)
481 0 : return -errno;
482 :
483 4 : return r;
484 : }
485 :
486 25 : if ((flags & ACQUIRE_NO_MEMFD) == 0) {
487 12 : fd = memfd_new("data-fd");
488 12 : if (fd < 0)
489 0 : goto try_pipe;
490 :
491 12 : n = write(fd, data, size);
492 12 : if (n < 0)
493 0 : return -errno;
494 12 : if ((size_t) n != size)
495 0 : return -EIO;
496 :
497 12 : f = lseek(fd, 0, SEEK_SET);
498 12 : if (f != 0)
499 0 : return -errno;
500 :
501 12 : r = memfd_set_sealed(fd);
502 12 : if (r < 0)
503 0 : return r;
504 :
505 12 : return TAKE_FD(fd);
506 : }
507 :
508 13 : try_pipe:
509 13 : if ((flags & ACQUIRE_NO_PIPE) == 0) {
510 5 : if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
511 0 : return -errno;
512 :
513 5 : isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
514 5 : if (isz < 0)
515 0 : return -errno;
516 :
517 5 : if ((size_t) isz < size) {
518 2 : isz = (int) size;
519 2 : if (isz < 0 || (size_t) isz != size)
520 0 : return -E2BIG;
521 :
522 : /* Try to bump the pipe size */
523 2 : (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
524 :
525 : /* See if that worked */
526 2 : isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
527 2 : if (isz < 0)
528 0 : return -errno;
529 :
530 2 : if ((size_t) isz < size)
531 0 : goto try_dev_shm;
532 : }
533 :
534 5 : n = write(pipefds[1], data, size);
535 5 : if (n < 0)
536 0 : return -errno;
537 5 : if ((size_t) n != size)
538 0 : return -EIO;
539 :
540 5 : (void) fd_nonblock(pipefds[0], false);
541 :
542 5 : return TAKE_FD(pipefds[0]);
543 : }
544 :
545 8 : try_dev_shm:
546 8 : if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
547 5 : fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
548 5 : if (fd < 0)
549 0 : goto try_dev_shm_without_o_tmpfile;
550 :
551 5 : n = write(fd, data, size);
552 5 : if (n < 0)
553 0 : return -errno;
554 5 : if ((size_t) n != size)
555 0 : return -EIO;
556 :
557 : /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
558 5 : return fd_reopen(fd, O_RDONLY|O_CLOEXEC);
559 : }
560 :
561 3 : try_dev_shm_without_o_tmpfile:
562 3 : if ((flags & ACQUIRE_NO_REGULAR) == 0) {
563 3 : fd = mkostemp_safe(pattern);
564 3 : if (fd < 0)
565 0 : return fd;
566 :
567 3 : n = write(fd, data, size);
568 3 : if (n < 0) {
569 0 : r = -errno;
570 0 : goto unlink_and_return;
571 : }
572 3 : if ((size_t) n != size) {
573 0 : r = -EIO;
574 0 : goto unlink_and_return;
575 : }
576 :
577 : /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
578 3 : r = open(pattern, O_RDONLY|O_CLOEXEC);
579 3 : if (r < 0)
580 0 : r = -errno;
581 :
582 3 : unlink_and_return:
583 3 : (void) unlink(pattern);
584 3 : return r;
585 : }
586 :
587 0 : return -EOPNOTSUPP;
588 : }
589 :
590 : /* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */
591 : #define DATA_FD_MEMORY_LIMIT (64U*1024U)
592 :
593 : /* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */
594 : #define DATA_FD_TMP_LIMIT (1024U*1024U)
595 :
596 3 : int fd_duplicate_data_fd(int fd) {
597 :
598 3 : _cleanup_close_ int copy_fd = -1, tmp_fd = -1;
599 3 : _cleanup_free_ void *remains = NULL;
600 3 : size_t remains_size = 0;
601 : const char *td;
602 : struct stat st;
603 : int r;
604 :
605 : /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but
606 : * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be
607 : * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported
608 : * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in
609 : * /var/tmp. */
610 :
611 3 : if (fstat(fd, &st) < 0)
612 0 : return -errno;
613 :
614 : /* For now, let's only accept regular files, sockets, pipes and char devices */
615 3 : if (S_ISDIR(st.st_mode))
616 0 : return -EISDIR;
617 3 : if (S_ISLNK(st.st_mode))
618 0 : return -ELOOP;
619 3 : if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode))
620 0 : return -EBADFD;
621 :
622 : /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note
623 : * that we use the reported regular file size only as a hint, given that there are plenty special files in
624 : * /proc and /sys which report a zero file size but can be read from. */
625 :
626 3 : if (!S_ISREG(st.st_mode) || st.st_size < DATA_FD_MEMORY_LIMIT) {
627 :
628 : /* Try a memfd first */
629 3 : copy_fd = memfd_new("data-fd");
630 3 : if (copy_fd >= 0) {
631 : off_t f;
632 :
633 3 : r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0);
634 3 : if (r < 0)
635 0 : return r;
636 :
637 3 : f = lseek(copy_fd, 0, SEEK_SET);
638 3 : if (f != 0)
639 0 : return -errno;
640 :
641 3 : if (r == 0) {
642 : /* Did it fit into the limit? If so, we are done. */
643 2 : r = memfd_set_sealed(copy_fd);
644 2 : if (r < 0)
645 0 : return r;
646 :
647 2 : return TAKE_FD(copy_fd);
648 : }
649 :
650 : /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */
651 :
652 : } else {
653 0 : _cleanup_(close_pairp) int pipefds[2] = { -1, -1 };
654 : int isz;
655 :
656 : /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather
657 : * then block indefinitely when we hit the pipe size limit */
658 :
659 0 : if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
660 0 : return -errno;
661 :
662 0 : isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
663 0 : if (isz < 0)
664 0 : return -errno;
665 :
666 : /* Try to enlarge the pipe size if necessary */
667 0 : if ((size_t) isz < DATA_FD_MEMORY_LIMIT) {
668 :
669 0 : (void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT);
670 :
671 0 : isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
672 0 : if (isz < 0)
673 0 : return -errno;
674 : }
675 :
676 0 : if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) {
677 :
678 0 : r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size, NULL, NULL);
679 0 : if (r < 0 && r != -EAGAIN)
680 0 : return r; /* If we get EAGAIN it could be because of the source or because of
681 : * the destination fd, we can't know, as sendfile() and friends won't
682 : * tell us. Hence, treat this as reason to fall back, just to be
683 : * sure. */
684 0 : if (r == 0) {
685 : /* Everything fit in, yay! */
686 0 : (void) fd_nonblock(pipefds[0], false);
687 :
688 0 : return TAKE_FD(pipefds[0]);
689 : }
690 :
691 : /* Things didn't fit in. But we read data into the pipe, let's remember that, so that
692 : * when writing the new file we incorporate this first. */
693 0 : copy_fd = TAKE_FD(pipefds[0]);
694 : }
695 : }
696 : }
697 :
698 : /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */
699 1 : if ((!S_ISREG(st.st_mode) || st.st_size < DATA_FD_TMP_LIMIT) &&
700 1 : (DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) {
701 : off_t f;
702 :
703 1 : tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC);
704 1 : if (tmp_fd < 0)
705 0 : return tmp_fd;
706 :
707 1 : if (copy_fd >= 0) {
708 : /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the
709 : * temporary file first. */
710 :
711 1 : r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0);
712 1 : if (r < 0)
713 0 : return r;
714 :
715 1 : assert(r == 0);
716 : }
717 :
718 1 : if (remains_size > 0) {
719 : /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the
720 : * failed copy operation, let's flush them out next. */
721 :
722 0 : r = loop_write(tmp_fd, remains, remains_size, false);
723 0 : if (r < 0)
724 0 : return r;
725 : }
726 :
727 1 : r = copy_bytes(fd, tmp_fd, DATA_FD_TMP_LIMIT - DATA_FD_MEMORY_LIMIT - remains_size, COPY_REFLINK);
728 1 : if (r < 0)
729 0 : return r;
730 1 : if (r == 0)
731 0 : goto finish; /* Yay, it fit in */
732 :
733 : /* It didn't fit in. Let's not forget to use what we already used */
734 1 : f = lseek(tmp_fd, 0, SEEK_SET);
735 1 : if (f != 0)
736 0 : return -errno;
737 :
738 1 : safe_close(copy_fd);
739 1 : copy_fd = TAKE_FD(tmp_fd);
740 :
741 1 : remains = mfree(remains);
742 1 : remains_size = 0;
743 : }
744 :
745 : /* As last fallback use /var/tmp */
746 1 : r = var_tmp_dir(&td);
747 1 : if (r < 0)
748 0 : return r;
749 :
750 1 : tmp_fd = open_tmpfile_unlinkable(td, O_RDWR|O_CLOEXEC);
751 1 : if (tmp_fd < 0)
752 0 : return tmp_fd;
753 :
754 1 : if (copy_fd >= 0) {
755 : /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this
756 : * into the temporary file first. */
757 1 : r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
758 1 : if (r < 0)
759 0 : return r;
760 :
761 1 : assert(r == 0);
762 : }
763 :
764 1 : if (remains_size > 0) {
765 : /* Then, copy in any read but not yet written bytes. */
766 0 : r = loop_write(tmp_fd, remains, remains_size, false);
767 0 : if (r < 0)
768 0 : return r;
769 : }
770 :
771 : /* Copy in the rest */
772 1 : r = copy_bytes(fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
773 1 : if (r < 0)
774 0 : return r;
775 :
776 1 : assert(r == 0);
777 :
778 1 : finish:
779 : /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the
780 : * file again */
781 :
782 1 : return fd_reopen(tmp_fd, O_RDONLY|O_CLOEXEC);
783 : }
784 :
785 270 : int fd_move_above_stdio(int fd) {
786 : int flags, copy;
787 270 : PROTECT_ERRNO;
788 :
789 : /* Moves the specified file descriptor if possible out of the range [0…2], i.e. the range of
790 : * stdin/stdout/stderr. If it can't be moved outside of this range the original file descriptor is
791 : * returned. This call is supposed to be used for long-lasting file descriptors we allocate in our code that
792 : * might get loaded into foreign code, and where we want ensure our fds are unlikely used accidentally as
793 : * stdin/stdout/stderr of unrelated code.
794 : *
795 : * Note that this doesn't fix any real bugs, it just makes it less likely that our code will be affected by
796 : * buggy code from others that mindlessly invokes 'fprintf(stderr, …' or similar in places where stderr has
797 : * been closed before.
798 : *
799 : * This function is written in a "best-effort" and "least-impact" style. This means whenever we encounter an
800 : * error we simply return the original file descriptor, and we do not touch errno. */
801 :
802 270 : if (fd < 0 || fd > 2)
803 269 : return fd;
804 :
805 1 : flags = fcntl(fd, F_GETFD, 0);
806 1 : if (flags < 0)
807 0 : return fd;
808 :
809 1 : if (flags & FD_CLOEXEC)
810 0 : copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
811 : else
812 1 : copy = fcntl(fd, F_DUPFD, 3);
813 1 : if (copy < 0)
814 0 : return fd;
815 :
816 1 : assert(copy > 2);
817 :
818 1 : (void) close(fd);
819 1 : return copy;
820 : }
821 :
822 16 : int rearrange_stdio(int original_input_fd, int original_output_fd, int original_error_fd) {
823 :
824 16 : int fd[3] = { /* Put together an array of fds we work on */
825 : original_input_fd,
826 : original_output_fd,
827 : original_error_fd
828 : };
829 :
830 : int r, i,
831 16 : null_fd = -1, /* if we open /dev/null, we store the fd to it here */
832 16 : copy_fd[3] = { -1, -1, -1 }; /* This contains all fds we duplicate here temporarily, and hence need to close at the end */
833 : bool null_readable, null_writable;
834 :
835 : /* Sets up stdin, stdout, stderr with the three file descriptors passed in. If any of the descriptors is
836 : * specified as -1 it will be connected with /dev/null instead. If any of the file descriptors is passed as
837 : * itself (e.g. stdin as STDIN_FILENO) it is left unmodified, but the O_CLOEXEC bit is turned off should it be
838 : * on.
839 : *
840 : * Note that if any of the passed file descriptors are > 2 they will be closed — both on success and on
841 : * failure! Thus, callers should assume that when this function returns the input fds are invalidated.
842 : *
843 : * Note that when this function fails stdin/stdout/stderr might remain half set up!
844 : *
845 : * O_CLOEXEC is turned off for all three file descriptors (which is how it should be for
846 : * stdin/stdout/stderr). */
847 :
848 16 : null_readable = original_input_fd < 0;
849 16 : null_writable = original_output_fd < 0 || original_error_fd < 0;
850 :
851 : /* First step, open /dev/null once, if we need it */
852 16 : if (null_readable || null_writable) {
853 :
854 : /* Let's open this with O_CLOEXEC first, and convert it to non-O_CLOEXEC when we move the fd to the final position. */
855 0 : null_fd = open("/dev/null", (null_readable && null_writable ? O_RDWR :
856 0 : null_readable ? O_RDONLY : O_WRONLY) | O_CLOEXEC);
857 0 : if (null_fd < 0) {
858 0 : r = -errno;
859 0 : goto finish;
860 : }
861 :
862 : /* If this fd is in the 0…2 range, let's move it out of it */
863 0 : if (null_fd < 3) {
864 : int copy;
865 :
866 0 : copy = fcntl(null_fd, F_DUPFD_CLOEXEC, 3); /* Duplicate this with O_CLOEXEC set */
867 0 : if (copy < 0) {
868 0 : r = -errno;
869 0 : goto finish;
870 : }
871 :
872 0 : safe_close(null_fd);
873 0 : null_fd = copy;
874 : }
875 : }
876 :
877 : /* Let's assemble fd[] with the fds to install in place of stdin/stdout/stderr */
878 64 : for (i = 0; i < 3; i++) {
879 :
880 48 : if (fd[i] < 0)
881 0 : fd[i] = null_fd; /* A negative parameter means: connect this one to /dev/null */
882 48 : else if (fd[i] != i && fd[i] < 3) {
883 : /* This fd is in the 0…2 territory, but not at its intended place, move it out of there, so that we can work there. */
884 0 : copy_fd[i] = fcntl(fd[i], F_DUPFD_CLOEXEC, 3); /* Duplicate this with O_CLOEXEC set */
885 0 : if (copy_fd[i] < 0) {
886 0 : r = -errno;
887 0 : goto finish;
888 : }
889 :
890 0 : fd[i] = copy_fd[i];
891 : }
892 : }
893 :
894 : /* At this point we now have the fds to use in fd[], and they are all above the stdio range, so that we
895 : * have freedom to move them around. If the fds already were at the right places then the specific fds are
896 : * -1. Let's now move them to the right places. This is the point of no return. */
897 64 : for (i = 0; i < 3; i++) {
898 :
899 48 : if (fd[i] == i) {
900 :
901 : /* fd is already in place, but let's make sure O_CLOEXEC is off */
902 32 : r = fd_cloexec(i, false);
903 32 : if (r < 0)
904 0 : goto finish;
905 :
906 : } else {
907 16 : assert(fd[i] > 2);
908 :
909 16 : if (dup2(fd[i], i) < 0) { /* Turns off O_CLOEXEC on the new fd. */
910 0 : r = -errno;
911 0 : goto finish;
912 : }
913 : }
914 : }
915 :
916 16 : r = 0;
917 :
918 16 : finish:
919 : /* Close the original fds, but only if they were outside of the stdio range. Also, properly check for the same
920 : * fd passed in multiple times. */
921 16 : safe_close_above_stdio(original_input_fd);
922 16 : if (original_output_fd != original_input_fd)
923 16 : safe_close_above_stdio(original_output_fd);
924 16 : if (original_error_fd != original_input_fd && original_error_fd != original_output_fd)
925 16 : safe_close_above_stdio(original_error_fd);
926 :
927 : /* Close the copies we moved > 2 */
928 64 : for (i = 0; i < 3; i++)
929 48 : safe_close(copy_fd[i]);
930 :
931 : /* Close our null fd, if it's > 2 */
932 16 : safe_close_above_stdio(null_fd);
933 :
934 16 : return r;
935 : }
936 :
937 31 : int fd_reopen(int fd, int flags) {
938 : char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
939 : int new_fd;
940 :
941 : /* Reopens the specified fd with new flags. This is useful for convert an O_PATH fd into a regular one, or to
942 : * turn O_RDWR fds into O_RDONLY fds.
943 : *
944 : * This doesn't work on sockets (since they cannot be open()ed, ever).
945 : *
946 : * This implicitly resets the file read index to 0. */
947 :
948 31 : xsprintf(procfs_path, "/proc/self/fd/%i", fd);
949 31 : new_fd = open(procfs_path, flags);
950 31 : if (new_fd < 0)
951 0 : return -errno;
952 :
953 31 : return new_fd;
954 : }
955 :
956 1 : int read_nr_open(void) {
957 1 : _cleanup_free_ char *nr_open = NULL;
958 : int r;
959 :
960 : /* Returns the kernel's current fd limit, either by reading it of /proc/sys if that works, or using the
961 : * hard-coded default compiled-in value of current kernels (1M) if not. This call will never fail. */
962 :
963 1 : r = read_one_line_file("/proc/sys/fs/nr_open", &nr_open);
964 1 : if (r < 0)
965 0 : log_debug_errno(r, "Failed to read /proc/sys/fs/nr_open, ignoring: %m");
966 : else {
967 : int v;
968 :
969 1 : r = safe_atoi(nr_open, &v);
970 1 : if (r < 0)
971 0 : log_debug_errno(r, "Failed to parse /proc/sys/fs/nr_open value '%s', ignoring: %m", nr_open);
972 : else
973 1 : return v;
974 : }
975 :
976 : /* If we fail, fallback to the hard-coded kernel limit of 1024 * 1024. */
977 0 : return 1024 * 1024;
978 : }
|