Line data Source code
1 : /* SPDX-License-Identifier: LGPL-2.1+ */
2 :
3 : #include <errno.h>
4 : #include <fcntl.h>
5 : #include <linux/seccomp.h>
6 : #include <seccomp.h>
7 : #include <stddef.h>
8 : #include <sys/mman.h>
9 : #include <sys/prctl.h>
10 : #include <sys/shm.h>
11 : #include <sys/stat.h>
12 :
13 : #include "af-list.h"
14 : #include "alloc-util.h"
15 : #include "errno-list.h"
16 : #include "macro.h"
17 : #include "nsflags.h"
18 : #include "nulstr-util.h"
19 : #include "process-util.h"
20 : #include "seccomp-util.h"
21 : #include "set.h"
22 : #include "string-util.h"
23 : #include "strv.h"
24 :
25 : const uint32_t seccomp_local_archs[] = {
26 :
27 : /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
28 :
29 : #if defined(__x86_64__) && defined(__ILP32__)
30 : SCMP_ARCH_X86,
31 : SCMP_ARCH_X86_64,
32 : SCMP_ARCH_X32, /* native */
33 : #elif defined(__x86_64__) && !defined(__ILP32__)
34 : SCMP_ARCH_X86,
35 : SCMP_ARCH_X32,
36 : SCMP_ARCH_X86_64, /* native */
37 : #elif defined(__i386__)
38 : SCMP_ARCH_X86,
39 : #elif defined(__aarch64__)
40 : SCMP_ARCH_ARM,
41 : SCMP_ARCH_AARCH64, /* native */
42 : #elif defined(__arm__)
43 : SCMP_ARCH_ARM,
44 : #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 : SCMP_ARCH_MIPSEL,
46 : SCMP_ARCH_MIPS, /* native */
47 : #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
48 : SCMP_ARCH_MIPS,
49 : SCMP_ARCH_MIPSEL, /* native */
50 : #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 : SCMP_ARCH_MIPSEL,
52 : SCMP_ARCH_MIPS,
53 : SCMP_ARCH_MIPSEL64N32,
54 : SCMP_ARCH_MIPS64N32,
55 : SCMP_ARCH_MIPSEL64,
56 : SCMP_ARCH_MIPS64, /* native */
57 : #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 : SCMP_ARCH_MIPS,
59 : SCMP_ARCH_MIPSEL,
60 : SCMP_ARCH_MIPS64N32,
61 : SCMP_ARCH_MIPSEL64N32,
62 : SCMP_ARCH_MIPS64,
63 : SCMP_ARCH_MIPSEL64, /* native */
64 : #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 : SCMP_ARCH_MIPSEL,
66 : SCMP_ARCH_MIPS,
67 : SCMP_ARCH_MIPSEL64,
68 : SCMP_ARCH_MIPS64,
69 : SCMP_ARCH_MIPSEL64N32,
70 : SCMP_ARCH_MIPS64N32, /* native */
71 : #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 : SCMP_ARCH_MIPS,
73 : SCMP_ARCH_MIPSEL,
74 : SCMP_ARCH_MIPS64,
75 : SCMP_ARCH_MIPSEL64,
76 : SCMP_ARCH_MIPS64N32,
77 : SCMP_ARCH_MIPSEL64N32, /* native */
78 : #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
79 : SCMP_ARCH_PPC,
80 : SCMP_ARCH_PPC64LE,
81 : SCMP_ARCH_PPC64, /* native */
82 : #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 : SCMP_ARCH_PPC,
84 : SCMP_ARCH_PPC64,
85 : SCMP_ARCH_PPC64LE, /* native */
86 : #elif defined(__powerpc__)
87 : SCMP_ARCH_PPC,
88 : #elif defined(__s390x__)
89 : SCMP_ARCH_S390,
90 : SCMP_ARCH_S390X, /* native */
91 : #elif defined(__s390__)
92 : SCMP_ARCH_S390,
93 : #endif
94 : (uint32_t) -1
95 : };
96 :
97 33 : const char* seccomp_arch_to_string(uint32_t c) {
98 : /* Maintain order used in <seccomp.h>.
99 : *
100 : * Names used here should be the same as those used for ConditionArchitecture=,
101 : * except for "subarchitectures" like x32. */
102 :
103 33 : switch(c) {
104 1 : case SCMP_ARCH_NATIVE:
105 1 : return "native";
106 6 : case SCMP_ARCH_X86:
107 6 : return "x86";
108 7 : case SCMP_ARCH_X86_64:
109 7 : return "x86-64";
110 6 : case SCMP_ARCH_X32:
111 6 : return "x32";
112 1 : case SCMP_ARCH_ARM:
113 1 : return "arm";
114 1 : case SCMP_ARCH_AARCH64:
115 1 : return "arm64";
116 1 : case SCMP_ARCH_MIPS:
117 1 : return "mips";
118 1 : case SCMP_ARCH_MIPS64:
119 1 : return "mips64";
120 1 : case SCMP_ARCH_MIPS64N32:
121 1 : return "mips64-n32";
122 1 : case SCMP_ARCH_MIPSEL:
123 1 : return "mips-le";
124 1 : case SCMP_ARCH_MIPSEL64:
125 1 : return "mips64-le";
126 1 : case SCMP_ARCH_MIPSEL64N32:
127 1 : return "mips64-le-n32";
128 1 : case SCMP_ARCH_PPC:
129 1 : return "ppc";
130 1 : case SCMP_ARCH_PPC64:
131 1 : return "ppc64";
132 1 : case SCMP_ARCH_PPC64LE:
133 1 : return "ppc64-le";
134 1 : case SCMP_ARCH_S390:
135 1 : return "s390";
136 1 : case SCMP_ARCH_S390X:
137 1 : return "s390x";
138 0 : default:
139 0 : return NULL;
140 : }
141 : }
142 :
143 18 : int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 18 : if (!n)
145 0 : return -EINVAL;
146 :
147 18 : assert(ret);
148 :
149 18 : if (streq(n, "native"))
150 1 : *ret = SCMP_ARCH_NATIVE;
151 17 : else if (streq(n, "x86"))
152 1 : *ret = SCMP_ARCH_X86;
153 16 : else if (streq(n, "x86-64"))
154 2 : *ret = SCMP_ARCH_X86_64;
155 14 : else if (streq(n, "x32"))
156 1 : *ret = SCMP_ARCH_X32;
157 13 : else if (streq(n, "arm"))
158 1 : *ret = SCMP_ARCH_ARM;
159 12 : else if (streq(n, "arm64"))
160 1 : *ret = SCMP_ARCH_AARCH64;
161 11 : else if (streq(n, "mips"))
162 1 : *ret = SCMP_ARCH_MIPS;
163 10 : else if (streq(n, "mips64"))
164 1 : *ret = SCMP_ARCH_MIPS64;
165 9 : else if (streq(n, "mips64-n32"))
166 1 : *ret = SCMP_ARCH_MIPS64N32;
167 8 : else if (streq(n, "mips-le"))
168 1 : *ret = SCMP_ARCH_MIPSEL;
169 7 : else if (streq(n, "mips64-le"))
170 1 : *ret = SCMP_ARCH_MIPSEL64;
171 6 : else if (streq(n, "mips64-le-n32"))
172 1 : *ret = SCMP_ARCH_MIPSEL64N32;
173 5 : else if (streq(n, "ppc"))
174 1 : *ret = SCMP_ARCH_PPC;
175 4 : else if (streq(n, "ppc64"))
176 1 : *ret = SCMP_ARCH_PPC64;
177 3 : else if (streq(n, "ppc64-le"))
178 1 : *ret = SCMP_ARCH_PPC64LE;
179 2 : else if (streq(n, "s390"))
180 1 : *ret = SCMP_ARCH_S390;
181 1 : else if (streq(n, "s390x"))
182 1 : *ret = SCMP_ARCH_S390X;
183 : else
184 0 : return -EINVAL;
185 :
186 18 : return 0;
187 : }
188 :
189 0 : int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
190 : scmp_filter_ctx seccomp;
191 : int r;
192 :
193 : /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 : * any others. Also, turns off the NNP fiddling. */
195 :
196 0 : seccomp = seccomp_init(default_action);
197 0 : if (!seccomp)
198 0 : return -ENOMEM;
199 :
200 0 : if (arch != SCMP_ARCH_NATIVE &&
201 0 : arch != seccomp_arch_native()) {
202 :
203 0 : r = seccomp_arch_remove(seccomp, seccomp_arch_native());
204 0 : if (r < 0)
205 0 : goto finish;
206 :
207 0 : r = seccomp_arch_add(seccomp, arch);
208 0 : if (r < 0)
209 0 : goto finish;
210 :
211 0 : assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 0 : assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 0 : assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 : } else {
215 0 : assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 0 : assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 : }
218 :
219 0 : r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
220 0 : if (r < 0)
221 0 : goto finish;
222 :
223 0 : r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 0 : if (r < 0)
225 0 : goto finish;
226 :
227 0 : *ret = seccomp;
228 0 : return 0;
229 :
230 0 : finish:
231 0 : seccomp_release(seccomp);
232 0 : return r;
233 : }
234 :
235 1 : static bool is_basic_seccomp_available(void) {
236 1 : return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
237 : }
238 :
239 1 : static bool is_seccomp_filter_available(void) {
240 2 : return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
241 1 : errno == EFAULT;
242 : }
243 :
244 10 : bool is_seccomp_available(void) {
245 : static int cached_enabled = -1;
246 :
247 10 : if (cached_enabled < 0)
248 1 : cached_enabled =
249 2 : is_basic_seccomp_available() &&
250 1 : is_seccomp_filter_available();
251 :
252 10 : return cached_enabled;
253 : }
254 :
255 : const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
256 : [SYSCALL_FILTER_SET_DEFAULT] = {
257 : .name = "@default",
258 : .help = "System calls that are always permitted",
259 : .value =
260 : "clock_getres\0"
261 : "clock_gettime\0"
262 : "clock_nanosleep\0"
263 : "execve\0"
264 : "exit\0"
265 : "exit_group\0"
266 : "futex\0"
267 : "get_robust_list\0"
268 : "get_thread_area\0"
269 : "getegid\0"
270 : "getegid32\0"
271 : "geteuid\0"
272 : "geteuid32\0"
273 : "getgid\0"
274 : "getgid32\0"
275 : "getgroups\0"
276 : "getgroups32\0"
277 : "getpgid\0"
278 : "getpgrp\0"
279 : "getpid\0"
280 : "getppid\0"
281 : "getresgid\0"
282 : "getresgid32\0"
283 : "getresuid\0"
284 : "getresuid32\0"
285 : "getrlimit\0" /* make sure processes can query stack size and such */
286 : "getsid\0"
287 : "gettid\0"
288 : "gettimeofday\0"
289 : "getuid\0"
290 : "getuid32\0"
291 : "membarrier\0"
292 : "nanosleep\0"
293 : "pause\0"
294 : "prlimit64\0"
295 : "restart_syscall\0"
296 : "rseq\0"
297 : "rt_sigreturn\0"
298 : "sched_yield\0"
299 : "set_robust_list\0"
300 : "set_thread_area\0"
301 : "set_tid_address\0"
302 : "set_tls\0"
303 : "sigreturn\0"
304 : "time\0"
305 : "ugetrlimit\0"
306 : },
307 : [SYSCALL_FILTER_SET_AIO] = {
308 : .name = "@aio",
309 : .help = "Asynchronous IO",
310 : .value =
311 : "io_cancel\0"
312 : "io_destroy\0"
313 : "io_getevents\0"
314 : "io_pgetevents\0"
315 : "io_setup\0"
316 : "io_submit\0"
317 : },
318 : [SYSCALL_FILTER_SET_BASIC_IO] = {
319 : .name = "@basic-io",
320 : .help = "Basic IO",
321 : .value =
322 : "_llseek\0"
323 : "close\0"
324 : "dup\0"
325 : "dup2\0"
326 : "dup3\0"
327 : "lseek\0"
328 : "pread64\0"
329 : "preadv\0"
330 : "preadv2\0"
331 : "pwrite64\0"
332 : "pwritev\0"
333 : "pwritev2\0"
334 : "read\0"
335 : "readv\0"
336 : "write\0"
337 : "writev\0"
338 : },
339 : [SYSCALL_FILTER_SET_CHOWN] = {
340 : .name = "@chown",
341 : .help = "Change ownership of files and directories",
342 : .value =
343 : "chown\0"
344 : "chown32\0"
345 : "fchown\0"
346 : "fchown32\0"
347 : "fchownat\0"
348 : "lchown\0"
349 : "lchown32\0"
350 : },
351 : [SYSCALL_FILTER_SET_CLOCK] = {
352 : .name = "@clock",
353 : .help = "Change the system time",
354 : .value =
355 : "adjtimex\0"
356 : "clock_adjtime\0"
357 : "clock_settime\0"
358 : "settimeofday\0"
359 : "stime\0"
360 : },
361 : [SYSCALL_FILTER_SET_CPU_EMULATION] = {
362 : .name = "@cpu-emulation",
363 : .help = "System calls for CPU emulation functionality",
364 : .value =
365 : "modify_ldt\0"
366 : "subpage_prot\0"
367 : "switch_endian\0"
368 : "vm86\0"
369 : "vm86old\0"
370 : },
371 : [SYSCALL_FILTER_SET_DEBUG] = {
372 : .name = "@debug",
373 : .help = "Debugging, performance monitoring and tracing functionality",
374 : .value =
375 : "lookup_dcookie\0"
376 : "perf_event_open\0"
377 : "ptrace\0"
378 : "rtas\0"
379 : #ifdef __NR_s390_runtime_instr
380 : "s390_runtime_instr\0"
381 : #endif
382 : "sys_debug_setcontext\0"
383 : },
384 : [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
385 : .name = "@file-system",
386 : .help = "File system operations",
387 : .value =
388 : "access\0"
389 : "chdir\0"
390 : "chmod\0"
391 : "close\0"
392 : "creat\0"
393 : "faccessat\0"
394 : "fallocate\0"
395 : "fchdir\0"
396 : "fchmod\0"
397 : "fchmodat\0"
398 : "fcntl\0"
399 : "fcntl64\0"
400 : "fgetxattr\0"
401 : "flistxattr\0"
402 : "fremovexattr\0"
403 : "fsetxattr\0"
404 : "fstat\0"
405 : "fstat64\0"
406 : "fstatat64\0"
407 : "fstatfs\0"
408 : "fstatfs64\0"
409 : "ftruncate\0"
410 : "ftruncate64\0"
411 : "futimesat\0"
412 : "getcwd\0"
413 : "getdents\0"
414 : "getdents64\0"
415 : "getxattr\0"
416 : "inotify_add_watch\0"
417 : "inotify_init\0"
418 : "inotify_init1\0"
419 : "inotify_rm_watch\0"
420 : "lgetxattr\0"
421 : "link\0"
422 : "linkat\0"
423 : "listxattr\0"
424 : "llistxattr\0"
425 : "lremovexattr\0"
426 : "lsetxattr\0"
427 : "lstat\0"
428 : "lstat64\0"
429 : "mkdir\0"
430 : "mkdirat\0"
431 : "mknod\0"
432 : "mknodat\0"
433 : "mmap\0"
434 : "mmap2\0"
435 : "munmap\0"
436 : "newfstatat\0"
437 : "oldfstat\0"
438 : "oldlstat\0"
439 : "oldstat\0"
440 : "open\0"
441 : "openat\0"
442 : "readlink\0"
443 : "readlinkat\0"
444 : "removexattr\0"
445 : "rename\0"
446 : "renameat\0"
447 : "renameat2\0"
448 : "rmdir\0"
449 : "setxattr\0"
450 : "stat\0"
451 : "stat64\0"
452 : "statfs\0"
453 : "statfs64\0"
454 : #ifdef __NR_statx
455 : "statx\0"
456 : #endif
457 : "symlink\0"
458 : "symlinkat\0"
459 : "truncate\0"
460 : "truncate64\0"
461 : "unlink\0"
462 : "unlinkat\0"
463 : "utime\0"
464 : "utimensat\0"
465 : "utimes\0"
466 : },
467 : [SYSCALL_FILTER_SET_IO_EVENT] = {
468 : .name = "@io-event",
469 : .help = "Event loop system calls",
470 : .value =
471 : "_newselect\0"
472 : "epoll_create\0"
473 : "epoll_create1\0"
474 : "epoll_ctl\0"
475 : "epoll_ctl_old\0"
476 : "epoll_pwait\0"
477 : "epoll_wait\0"
478 : "epoll_wait_old\0"
479 : "eventfd\0"
480 : "eventfd2\0"
481 : "poll\0"
482 : "ppoll\0"
483 : "pselect6\0"
484 : "select\0"
485 : },
486 : [SYSCALL_FILTER_SET_IPC] = {
487 : .name = "@ipc",
488 : .help = "SysV IPC, POSIX Message Queues or other IPC",
489 : .value =
490 : "ipc\0"
491 : "memfd_create\0"
492 : "mq_getsetattr\0"
493 : "mq_notify\0"
494 : "mq_open\0"
495 : "mq_timedreceive\0"
496 : "mq_timedsend\0"
497 : "mq_unlink\0"
498 : "msgctl\0"
499 : "msgget\0"
500 : "msgrcv\0"
501 : "msgsnd\0"
502 : "pipe\0"
503 : "pipe2\0"
504 : "process_vm_readv\0"
505 : "process_vm_writev\0"
506 : "semctl\0"
507 : "semget\0"
508 : "semop\0"
509 : "semtimedop\0"
510 : "shmat\0"
511 : "shmctl\0"
512 : "shmdt\0"
513 : "shmget\0"
514 : },
515 : [SYSCALL_FILTER_SET_KEYRING] = {
516 : .name = "@keyring",
517 : .help = "Kernel keyring access",
518 : .value =
519 : "add_key\0"
520 : "keyctl\0"
521 : "request_key\0"
522 : },
523 : [SYSCALL_FILTER_SET_MEMLOCK] = {
524 : .name = "@memlock",
525 : .help = "Memory locking control",
526 : .value =
527 : "mlock\0"
528 : "mlock2\0"
529 : "mlockall\0"
530 : "munlock\0"
531 : "munlockall\0"
532 : },
533 : [SYSCALL_FILTER_SET_MODULE] = {
534 : .name = "@module",
535 : .help = "Loading and unloading of kernel modules",
536 : .value =
537 : "delete_module\0"
538 : "finit_module\0"
539 : "init_module\0"
540 : },
541 : [SYSCALL_FILTER_SET_MOUNT] = {
542 : .name = "@mount",
543 : .help = "Mounting and unmounting of file systems",
544 : .value =
545 : "chroot\0"
546 : "mount\0"
547 : "pivot_root\0"
548 : "umount\0"
549 : "umount2\0"
550 : },
551 : [SYSCALL_FILTER_SET_NETWORK_IO] = {
552 : .name = "@network-io",
553 : .help = "Network or Unix socket IO, should not be needed if not network facing",
554 : .value =
555 : "accept\0"
556 : "accept4\0"
557 : "bind\0"
558 : "connect\0"
559 : "getpeername\0"
560 : "getsockname\0"
561 : "getsockopt\0"
562 : "listen\0"
563 : "recv\0"
564 : "recvfrom\0"
565 : "recvmmsg\0"
566 : "recvmsg\0"
567 : "send\0"
568 : "sendmmsg\0"
569 : "sendmsg\0"
570 : "sendto\0"
571 : "setsockopt\0"
572 : "shutdown\0"
573 : "socket\0"
574 : "socketcall\0"
575 : "socketpair\0"
576 : },
577 : [SYSCALL_FILTER_SET_OBSOLETE] = {
578 : /* some unknown even to libseccomp */
579 : .name = "@obsolete",
580 : .help = "Unusual, obsolete or unimplemented system calls",
581 : .value =
582 : "_sysctl\0"
583 : "afs_syscall\0"
584 : "bdflush\0"
585 : "break\0"
586 : "create_module\0"
587 : "ftime\0"
588 : "get_kernel_syms\0"
589 : "getpmsg\0"
590 : "gtty\0"
591 : "idle\0"
592 : "lock\0"
593 : "mpx\0"
594 : "prof\0"
595 : "profil\0"
596 : "putpmsg\0"
597 : "query_module\0"
598 : "security\0"
599 : "sgetmask\0"
600 : "ssetmask\0"
601 : "stty\0"
602 : "sysfs\0"
603 : "tuxcall\0"
604 : "ulimit\0"
605 : "uselib\0"
606 : "ustat\0"
607 : "vserver\0"
608 : },
609 : [SYSCALL_FILTER_SET_PRIVILEGED] = {
610 : .name = "@privileged",
611 : .help = "All system calls which need super-user capabilities",
612 : .value =
613 : "@chown\0"
614 : "@clock\0"
615 : "@module\0"
616 : "@raw-io\0"
617 : "@reboot\0"
618 : "@swap\0"
619 : "_sysctl\0"
620 : "acct\0"
621 : "bpf\0"
622 : "capset\0"
623 : "chroot\0"
624 : "fanotify_init\0"
625 : "nfsservctl\0"
626 : "open_by_handle_at\0"
627 : "pivot_root\0"
628 : "quotactl\0"
629 : "setdomainname\0"
630 : "setfsuid\0"
631 : "setfsuid32\0"
632 : "setgroups\0"
633 : "setgroups32\0"
634 : "sethostname\0"
635 : "setresuid\0"
636 : "setresuid32\0"
637 : "setreuid\0"
638 : "setreuid32\0"
639 : "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
640 : "setuid32\0"
641 : "vhangup\0"
642 : },
643 : [SYSCALL_FILTER_SET_PROCESS] = {
644 : .name = "@process",
645 : .help = "Process control, execution, namespaceing operations",
646 : .value =
647 : "arch_prctl\0"
648 : "capget\0" /* Able to query arbitrary processes */
649 : "clone\0"
650 : "execveat\0"
651 : "fork\0"
652 : "getrusage\0"
653 : "kill\0"
654 : "pidfd_send_signal\0"
655 : "prctl\0"
656 : "rt_sigqueueinfo\0"
657 : "rt_tgsigqueueinfo\0"
658 : "setns\0"
659 : "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
660 : "tgkill\0"
661 : "times\0"
662 : "tkill\0"
663 : "unshare\0"
664 : "vfork\0"
665 : "wait4\0"
666 : "waitid\0"
667 : "waitpid\0"
668 : },
669 : [SYSCALL_FILTER_SET_RAW_IO] = {
670 : .name = "@raw-io",
671 : .help = "Raw I/O port access",
672 : .value =
673 : "ioperm\0"
674 : "iopl\0"
675 : "pciconfig_iobase\0"
676 : "pciconfig_read\0"
677 : "pciconfig_write\0"
678 : #ifdef __NR_s390_pci_mmio_read
679 : "s390_pci_mmio_read\0"
680 : #endif
681 : #ifdef __NR_s390_pci_mmio_write
682 : "s390_pci_mmio_write\0"
683 : #endif
684 : },
685 : [SYSCALL_FILTER_SET_REBOOT] = {
686 : .name = "@reboot",
687 : .help = "Reboot and reboot preparation/kexec",
688 : .value =
689 : "kexec_file_load\0"
690 : "kexec_load\0"
691 : "reboot\0"
692 : },
693 : [SYSCALL_FILTER_SET_RESOURCES] = {
694 : .name = "@resources",
695 : .help = "Alter resource settings",
696 : .value =
697 : "ioprio_set\0"
698 : "mbind\0"
699 : "migrate_pages\0"
700 : "move_pages\0"
701 : "nice\0"
702 : "sched_setaffinity\0"
703 : "sched_setattr\0"
704 : "sched_setparam\0"
705 : "sched_setscheduler\0"
706 : "set_mempolicy\0"
707 : "setpriority\0"
708 : "setrlimit\0"
709 : },
710 : [SYSCALL_FILTER_SET_SETUID] = {
711 : .name = "@setuid",
712 : .help = "Operations for changing user/group credentials",
713 : .value =
714 : "setgid\0"
715 : "setgid32\0"
716 : "setgroups\0"
717 : "setgroups32\0"
718 : "setregid\0"
719 : "setregid32\0"
720 : "setresgid\0"
721 : "setresgid32\0"
722 : "setresuid\0"
723 : "setresuid32\0"
724 : "setreuid\0"
725 : "setreuid32\0"
726 : "setuid\0"
727 : "setuid32\0"
728 : },
729 : [SYSCALL_FILTER_SET_SIGNAL] = {
730 : .name = "@signal",
731 : .help = "Process signal handling",
732 : .value =
733 : "rt_sigaction\0"
734 : "rt_sigpending\0"
735 : "rt_sigprocmask\0"
736 : "rt_sigsuspend\0"
737 : "rt_sigtimedwait\0"
738 : "sigaction\0"
739 : "sigaltstack\0"
740 : "signal\0"
741 : "signalfd\0"
742 : "signalfd4\0"
743 : "sigpending\0"
744 : "sigprocmask\0"
745 : "sigsuspend\0"
746 : },
747 : [SYSCALL_FILTER_SET_SWAP] = {
748 : .name = "@swap",
749 : .help = "Enable/disable swap devices",
750 : .value =
751 : "swapoff\0"
752 : "swapon\0"
753 : },
754 : [SYSCALL_FILTER_SET_SYNC] = {
755 : .name = "@sync",
756 : .help = "Synchronize files and memory to storage",
757 : .value =
758 : "fdatasync\0"
759 : "fsync\0"
760 : "msync\0"
761 : "sync\0"
762 : "sync_file_range\0"
763 : "sync_file_range2\0"
764 : "syncfs\0"
765 : },
766 : [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
767 : .name = "@system-service",
768 : .help = "General system service operations",
769 : .value =
770 : "@aio\0"
771 : "@basic-io\0"
772 : "@chown\0"
773 : "@default\0"
774 : "@file-system\0"
775 : "@io-event\0"
776 : "@ipc\0"
777 : "@keyring\0"
778 : "@memlock\0"
779 : "@network-io\0"
780 : "@process\0"
781 : "@resources\0"
782 : "@setuid\0"
783 : "@signal\0"
784 : "@sync\0"
785 : "@timer\0"
786 : "brk\0"
787 : "capget\0"
788 : "capset\0"
789 : "copy_file_range\0"
790 : "fadvise64\0"
791 : "fadvise64_64\0"
792 : "flock\0"
793 : "get_mempolicy\0"
794 : "getcpu\0"
795 : "getpriority\0"
796 : "getrandom\0"
797 : "ioctl\0"
798 : "ioprio_get\0"
799 : "kcmp\0"
800 : "madvise\0"
801 : "mprotect\0"
802 : "mremap\0"
803 : "name_to_handle_at\0"
804 : "oldolduname\0"
805 : "olduname\0"
806 : "personality\0"
807 : "readahead\0"
808 : "readdir\0"
809 : "remap_file_pages\0"
810 : "sched_get_priority_max\0"
811 : "sched_get_priority_min\0"
812 : "sched_getaffinity\0"
813 : "sched_getattr\0"
814 : "sched_getparam\0"
815 : "sched_getscheduler\0"
816 : "sched_rr_get_interval\0"
817 : "sched_yield\0"
818 : "sendfile\0"
819 : "sendfile64\0"
820 : "setfsgid\0"
821 : "setfsgid32\0"
822 : "setfsuid\0"
823 : "setfsuid32\0"
824 : "setpgid\0"
825 : "setsid\0"
826 : "splice\0"
827 : "sysinfo\0"
828 : "tee\0"
829 : "umask\0"
830 : "uname\0"
831 : "userfaultfd\0"
832 : "vmsplice\0"
833 : },
834 : [SYSCALL_FILTER_SET_TIMER] = {
835 : .name = "@timer",
836 : .help = "Schedule operations by time",
837 : .value =
838 : "alarm\0"
839 : "getitimer\0"
840 : "setitimer\0"
841 : "timer_create\0"
842 : "timer_delete\0"
843 : "timer_getoverrun\0"
844 : "timer_gettime\0"
845 : "timer_settime\0"
846 : "timerfd_create\0"
847 : "timerfd_gettime\0"
848 : "timerfd_settime\0"
849 : "times\0"
850 : },
851 : };
852 :
853 7 : const SyscallFilterSet *syscall_filter_set_find(const char *name) {
854 : unsigned i;
855 :
856 7 : if (isempty(name) || name[0] != '@')
857 3 : return NULL;
858 :
859 53 : for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
860 52 : if (streq(syscall_filter_sets[i].name, name))
861 3 : return syscall_filter_sets + i;
862 :
863 1 : return NULL;
864 : }
865 :
866 : static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
867 :
868 0 : int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
869 0 : assert(seccomp);
870 0 : assert(name);
871 :
872 0 : if (strv_contains(exclude, name))
873 0 : return 0;
874 :
875 0 : if (name[0] == '@') {
876 : const SyscallFilterSet *other;
877 :
878 0 : other = syscall_filter_set_find(name);
879 0 : if (!other)
880 0 : return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
881 : "Filter set %s is not known!",
882 : name);
883 :
884 0 : return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
885 :
886 : } else {
887 : int id, r;
888 :
889 0 : id = seccomp_syscall_resolve_name(name);
890 0 : if (id == __NR_SCMP_ERROR) {
891 0 : if (log_missing)
892 0 : log_debug("System call %s is not known, ignoring.", name);
893 0 : return 0;
894 : }
895 :
896 0 : r = seccomp_rule_add_exact(seccomp, action, id, 0);
897 0 : if (r < 0) {
898 : /* If the system call is not known on this architecture, then that's fine, let's ignore it */
899 0 : bool ignore = r == -EDOM;
900 :
901 0 : if (!ignore || log_missing)
902 0 : log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
903 : name, id, ignore ? ", ignoring" : "");
904 0 : if (!ignore)
905 0 : return r;
906 : }
907 :
908 0 : return 0;
909 : }
910 : }
911 :
912 0 : static int seccomp_add_syscall_filter_set(
913 : scmp_filter_ctx seccomp,
914 : const SyscallFilterSet *set,
915 : uint32_t action,
916 : char **exclude,
917 : bool log_missing) {
918 :
919 : const char *sys;
920 : int r;
921 :
922 0 : assert(seccomp);
923 0 : assert(set);
924 :
925 0 : NULSTR_FOREACH(sys, set->value) {
926 0 : r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
927 0 : if (r < 0)
928 0 : return r;
929 : }
930 :
931 0 : return 0;
932 : }
933 :
934 0 : int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
935 : uint32_t arch;
936 : int r;
937 :
938 0 : assert(set);
939 :
940 : /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
941 : * each local arch. */
942 :
943 0 : SECCOMP_FOREACH_LOCAL_ARCH(arch) {
944 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
945 :
946 0 : log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
947 :
948 0 : r = seccomp_init_for_arch(&seccomp, arch, default_action);
949 0 : if (r < 0)
950 0 : return r;
951 :
952 0 : r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
953 0 : if (r < 0)
954 0 : return log_debug_errno(r, "Failed to add filter set: %m");
955 :
956 0 : r = seccomp_load(seccomp);
957 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
958 0 : return r;
959 0 : if (r < 0)
960 0 : log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
961 : }
962 :
963 0 : return 0;
964 : }
965 :
966 0 : int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
967 : uint32_t arch;
968 : int r;
969 :
970 : /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
971 : * SyscallFilterSet* table. */
972 :
973 0 : if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
974 0 : return 0;
975 :
976 0 : SECCOMP_FOREACH_LOCAL_ARCH(arch) {
977 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
978 : Iterator i;
979 : void *syscall_id, *val;
980 :
981 0 : log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
982 :
983 0 : r = seccomp_init_for_arch(&seccomp, arch, default_action);
984 0 : if (r < 0)
985 0 : return r;
986 :
987 0 : HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
988 0 : uint32_t a = action;
989 0 : int id = PTR_TO_INT(syscall_id) - 1;
990 0 : int error = PTR_TO_INT(val);
991 :
992 0 : if (action != SCMP_ACT_ALLOW && error >= 0)
993 0 : a = SCMP_ACT_ERRNO(error);
994 :
995 0 : r = seccomp_rule_add_exact(seccomp, a, id, 0);
996 0 : if (r < 0) {
997 : /* If the system call is not known on this architecture, then that's fine, let's ignore it */
998 0 : _cleanup_free_ char *n = NULL;
999 : bool ignore;
1000 :
1001 0 : n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1002 0 : ignore = r == -EDOM;
1003 0 : if (!ignore || log_missing)
1004 0 : log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1005 : strna(n), id, ignore ? ", ignoring" : "");
1006 0 : if (!ignore)
1007 0 : return r;
1008 : }
1009 : }
1010 :
1011 0 : r = seccomp_load(seccomp);
1012 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
1013 0 : return r;
1014 0 : if (r < 0)
1015 0 : log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1016 : }
1017 :
1018 0 : return 0;
1019 : }
1020 :
1021 0 : int seccomp_parse_syscall_filter(
1022 : const char *name,
1023 : int errno_num,
1024 : Hashmap *filter,
1025 : SeccompParseFlags flags,
1026 : const char *unit,
1027 : const char *filename,
1028 : unsigned line) {
1029 :
1030 : int r;
1031 :
1032 0 : assert(name);
1033 0 : assert(filter);
1034 :
1035 0 : if (name[0] == '@') {
1036 : const SyscallFilterSet *set;
1037 : const char *i;
1038 :
1039 0 : set = syscall_filter_set_find(name);
1040 0 : if (!set) {
1041 0 : if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1042 0 : return -EINVAL;
1043 :
1044 0 : log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1045 : "Unknown system call group, ignoring: %s", name);
1046 0 : return 0;
1047 : }
1048 :
1049 0 : NULSTR_FOREACH(i, set->value) {
1050 : /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1051 : * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1052 : * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1053 : * about them. */
1054 0 : r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1055 0 : if (r < 0)
1056 0 : return r;
1057 : }
1058 : } else {
1059 : int id;
1060 :
1061 0 : id = seccomp_syscall_resolve_name(name);
1062 0 : if (id == __NR_SCMP_ERROR) {
1063 0 : if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1064 0 : return -EINVAL;
1065 :
1066 0 : log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1067 : "Failed to parse system call, ignoring: %s", name);
1068 0 : return 0;
1069 : }
1070 :
1071 : /* If we previously wanted to forbid a syscall and now
1072 : * we want to allow it, then remove it from the list. */
1073 0 : if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
1074 0 : r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1075 0 : if (r < 0)
1076 0 : switch (r) {
1077 0 : case -ENOMEM:
1078 0 : return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1079 0 : case -EEXIST:
1080 0 : assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1081 0 : break;
1082 0 : default:
1083 0 : return r;
1084 : }
1085 0 : } else
1086 0 : (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1087 : }
1088 :
1089 0 : return 0;
1090 : }
1091 :
1092 0 : int seccomp_restrict_namespaces(unsigned long retain) {
1093 : uint32_t arch;
1094 : int r;
1095 :
1096 0 : if (DEBUG_LOGGING) {
1097 0 : _cleanup_free_ char *s = NULL;
1098 :
1099 0 : (void) namespace_flags_to_string(retain, &s);
1100 0 : log_debug("Restricting namespace to: %s.", strna(s));
1101 : }
1102 :
1103 : /* NOOP? */
1104 0 : if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1105 0 : return 0;
1106 :
1107 0 : SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1108 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1109 : unsigned i;
1110 :
1111 0 : log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1112 :
1113 0 : r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1114 0 : if (r < 0)
1115 0 : return r;
1116 :
1117 0 : if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1118 : /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1119 : * altogether. */
1120 0 : r = seccomp_rule_add_exact(
1121 : seccomp,
1122 : SCMP_ACT_ERRNO(EPERM),
1123 : SCMP_SYS(setns),
1124 : 0);
1125 : else
1126 : /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1127 : * special invocation with a zero flags argument, right here. */
1128 0 : r = seccomp_rule_add_exact(
1129 : seccomp,
1130 : SCMP_ACT_ERRNO(EPERM),
1131 : SCMP_SYS(setns),
1132 : 1,
1133 0 : SCMP_A1(SCMP_CMP_EQ, 0));
1134 0 : if (r < 0) {
1135 0 : log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1136 0 : continue;
1137 : }
1138 :
1139 0 : for (i = 0; namespace_flag_map[i].name; i++) {
1140 : unsigned long f;
1141 :
1142 0 : f = namespace_flag_map[i].flag;
1143 0 : if ((retain & f) == f) {
1144 0 : log_debug("Permitting %s.", namespace_flag_map[i].name);
1145 0 : continue;
1146 : }
1147 :
1148 0 : log_debug("Blocking %s.", namespace_flag_map[i].name);
1149 :
1150 0 : r = seccomp_rule_add_exact(
1151 : seccomp,
1152 : SCMP_ACT_ERRNO(EPERM),
1153 : SCMP_SYS(unshare),
1154 : 1,
1155 0 : SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1156 0 : if (r < 0) {
1157 0 : log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1158 0 : break;
1159 : }
1160 :
1161 : /* On s390/s390x the first two parameters to clone are switched */
1162 0 : if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1163 0 : r = seccomp_rule_add_exact(
1164 : seccomp,
1165 : SCMP_ACT_ERRNO(EPERM),
1166 : SCMP_SYS(clone),
1167 : 1,
1168 0 : SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1169 : else
1170 0 : r = seccomp_rule_add_exact(
1171 : seccomp,
1172 : SCMP_ACT_ERRNO(EPERM),
1173 : SCMP_SYS(clone),
1174 : 1,
1175 0 : SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1176 0 : if (r < 0) {
1177 0 : log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1178 0 : break;
1179 : }
1180 :
1181 0 : if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1182 0 : r = seccomp_rule_add_exact(
1183 : seccomp,
1184 : SCMP_ACT_ERRNO(EPERM),
1185 : SCMP_SYS(setns),
1186 : 1,
1187 0 : SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1188 0 : if (r < 0) {
1189 0 : log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1190 0 : break;
1191 : }
1192 : }
1193 : }
1194 0 : if (r < 0)
1195 0 : continue;
1196 :
1197 0 : r = seccomp_load(seccomp);
1198 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
1199 0 : return r;
1200 0 : if (r < 0)
1201 0 : log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1202 : }
1203 :
1204 0 : return 0;
1205 : }
1206 :
1207 0 : int seccomp_protect_sysctl(void) {
1208 : uint32_t arch;
1209 : int r;
1210 :
1211 0 : SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1212 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1213 :
1214 0 : log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1215 :
1216 0 : if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1217 : /* No _sysctl syscall */
1218 0 : continue;
1219 :
1220 0 : r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1221 0 : if (r < 0)
1222 0 : return r;
1223 :
1224 0 : r = seccomp_rule_add_exact(
1225 : seccomp,
1226 : SCMP_ACT_ERRNO(EPERM),
1227 : SCMP_SYS(_sysctl),
1228 : 0);
1229 0 : if (r < 0) {
1230 0 : log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1231 0 : continue;
1232 : }
1233 :
1234 0 : r = seccomp_load(seccomp);
1235 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
1236 0 : return r;
1237 0 : if (r < 0)
1238 0 : log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1239 : }
1240 :
1241 0 : return 0;
1242 : }
1243 :
1244 0 : int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1245 : uint32_t arch;
1246 : int r;
1247 :
1248 0 : SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1249 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1250 : bool supported;
1251 : Iterator i;
1252 :
1253 0 : log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1254 :
1255 0 : switch (arch) {
1256 :
1257 0 : case SCMP_ARCH_X86_64:
1258 : case SCMP_ARCH_X32:
1259 : case SCMP_ARCH_ARM:
1260 : case SCMP_ARCH_AARCH64:
1261 : case SCMP_ARCH_PPC:
1262 : case SCMP_ARCH_PPC64:
1263 : case SCMP_ARCH_PPC64LE:
1264 : case SCMP_ARCH_MIPSEL64N32:
1265 : case SCMP_ARCH_MIPS64N32:
1266 : case SCMP_ARCH_MIPSEL64:
1267 : case SCMP_ARCH_MIPS64:
1268 : /* These we know we support (i.e. are the ones that do not use socketcall()) */
1269 0 : supported = true;
1270 0 : break;
1271 :
1272 0 : case SCMP_ARCH_S390:
1273 : case SCMP_ARCH_S390X:
1274 : case SCMP_ARCH_X86:
1275 : case SCMP_ARCH_MIPSEL:
1276 : case SCMP_ARCH_MIPS:
1277 : default:
1278 : /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1279 : * don't know */
1280 0 : supported = false;
1281 0 : break;
1282 : }
1283 :
1284 0 : if (!supported)
1285 0 : continue;
1286 :
1287 0 : r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1288 0 : if (r < 0)
1289 0 : return r;
1290 :
1291 0 : if (whitelist) {
1292 0 : int af, first = 0, last = 0;
1293 : void *afp;
1294 :
1295 : /* If this is a whitelist, we first block the address families that are out of range and then
1296 : * everything that is not in the set. First, we find the lowest and highest address family in
1297 : * the set. */
1298 :
1299 0 : SET_FOREACH(afp, address_families, i) {
1300 0 : af = PTR_TO_INT(afp);
1301 :
1302 0 : if (af <= 0 || af >= af_max())
1303 0 : continue;
1304 :
1305 0 : if (first == 0 || af < first)
1306 0 : first = af;
1307 :
1308 0 : if (last == 0 || af > last)
1309 0 : last = af;
1310 : }
1311 :
1312 0 : assert((first == 0) == (last == 0));
1313 :
1314 0 : if (first == 0) {
1315 :
1316 : /* No entries in the valid range, block everything */
1317 0 : r = seccomp_rule_add_exact(
1318 : seccomp,
1319 : SCMP_ACT_ERRNO(EAFNOSUPPORT),
1320 : SCMP_SYS(socket),
1321 : 0);
1322 0 : if (r < 0) {
1323 0 : log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1324 0 : continue;
1325 : }
1326 :
1327 : } else {
1328 :
1329 : /* Block everything below the first entry */
1330 0 : r = seccomp_rule_add_exact(
1331 : seccomp,
1332 : SCMP_ACT_ERRNO(EAFNOSUPPORT),
1333 : SCMP_SYS(socket),
1334 : 1,
1335 0 : SCMP_A0(SCMP_CMP_LT, first));
1336 0 : if (r < 0) {
1337 0 : log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1338 0 : continue;
1339 : }
1340 :
1341 : /* Block everything above the last entry */
1342 0 : r = seccomp_rule_add_exact(
1343 : seccomp,
1344 : SCMP_ACT_ERRNO(EAFNOSUPPORT),
1345 : SCMP_SYS(socket),
1346 : 1,
1347 0 : SCMP_A0(SCMP_CMP_GT, last));
1348 0 : if (r < 0) {
1349 0 : log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1350 0 : continue;
1351 : }
1352 :
1353 : /* Block everything between the first and last entry */
1354 0 : for (af = 1; af < af_max(); af++) {
1355 :
1356 0 : if (set_contains(address_families, INT_TO_PTR(af)))
1357 0 : continue;
1358 :
1359 0 : r = seccomp_rule_add_exact(
1360 : seccomp,
1361 : SCMP_ACT_ERRNO(EAFNOSUPPORT),
1362 : SCMP_SYS(socket),
1363 : 1,
1364 0 : SCMP_A0(SCMP_CMP_EQ, af));
1365 0 : if (r < 0)
1366 0 : break;
1367 : }
1368 0 : if (r < 0) {
1369 0 : log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1370 0 : continue;
1371 : }
1372 : }
1373 :
1374 : } else {
1375 : void *af;
1376 :
1377 : /* If this is a blacklist, then generate one rule for
1378 : * each address family that are then combined in OR
1379 : * checks. */
1380 :
1381 0 : SET_FOREACH(af, address_families, i) {
1382 :
1383 0 : r = seccomp_rule_add_exact(
1384 : seccomp,
1385 : SCMP_ACT_ERRNO(EAFNOSUPPORT),
1386 : SCMP_SYS(socket),
1387 : 1,
1388 0 : SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1389 0 : if (r < 0)
1390 0 : break;
1391 : }
1392 0 : if (r < 0) {
1393 0 : log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1394 0 : continue;
1395 : }
1396 : }
1397 :
1398 0 : r = seccomp_load(seccomp);
1399 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
1400 0 : return r;
1401 0 : if (r < 0)
1402 0 : log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1403 : }
1404 :
1405 0 : return 0;
1406 : }
1407 :
1408 0 : int seccomp_restrict_realtime(void) {
1409 : static const int permitted_policies[] = {
1410 : SCHED_OTHER,
1411 : SCHED_BATCH,
1412 : SCHED_IDLE,
1413 : };
1414 :
1415 0 : int r, max_policy = 0;
1416 : uint32_t arch;
1417 : unsigned i;
1418 :
1419 : /* Determine the highest policy constant we want to allow */
1420 0 : for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1421 0 : if (permitted_policies[i] > max_policy)
1422 0 : max_policy = permitted_policies[i];
1423 :
1424 0 : SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1425 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1426 : int p;
1427 :
1428 0 : log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1429 :
1430 0 : r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1431 0 : if (r < 0)
1432 0 : return r;
1433 :
1434 : /* Go through all policies with lower values than that, and block them -- unless they appear in the
1435 : * whitelist. */
1436 0 : for (p = 0; p < max_policy; p++) {
1437 0 : bool good = false;
1438 :
1439 : /* Check if this is in the whitelist. */
1440 0 : for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1441 0 : if (permitted_policies[i] == p) {
1442 0 : good = true;
1443 0 : break;
1444 : }
1445 :
1446 0 : if (good)
1447 0 : continue;
1448 :
1449 : /* Deny this policy */
1450 0 : r = seccomp_rule_add_exact(
1451 : seccomp,
1452 : SCMP_ACT_ERRNO(EPERM),
1453 : SCMP_SYS(sched_setscheduler),
1454 : 1,
1455 0 : SCMP_A1(SCMP_CMP_EQ, p));
1456 0 : if (r < 0) {
1457 0 : log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1458 0 : continue;
1459 : }
1460 : }
1461 :
1462 : /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1463 : * unsigned here, hence no need no check for < 0 values. */
1464 0 : r = seccomp_rule_add_exact(
1465 : seccomp,
1466 : SCMP_ACT_ERRNO(EPERM),
1467 : SCMP_SYS(sched_setscheduler),
1468 : 1,
1469 0 : SCMP_A1(SCMP_CMP_GT, max_policy));
1470 0 : if (r < 0) {
1471 0 : log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1472 0 : continue;
1473 : }
1474 :
1475 0 : r = seccomp_load(seccomp);
1476 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
1477 0 : return r;
1478 0 : if (r < 0)
1479 0 : log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1480 : }
1481 :
1482 0 : return 0;
1483 : }
1484 :
1485 0 : static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1486 : uint32_t arch,
1487 : int nr,
1488 : unsigned arg_cnt,
1489 : const struct scmp_arg_cmp arg) {
1490 : int r;
1491 :
1492 0 : r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1493 0 : if (r < 0) {
1494 0 : _cleanup_free_ char *n = NULL;
1495 :
1496 0 : n = seccomp_syscall_resolve_num_arch(arch, nr);
1497 0 : log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1498 : strna(n),
1499 : seccomp_arch_to_string(arch));
1500 : }
1501 :
1502 0 : return r;
1503 : }
1504 :
1505 : /* For known architectures, check that syscalls are indeed defined or not. */
1506 : #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1507 : assert_cc(SCMP_SYS(shmget) > 0);
1508 : assert_cc(SCMP_SYS(shmat) > 0);
1509 : assert_cc(SCMP_SYS(shmdt) > 0);
1510 : #endif
1511 :
1512 0 : int seccomp_memory_deny_write_execute(void) {
1513 : uint32_t arch;
1514 : int r;
1515 :
1516 0 : SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1517 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1518 0 : int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1519 :
1520 0 : log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1521 :
1522 0 : switch (arch) {
1523 :
1524 0 : case SCMP_ARCH_X86:
1525 : case SCMP_ARCH_S390:
1526 0 : filter_syscall = SCMP_SYS(mmap2);
1527 0 : block_syscall = SCMP_SYS(mmap);
1528 0 : shmat_syscall = SCMP_SYS(shmat);
1529 0 : break;
1530 :
1531 0 : case SCMP_ARCH_PPC:
1532 : case SCMP_ARCH_PPC64:
1533 : case SCMP_ARCH_PPC64LE:
1534 0 : filter_syscall = SCMP_SYS(mmap);
1535 :
1536 : /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1537 : * We ignore that here, which means there's still a way to get writable/executable
1538 : * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1539 :
1540 0 : break;
1541 :
1542 0 : case SCMP_ARCH_ARM:
1543 0 : filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1544 0 : shmat_syscall = SCMP_SYS(shmat);
1545 0 : break;
1546 :
1547 0 : case SCMP_ARCH_X86_64:
1548 : case SCMP_ARCH_X32:
1549 : case SCMP_ARCH_AARCH64:
1550 : case SCMP_ARCH_S390X:
1551 0 : filter_syscall = SCMP_SYS(mmap); /* amd64, x32, s390x, and arm64 have only mmap */
1552 0 : shmat_syscall = SCMP_SYS(shmat);
1553 0 : break;
1554 :
1555 : /* Please add more definitions here, if you port systemd to other architectures! */
1556 :
1557 : #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
1558 : #warning "Consider adding the right mmap() syscall definitions here!"
1559 : #endif
1560 : }
1561 :
1562 : /* Can't filter mmap() on this arch, then skip it */
1563 0 : if (filter_syscall == 0)
1564 0 : continue;
1565 :
1566 0 : r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1567 0 : if (r < 0)
1568 0 : return r;
1569 :
1570 0 : r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1571 : 1,
1572 0 : SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1573 0 : if (r < 0)
1574 0 : continue;
1575 :
1576 0 : if (block_syscall != 0) {
1577 0 : r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1578 0 : if (r < 0)
1579 0 : continue;
1580 : }
1581 :
1582 0 : r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1583 : 1,
1584 0 : SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1585 0 : if (r < 0)
1586 0 : continue;
1587 :
1588 : #ifdef __NR_pkey_mprotect
1589 0 : r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1590 : 1,
1591 0 : SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1592 0 : if (r < 0)
1593 0 : continue;
1594 : #endif
1595 :
1596 0 : if (shmat_syscall > 0) {
1597 0 : r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1598 : 1,
1599 0 : SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1600 0 : if (r < 0)
1601 0 : continue;
1602 : }
1603 :
1604 0 : r = seccomp_load(seccomp);
1605 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
1606 0 : return r;
1607 0 : if (r < 0)
1608 0 : log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1609 : }
1610 :
1611 0 : return 0;
1612 : }
1613 :
1614 0 : int seccomp_restrict_archs(Set *archs) {
1615 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1616 : Iterator i;
1617 : void *id;
1618 : int r;
1619 :
1620 : /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1621 : * list.
1622 : *
1623 : * There are some qualifications. However the most important use is to stop processes from bypassing
1624 : * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1625 : * in a non-native architecture. There are no holes in this use case, at least so far. */
1626 :
1627 : /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1628 : * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1629 : * to run a program with the restrictions applied. */
1630 0 : seccomp = seccomp_init(SCMP_ACT_ALLOW);
1631 0 : if (!seccomp)
1632 0 : return -ENOMEM;
1633 :
1634 0 : SET_FOREACH(id, archs, i) {
1635 0 : r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1636 0 : if (r < 0 && r != -EEXIST)
1637 0 : return r;
1638 : }
1639 :
1640 : /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1641 : * x32 syscalls should basically match x86-64 for everything except the pointer type.
1642 : * The important thing is that you can block the old 32-bit x86 syscalls.
1643 : * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1644 :
1645 0 : if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1646 0 : set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1647 :
1648 0 : r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1649 0 : if (r < 0 && r != -EEXIST)
1650 0 : return r;
1651 : }
1652 :
1653 0 : r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1654 0 : if (r < 0)
1655 0 : return r;
1656 :
1657 0 : r = seccomp_load(seccomp);
1658 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
1659 0 : return r;
1660 0 : if (r < 0)
1661 0 : log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1662 :
1663 0 : return 0;
1664 : }
1665 :
1666 0 : int parse_syscall_archs(char **l, Set **archs) {
1667 0 : _cleanup_set_free_ Set *_archs;
1668 : char **s;
1669 : int r;
1670 :
1671 0 : assert(l);
1672 0 : assert(archs);
1673 :
1674 0 : r = set_ensure_allocated(&_archs, NULL);
1675 0 : if (r < 0)
1676 0 : return r;
1677 :
1678 0 : STRV_FOREACH(s, l) {
1679 : uint32_t a;
1680 :
1681 0 : r = seccomp_arch_from_string(*s, &a);
1682 0 : if (r < 0)
1683 0 : return -EINVAL;
1684 :
1685 0 : r = set_put(_archs, UINT32_TO_PTR(a + 1));
1686 0 : if (r < 0)
1687 0 : return -ENOMEM;
1688 : }
1689 :
1690 0 : *archs = TAKE_PTR(_archs);
1691 :
1692 0 : return 0;
1693 : }
1694 :
1695 0 : int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1696 : const char *i;
1697 : int r;
1698 :
1699 0 : assert(set);
1700 :
1701 0 : NULSTR_FOREACH(i, set->value) {
1702 :
1703 0 : if (i[0] == '@') {
1704 : const SyscallFilterSet *more;
1705 :
1706 0 : more = syscall_filter_set_find(i);
1707 0 : if (!more)
1708 0 : return -ENXIO;
1709 :
1710 0 : r = seccomp_filter_set_add(filter, add, more);
1711 0 : if (r < 0)
1712 0 : return r;
1713 : } else {
1714 : int id;
1715 :
1716 0 : id = seccomp_syscall_resolve_name(i);
1717 0 : if (id == __NR_SCMP_ERROR) {
1718 0 : log_debug("Couldn't resolve system call, ignoring: %s", i);
1719 0 : continue;
1720 : }
1721 :
1722 0 : if (add) {
1723 0 : r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1724 0 : if (r < 0)
1725 0 : return r;
1726 : } else
1727 0 : (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1728 : }
1729 : }
1730 :
1731 0 : return 0;
1732 : }
1733 :
1734 0 : int seccomp_lock_personality(unsigned long personality) {
1735 : uint32_t arch;
1736 : int r;
1737 :
1738 0 : if (personality >= PERSONALITY_INVALID)
1739 0 : return -EINVAL;
1740 :
1741 0 : SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1742 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1743 :
1744 0 : r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1745 0 : if (r < 0)
1746 0 : return r;
1747 :
1748 0 : r = seccomp_rule_add_exact(
1749 : seccomp,
1750 : SCMP_ACT_ERRNO(EPERM),
1751 : SCMP_SYS(personality),
1752 : 1,
1753 0 : SCMP_A0(SCMP_CMP_NE, personality));
1754 0 : if (r < 0) {
1755 0 : log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1756 0 : continue;
1757 : }
1758 :
1759 0 : r = seccomp_load(seccomp);
1760 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
1761 0 : return r;
1762 0 : if (r < 0)
1763 0 : log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1764 : }
1765 :
1766 0 : return 0;
1767 : }
1768 :
1769 0 : int seccomp_protect_hostname(void) {
1770 : uint32_t arch;
1771 : int r;
1772 :
1773 0 : SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1774 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1775 :
1776 0 : r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1777 0 : if (r < 0)
1778 0 : return r;
1779 :
1780 0 : r = seccomp_rule_add_exact(
1781 : seccomp,
1782 : SCMP_ACT_ERRNO(EPERM),
1783 : SCMP_SYS(sethostname),
1784 : 0);
1785 0 : if (r < 0) {
1786 0 : log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1787 0 : continue;
1788 : }
1789 :
1790 0 : r = seccomp_rule_add_exact(
1791 : seccomp,
1792 : SCMP_ACT_ERRNO(EPERM),
1793 : SCMP_SYS(setdomainname),
1794 : 0);
1795 0 : if (r < 0) {
1796 0 : log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1797 0 : continue;
1798 : }
1799 :
1800 0 : r = seccomp_load(seccomp);
1801 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
1802 0 : return r;
1803 0 : if (r < 0)
1804 0 : log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1805 : }
1806 :
1807 0 : return 0;
1808 : }
1809 :
1810 0 : static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1811 : /* Checks the mode_t parameter of the following system calls:
1812 : *
1813 : * → chmod() + fchmod() + fchmodat()
1814 : * → open() + creat() + openat()
1815 : * → mkdir() + mkdirat()
1816 : * → mknod() + mknodat()
1817 : *
1818 : * Returns error if *everything* failed, and 0 otherwise.
1819 : */
1820 0 : int r = 0;
1821 0 : bool any = false;
1822 :
1823 0 : r = seccomp_rule_add_exact(
1824 : seccomp,
1825 : SCMP_ACT_ERRNO(EPERM),
1826 : SCMP_SYS(chmod),
1827 : 1,
1828 0 : SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1829 0 : if (r < 0)
1830 0 : log_debug_errno(r, "Failed to add filter for chmod: %m");
1831 : else
1832 0 : any = true;
1833 :
1834 0 : r = seccomp_rule_add_exact(
1835 : seccomp,
1836 : SCMP_ACT_ERRNO(EPERM),
1837 : SCMP_SYS(fchmod),
1838 : 1,
1839 0 : SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1840 0 : if (r < 0)
1841 0 : log_debug_errno(r, "Failed to add filter for fchmod: %m");
1842 : else
1843 0 : any = true;
1844 :
1845 0 : r = seccomp_rule_add_exact(
1846 : seccomp,
1847 : SCMP_ACT_ERRNO(EPERM),
1848 : SCMP_SYS(fchmodat),
1849 : 1,
1850 0 : SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1851 0 : if (r < 0)
1852 0 : log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1853 : else
1854 0 : any = true;
1855 :
1856 0 : r = seccomp_rule_add_exact(
1857 : seccomp,
1858 : SCMP_ACT_ERRNO(EPERM),
1859 : SCMP_SYS(mkdir),
1860 : 1,
1861 0 : SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1862 0 : if (r < 0)
1863 0 : log_debug_errno(r, "Failed to add filter for mkdir: %m");
1864 : else
1865 0 : any = true;
1866 :
1867 0 : r = seccomp_rule_add_exact(
1868 : seccomp,
1869 : SCMP_ACT_ERRNO(EPERM),
1870 : SCMP_SYS(mkdirat),
1871 : 1,
1872 0 : SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1873 0 : if (r < 0)
1874 0 : log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1875 : else
1876 0 : any = true;
1877 :
1878 0 : r = seccomp_rule_add_exact(
1879 : seccomp,
1880 : SCMP_ACT_ERRNO(EPERM),
1881 : SCMP_SYS(mknod),
1882 : 1,
1883 0 : SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1884 0 : if (r < 0)
1885 0 : log_debug_errno(r, "Failed to add filter for mknod: %m");
1886 : else
1887 0 : any = true;
1888 :
1889 0 : r = seccomp_rule_add_exact(
1890 : seccomp,
1891 : SCMP_ACT_ERRNO(EPERM),
1892 : SCMP_SYS(mknodat),
1893 : 1,
1894 0 : SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1895 0 : if (r < 0)
1896 0 : log_debug_errno(r, "Failed to add filter for mknodat: %m");
1897 : else
1898 0 : any = true;
1899 :
1900 : #if SCMP_SYS(open) > 0
1901 0 : r = seccomp_rule_add_exact(
1902 : seccomp,
1903 : SCMP_ACT_ERRNO(EPERM),
1904 : SCMP_SYS(open),
1905 : 2,
1906 0 : SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1907 0 : SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1908 0 : if (r < 0)
1909 0 : log_debug_errno(r, "Failed to add filter for open: %m");
1910 : else
1911 0 : any = true;
1912 : #endif
1913 :
1914 0 : r = seccomp_rule_add_exact(
1915 : seccomp,
1916 : SCMP_ACT_ERRNO(EPERM),
1917 : SCMP_SYS(openat),
1918 : 2,
1919 0 : SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1920 0 : SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
1921 0 : if (r < 0)
1922 0 : log_debug_errno(r, "Failed to add filter for openat: %m");
1923 : else
1924 0 : any = true;
1925 :
1926 0 : r = seccomp_rule_add_exact(
1927 : seccomp,
1928 : SCMP_ACT_ERRNO(EPERM),
1929 : SCMP_SYS(creat),
1930 : 1,
1931 0 : SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1932 0 : if (r < 0)
1933 0 : log_debug_errno(r, "Failed to add filter for creat: %m");
1934 : else
1935 0 : any = true;
1936 :
1937 0 : return any ? 0 : r;
1938 : }
1939 :
1940 0 : int seccomp_restrict_suid_sgid(void) {
1941 : uint32_t arch;
1942 : int r, k;
1943 :
1944 0 : SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1945 0 : _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1946 :
1947 0 : r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1948 0 : if (r < 0)
1949 0 : return r;
1950 :
1951 0 : r = seccomp_restrict_sxid(seccomp, S_ISUID);
1952 0 : if (r < 0)
1953 0 : log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
1954 :
1955 0 : k = seccomp_restrict_sxid(seccomp, S_ISGID);
1956 0 : if (k < 0)
1957 0 : log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
1958 :
1959 0 : if (r < 0 && k < 0)
1960 0 : continue;
1961 :
1962 0 : r = seccomp_load(seccomp);
1963 0 : if (ERRNO_IS_SECCOMP_FATAL(r))
1964 0 : return r;
1965 0 : if (r < 0)
1966 0 : log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1967 : }
1968 :
1969 0 : return 0;
1970 : }
1971 :
1972 0 : uint32_t scmp_act_kill_process(void) {
1973 :
1974 : /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
1975 : * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
1976 : * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
1977 : * for single-threaded apps does the right thing. */
1978 :
1979 : #ifdef SCMP_ACT_KILL_PROCESS
1980 0 : if (seccomp_api_get() >= 3)
1981 0 : return SCMP_ACT_KILL_PROCESS;
1982 : #endif
1983 :
1984 0 : return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
1985 : }
|