LCOV - code coverage report
Current view: top level - shared - seccomp-util.c (source / functions) Hit Total Coverage
Test: main_coverage.info Lines: 92 648 14.2 %
Date: 2019-08-22 15:41:25 Functions: 6 26 23.1 %

          Line data    Source code
       1             : /* SPDX-License-Identifier: LGPL-2.1+ */
       2             : 
       3             : #include <errno.h>
       4             : #include <fcntl.h>
       5             : #include <linux/seccomp.h>
       6             : #include <seccomp.h>
       7             : #include <stddef.h>
       8             : #include <sys/mman.h>
       9             : #include <sys/prctl.h>
      10             : #include <sys/shm.h>
      11             : #include <sys/stat.h>
      12             : 
      13             : #include "af-list.h"
      14             : #include "alloc-util.h"
      15             : #include "errno-list.h"
      16             : #include "macro.h"
      17             : #include "nsflags.h"
      18             : #include "nulstr-util.h"
      19             : #include "process-util.h"
      20             : #include "seccomp-util.h"
      21             : #include "set.h"
      22             : #include "string-util.h"
      23             : #include "strv.h"
      24             : 
      25             : const uint32_t seccomp_local_archs[] = {
      26             : 
      27             :         /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
      28             : 
      29             : #if defined(__x86_64__) && defined(__ILP32__)
      30             :                 SCMP_ARCH_X86,
      31             :                 SCMP_ARCH_X86_64,
      32             :                 SCMP_ARCH_X32,         /* native */
      33             : #elif defined(__x86_64__) && !defined(__ILP32__)
      34             :                 SCMP_ARCH_X86,
      35             :                 SCMP_ARCH_X32,
      36             :                 SCMP_ARCH_X86_64,      /* native */
      37             : #elif defined(__i386__)
      38             :                 SCMP_ARCH_X86,
      39             : #elif defined(__aarch64__)
      40             :                 SCMP_ARCH_ARM,
      41             :                 SCMP_ARCH_AARCH64,     /* native */
      42             : #elif defined(__arm__)
      43             :                 SCMP_ARCH_ARM,
      44             : #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
      45             :                 SCMP_ARCH_MIPSEL,
      46             :                 SCMP_ARCH_MIPS,        /* native */
      47             : #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
      48             :                 SCMP_ARCH_MIPS,
      49             :                 SCMP_ARCH_MIPSEL,      /* native */
      50             : #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
      51             :                 SCMP_ARCH_MIPSEL,
      52             :                 SCMP_ARCH_MIPS,
      53             :                 SCMP_ARCH_MIPSEL64N32,
      54             :                 SCMP_ARCH_MIPS64N32,
      55             :                 SCMP_ARCH_MIPSEL64,
      56             :                 SCMP_ARCH_MIPS64,      /* native */
      57             : #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
      58             :                 SCMP_ARCH_MIPS,
      59             :                 SCMP_ARCH_MIPSEL,
      60             :                 SCMP_ARCH_MIPS64N32,
      61             :                 SCMP_ARCH_MIPSEL64N32,
      62             :                 SCMP_ARCH_MIPS64,
      63             :                 SCMP_ARCH_MIPSEL64,    /* native */
      64             : #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
      65             :                 SCMP_ARCH_MIPSEL,
      66             :                 SCMP_ARCH_MIPS,
      67             :                 SCMP_ARCH_MIPSEL64,
      68             :                 SCMP_ARCH_MIPS64,
      69             :                 SCMP_ARCH_MIPSEL64N32,
      70             :                 SCMP_ARCH_MIPS64N32,   /* native */
      71             : #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
      72             :                 SCMP_ARCH_MIPS,
      73             :                 SCMP_ARCH_MIPSEL,
      74             :                 SCMP_ARCH_MIPS64,
      75             :                 SCMP_ARCH_MIPSEL64,
      76             :                 SCMP_ARCH_MIPS64N32,
      77             :                 SCMP_ARCH_MIPSEL64N32, /* native */
      78             : #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
      79             :                 SCMP_ARCH_PPC,
      80             :                 SCMP_ARCH_PPC64LE,
      81             :                 SCMP_ARCH_PPC64,       /* native */
      82             : #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
      83             :                 SCMP_ARCH_PPC,
      84             :                 SCMP_ARCH_PPC64,
      85             :                 SCMP_ARCH_PPC64LE,     /* native */
      86             : #elif defined(__powerpc__)
      87             :                 SCMP_ARCH_PPC,
      88             : #elif defined(__s390x__)
      89             :                 SCMP_ARCH_S390,
      90             :                 SCMP_ARCH_S390X,      /* native */
      91             : #elif defined(__s390__)
      92             :                 SCMP_ARCH_S390,
      93             : #endif
      94             :                 (uint32_t) -1
      95             :         };
      96             : 
      97          33 : const char* seccomp_arch_to_string(uint32_t c) {
      98             :         /* Maintain order used in <seccomp.h>.
      99             :          *
     100             :          * Names used here should be the same as those used for ConditionArchitecture=,
     101             :          * except for "subarchitectures" like x32. */
     102             : 
     103          33 :         switch(c) {
     104           1 :         case SCMP_ARCH_NATIVE:
     105           1 :                 return "native";
     106           6 :         case SCMP_ARCH_X86:
     107           6 :                 return "x86";
     108           7 :         case SCMP_ARCH_X86_64:
     109           7 :                 return "x86-64";
     110           6 :         case SCMP_ARCH_X32:
     111           6 :                 return "x32";
     112           1 :         case SCMP_ARCH_ARM:
     113           1 :                 return "arm";
     114           1 :         case SCMP_ARCH_AARCH64:
     115           1 :                 return "arm64";
     116           1 :         case SCMP_ARCH_MIPS:
     117           1 :                 return "mips";
     118           1 :         case SCMP_ARCH_MIPS64:
     119           1 :                 return "mips64";
     120           1 :         case SCMP_ARCH_MIPS64N32:
     121           1 :                 return "mips64-n32";
     122           1 :         case SCMP_ARCH_MIPSEL:
     123           1 :                 return "mips-le";
     124           1 :         case SCMP_ARCH_MIPSEL64:
     125           1 :                 return "mips64-le";
     126           1 :         case SCMP_ARCH_MIPSEL64N32:
     127           1 :                 return "mips64-le-n32";
     128           1 :         case SCMP_ARCH_PPC:
     129           1 :                 return "ppc";
     130           1 :         case SCMP_ARCH_PPC64:
     131           1 :                 return "ppc64";
     132           1 :         case SCMP_ARCH_PPC64LE:
     133           1 :                 return "ppc64-le";
     134           1 :         case SCMP_ARCH_S390:
     135           1 :                 return "s390";
     136           1 :         case SCMP_ARCH_S390X:
     137           1 :                 return "s390x";
     138           0 :         default:
     139           0 :                 return NULL;
     140             :         }
     141             : }
     142             : 
     143          18 : int seccomp_arch_from_string(const char *n, uint32_t *ret) {
     144          18 :         if (!n)
     145           0 :                 return -EINVAL;
     146             : 
     147          18 :         assert(ret);
     148             : 
     149          18 :         if (streq(n, "native"))
     150           1 :                 *ret = SCMP_ARCH_NATIVE;
     151          17 :         else if (streq(n, "x86"))
     152           1 :                 *ret = SCMP_ARCH_X86;
     153          16 :         else if (streq(n, "x86-64"))
     154           2 :                 *ret = SCMP_ARCH_X86_64;
     155          14 :         else if (streq(n, "x32"))
     156           1 :                 *ret = SCMP_ARCH_X32;
     157          13 :         else if (streq(n, "arm"))
     158           1 :                 *ret = SCMP_ARCH_ARM;
     159          12 :         else if (streq(n, "arm64"))
     160           1 :                 *ret = SCMP_ARCH_AARCH64;
     161          11 :         else if (streq(n, "mips"))
     162           1 :                 *ret = SCMP_ARCH_MIPS;
     163          10 :         else if (streq(n, "mips64"))
     164           1 :                 *ret = SCMP_ARCH_MIPS64;
     165           9 :         else if (streq(n, "mips64-n32"))
     166           1 :                 *ret = SCMP_ARCH_MIPS64N32;
     167           8 :         else if (streq(n, "mips-le"))
     168           1 :                 *ret = SCMP_ARCH_MIPSEL;
     169           7 :         else if (streq(n, "mips64-le"))
     170           1 :                 *ret = SCMP_ARCH_MIPSEL64;
     171           6 :         else if (streq(n, "mips64-le-n32"))
     172           1 :                 *ret = SCMP_ARCH_MIPSEL64N32;
     173           5 :         else if (streq(n, "ppc"))
     174           1 :                 *ret = SCMP_ARCH_PPC;
     175           4 :         else if (streq(n, "ppc64"))
     176           1 :                 *ret = SCMP_ARCH_PPC64;
     177           3 :         else if (streq(n, "ppc64-le"))
     178           1 :                 *ret = SCMP_ARCH_PPC64LE;
     179           2 :         else if (streq(n, "s390"))
     180           1 :                 *ret = SCMP_ARCH_S390;
     181           1 :         else if (streq(n, "s390x"))
     182           1 :                 *ret = SCMP_ARCH_S390X;
     183             :         else
     184           0 :                 return -EINVAL;
     185             : 
     186          18 :         return 0;
     187             : }
     188             : 
     189           0 : int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
     190             :         scmp_filter_ctx seccomp;
     191             :         int r;
     192             : 
     193             :         /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
     194             :          * any others. Also, turns off the NNP fiddling. */
     195             : 
     196           0 :         seccomp = seccomp_init(default_action);
     197           0 :         if (!seccomp)
     198           0 :                 return -ENOMEM;
     199             : 
     200           0 :         if (arch != SCMP_ARCH_NATIVE &&
     201           0 :             arch != seccomp_arch_native()) {
     202             : 
     203           0 :                 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
     204           0 :                 if (r < 0)
     205           0 :                         goto finish;
     206             : 
     207           0 :                 r = seccomp_arch_add(seccomp, arch);
     208           0 :                 if (r < 0)
     209           0 :                         goto finish;
     210             : 
     211           0 :                 assert(seccomp_arch_exist(seccomp, arch) >= 0);
     212           0 :                 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
     213           0 :                 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
     214             :         } else {
     215           0 :                 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
     216           0 :                 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
     217             :         }
     218             : 
     219           0 :         r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
     220           0 :         if (r < 0)
     221           0 :                 goto finish;
     222             : 
     223           0 :         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
     224           0 :         if (r < 0)
     225           0 :                 goto finish;
     226             : 
     227           0 :         *ret = seccomp;
     228           0 :         return 0;
     229             : 
     230           0 : finish:
     231           0 :         seccomp_release(seccomp);
     232           0 :         return r;
     233             : }
     234             : 
     235           1 : static bool is_basic_seccomp_available(void) {
     236           1 :         return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
     237             : }
     238             : 
     239           1 : static bool is_seccomp_filter_available(void) {
     240           2 :         return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
     241           1 :                 errno == EFAULT;
     242             : }
     243             : 
     244          10 : bool is_seccomp_available(void) {
     245             :         static int cached_enabled = -1;
     246             : 
     247          10 :         if (cached_enabled < 0)
     248           1 :                 cached_enabled =
     249           2 :                         is_basic_seccomp_available() &&
     250           1 :                         is_seccomp_filter_available();
     251             : 
     252          10 :         return cached_enabled;
     253             : }
     254             : 
     255             : const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
     256             :         [SYSCALL_FILTER_SET_DEFAULT] = {
     257             :                 .name = "@default",
     258             :                 .help = "System calls that are always permitted",
     259             :                 .value =
     260             :                 "clock_getres\0"
     261             :                 "clock_gettime\0"
     262             :                 "clock_nanosleep\0"
     263             :                 "execve\0"
     264             :                 "exit\0"
     265             :                 "exit_group\0"
     266             :                 "futex\0"
     267             :                 "get_robust_list\0"
     268             :                 "get_thread_area\0"
     269             :                 "getegid\0"
     270             :                 "getegid32\0"
     271             :                 "geteuid\0"
     272             :                 "geteuid32\0"
     273             :                 "getgid\0"
     274             :                 "getgid32\0"
     275             :                 "getgroups\0"
     276             :                 "getgroups32\0"
     277             :                 "getpgid\0"
     278             :                 "getpgrp\0"
     279             :                 "getpid\0"
     280             :                 "getppid\0"
     281             :                 "getresgid\0"
     282             :                 "getresgid32\0"
     283             :                 "getresuid\0"
     284             :                 "getresuid32\0"
     285             :                 "getrlimit\0"      /* make sure processes can query stack size and such */
     286             :                 "getsid\0"
     287             :                 "gettid\0"
     288             :                 "gettimeofday\0"
     289             :                 "getuid\0"
     290             :                 "getuid32\0"
     291             :                 "membarrier\0"
     292             :                 "nanosleep\0"
     293             :                 "pause\0"
     294             :                 "prlimit64\0"
     295             :                 "restart_syscall\0"
     296             :                 "rseq\0"
     297             :                 "rt_sigreturn\0"
     298             :                 "sched_yield\0"
     299             :                 "set_robust_list\0"
     300             :                 "set_thread_area\0"
     301             :                 "set_tid_address\0"
     302             :                 "set_tls\0"
     303             :                 "sigreturn\0"
     304             :                 "time\0"
     305             :                 "ugetrlimit\0"
     306             :         },
     307             :         [SYSCALL_FILTER_SET_AIO] = {
     308             :                 .name = "@aio",
     309             :                 .help = "Asynchronous IO",
     310             :                 .value =
     311             :                 "io_cancel\0"
     312             :                 "io_destroy\0"
     313             :                 "io_getevents\0"
     314             :                 "io_pgetevents\0"
     315             :                 "io_setup\0"
     316             :                 "io_submit\0"
     317             :         },
     318             :         [SYSCALL_FILTER_SET_BASIC_IO] = {
     319             :                 .name = "@basic-io",
     320             :                 .help = "Basic IO",
     321             :                 .value =
     322             :                 "_llseek\0"
     323             :                 "close\0"
     324             :                 "dup\0"
     325             :                 "dup2\0"
     326             :                 "dup3\0"
     327             :                 "lseek\0"
     328             :                 "pread64\0"
     329             :                 "preadv\0"
     330             :                 "preadv2\0"
     331             :                 "pwrite64\0"
     332             :                 "pwritev\0"
     333             :                 "pwritev2\0"
     334             :                 "read\0"
     335             :                 "readv\0"
     336             :                 "write\0"
     337             :                 "writev\0"
     338             :         },
     339             :         [SYSCALL_FILTER_SET_CHOWN] = {
     340             :                 .name = "@chown",
     341             :                 .help = "Change ownership of files and directories",
     342             :                 .value =
     343             :                 "chown\0"
     344             :                 "chown32\0"
     345             :                 "fchown\0"
     346             :                 "fchown32\0"
     347             :                 "fchownat\0"
     348             :                 "lchown\0"
     349             :                 "lchown32\0"
     350             :         },
     351             :         [SYSCALL_FILTER_SET_CLOCK] = {
     352             :                 .name = "@clock",
     353             :                 .help = "Change the system time",
     354             :                 .value =
     355             :                 "adjtimex\0"
     356             :                 "clock_adjtime\0"
     357             :                 "clock_settime\0"
     358             :                 "settimeofday\0"
     359             :                 "stime\0"
     360             :         },
     361             :         [SYSCALL_FILTER_SET_CPU_EMULATION] = {
     362             :                 .name = "@cpu-emulation",
     363             :                 .help = "System calls for CPU emulation functionality",
     364             :                 .value =
     365             :                 "modify_ldt\0"
     366             :                 "subpage_prot\0"
     367             :                 "switch_endian\0"
     368             :                 "vm86\0"
     369             :                 "vm86old\0"
     370             :         },
     371             :         [SYSCALL_FILTER_SET_DEBUG] = {
     372             :                 .name = "@debug",
     373             :                 .help = "Debugging, performance monitoring and tracing functionality",
     374             :                 .value =
     375             :                 "lookup_dcookie\0"
     376             :                 "perf_event_open\0"
     377             :                 "ptrace\0"
     378             :                 "rtas\0"
     379             : #ifdef __NR_s390_runtime_instr
     380             :                 "s390_runtime_instr\0"
     381             : #endif
     382             :                 "sys_debug_setcontext\0"
     383             :         },
     384             :         [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
     385             :                 .name = "@file-system",
     386             :                 .help = "File system operations",
     387             :                 .value =
     388             :                 "access\0"
     389             :                 "chdir\0"
     390             :                 "chmod\0"
     391             :                 "close\0"
     392             :                 "creat\0"
     393             :                 "faccessat\0"
     394             :                 "fallocate\0"
     395             :                 "fchdir\0"
     396             :                 "fchmod\0"
     397             :                 "fchmodat\0"
     398             :                 "fcntl\0"
     399             :                 "fcntl64\0"
     400             :                 "fgetxattr\0"
     401             :                 "flistxattr\0"
     402             :                 "fremovexattr\0"
     403             :                 "fsetxattr\0"
     404             :                 "fstat\0"
     405             :                 "fstat64\0"
     406             :                 "fstatat64\0"
     407             :                 "fstatfs\0"
     408             :                 "fstatfs64\0"
     409             :                 "ftruncate\0"
     410             :                 "ftruncate64\0"
     411             :                 "futimesat\0"
     412             :                 "getcwd\0"
     413             :                 "getdents\0"
     414             :                 "getdents64\0"
     415             :                 "getxattr\0"
     416             :                 "inotify_add_watch\0"
     417             :                 "inotify_init\0"
     418             :                 "inotify_init1\0"
     419             :                 "inotify_rm_watch\0"
     420             :                 "lgetxattr\0"
     421             :                 "link\0"
     422             :                 "linkat\0"
     423             :                 "listxattr\0"
     424             :                 "llistxattr\0"
     425             :                 "lremovexattr\0"
     426             :                 "lsetxattr\0"
     427             :                 "lstat\0"
     428             :                 "lstat64\0"
     429             :                 "mkdir\0"
     430             :                 "mkdirat\0"
     431             :                 "mknod\0"
     432             :                 "mknodat\0"
     433             :                 "mmap\0"
     434             :                 "mmap2\0"
     435             :                 "munmap\0"
     436             :                 "newfstatat\0"
     437             :                 "oldfstat\0"
     438             :                 "oldlstat\0"
     439             :                 "oldstat\0"
     440             :                 "open\0"
     441             :                 "openat\0"
     442             :                 "readlink\0"
     443             :                 "readlinkat\0"
     444             :                 "removexattr\0"
     445             :                 "rename\0"
     446             :                 "renameat\0"
     447             :                 "renameat2\0"
     448             :                 "rmdir\0"
     449             :                 "setxattr\0"
     450             :                 "stat\0"
     451             :                 "stat64\0"
     452             :                 "statfs\0"
     453             :                 "statfs64\0"
     454             : #ifdef __NR_statx
     455             :                 "statx\0"
     456             : #endif
     457             :                 "symlink\0"
     458             :                 "symlinkat\0"
     459             :                 "truncate\0"
     460             :                 "truncate64\0"
     461             :                 "unlink\0"
     462             :                 "unlinkat\0"
     463             :                 "utime\0"
     464             :                 "utimensat\0"
     465             :                 "utimes\0"
     466             :         },
     467             :         [SYSCALL_FILTER_SET_IO_EVENT] = {
     468             :                 .name = "@io-event",
     469             :                 .help = "Event loop system calls",
     470             :                 .value =
     471             :                 "_newselect\0"
     472             :                 "epoll_create\0"
     473             :                 "epoll_create1\0"
     474             :                 "epoll_ctl\0"
     475             :                 "epoll_ctl_old\0"
     476             :                 "epoll_pwait\0"
     477             :                 "epoll_wait\0"
     478             :                 "epoll_wait_old\0"
     479             :                 "eventfd\0"
     480             :                 "eventfd2\0"
     481             :                 "poll\0"
     482             :                 "ppoll\0"
     483             :                 "pselect6\0"
     484             :                 "select\0"
     485             :         },
     486             :         [SYSCALL_FILTER_SET_IPC] = {
     487             :                 .name = "@ipc",
     488             :                 .help = "SysV IPC, POSIX Message Queues or other IPC",
     489             :                 .value =
     490             :                 "ipc\0"
     491             :                 "memfd_create\0"
     492             :                 "mq_getsetattr\0"
     493             :                 "mq_notify\0"
     494             :                 "mq_open\0"
     495             :                 "mq_timedreceive\0"
     496             :                 "mq_timedsend\0"
     497             :                 "mq_unlink\0"
     498             :                 "msgctl\0"
     499             :                 "msgget\0"
     500             :                 "msgrcv\0"
     501             :                 "msgsnd\0"
     502             :                 "pipe\0"
     503             :                 "pipe2\0"
     504             :                 "process_vm_readv\0"
     505             :                 "process_vm_writev\0"
     506             :                 "semctl\0"
     507             :                 "semget\0"
     508             :                 "semop\0"
     509             :                 "semtimedop\0"
     510             :                 "shmat\0"
     511             :                 "shmctl\0"
     512             :                 "shmdt\0"
     513             :                 "shmget\0"
     514             :         },
     515             :         [SYSCALL_FILTER_SET_KEYRING] = {
     516             :                 .name = "@keyring",
     517             :                 .help = "Kernel keyring access",
     518             :                 .value =
     519             :                 "add_key\0"
     520             :                 "keyctl\0"
     521             :                 "request_key\0"
     522             :         },
     523             :         [SYSCALL_FILTER_SET_MEMLOCK] = {
     524             :                 .name = "@memlock",
     525             :                 .help = "Memory locking control",
     526             :                 .value =
     527             :                 "mlock\0"
     528             :                 "mlock2\0"
     529             :                 "mlockall\0"
     530             :                 "munlock\0"
     531             :                 "munlockall\0"
     532             :         },
     533             :         [SYSCALL_FILTER_SET_MODULE] = {
     534             :                 .name = "@module",
     535             :                 .help = "Loading and unloading of kernel modules",
     536             :                 .value =
     537             :                 "delete_module\0"
     538             :                 "finit_module\0"
     539             :                 "init_module\0"
     540             :         },
     541             :         [SYSCALL_FILTER_SET_MOUNT] = {
     542             :                 .name = "@mount",
     543             :                 .help = "Mounting and unmounting of file systems",
     544             :                 .value =
     545             :                 "chroot\0"
     546             :                 "mount\0"
     547             :                 "pivot_root\0"
     548             :                 "umount\0"
     549             :                 "umount2\0"
     550             :         },
     551             :         [SYSCALL_FILTER_SET_NETWORK_IO] = {
     552             :                 .name = "@network-io",
     553             :                 .help = "Network or Unix socket IO, should not be needed if not network facing",
     554             :                 .value =
     555             :                 "accept\0"
     556             :                 "accept4\0"
     557             :                 "bind\0"
     558             :                 "connect\0"
     559             :                 "getpeername\0"
     560             :                 "getsockname\0"
     561             :                 "getsockopt\0"
     562             :                 "listen\0"
     563             :                 "recv\0"
     564             :                 "recvfrom\0"
     565             :                 "recvmmsg\0"
     566             :                 "recvmsg\0"
     567             :                 "send\0"
     568             :                 "sendmmsg\0"
     569             :                 "sendmsg\0"
     570             :                 "sendto\0"
     571             :                 "setsockopt\0"
     572             :                 "shutdown\0"
     573             :                 "socket\0"
     574             :                 "socketcall\0"
     575             :                 "socketpair\0"
     576             :         },
     577             :         [SYSCALL_FILTER_SET_OBSOLETE] = {
     578             :                 /* some unknown even to libseccomp */
     579             :                 .name = "@obsolete",
     580             :                 .help = "Unusual, obsolete or unimplemented system calls",
     581             :                 .value =
     582             :                 "_sysctl\0"
     583             :                 "afs_syscall\0"
     584             :                 "bdflush\0"
     585             :                 "break\0"
     586             :                 "create_module\0"
     587             :                 "ftime\0"
     588             :                 "get_kernel_syms\0"
     589             :                 "getpmsg\0"
     590             :                 "gtty\0"
     591             :                 "idle\0"
     592             :                 "lock\0"
     593             :                 "mpx\0"
     594             :                 "prof\0"
     595             :                 "profil\0"
     596             :                 "putpmsg\0"
     597             :                 "query_module\0"
     598             :                 "security\0"
     599             :                 "sgetmask\0"
     600             :                 "ssetmask\0"
     601             :                 "stty\0"
     602             :                 "sysfs\0"
     603             :                 "tuxcall\0"
     604             :                 "ulimit\0"
     605             :                 "uselib\0"
     606             :                 "ustat\0"
     607             :                 "vserver\0"
     608             :         },
     609             :         [SYSCALL_FILTER_SET_PRIVILEGED] = {
     610             :                 .name = "@privileged",
     611             :                 .help = "All system calls which need super-user capabilities",
     612             :                 .value =
     613             :                 "@chown\0"
     614             :                 "@clock\0"
     615             :                 "@module\0"
     616             :                 "@raw-io\0"
     617             :                 "@reboot\0"
     618             :                 "@swap\0"
     619             :                 "_sysctl\0"
     620             :                 "acct\0"
     621             :                 "bpf\0"
     622             :                 "capset\0"
     623             :                 "chroot\0"
     624             :                 "fanotify_init\0"
     625             :                 "nfsservctl\0"
     626             :                 "open_by_handle_at\0"
     627             :                 "pivot_root\0"
     628             :                 "quotactl\0"
     629             :                 "setdomainname\0"
     630             :                 "setfsuid\0"
     631             :                 "setfsuid32\0"
     632             :                 "setgroups\0"
     633             :                 "setgroups32\0"
     634             :                 "sethostname\0"
     635             :                 "setresuid\0"
     636             :                 "setresuid32\0"
     637             :                 "setreuid\0"
     638             :                 "setreuid32\0"
     639             :                 "setuid\0"      /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
     640             :                 "setuid32\0"
     641             :                 "vhangup\0"
     642             :         },
     643             :         [SYSCALL_FILTER_SET_PROCESS] = {
     644             :                 .name = "@process",
     645             :                 .help = "Process control, execution, namespaceing operations",
     646             :                 .value =
     647             :                 "arch_prctl\0"
     648             :                 "capget\0"      /* Able to query arbitrary processes */
     649             :                 "clone\0"
     650             :                 "execveat\0"
     651             :                 "fork\0"
     652             :                 "getrusage\0"
     653             :                 "kill\0"
     654             :                 "pidfd_send_signal\0"
     655             :                 "prctl\0"
     656             :                 "rt_sigqueueinfo\0"
     657             :                 "rt_tgsigqueueinfo\0"
     658             :                 "setns\0"
     659             :                 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
     660             :                 "tgkill\0"
     661             :                 "times\0"
     662             :                 "tkill\0"
     663             :                 "unshare\0"
     664             :                 "vfork\0"
     665             :                 "wait4\0"
     666             :                 "waitid\0"
     667             :                 "waitpid\0"
     668             :         },
     669             :         [SYSCALL_FILTER_SET_RAW_IO] = {
     670             :                 .name = "@raw-io",
     671             :                 .help = "Raw I/O port access",
     672             :                 .value =
     673             :                 "ioperm\0"
     674             :                 "iopl\0"
     675             :                 "pciconfig_iobase\0"
     676             :                 "pciconfig_read\0"
     677             :                 "pciconfig_write\0"
     678             : #ifdef __NR_s390_pci_mmio_read
     679             :                 "s390_pci_mmio_read\0"
     680             : #endif
     681             : #ifdef __NR_s390_pci_mmio_write
     682             :                 "s390_pci_mmio_write\0"
     683             : #endif
     684             :         },
     685             :         [SYSCALL_FILTER_SET_REBOOT] = {
     686             :                 .name = "@reboot",
     687             :                 .help = "Reboot and reboot preparation/kexec",
     688             :                 .value =
     689             :                 "kexec_file_load\0"
     690             :                 "kexec_load\0"
     691             :                 "reboot\0"
     692             :         },
     693             :         [SYSCALL_FILTER_SET_RESOURCES] = {
     694             :                 .name = "@resources",
     695             :                 .help = "Alter resource settings",
     696             :                 .value =
     697             :                 "ioprio_set\0"
     698             :                 "mbind\0"
     699             :                 "migrate_pages\0"
     700             :                 "move_pages\0"
     701             :                 "nice\0"
     702             :                 "sched_setaffinity\0"
     703             :                 "sched_setattr\0"
     704             :                 "sched_setparam\0"
     705             :                 "sched_setscheduler\0"
     706             :                 "set_mempolicy\0"
     707             :                 "setpriority\0"
     708             :                 "setrlimit\0"
     709             :         },
     710             :         [SYSCALL_FILTER_SET_SETUID] = {
     711             :                 .name = "@setuid",
     712             :                 .help = "Operations for changing user/group credentials",
     713             :                 .value =
     714             :                 "setgid\0"
     715             :                 "setgid32\0"
     716             :                 "setgroups\0"
     717             :                 "setgroups32\0"
     718             :                 "setregid\0"
     719             :                 "setregid32\0"
     720             :                 "setresgid\0"
     721             :                 "setresgid32\0"
     722             :                 "setresuid\0"
     723             :                 "setresuid32\0"
     724             :                 "setreuid\0"
     725             :                 "setreuid32\0"
     726             :                 "setuid\0"
     727             :                 "setuid32\0"
     728             :         },
     729             :         [SYSCALL_FILTER_SET_SIGNAL] = {
     730             :                 .name = "@signal",
     731             :                 .help = "Process signal handling",
     732             :                 .value =
     733             :                 "rt_sigaction\0"
     734             :                 "rt_sigpending\0"
     735             :                 "rt_sigprocmask\0"
     736             :                 "rt_sigsuspend\0"
     737             :                 "rt_sigtimedwait\0"
     738             :                 "sigaction\0"
     739             :                 "sigaltstack\0"
     740             :                 "signal\0"
     741             :                 "signalfd\0"
     742             :                 "signalfd4\0"
     743             :                 "sigpending\0"
     744             :                 "sigprocmask\0"
     745             :                 "sigsuspend\0"
     746             :         },
     747             :         [SYSCALL_FILTER_SET_SWAP] = {
     748             :                 .name = "@swap",
     749             :                 .help = "Enable/disable swap devices",
     750             :                 .value =
     751             :                 "swapoff\0"
     752             :                 "swapon\0"
     753             :         },
     754             :         [SYSCALL_FILTER_SET_SYNC] = {
     755             :                 .name = "@sync",
     756             :                 .help = "Synchronize files and memory to storage",
     757             :                 .value =
     758             :                 "fdatasync\0"
     759             :                 "fsync\0"
     760             :                 "msync\0"
     761             :                 "sync\0"
     762             :                 "sync_file_range\0"
     763             :                 "sync_file_range2\0"
     764             :                 "syncfs\0"
     765             :         },
     766             :         [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
     767             :                 .name = "@system-service",
     768             :                 .help = "General system service operations",
     769             :                 .value =
     770             :                 "@aio\0"
     771             :                 "@basic-io\0"
     772             :                 "@chown\0"
     773             :                 "@default\0"
     774             :                 "@file-system\0"
     775             :                 "@io-event\0"
     776             :                 "@ipc\0"
     777             :                 "@keyring\0"
     778             :                 "@memlock\0"
     779             :                 "@network-io\0"
     780             :                 "@process\0"
     781             :                 "@resources\0"
     782             :                 "@setuid\0"
     783             :                 "@signal\0"
     784             :                 "@sync\0"
     785             :                 "@timer\0"
     786             :                 "brk\0"
     787             :                 "capget\0"
     788             :                 "capset\0"
     789             :                 "copy_file_range\0"
     790             :                 "fadvise64\0"
     791             :                 "fadvise64_64\0"
     792             :                 "flock\0"
     793             :                 "get_mempolicy\0"
     794             :                 "getcpu\0"
     795             :                 "getpriority\0"
     796             :                 "getrandom\0"
     797             :                 "ioctl\0"
     798             :                 "ioprio_get\0"
     799             :                 "kcmp\0"
     800             :                 "madvise\0"
     801             :                 "mprotect\0"
     802             :                 "mremap\0"
     803             :                 "name_to_handle_at\0"
     804             :                 "oldolduname\0"
     805             :                 "olduname\0"
     806             :                 "personality\0"
     807             :                 "readahead\0"
     808             :                 "readdir\0"
     809             :                 "remap_file_pages\0"
     810             :                 "sched_get_priority_max\0"
     811             :                 "sched_get_priority_min\0"
     812             :                 "sched_getaffinity\0"
     813             :                 "sched_getattr\0"
     814             :                 "sched_getparam\0"
     815             :                 "sched_getscheduler\0"
     816             :                 "sched_rr_get_interval\0"
     817             :                 "sched_yield\0"
     818             :                 "sendfile\0"
     819             :                 "sendfile64\0"
     820             :                 "setfsgid\0"
     821             :                 "setfsgid32\0"
     822             :                 "setfsuid\0"
     823             :                 "setfsuid32\0"
     824             :                 "setpgid\0"
     825             :                 "setsid\0"
     826             :                 "splice\0"
     827             :                 "sysinfo\0"
     828             :                 "tee\0"
     829             :                 "umask\0"
     830             :                 "uname\0"
     831             :                 "userfaultfd\0"
     832             :                 "vmsplice\0"
     833             :         },
     834             :         [SYSCALL_FILTER_SET_TIMER] = {
     835             :                 .name = "@timer",
     836             :                 .help = "Schedule operations by time",
     837             :                 .value =
     838             :                 "alarm\0"
     839             :                 "getitimer\0"
     840             :                 "setitimer\0"
     841             :                 "timer_create\0"
     842             :                 "timer_delete\0"
     843             :                 "timer_getoverrun\0"
     844             :                 "timer_gettime\0"
     845             :                 "timer_settime\0"
     846             :                 "timerfd_create\0"
     847             :                 "timerfd_gettime\0"
     848             :                 "timerfd_settime\0"
     849             :                 "times\0"
     850             :         },
     851             : };
     852             : 
     853           7 : const SyscallFilterSet *syscall_filter_set_find(const char *name) {
     854             :         unsigned i;
     855             : 
     856           7 :         if (isempty(name) || name[0] != '@')
     857           3 :                 return NULL;
     858             : 
     859          53 :         for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
     860          52 :                 if (streq(syscall_filter_sets[i].name, name))
     861           3 :                         return syscall_filter_sets + i;
     862             : 
     863           1 :         return NULL;
     864             : }
     865             : 
     866             : static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
     867             : 
     868           0 : int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
     869           0 :         assert(seccomp);
     870           0 :         assert(name);
     871             : 
     872           0 :         if (strv_contains(exclude, name))
     873           0 :                 return 0;
     874             : 
     875           0 :         if (name[0] == '@') {
     876             :                 const SyscallFilterSet *other;
     877             : 
     878           0 :                 other = syscall_filter_set_find(name);
     879           0 :                 if (!other)
     880           0 :                         return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
     881             :                                                "Filter set %s is not known!",
     882             :                                                name);
     883             : 
     884           0 :                 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
     885             : 
     886             :         } else {
     887             :                 int id, r;
     888             : 
     889           0 :                 id = seccomp_syscall_resolve_name(name);
     890           0 :                 if (id == __NR_SCMP_ERROR) {
     891           0 :                         if (log_missing)
     892           0 :                                 log_debug("System call %s is not known, ignoring.", name);
     893           0 :                         return 0;
     894             :                 }
     895             : 
     896           0 :                 r = seccomp_rule_add_exact(seccomp, action, id, 0);
     897           0 :                 if (r < 0) {
     898             :                         /* If the system call is not known on this architecture, then that's fine, let's ignore it */
     899           0 :                         bool ignore = r == -EDOM;
     900             : 
     901           0 :                         if (!ignore || log_missing)
     902           0 :                                 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
     903             :                                                 name, id, ignore ? ", ignoring" : "");
     904           0 :                         if (!ignore)
     905           0 :                                 return r;
     906             :                 }
     907             : 
     908           0 :                 return 0;
     909             :         }
     910             : }
     911             : 
     912           0 : static int seccomp_add_syscall_filter_set(
     913             :                 scmp_filter_ctx seccomp,
     914             :                 const SyscallFilterSet *set,
     915             :                 uint32_t action,
     916             :                 char **exclude,
     917             :                 bool log_missing) {
     918             : 
     919             :         const char *sys;
     920             :         int r;
     921             : 
     922           0 :         assert(seccomp);
     923           0 :         assert(set);
     924             : 
     925           0 :         NULSTR_FOREACH(sys, set->value) {
     926           0 :                 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
     927           0 :                 if (r < 0)
     928           0 :                         return r;
     929             :         }
     930             : 
     931           0 :         return 0;
     932             : }
     933             : 
     934           0 : int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
     935             :         uint32_t arch;
     936             :         int r;
     937             : 
     938           0 :         assert(set);
     939             : 
     940             :         /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
     941             :          * each local arch. */
     942             : 
     943           0 :         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
     944           0 :                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
     945             : 
     946           0 :                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
     947             : 
     948           0 :                 r = seccomp_init_for_arch(&seccomp, arch, default_action);
     949           0 :                 if (r < 0)
     950           0 :                         return r;
     951             : 
     952           0 :                 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
     953           0 :                 if (r < 0)
     954           0 :                         return log_debug_errno(r, "Failed to add filter set: %m");
     955             : 
     956           0 :                 r = seccomp_load(seccomp);
     957           0 :                 if (ERRNO_IS_SECCOMP_FATAL(r))
     958           0 :                         return r;
     959           0 :                 if (r < 0)
     960           0 :                         log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
     961             :         }
     962             : 
     963           0 :         return 0;
     964             : }
     965             : 
     966           0 : int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
     967             :         uint32_t arch;
     968             :         int r;
     969             : 
     970             :         /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
     971             :          * SyscallFilterSet* table. */
     972             : 
     973           0 :         if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
     974           0 :                 return 0;
     975             : 
     976           0 :         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
     977           0 :                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
     978             :                 Iterator i;
     979             :                 void *syscall_id, *val;
     980             : 
     981           0 :                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
     982             : 
     983           0 :                 r = seccomp_init_for_arch(&seccomp, arch, default_action);
     984           0 :                 if (r < 0)
     985           0 :                         return r;
     986             : 
     987           0 :                 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
     988           0 :                         uint32_t a = action;
     989           0 :                         int id = PTR_TO_INT(syscall_id) - 1;
     990           0 :                         int error = PTR_TO_INT(val);
     991             : 
     992           0 :                         if (action != SCMP_ACT_ALLOW && error >= 0)
     993           0 :                                 a = SCMP_ACT_ERRNO(error);
     994             : 
     995           0 :                         r = seccomp_rule_add_exact(seccomp, a, id, 0);
     996           0 :                         if (r < 0) {
     997             :                                 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
     998           0 :                                 _cleanup_free_ char *n = NULL;
     999             :                                 bool ignore;
    1000             : 
    1001           0 :                                 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
    1002           0 :                                 ignore = r == -EDOM;
    1003           0 :                                 if (!ignore || log_missing)
    1004           0 :                                         log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
    1005             :                                                         strna(n), id, ignore ? ", ignoring" : "");
    1006           0 :                                 if (!ignore)
    1007           0 :                                         return r;
    1008             :                         }
    1009             :                 }
    1010             : 
    1011           0 :                 r = seccomp_load(seccomp);
    1012           0 :                 if (ERRNO_IS_SECCOMP_FATAL(r))
    1013           0 :                         return r;
    1014           0 :                 if (r < 0)
    1015           0 :                         log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1016             :         }
    1017             : 
    1018           0 :         return 0;
    1019             : }
    1020             : 
    1021           0 : int seccomp_parse_syscall_filter(
    1022             :                 const char *name,
    1023             :                 int errno_num,
    1024             :                 Hashmap *filter,
    1025             :                 SeccompParseFlags flags,
    1026             :                 const char *unit,
    1027             :                 const char *filename,
    1028             :                 unsigned line) {
    1029             : 
    1030             :         int r;
    1031             : 
    1032           0 :         assert(name);
    1033           0 :         assert(filter);
    1034             : 
    1035           0 :         if (name[0] == '@') {
    1036             :                 const SyscallFilterSet *set;
    1037             :                 const char *i;
    1038             : 
    1039           0 :                 set = syscall_filter_set_find(name);
    1040           0 :                 if (!set) {
    1041           0 :                         if (!(flags & SECCOMP_PARSE_PERMISSIVE))
    1042           0 :                                 return -EINVAL;
    1043             : 
    1044           0 :                         log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
    1045             :                                    "Unknown system call group, ignoring: %s", name);
    1046           0 :                         return 0;
    1047             :                 }
    1048             : 
    1049           0 :                 NULSTR_FOREACH(i, set->value) {
    1050             :                         /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
    1051             :                          * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
    1052             :                          * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
    1053             :                          * about them. */
    1054           0 :                         r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
    1055           0 :                         if (r < 0)
    1056           0 :                                 return r;
    1057             :                 }
    1058             :         } else {
    1059             :                 int id;
    1060             : 
    1061           0 :                 id = seccomp_syscall_resolve_name(name);
    1062           0 :                 if (id == __NR_SCMP_ERROR) {
    1063           0 :                         if (!(flags & SECCOMP_PARSE_PERMISSIVE))
    1064           0 :                                 return -EINVAL;
    1065             : 
    1066           0 :                         log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
    1067             :                                    "Failed to parse system call, ignoring: %s", name);
    1068           0 :                         return 0;
    1069             :                 }
    1070             : 
    1071             :                 /* If we previously wanted to forbid a syscall and now
    1072             :                  * we want to allow it, then remove it from the list. */
    1073           0 :                 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
    1074           0 :                         r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
    1075           0 :                         if (r < 0)
    1076           0 :                                 switch (r) {
    1077           0 :                                 case -ENOMEM:
    1078           0 :                                         return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
    1079           0 :                                 case -EEXIST:
    1080           0 :                                         assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
    1081           0 :                                         break;
    1082           0 :                                 default:
    1083           0 :                                         return r;
    1084             :                                 }
    1085           0 :                 } else
    1086           0 :                         (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
    1087             :         }
    1088             : 
    1089           0 :         return 0;
    1090             : }
    1091             : 
    1092           0 : int seccomp_restrict_namespaces(unsigned long retain) {
    1093             :         uint32_t arch;
    1094             :         int r;
    1095             : 
    1096           0 :         if (DEBUG_LOGGING) {
    1097           0 :                 _cleanup_free_ char *s = NULL;
    1098             : 
    1099           0 :                 (void) namespace_flags_to_string(retain, &s);
    1100           0 :                 log_debug("Restricting namespace to: %s.", strna(s));
    1101             :         }
    1102             : 
    1103             :         /* NOOP? */
    1104           0 :         if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
    1105           0 :                 return 0;
    1106             : 
    1107           0 :         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
    1108           0 :                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
    1109             :                 unsigned i;
    1110             : 
    1111           0 :                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
    1112             : 
    1113           0 :                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
    1114           0 :                 if (r < 0)
    1115           0 :                         return r;
    1116             : 
    1117           0 :                 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
    1118             :                         /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
    1119             :                          * altogether. */
    1120           0 :                         r = seccomp_rule_add_exact(
    1121             :                                         seccomp,
    1122             :                                         SCMP_ACT_ERRNO(EPERM),
    1123             :                                         SCMP_SYS(setns),
    1124             :                                         0);
    1125             :                 else
    1126             :                         /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
    1127             :                          * special invocation with a zero flags argument, right here. */
    1128           0 :                         r = seccomp_rule_add_exact(
    1129             :                                         seccomp,
    1130             :                                         SCMP_ACT_ERRNO(EPERM),
    1131             :                                         SCMP_SYS(setns),
    1132             :                                         1,
    1133           0 :                                         SCMP_A1(SCMP_CMP_EQ, 0));
    1134           0 :                 if (r < 0) {
    1135           0 :                         log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1136           0 :                         continue;
    1137             :                 }
    1138             : 
    1139           0 :                 for (i = 0; namespace_flag_map[i].name; i++) {
    1140             :                         unsigned long f;
    1141             : 
    1142           0 :                         f = namespace_flag_map[i].flag;
    1143           0 :                         if ((retain & f) == f) {
    1144           0 :                                 log_debug("Permitting %s.", namespace_flag_map[i].name);
    1145           0 :                                 continue;
    1146             :                         }
    1147             : 
    1148           0 :                         log_debug("Blocking %s.", namespace_flag_map[i].name);
    1149             : 
    1150           0 :                         r = seccomp_rule_add_exact(
    1151             :                                         seccomp,
    1152             :                                         SCMP_ACT_ERRNO(EPERM),
    1153             :                                         SCMP_SYS(unshare),
    1154             :                                         1,
    1155           0 :                                         SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
    1156           0 :                         if (r < 0) {
    1157           0 :                                 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1158           0 :                                 break;
    1159             :                         }
    1160             : 
    1161             :                         /* On s390/s390x the first two parameters to clone are switched */
    1162           0 :                         if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
    1163           0 :                                 r = seccomp_rule_add_exact(
    1164             :                                                 seccomp,
    1165             :                                                 SCMP_ACT_ERRNO(EPERM),
    1166             :                                                 SCMP_SYS(clone),
    1167             :                                                 1,
    1168           0 :                                                 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
    1169             :                         else
    1170           0 :                                 r = seccomp_rule_add_exact(
    1171             :                                                 seccomp,
    1172             :                                                 SCMP_ACT_ERRNO(EPERM),
    1173             :                                                 SCMP_SYS(clone),
    1174             :                                                 1,
    1175           0 :                                                 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
    1176           0 :                         if (r < 0) {
    1177           0 :                                 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1178           0 :                                 break;
    1179             :                         }
    1180             : 
    1181           0 :                         if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
    1182           0 :                                 r = seccomp_rule_add_exact(
    1183             :                                                 seccomp,
    1184             :                                                 SCMP_ACT_ERRNO(EPERM),
    1185             :                                                 SCMP_SYS(setns),
    1186             :                                                 1,
    1187           0 :                                                 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
    1188           0 :                                 if (r < 0) {
    1189           0 :                                         log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1190           0 :                                         break;
    1191             :                                 }
    1192             :                         }
    1193             :                 }
    1194           0 :                 if (r < 0)
    1195           0 :                         continue;
    1196             : 
    1197           0 :                 r = seccomp_load(seccomp);
    1198           0 :                 if (ERRNO_IS_SECCOMP_FATAL(r))
    1199           0 :                         return r;
    1200           0 :                 if (r < 0)
    1201           0 :                         log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1202             :         }
    1203             : 
    1204           0 :         return 0;
    1205             : }
    1206             : 
    1207           0 : int seccomp_protect_sysctl(void) {
    1208             :         uint32_t arch;
    1209             :         int r;
    1210             : 
    1211           0 :         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
    1212           0 :                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
    1213             : 
    1214           0 :                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
    1215             : 
    1216           0 :                 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
    1217             :                         /* No _sysctl syscall */
    1218           0 :                         continue;
    1219             : 
    1220           0 :                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
    1221           0 :                 if (r < 0)
    1222           0 :                         return r;
    1223             : 
    1224           0 :                 r = seccomp_rule_add_exact(
    1225             :                                 seccomp,
    1226             :                                 SCMP_ACT_ERRNO(EPERM),
    1227             :                                 SCMP_SYS(_sysctl),
    1228             :                                 0);
    1229           0 :                 if (r < 0) {
    1230           0 :                         log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1231           0 :                         continue;
    1232             :                 }
    1233             : 
    1234           0 :                 r = seccomp_load(seccomp);
    1235           0 :                 if (ERRNO_IS_SECCOMP_FATAL(r))
    1236           0 :                         return r;
    1237           0 :                 if (r < 0)
    1238           0 :                         log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1239             :         }
    1240             : 
    1241           0 :         return 0;
    1242             : }
    1243             : 
    1244           0 : int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
    1245             :         uint32_t arch;
    1246             :         int r;
    1247             : 
    1248           0 :         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
    1249           0 :                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
    1250             :                 bool supported;
    1251             :                 Iterator i;
    1252             : 
    1253           0 :                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
    1254             : 
    1255           0 :                 switch (arch) {
    1256             : 
    1257           0 :                 case SCMP_ARCH_X86_64:
    1258             :                 case SCMP_ARCH_X32:
    1259             :                 case SCMP_ARCH_ARM:
    1260             :                 case SCMP_ARCH_AARCH64:
    1261             :                 case SCMP_ARCH_PPC:
    1262             :                 case SCMP_ARCH_PPC64:
    1263             :                 case SCMP_ARCH_PPC64LE:
    1264             :                 case SCMP_ARCH_MIPSEL64N32:
    1265             :                 case SCMP_ARCH_MIPS64N32:
    1266             :                 case SCMP_ARCH_MIPSEL64:
    1267             :                 case SCMP_ARCH_MIPS64:
    1268             :                         /* These we know we support (i.e. are the ones that do not use socketcall()) */
    1269           0 :                         supported = true;
    1270           0 :                         break;
    1271             : 
    1272           0 :                 case SCMP_ARCH_S390:
    1273             :                 case SCMP_ARCH_S390X:
    1274             :                 case SCMP_ARCH_X86:
    1275             :                 case SCMP_ARCH_MIPSEL:
    1276             :                 case SCMP_ARCH_MIPS:
    1277             :                 default:
    1278             :                         /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
    1279             :                          * don't know */
    1280           0 :                         supported = false;
    1281           0 :                         break;
    1282             :                 }
    1283             : 
    1284           0 :                 if (!supported)
    1285           0 :                         continue;
    1286             : 
    1287           0 :                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
    1288           0 :                 if (r < 0)
    1289           0 :                         return r;
    1290             : 
    1291           0 :                 if (whitelist) {
    1292           0 :                         int af, first = 0, last = 0;
    1293             :                         void *afp;
    1294             : 
    1295             :                         /* If this is a whitelist, we first block the address families that are out of range and then
    1296             :                          * everything that is not in the set. First, we find the lowest and highest address family in
    1297             :                          * the set. */
    1298             : 
    1299           0 :                         SET_FOREACH(afp, address_families, i) {
    1300           0 :                                 af = PTR_TO_INT(afp);
    1301             : 
    1302           0 :                                 if (af <= 0 || af >= af_max())
    1303           0 :                                         continue;
    1304             : 
    1305           0 :                                 if (first == 0 || af < first)
    1306           0 :                                         first = af;
    1307             : 
    1308           0 :                                 if (last == 0 || af > last)
    1309           0 :                                         last = af;
    1310             :                         }
    1311             : 
    1312           0 :                         assert((first == 0) == (last == 0));
    1313             : 
    1314           0 :                         if (first == 0) {
    1315             : 
    1316             :                                 /* No entries in the valid range, block everything */
    1317           0 :                                 r = seccomp_rule_add_exact(
    1318             :                                                 seccomp,
    1319             :                                                 SCMP_ACT_ERRNO(EAFNOSUPPORT),
    1320             :                                                 SCMP_SYS(socket),
    1321             :                                                 0);
    1322           0 :                                 if (r < 0) {
    1323           0 :                                         log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1324           0 :                                         continue;
    1325             :                                 }
    1326             : 
    1327             :                         } else {
    1328             : 
    1329             :                                 /* Block everything below the first entry */
    1330           0 :                                 r = seccomp_rule_add_exact(
    1331             :                                                 seccomp,
    1332             :                                                 SCMP_ACT_ERRNO(EAFNOSUPPORT),
    1333             :                                                 SCMP_SYS(socket),
    1334             :                                                 1,
    1335           0 :                                                 SCMP_A0(SCMP_CMP_LT, first));
    1336           0 :                                 if (r < 0) {
    1337           0 :                                         log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1338           0 :                                         continue;
    1339             :                                 }
    1340             : 
    1341             :                                 /* Block everything above the last entry */
    1342           0 :                                 r = seccomp_rule_add_exact(
    1343             :                                                 seccomp,
    1344             :                                                 SCMP_ACT_ERRNO(EAFNOSUPPORT),
    1345             :                                                 SCMP_SYS(socket),
    1346             :                                                 1,
    1347           0 :                                                 SCMP_A0(SCMP_CMP_GT, last));
    1348           0 :                                 if (r < 0) {
    1349           0 :                                         log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1350           0 :                                         continue;
    1351             :                                 }
    1352             : 
    1353             :                                 /* Block everything between the first and last entry */
    1354           0 :                                 for (af = 1; af < af_max(); af++) {
    1355             : 
    1356           0 :                                         if (set_contains(address_families, INT_TO_PTR(af)))
    1357           0 :                                                 continue;
    1358             : 
    1359           0 :                                         r = seccomp_rule_add_exact(
    1360             :                                                         seccomp,
    1361             :                                                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
    1362             :                                                         SCMP_SYS(socket),
    1363             :                                                         1,
    1364           0 :                                                         SCMP_A0(SCMP_CMP_EQ, af));
    1365           0 :                                         if (r < 0)
    1366           0 :                                                 break;
    1367             :                                 }
    1368           0 :                                 if (r < 0) {
    1369           0 :                                         log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1370           0 :                                         continue;
    1371             :                                 }
    1372             :                         }
    1373             : 
    1374             :                 } else {
    1375             :                         void *af;
    1376             : 
    1377             :                         /* If this is a blacklist, then generate one rule for
    1378             :                          * each address family that are then combined in OR
    1379             :                          * checks. */
    1380             : 
    1381           0 :                         SET_FOREACH(af, address_families, i) {
    1382             : 
    1383           0 :                                 r = seccomp_rule_add_exact(
    1384             :                                                 seccomp,
    1385             :                                                 SCMP_ACT_ERRNO(EAFNOSUPPORT),
    1386             :                                                 SCMP_SYS(socket),
    1387             :                                                 1,
    1388           0 :                                                 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
    1389           0 :                                 if (r < 0)
    1390           0 :                                         break;
    1391             :                         }
    1392           0 :                         if (r < 0) {
    1393           0 :                                 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1394           0 :                                 continue;
    1395             :                         }
    1396             :                 }
    1397             : 
    1398           0 :                 r = seccomp_load(seccomp);
    1399           0 :                 if (ERRNO_IS_SECCOMP_FATAL(r))
    1400           0 :                         return r;
    1401           0 :                 if (r < 0)
    1402           0 :                         log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1403             :         }
    1404             : 
    1405           0 :         return 0;
    1406             : }
    1407             : 
    1408           0 : int seccomp_restrict_realtime(void) {
    1409             :         static const int permitted_policies[] = {
    1410             :                 SCHED_OTHER,
    1411             :                 SCHED_BATCH,
    1412             :                 SCHED_IDLE,
    1413             :         };
    1414             : 
    1415           0 :         int r, max_policy = 0;
    1416             :         uint32_t arch;
    1417             :         unsigned i;
    1418             : 
    1419             :         /* Determine the highest policy constant we want to allow */
    1420           0 :         for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
    1421           0 :                 if (permitted_policies[i] > max_policy)
    1422           0 :                         max_policy = permitted_policies[i];
    1423             : 
    1424           0 :         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
    1425           0 :                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
    1426             :                 int p;
    1427             : 
    1428           0 :                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
    1429             : 
    1430           0 :                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
    1431           0 :                 if (r < 0)
    1432           0 :                         return r;
    1433             : 
    1434             :                 /* Go through all policies with lower values than that, and block them -- unless they appear in the
    1435             :                  * whitelist. */
    1436           0 :                 for (p = 0; p < max_policy; p++) {
    1437           0 :                         bool good = false;
    1438             : 
    1439             :                         /* Check if this is in the whitelist. */
    1440           0 :                         for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
    1441           0 :                                 if (permitted_policies[i] == p) {
    1442           0 :                                         good = true;
    1443           0 :                                         break;
    1444             :                                 }
    1445             : 
    1446           0 :                         if (good)
    1447           0 :                                 continue;
    1448             : 
    1449             :                         /* Deny this policy */
    1450           0 :                         r = seccomp_rule_add_exact(
    1451             :                                         seccomp,
    1452             :                                         SCMP_ACT_ERRNO(EPERM),
    1453             :                                         SCMP_SYS(sched_setscheduler),
    1454             :                                         1,
    1455           0 :                                         SCMP_A1(SCMP_CMP_EQ, p));
    1456           0 :                         if (r < 0) {
    1457           0 :                                 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1458           0 :                                 continue;
    1459             :                         }
    1460             :                 }
    1461             : 
    1462             :                 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
    1463             :                  * unsigned here, hence no need no check for < 0 values. */
    1464           0 :                 r = seccomp_rule_add_exact(
    1465             :                                 seccomp,
    1466             :                                 SCMP_ACT_ERRNO(EPERM),
    1467             :                                 SCMP_SYS(sched_setscheduler),
    1468             :                                 1,
    1469           0 :                                 SCMP_A1(SCMP_CMP_GT, max_policy));
    1470           0 :                 if (r < 0) {
    1471           0 :                         log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1472           0 :                         continue;
    1473             :                 }
    1474             : 
    1475           0 :                 r = seccomp_load(seccomp);
    1476           0 :                 if (ERRNO_IS_SECCOMP_FATAL(r))
    1477           0 :                         return r;
    1478           0 :                 if (r < 0)
    1479           0 :                         log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1480             :         }
    1481             : 
    1482           0 :         return 0;
    1483             : }
    1484             : 
    1485           0 : static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
    1486             :                                       uint32_t arch,
    1487             :                                       int nr,
    1488             :                                       unsigned arg_cnt,
    1489             :                                       const struct scmp_arg_cmp arg) {
    1490             :         int r;
    1491             : 
    1492           0 :         r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
    1493           0 :         if (r < 0) {
    1494           0 :                 _cleanup_free_ char *n = NULL;
    1495             : 
    1496           0 :                 n = seccomp_syscall_resolve_num_arch(arch, nr);
    1497           0 :                 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
    1498             :                                 strna(n),
    1499             :                                 seccomp_arch_to_string(arch));
    1500             :         }
    1501             : 
    1502           0 :         return r;
    1503             : }
    1504             : 
    1505             : /* For known architectures, check that syscalls are indeed defined or not. */
    1506             : #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
    1507             : assert_cc(SCMP_SYS(shmget) > 0);
    1508             : assert_cc(SCMP_SYS(shmat) > 0);
    1509             : assert_cc(SCMP_SYS(shmdt) > 0);
    1510             : #endif
    1511             : 
    1512           0 : int seccomp_memory_deny_write_execute(void) {
    1513             :         uint32_t arch;
    1514             :         int r;
    1515             : 
    1516           0 :         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
    1517           0 :                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
    1518           0 :                 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
    1519             : 
    1520           0 :                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
    1521             : 
    1522           0 :                 switch (arch) {
    1523             : 
    1524           0 :                 case SCMP_ARCH_X86:
    1525             :                 case SCMP_ARCH_S390:
    1526           0 :                         filter_syscall = SCMP_SYS(mmap2);
    1527           0 :                         block_syscall = SCMP_SYS(mmap);
    1528           0 :                         shmat_syscall = SCMP_SYS(shmat);
    1529           0 :                         break;
    1530             : 
    1531           0 :                 case SCMP_ARCH_PPC:
    1532             :                 case SCMP_ARCH_PPC64:
    1533             :                 case SCMP_ARCH_PPC64LE:
    1534           0 :                         filter_syscall = SCMP_SYS(mmap);
    1535             : 
    1536             :                         /* Note that shmat() isn't available, and the call is multiplexed through ipc().
    1537             :                          * We ignore that here, which means there's still a way to get writable/executable
    1538             :                          * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
    1539             : 
    1540           0 :                         break;
    1541             : 
    1542           0 :                 case SCMP_ARCH_ARM:
    1543           0 :                         filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
    1544           0 :                         shmat_syscall = SCMP_SYS(shmat);
    1545           0 :                         break;
    1546             : 
    1547           0 :                 case SCMP_ARCH_X86_64:
    1548             :                 case SCMP_ARCH_X32:
    1549             :                 case SCMP_ARCH_AARCH64:
    1550             :                 case SCMP_ARCH_S390X:
    1551           0 :                         filter_syscall = SCMP_SYS(mmap); /* amd64, x32, s390x, and arm64 have only mmap */
    1552           0 :                         shmat_syscall = SCMP_SYS(shmat);
    1553           0 :                         break;
    1554             : 
    1555             :                 /* Please add more definitions here, if you port systemd to other architectures! */
    1556             : 
    1557             : #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
    1558             : #warning "Consider adding the right mmap() syscall definitions here!"
    1559             : #endif
    1560             :                 }
    1561             : 
    1562             :                 /* Can't filter mmap() on this arch, then skip it */
    1563           0 :                 if (filter_syscall == 0)
    1564           0 :                         continue;
    1565             : 
    1566           0 :                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
    1567           0 :                 if (r < 0)
    1568           0 :                         return r;
    1569             : 
    1570           0 :                 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
    1571             :                                                1,
    1572           0 :                                                SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
    1573           0 :                 if (r < 0)
    1574           0 :                         continue;
    1575             : 
    1576           0 :                 if (block_syscall != 0) {
    1577           0 :                         r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
    1578           0 :                         if (r < 0)
    1579           0 :                                 continue;
    1580             :                 }
    1581             : 
    1582           0 :                 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
    1583             :                                                1,
    1584           0 :                                                SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
    1585           0 :                 if (r < 0)
    1586           0 :                         continue;
    1587             : 
    1588             : #ifdef __NR_pkey_mprotect
    1589           0 :                 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
    1590             :                                                1,
    1591           0 :                                                SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
    1592           0 :                 if (r < 0)
    1593           0 :                         continue;
    1594             : #endif
    1595             : 
    1596           0 :                 if (shmat_syscall > 0) {
    1597           0 :                         r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
    1598             :                                                        1,
    1599           0 :                                                        SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
    1600           0 :                         if (r < 0)
    1601           0 :                                 continue;
    1602             :                 }
    1603             : 
    1604           0 :                 r = seccomp_load(seccomp);
    1605           0 :                 if (ERRNO_IS_SECCOMP_FATAL(r))
    1606           0 :                         return r;
    1607           0 :                 if (r < 0)
    1608           0 :                         log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1609             :         }
    1610             : 
    1611           0 :         return 0;
    1612             : }
    1613             : 
    1614           0 : int seccomp_restrict_archs(Set *archs) {
    1615           0 :         _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
    1616             :         Iterator i;
    1617             :         void *id;
    1618             :         int r;
    1619             : 
    1620             :         /* This installs a filter with no rules, but that restricts the system call architectures to the specified
    1621             :          * list.
    1622             :          *
    1623             :          * There are some qualifications. However the most important use is to stop processes from bypassing
    1624             :          * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
    1625             :          * in a non-native architecture. There are no holes in this use case, at least so far. */
    1626             : 
    1627             :         /* Note libseccomp includes our "native" (current) architecture in the filter by default.
    1628             :          * We do not remove it. For example, our callers expect to be able to call execve() afterwards
    1629             :          * to run a program with the restrictions applied. */
    1630           0 :         seccomp = seccomp_init(SCMP_ACT_ALLOW);
    1631           0 :         if (!seccomp)
    1632           0 :                 return -ENOMEM;
    1633             : 
    1634           0 :         SET_FOREACH(id, archs, i) {
    1635           0 :                 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
    1636           0 :                 if (r < 0 && r != -EEXIST)
    1637           0 :                         return r;
    1638             :         }
    1639             : 
    1640             :         /* The vdso for x32 assumes that x86-64 syscalls are available.  Let's allow them, since x32
    1641             :          * x32 syscalls should basically match x86-64 for everything except the pointer type.
    1642             :          * The important thing is that you can block the old 32-bit x86 syscalls.
    1643             :          * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
    1644             : 
    1645           0 :         if (seccomp_arch_native() == SCMP_ARCH_X32 ||
    1646           0 :             set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
    1647             : 
    1648           0 :                 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
    1649           0 :                 if (r < 0 && r != -EEXIST)
    1650           0 :                         return r;
    1651             :         }
    1652             : 
    1653           0 :         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
    1654           0 :         if (r < 0)
    1655           0 :                 return r;
    1656             : 
    1657           0 :         r = seccomp_load(seccomp);
    1658           0 :         if (ERRNO_IS_SECCOMP_FATAL(r))
    1659           0 :                 return r;
    1660           0 :         if (r < 0)
    1661           0 :                 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
    1662             : 
    1663           0 :         return 0;
    1664             : }
    1665             : 
    1666           0 : int parse_syscall_archs(char **l, Set **archs) {
    1667           0 :         _cleanup_set_free_ Set *_archs;
    1668             :         char **s;
    1669             :         int r;
    1670             : 
    1671           0 :         assert(l);
    1672           0 :         assert(archs);
    1673             : 
    1674           0 :         r = set_ensure_allocated(&_archs, NULL);
    1675           0 :         if (r < 0)
    1676           0 :                 return r;
    1677             : 
    1678           0 :         STRV_FOREACH(s, l) {
    1679             :                 uint32_t a;
    1680             : 
    1681           0 :                 r = seccomp_arch_from_string(*s, &a);
    1682           0 :                 if (r < 0)
    1683           0 :                         return -EINVAL;
    1684             : 
    1685           0 :                 r = set_put(_archs, UINT32_TO_PTR(a + 1));
    1686           0 :                 if (r < 0)
    1687           0 :                         return -ENOMEM;
    1688             :         }
    1689             : 
    1690           0 :         *archs = TAKE_PTR(_archs);
    1691             : 
    1692           0 :         return 0;
    1693             : }
    1694             : 
    1695           0 : int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
    1696             :         const char *i;
    1697             :         int r;
    1698             : 
    1699           0 :         assert(set);
    1700             : 
    1701           0 :         NULSTR_FOREACH(i, set->value) {
    1702             : 
    1703           0 :                 if (i[0] == '@') {
    1704             :                         const SyscallFilterSet *more;
    1705             : 
    1706           0 :                         more = syscall_filter_set_find(i);
    1707           0 :                         if (!more)
    1708           0 :                                 return -ENXIO;
    1709             : 
    1710           0 :                         r = seccomp_filter_set_add(filter, add, more);
    1711           0 :                         if (r < 0)
    1712           0 :                                 return r;
    1713             :                 } else {
    1714             :                         int id;
    1715             : 
    1716           0 :                         id = seccomp_syscall_resolve_name(i);
    1717           0 :                         if (id == __NR_SCMP_ERROR) {
    1718           0 :                                 log_debug("Couldn't resolve system call, ignoring: %s", i);
    1719           0 :                                 continue;
    1720             :                         }
    1721             : 
    1722           0 :                         if (add) {
    1723           0 :                                 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
    1724           0 :                                 if (r < 0)
    1725           0 :                                         return r;
    1726             :                         } else
    1727           0 :                                 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
    1728             :                 }
    1729             :         }
    1730             : 
    1731           0 :         return 0;
    1732             : }
    1733             : 
    1734           0 : int seccomp_lock_personality(unsigned long personality) {
    1735             :         uint32_t arch;
    1736             :         int r;
    1737             : 
    1738           0 :         if (personality >= PERSONALITY_INVALID)
    1739           0 :                 return -EINVAL;
    1740             : 
    1741           0 :         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
    1742           0 :                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
    1743             : 
    1744           0 :                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
    1745           0 :                 if (r < 0)
    1746           0 :                         return r;
    1747             : 
    1748           0 :                 r = seccomp_rule_add_exact(
    1749             :                                 seccomp,
    1750             :                                 SCMP_ACT_ERRNO(EPERM),
    1751             :                                 SCMP_SYS(personality),
    1752             :                                 1,
    1753           0 :                                 SCMP_A0(SCMP_CMP_NE, personality));
    1754           0 :                 if (r < 0) {
    1755           0 :                         log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1756           0 :                         continue;
    1757             :                 }
    1758             : 
    1759           0 :                 r = seccomp_load(seccomp);
    1760           0 :                 if (ERRNO_IS_SECCOMP_FATAL(r))
    1761           0 :                         return r;
    1762           0 :                 if (r < 0)
    1763           0 :                         log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1764             :         }
    1765             : 
    1766           0 :         return 0;
    1767             : }
    1768             : 
    1769           0 : int seccomp_protect_hostname(void) {
    1770             :         uint32_t arch;
    1771             :         int r;
    1772             : 
    1773           0 :         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
    1774           0 :                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
    1775             : 
    1776           0 :                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
    1777           0 :                 if (r < 0)
    1778           0 :                         return r;
    1779             : 
    1780           0 :                 r = seccomp_rule_add_exact(
    1781             :                                 seccomp,
    1782             :                                 SCMP_ACT_ERRNO(EPERM),
    1783             :                                 SCMP_SYS(sethostname),
    1784             :                                 0);
    1785           0 :                 if (r < 0) {
    1786           0 :                         log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1787           0 :                         continue;
    1788             :                 }
    1789             : 
    1790           0 :                 r = seccomp_rule_add_exact(
    1791             :                                 seccomp,
    1792             :                                 SCMP_ACT_ERRNO(EPERM),
    1793             :                                 SCMP_SYS(setdomainname),
    1794             :                                 0);
    1795           0 :                 if (r < 0) {
    1796           0 :                         log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1797           0 :                         continue;
    1798             :                 }
    1799             : 
    1800           0 :                 r = seccomp_load(seccomp);
    1801           0 :                 if (ERRNO_IS_SECCOMP_FATAL(r))
    1802           0 :                         return r;
    1803           0 :                 if (r < 0)
    1804           0 :                         log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1805             :         }
    1806             : 
    1807           0 :         return 0;
    1808             : }
    1809             : 
    1810           0 : static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
    1811             :         /* Checks the mode_t parameter of the following system calls:
    1812             :          *
    1813             :          *       → chmod() + fchmod() + fchmodat()
    1814             :          *       → open() + creat() + openat()
    1815             :          *       → mkdir() + mkdirat()
    1816             :          *       → mknod() + mknodat()
    1817             :          *
    1818             :          * Returns error if *everything* failed, and 0 otherwise.
    1819             :          */
    1820           0 :         int r = 0;
    1821           0 :         bool any = false;
    1822             : 
    1823           0 :         r = seccomp_rule_add_exact(
    1824             :                         seccomp,
    1825             :                         SCMP_ACT_ERRNO(EPERM),
    1826             :                         SCMP_SYS(chmod),
    1827             :                         1,
    1828           0 :                         SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
    1829           0 :         if (r < 0)
    1830           0 :                 log_debug_errno(r, "Failed to add filter for chmod: %m");
    1831             :         else
    1832           0 :                 any = true;
    1833             : 
    1834           0 :         r = seccomp_rule_add_exact(
    1835             :                         seccomp,
    1836             :                         SCMP_ACT_ERRNO(EPERM),
    1837             :                         SCMP_SYS(fchmod),
    1838             :                         1,
    1839           0 :                         SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
    1840           0 :         if (r < 0)
    1841           0 :                 log_debug_errno(r, "Failed to add filter for fchmod: %m");
    1842             :         else
    1843           0 :                 any = true;
    1844             : 
    1845           0 :         r = seccomp_rule_add_exact(
    1846             :                         seccomp,
    1847             :                         SCMP_ACT_ERRNO(EPERM),
    1848             :                         SCMP_SYS(fchmodat),
    1849             :                         1,
    1850           0 :                         SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
    1851           0 :         if (r < 0)
    1852           0 :                 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
    1853             :         else
    1854           0 :                 any = true;
    1855             : 
    1856           0 :         r = seccomp_rule_add_exact(
    1857             :                         seccomp,
    1858             :                         SCMP_ACT_ERRNO(EPERM),
    1859             :                         SCMP_SYS(mkdir),
    1860             :                         1,
    1861           0 :                         SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
    1862           0 :         if (r < 0)
    1863           0 :                 log_debug_errno(r, "Failed to add filter for mkdir: %m");
    1864             :         else
    1865           0 :                 any = true;
    1866             : 
    1867           0 :         r = seccomp_rule_add_exact(
    1868             :                         seccomp,
    1869             :                         SCMP_ACT_ERRNO(EPERM),
    1870             :                         SCMP_SYS(mkdirat),
    1871             :                         1,
    1872           0 :                         SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
    1873           0 :         if (r < 0)
    1874           0 :                 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
    1875             :         else
    1876           0 :                 any = true;
    1877             : 
    1878           0 :         r = seccomp_rule_add_exact(
    1879             :                         seccomp,
    1880             :                         SCMP_ACT_ERRNO(EPERM),
    1881             :                         SCMP_SYS(mknod),
    1882             :                         1,
    1883           0 :                         SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
    1884           0 :         if (r < 0)
    1885           0 :                 log_debug_errno(r, "Failed to add filter for mknod: %m");
    1886             :         else
    1887           0 :                 any = true;
    1888             : 
    1889           0 :         r = seccomp_rule_add_exact(
    1890             :                         seccomp,
    1891             :                         SCMP_ACT_ERRNO(EPERM),
    1892             :                         SCMP_SYS(mknodat),
    1893             :                         1,
    1894           0 :                         SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
    1895           0 :         if (r < 0)
    1896           0 :                 log_debug_errno(r, "Failed to add filter for mknodat: %m");
    1897             :         else
    1898           0 :                 any = true;
    1899             : 
    1900             : #if SCMP_SYS(open) > 0
    1901           0 :         r = seccomp_rule_add_exact(
    1902             :                         seccomp,
    1903             :                         SCMP_ACT_ERRNO(EPERM),
    1904             :                         SCMP_SYS(open),
    1905             :                         2,
    1906           0 :                         SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
    1907           0 :                         SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
    1908           0 :         if (r < 0)
    1909           0 :                 log_debug_errno(r, "Failed to add filter for open: %m");
    1910             :         else
    1911           0 :                 any = true;
    1912             : #endif
    1913             : 
    1914           0 :         r = seccomp_rule_add_exact(
    1915             :                         seccomp,
    1916             :                         SCMP_ACT_ERRNO(EPERM),
    1917             :                         SCMP_SYS(openat),
    1918             :                         2,
    1919           0 :                         SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
    1920           0 :                         SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
    1921           0 :         if (r < 0)
    1922           0 :                 log_debug_errno(r, "Failed to add filter for openat: %m");
    1923             :         else
    1924           0 :                 any = true;
    1925             : 
    1926           0 :         r = seccomp_rule_add_exact(
    1927             :                         seccomp,
    1928             :                         SCMP_ACT_ERRNO(EPERM),
    1929             :                         SCMP_SYS(creat),
    1930             :                         1,
    1931           0 :                         SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
    1932           0 :         if (r < 0)
    1933           0 :                 log_debug_errno(r, "Failed to add filter for creat: %m");
    1934             :         else
    1935           0 :                 any = true;
    1936             : 
    1937           0 :         return any ? 0 : r;
    1938             : }
    1939             : 
    1940           0 : int seccomp_restrict_suid_sgid(void) {
    1941             :         uint32_t arch;
    1942             :         int r, k;
    1943             : 
    1944           0 :         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
    1945           0 :                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
    1946             : 
    1947           0 :                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
    1948           0 :                 if (r < 0)
    1949           0 :                         return r;
    1950             : 
    1951           0 :                 r = seccomp_restrict_sxid(seccomp, S_ISUID);
    1952           0 :                 if (r < 0)
    1953           0 :                         log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
    1954             : 
    1955           0 :                 k = seccomp_restrict_sxid(seccomp, S_ISGID);
    1956           0 :                 if (k < 0)
    1957           0 :                         log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
    1958             : 
    1959           0 :                 if (r < 0 && k < 0)
    1960           0 :                         continue;
    1961             : 
    1962           0 :                 r = seccomp_load(seccomp);
    1963           0 :                 if (ERRNO_IS_SECCOMP_FATAL(r))
    1964           0 :                         return r;
    1965           0 :                 if (r < 0)
    1966           0 :                         log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
    1967             :         }
    1968             : 
    1969           0 :         return 0;
    1970             : }
    1971             : 
    1972           0 : uint32_t scmp_act_kill_process(void) {
    1973             : 
    1974             :         /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
    1975             :          * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
    1976             :          * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
    1977             :          * for single-threaded apps does the right thing. */
    1978             : 
    1979             : #ifdef SCMP_ACT_KILL_PROCESS
    1980           0 :         if (seccomp_api_get() >= 3)
    1981           0 :                 return SCMP_ACT_KILL_PROCESS;
    1982             : #endif
    1983             : 
    1984           0 :         return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
    1985             : }

Generated by: LCOV version 1.14