LCOV - main_coverage.info - core/bpf-firewall.c

LCOV - code coverage report

Current view:	top level - core - bpf-firewall.c (source / functions)		Hit	Total	Coverage
Test:	main_coverage.info	Lines:	26	402	6.5 %
Date:	2019-08-22 15:41:25	Functions:	2	17	11.8 %

          Line data    Source code

       1             : /* SPDX-License-Identifier: LGPL-2.1+ */
       2             : 
       3             : #include <arpa/inet.h>
       4             : #include <assert.h>
       5             : #include <errno.h>
       6             : #include <fcntl.h>
       7             : #include <linux/bpf_insn.h>
       8             : #include <net/ethernet.h>
       9             : #include <net/if.h>
      10             : #include <netinet/ip.h>
      11             : #include <netinet/ip6.h>
      12             : #include <stddef.h>
      13             : #include <stdio.h>
      14             : #include <stdlib.h>
      15             : #include <string.h>
      16             : #include <unistd.h>
      17             : 
      18             : #include "alloc-util.h"
      19             : #include "bpf-firewall.h"
      20             : #include "bpf-program.h"
      21             : #include "fd-util.h"
      22             : #include "ip-address-access.h"
      23             : #include "memory-util.h"
      24             : #include "missing_syscall.h"
      25             : #include "unit.h"
      26             : #include "strv.h"
      27             : #include "virt.h"
      28             : 
      29             : enum {
      30             :         MAP_KEY_PACKETS,
      31             :         MAP_KEY_BYTES,
      32             : };
      33             : 
      34             : enum {
      35             :         ACCESS_ALLOWED = 1,
      36             :         ACCESS_DENIED  = 2,
      37             : };
      38             : 
      39             : /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
      40             : 
      41           0 : static int add_lookup_instructions(
      42             :                 BPFProgram *p,
      43             :                 int map_fd,
      44             :                 int protocol,
      45             :                 bool is_ingress,
      46             :                 int verdict) {
      47             : 
      48             :         int r, addr_offset, addr_size;
      49             : 
      50           0 :         assert(p);
      51           0 :         assert(map_fd >= 0);
      52             : 
      53           0 :         switch (protocol) {
      54             : 
      55           0 :         case ETH_P_IP:
      56           0 :                 addr_size = sizeof(uint32_t);
      57           0 :                 addr_offset = is_ingress ?
      58             :                         offsetof(struct iphdr, saddr) :
      59             :                         offsetof(struct iphdr, daddr);
      60           0 :                 break;
      61             : 
      62           0 :         case ETH_P_IPV6:
      63           0 :                 addr_size = 4 * sizeof(uint32_t);
      64           0 :                 addr_offset = is_ingress ?
      65             :                         offsetof(struct ip6_hdr, ip6_src.s6_addr) :
      66             :                         offsetof(struct ip6_hdr, ip6_dst.s6_addr);
      67           0 :                 break;
      68             : 
      69           0 :         default:
      70           0 :                 return -EAFNOSUPPORT;
      71             :         }
      72             : 
      73             :         do {
      74             :                 /* Compare IPv4 with one word instruction (32bit) */
      75           0 :                 struct bpf_insn insn[] = {
      76             :                         /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
      77           0 :                         BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
      78             : 
      79             :                         /*
      80             :                          * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
      81             :                          *
      82             :                          * R1: Pointer to the skb
      83             :                          * R2: Data offset
      84             :                          * R3: Destination buffer on the stack (r10 - 4)
      85             :                          * R4: Number of bytes to read (4)
      86             :                          */
      87             : 
      88             :                         BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
      89             :                         BPF_MOV32_IMM(BPF_REG_2, addr_offset),
      90             : 
      91             :                         BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
      92           0 :                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
      93             : 
      94             :                         BPF_MOV32_IMM(BPF_REG_4, addr_size),
      95             :                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
      96             : 
      97             :                         /*
      98             :                          * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
      99             :                          * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
     100             :                          * has to be set to the maximum possible value.
     101             :                          *
     102             :                          * On success, the looked up value is stored in R0. For this application, the actual
     103             :                          * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
     104             :                          * matching value.
     105             :                          */
     106             : 
     107           0 :                         BPF_LD_MAP_FD(BPF_REG_1, map_fd),
     108             :                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
     109           0 :                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
     110           0 :                         BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
     111             : 
     112             :                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
     113             :                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
     114             :                         BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
     115             :                 };
     116             : 
     117             :                 /* Jump label fixup */
     118           0 :                 insn[0].off = ELEMENTSOF(insn) - 1;
     119             : 
     120           0 :                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
     121           0 :                 if (r < 0)
     122           0 :                         return r;
     123             : 
     124             :         } while (false);
     125             : 
     126           0 :         return 0;
     127             : }
     128             : 
     129           0 : static int add_instructions_for_ip_any(
     130             :                 BPFProgram *p,
     131             :                 int verdict) {
     132             :         int r;
     133             : 
     134           0 :         assert(p);
     135             : 
     136           0 :         struct bpf_insn insn[] = {
     137             :                 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
     138             :         };
     139             : 
     140           0 :         r = bpf_program_add_instructions(p, insn, 1);
     141           0 :         if (r < 0)
     142           0 :                 return r;
     143             : 
     144           0 :         return 0;
     145             : }
     146             : 
     147           0 : static int bpf_firewall_compile_bpf(
     148             :                 Unit *u,
     149             :                 bool is_ingress,
     150             :                 BPFProgram **ret,
     151             :                 bool ip_allow_any,
     152             :                 bool ip_deny_any) {
     153             : 
     154           0 :         struct bpf_insn pre_insn[] = {
     155             :                 /*
     156             :                  * When the eBPF program is entered, R1 contains the address of the skb.
     157             :                  * However, R1-R5 are scratch registers that are not preserved when calling
     158             :                  * into kernel functions, so we need to save anything that's supposed to
     159             :                  * stay around to R6-R9. Save the skb to R6.
     160             :                  */
     161             :                 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
     162             : 
     163             :                 /*
     164             :                  * Although we cannot access the skb data directly from eBPF programs used in this
     165             :                  * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
     166             :                  * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
     167             :                  * for later use.
     168             :                  */
     169             :                 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
     170             : 
     171             :                 /*
     172             :                  * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
     173             :                  * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
     174             :                  */
     175             :                 BPF_MOV32_IMM(BPF_REG_8, 0),
     176             :         };
     177             : 
     178             :         /*
     179             :          * The access checkers compiled for the configured allowance and denial lists
     180             :          * write to R8 at runtime. The following code prepares for an early exit that
     181             :          * skip the accounting if the packet is denied.
     182             :          *
     183             :          * R0 = 1
     184             :          * if (R8 == ACCESS_DENIED)
     185             :          *     R0 = 0
     186             :          *
     187             :          * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
     188             :          * is allowed to pass.
     189             :          */
     190           0 :         struct bpf_insn post_insn[] = {
     191             :                 BPF_MOV64_IMM(BPF_REG_0, 1),
     192             :                 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
     193             :                 BPF_MOV64_IMM(BPF_REG_0, 0),
     194             :         };
     195             : 
     196           0 :         _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
     197             :         int accounting_map_fd, r;
     198             :         bool access_enabled;
     199             : 
     200           0 :         assert(u);
     201           0 :         assert(ret);
     202             : 
     203           0 :         accounting_map_fd = is_ingress ?
     204           0 :                 u->ip_accounting_ingress_map_fd :
     205             :                 u->ip_accounting_egress_map_fd;
     206             : 
     207           0 :         access_enabled =
     208           0 :                 u->ipv4_allow_map_fd >= 0 ||
     209           0 :                 u->ipv6_allow_map_fd >= 0 ||
     210           0 :                 u->ipv4_deny_map_fd >= 0 ||
     211           0 :                 u->ipv6_deny_map_fd >= 0 ||
     212           0 :                 ip_allow_any ||
     213             :                 ip_deny_any;
     214             : 
     215           0 :         if (accounting_map_fd < 0 && !access_enabled) {
     216           0 :                 *ret = NULL;
     217           0 :                 return 0;
     218             :         }
     219             : 
     220           0 :         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
     221           0 :         if (r < 0)
     222           0 :                 return r;
     223             : 
     224           0 :         r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
     225           0 :         if (r < 0)
     226           0 :                 return r;
     227             : 
     228           0 :         if (access_enabled) {
     229             :                 /*
     230             :                  * The simple rule this function translates into eBPF instructions is:
     231             :                  *
     232             :                  * - Access will be granted when an address matches an entry in @list_allow
     233             :                  * - Otherwise, access will be denied when an address matches an entry in @list_deny
     234             :                  * - Otherwise, access will be granted
     235             :                  */
     236             : 
     237           0 :                 if (u->ipv4_deny_map_fd >= 0) {
     238           0 :                         r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
     239           0 :                         if (r < 0)
     240           0 :                                 return r;
     241             :                 }
     242             : 
     243           0 :                 if (u->ipv6_deny_map_fd >= 0) {
     244           0 :                         r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
     245           0 :                         if (r < 0)
     246           0 :                                 return r;
     247             :                 }
     248             : 
     249           0 :                 if (u->ipv4_allow_map_fd >= 0) {
     250           0 :                         r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
     251           0 :                         if (r < 0)
     252           0 :                                 return r;
     253             :                 }
     254             : 
     255           0 :                 if (u->ipv6_allow_map_fd >= 0) {
     256           0 :                         r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
     257           0 :                         if (r < 0)
     258           0 :                                 return r;
     259             :                 }
     260             : 
     261           0 :                 if (ip_allow_any) {
     262           0 :                         r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
     263           0 :                         if (r < 0)
     264           0 :                                 return r;
     265             :                 }
     266             : 
     267           0 :                 if (ip_deny_any) {
     268           0 :                         r = add_instructions_for_ip_any(p, ACCESS_DENIED);
     269           0 :                         if (r < 0)
     270           0 :                                 return r;
     271             :                 }
     272             :         }
     273             : 
     274           0 :         r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
     275           0 :         if (r < 0)
     276           0 :                 return r;
     277             : 
     278           0 :         if (accounting_map_fd >= 0) {
     279           0 :                 struct bpf_insn insn[] = {
     280             :                         /*
     281             :                          * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
     282             :                          * The jump label will be fixed up later.
     283             :                          */
     284             :                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
     285             : 
     286             :                         /* Count packets */
     287             :                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
     288             :                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
     289             :                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
     290             :                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
     291           0 :                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
     292             :                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
     293             :                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
     294             :                         BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
     295             :                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
     296             : 
     297             :                         /* Count bytes */
     298             :                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
     299             :                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
     300             :                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
     301             :                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
     302           0 :                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
     303             :                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
     304             :                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
     305             :                         BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
     306             :                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
     307             : 
     308             :                         /* Allow the packet to pass */
     309             :                         BPF_MOV64_IMM(BPF_REG_0, 1),
     310             :                 };
     311             : 
     312             :                 /* Jump label fixup */
     313           0 :                 insn[0].off = ELEMENTSOF(insn) - 1;
     314             : 
     315           0 :                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
     316           0 :                 if (r < 0)
     317           0 :                         return r;
     318             :         }
     319             : 
     320             :         do {
     321             :                 /*
     322             :                  * Exit from the eBPF program, R0 contains the verdict.
     323             :                  * 0 means the packet is denied, 1 means the packet may pass.
     324             :                  */
     325           0 :                 struct bpf_insn insn[] = {
     326             :                         BPF_EXIT_INSN()
     327             :                 };
     328             : 
     329           0 :                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
     330           0 :                 if (r < 0)
     331           0 :                         return r;
     332             :         } while (false);
     333             : 
     334           0 :         *ret = TAKE_PTR(p);
     335             : 
     336           0 :         return 0;
     337             : }
     338             : 
     339           0 : static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
     340             :         IPAddressAccessItem *a;
     341             : 
     342           0 :         assert(n_ipv4);
     343           0 :         assert(n_ipv6);
     344             : 
     345           0 :         LIST_FOREACH(items, a, list) {
     346           0 :                 switch (a->family) {
     347             : 
     348           0 :                 case AF_INET:
     349           0 :                         (*n_ipv4)++;
     350           0 :                         break;
     351             : 
     352           0 :                 case AF_INET6:
     353           0 :                         (*n_ipv6)++;
     354           0 :                         break;
     355             : 
     356           0 :                 default:
     357           0 :                         return -EAFNOSUPPORT;
     358             :                 }
     359             :         }
     360             : 
     361           0 :         return 0;
     362             : }
     363             : 
     364           0 : static int bpf_firewall_add_access_items(
     365             :                 IPAddressAccessItem *list,
     366             :                 int ipv4_map_fd,
     367             :                 int ipv6_map_fd,
     368             :                 int verdict) {
     369             : 
     370             :         struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
     371           0 :         uint64_t value = verdict;
     372             :         IPAddressAccessItem *a;
     373             :         int r;
     374             : 
     375           0 :         key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
     376           0 :         key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
     377             : 
     378           0 :         LIST_FOREACH(items, a, list) {
     379           0 :                 switch (a->family) {
     380             : 
     381           0 :                 case AF_INET:
     382           0 :                         key_ipv4->prefixlen = a->prefixlen;
     383           0 :                         memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
     384             : 
     385           0 :                         r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
     386           0 :                         if (r < 0)
     387           0 :                                 return r;
     388             : 
     389           0 :                         break;
     390             : 
     391           0 :                 case AF_INET6:
     392           0 :                         key_ipv6->prefixlen = a->prefixlen;
     393           0 :                         memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
     394             : 
     395           0 :                         r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
     396           0 :                         if (r < 0)
     397           0 :                                 return r;
     398             : 
     399           0 :                         break;
     400             : 
     401           0 :                 default:
     402           0 :                         return -EAFNOSUPPORT;
     403             :                 }
     404             :         }
     405             : 
     406           0 :         return 0;
     407             : }
     408             : 
     409           0 : static int bpf_firewall_prepare_access_maps(
     410             :                 Unit *u,
     411             :                 int verdict,
     412             :                 int *ret_ipv4_map_fd,
     413             :                 int *ret_ipv6_map_fd,
     414             :                 bool *ret_has_any) {
     415             : 
     416           0 :         _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
     417           0 :         size_t n_ipv4 = 0, n_ipv6 = 0;
     418             :         IPAddressAccessItem *list;
     419             :         Unit *p;
     420             :         int r;
     421             : 
     422           0 :         assert(ret_ipv4_map_fd);
     423           0 :         assert(ret_ipv6_map_fd);
     424           0 :         assert(ret_has_any);
     425             : 
     426           0 :         for (p = u; p; p = UNIT_DEREF(p->slice)) {
     427             :                 CGroupContext *cc;
     428             : 
     429           0 :                 cc = unit_get_cgroup_context(p);
     430           0 :                 if (!cc)
     431           0 :                         continue;
     432             : 
     433           0 :                 list = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
     434             : 
     435           0 :                 bpf_firewall_count_access_items(list, &n_ipv4, &n_ipv6);
     436             : 
     437             :                 /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
     438             :                  * needing CAP_SYS_ADMIN for allocating LPM trie map. */
     439           0 :                 if (ip_address_access_item_is_any(list)) {
     440           0 :                         *ret_has_any = true;
     441           0 :                         return 0;
     442             :                 }
     443             :         }
     444             : 
     445           0 :         if (n_ipv4 > 0) {
     446           0 :                 ipv4_map_fd = bpf_map_new(
     447             :                                 BPF_MAP_TYPE_LPM_TRIE,
     448             :                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
     449             :                                 sizeof(uint64_t),
     450             :                                 n_ipv4,
     451             :                                 BPF_F_NO_PREALLOC);
     452           0 :                 if (ipv4_map_fd < 0)
     453           0 :                         return ipv4_map_fd;
     454             :         }
     455             : 
     456           0 :         if (n_ipv6 > 0) {
     457           0 :                 ipv6_map_fd = bpf_map_new(
     458             :                                 BPF_MAP_TYPE_LPM_TRIE,
     459             :                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
     460             :                                 sizeof(uint64_t),
     461             :                                 n_ipv6,
     462             :                                 BPF_F_NO_PREALLOC);
     463           0 :                 if (ipv6_map_fd < 0)
     464           0 :                         return ipv6_map_fd;
     465             :         }
     466             : 
     467           0 :         for (p = u; p; p = UNIT_DEREF(p->slice)) {
     468             :                 CGroupContext *cc;
     469             : 
     470           0 :                 cc = unit_get_cgroup_context(p);
     471           0 :                 if (!cc)
     472           0 :                         continue;
     473             : 
     474           0 :                 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
     475             :                                                   ipv4_map_fd, ipv6_map_fd, verdict);
     476           0 :                 if (r < 0)
     477           0 :                         return r;
     478             :         }
     479             : 
     480           0 :         *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
     481           0 :         *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
     482           0 :         *ret_has_any = false;
     483           0 :         return 0;
     484             : }
     485             : 
     486           0 : static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
     487             :         int r;
     488             : 
     489           0 :         assert(u);
     490           0 :         assert(fd_ingress);
     491           0 :         assert(fd_egress);
     492             : 
     493           0 :         if (enabled) {
     494           0 :                 if (*fd_ingress < 0) {
     495           0 :                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
     496           0 :                         if (r < 0)
     497           0 :                                 return r;
     498             : 
     499           0 :                         *fd_ingress = r;
     500             :                 }
     501             : 
     502           0 :                 if (*fd_egress < 0) {
     503             : 
     504           0 :                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
     505           0 :                         if (r < 0)
     506           0 :                                 return r;
     507             : 
     508           0 :                         *fd_egress = r;
     509             :                 }
     510             : 
     511             :         } else {
     512           0 :                 *fd_ingress = safe_close(*fd_ingress);
     513           0 :                 *fd_egress = safe_close(*fd_egress);
     514             : 
     515           0 :                 zero(u->ip_accounting_extra);
     516             :         }
     517             : 
     518           0 :         return 0;
     519             : }
     520             : 
     521           0 : int bpf_firewall_compile(Unit *u) {
     522             :         CGroupContext *cc;
     523             :         int r, supported;
     524           0 :         bool ip_allow_any = false, ip_deny_any = false;
     525             : 
     526           0 :         assert(u);
     527             : 
     528           0 :         cc = unit_get_cgroup_context(u);
     529           0 :         if (!cc)
     530           0 :                 return -EINVAL;
     531             : 
     532           0 :         supported = bpf_firewall_supported();
     533           0 :         if (supported < 0)
     534           0 :                 return supported;
     535           0 :         if (supported == BPF_FIREWALL_UNSUPPORTED)
     536           0 :                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
     537             :                                             "BPF firewalling not supported on this manager, proceeding without.");
     538           0 :         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
     539             :                 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
     540             :                  * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
     541             :                  * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
     542             :                  * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
     543             :                  * all, either. */
     544           0 :                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
     545             :                                             "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
     546             : 
     547             :         /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
     548             :          * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
     549             :          * configuration, but we don't flush out the accounting unnecessarily */
     550             : 
     551           0 :         u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
     552           0 :         u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
     553             : 
     554           0 :         u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
     555           0 :         u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
     556             : 
     557           0 :         u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
     558           0 :         u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
     559             : 
     560           0 :         if (u->type != UNIT_SLICE) {
     561             :                 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
     562             :                  * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
     563             :                  * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
     564             :                  * means that all configure IP access rules *will* take effect on processes, even though we never
     565             :                  * compile them for inner nodes. */
     566             : 
     567           0 :                 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
     568           0 :                 if (r < 0)
     569           0 :                         return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m");
     570             : 
     571           0 :                 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
     572           0 :                 if (r < 0)
     573           0 :                         return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m");
     574             :         }
     575             : 
     576           0 :         r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
     577           0 :         if (r < 0)
     578           0 :                 return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m");
     579             : 
     580           0 :         r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
     581           0 :         if (r < 0)
     582           0 :                 return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m");
     583             : 
     584           0 :         r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
     585           0 :         if (r < 0)
     586           0 :                 return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m");
     587             : 
     588           0 :         return 0;
     589             : }
     590             : 
     591           0 : DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(filter_prog_hash_ops, void, trivial_hash_func, trivial_compare_func, BPFProgram, bpf_program_unref);
     592             : 
     593           0 : static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
     594             :         char **bpf_fs_path;
     595             : 
     596           0 :         set_clear(*set);
     597             : 
     598           0 :         STRV_FOREACH(bpf_fs_path, filter_paths) {
     599           0 :                 _cleanup_free_ BPFProgram *prog = NULL;
     600             :                 int r;
     601             : 
     602           0 :                 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &prog);
     603           0 :                 if (r < 0)
     604           0 :                         return log_unit_error_errno(u, r, "Can't allocate CGROUP SKB BPF program: %m");
     605             : 
     606           0 :                 r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
     607           0 :                 if (r < 0)
     608           0 :                         return log_unit_error_errno(u, r, "Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
     609             : 
     610           0 :                 r = set_ensure_allocated(set, &filter_prog_hash_ops);
     611           0 :                 if (r < 0)
     612           0 :                         return log_unit_error_errno(u, r, "Can't allocate BPF program set: %m");
     613             : 
     614           0 :                 r = set_put(*set, prog);
     615           0 :                 if (r < 0)
     616           0 :                         return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
     617           0 :                 TAKE_PTR(prog);
     618             :         }
     619             : 
     620           0 :         return 0;
     621             : }
     622             : 
     623           6 : int bpf_firewall_load_custom(Unit *u) {
     624             :         CGroupContext *cc;
     625             :         int r, supported;
     626             : 
     627           6 :         assert(u);
     628             : 
     629           6 :         cc = unit_get_cgroup_context(u);
     630           6 :         if (!cc)
     631           0 :                 return 0;
     632             : 
     633           6 :         if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
     634           6 :                 return 0;
     635             : 
     636           0 :         supported = bpf_firewall_supported();
     637           0 :         if (supported < 0)
     638           0 :                 return supported;
     639             : 
     640           0 :         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
     641           0 :                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
     642             : 
     643           0 :         r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
     644           0 :         if (r < 0)
     645           0 :                 return r;
     646           0 :         r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
     647           0 :         if (r < 0)
     648           0 :                 return r;
     649             : 
     650           0 :         return 0;
     651             : }
     652             : 
     653           0 : static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
     654             :         BPFProgram *prog;
     655             :         Iterator i;
     656             :         int r;
     657             : 
     658           0 :         assert(u);
     659             : 
     660           0 :         set_clear(*set_installed);
     661             : 
     662           0 :         SET_FOREACH(prog, *set, i) {
     663           0 :                 r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
     664           0 :                 if (r < 0)
     665           0 :                         return log_unit_error_errno(u, r, "Attaching custom egress BPF program to cgroup %s failed: %m", path);
     666             :                 /* Remember that these BPF programs are installed now. */
     667           0 :                 r = set_ensure_allocated(set_installed, &filter_prog_hash_ops);
     668           0 :                 if (r < 0)
     669           0 :                         return log_unit_error_errno(u, r, "Can't allocate BPF program set: %m");
     670             : 
     671           0 :                 r = set_put(*set_installed, prog);
     672           0 :                 if (r < 0)
     673           0 :                         return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
     674           0 :                 bpf_program_ref(prog);
     675             :         }
     676             : 
     677           0 :         return 0;
     678             : }
     679             : 
     680           0 : int bpf_firewall_install(Unit *u) {
     681           0 :         _cleanup_free_ char *path = NULL;
     682             :         CGroupContext *cc;
     683             :         int r, supported;
     684             :         uint32_t flags;
     685             : 
     686           0 :         assert(u);
     687             : 
     688           0 :         cc = unit_get_cgroup_context(u);
     689           0 :         if (!cc)
     690           0 :                 return -EINVAL;
     691           0 :         if (!u->cgroup_path)
     692           0 :                 return -EINVAL;
     693           0 :         if (!u->cgroup_realized)
     694           0 :                 return -EINVAL;
     695             : 
     696           0 :         supported = bpf_firewall_supported();
     697           0 :         if (supported < 0)
     698           0 :                 return supported;
     699           0 :         if (supported == BPF_FIREWALL_UNSUPPORTED) {
     700           0 :                 log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without.");
     701           0 :                 return -EOPNOTSUPP;
     702             :         }
     703           0 :         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
     704           0 :                 log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
     705           0 :                 return -EOPNOTSUPP;
     706             :         }
     707           0 :         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
     708           0 :             (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
     709           0 :                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
     710             : 
     711           0 :         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
     712           0 :         if (r < 0)
     713           0 :                 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
     714             : 
     715           0 :         flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
     716           0 :                  (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
     717             : 
     718             :         /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
     719             :          * minimize the time window when we don't account for IP traffic. */
     720           0 :         u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
     721           0 :         u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
     722             : 
     723           0 :         if (u->ip_bpf_egress) {
     724           0 :                 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path,
     725           0 :                                               flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI));
     726           0 :                 if (r < 0)
     727           0 :                         return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
     728             : 
     729             :                 /* Remember that this BPF program is installed now. */
     730           0 :                 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
     731             :         }
     732             : 
     733           0 :         if (u->ip_bpf_ingress) {
     734           0 :                 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path,
     735           0 :                                               flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI));
     736           0 :                 if (r < 0)
     737           0 :                         return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
     738             : 
     739           0 :                 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
     740             :         }
     741             : 
     742           0 :         r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
     743           0 :         if (r < 0)
     744           0 :                 return r;
     745             : 
     746           0 :         r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
     747           0 :         if (r < 0)
     748           0 :                 return r;
     749             : 
     750           0 :         return 0;
     751             : }
     752             : 
     753           0 : int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
     754             :         uint64_t key, packets;
     755             :         int r;
     756             : 
     757           0 :         if (map_fd < 0)
     758           0 :                 return -EBADF;
     759             : 
     760           0 :         if (ret_packets) {
     761           0 :                 key = MAP_KEY_PACKETS;
     762           0 :                 r = bpf_map_lookup_element(map_fd, &key, &packets);
     763           0 :                 if (r < 0)
     764           0 :                         return r;
     765             :         }
     766             : 
     767           0 :         if (ret_bytes) {
     768           0 :                 key = MAP_KEY_BYTES;
     769           0 :                 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
     770           0 :                 if (r < 0)
     771           0 :                         return r;
     772             :         }
     773             : 
     774           0 :         if (ret_packets)
     775           0 :                 *ret_packets = packets;
     776             : 
     777           0 :         return 0;
     778             : }
     779             : 
     780           0 : int bpf_firewall_reset_accounting(int map_fd) {
     781           0 :         uint64_t key, value = 0;
     782             :         int r;
     783             : 
     784           0 :         if (map_fd < 0)
     785           0 :                 return -EBADF;
     786             : 
     787           0 :         key = MAP_KEY_PACKETS;
     788           0 :         r = bpf_map_update_element(map_fd, &key, &value);
     789           0 :         if (r < 0)
     790           0 :                 return r;
     791             : 
     792           0 :         key = MAP_KEY_BYTES;
     793           0 :         return bpf_map_update_element(map_fd, &key, &value);
     794             : }
     795             : 
     796             : static int bpf_firewall_unsupported_reason = 0;
     797             : 
     798          11 : int bpf_firewall_supported(void) {
     799          11 :         struct bpf_insn trivial[] = {
     800             :                 BPF_MOV64_IMM(BPF_REG_0, 1),
     801             :                 BPF_EXIT_INSN()
     802             :         };
     803             : 
     804          11 :         _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
     805             :         static int supported = -1;
     806             :         union bpf_attr attr;
     807             :         int r;
     808             : 
     809             :         /* Checks whether BPF firewalling is supported. For this, we check the following things:
     810             :          *
     811             :          * - whether the unified hierarchy is being used
     812             :          * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
     813             :          * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
     814             :          */
     815          11 :         if (supported >= 0)
     816           6 :                 return supported;
     817             : 
     818           5 :         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
     819           5 :         if (r < 0)
     820           0 :                 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
     821           5 :         if (r == 0) {
     822           0 :                 bpf_firewall_unsupported_reason =
     823           0 :                         log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
     824             :                                         "Not running with unified cgroups, BPF firewalling is not supported.");
     825           0 :                 return supported = BPF_FIREWALL_UNSUPPORTED;
     826             :         }
     827             : 
     828           5 :         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program);
     829           5 :         if (r < 0) {
     830           0 :                 bpf_firewall_unsupported_reason =
     831           0 :                         log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
     832           0 :                 return supported = BPF_FIREWALL_UNSUPPORTED;
     833             :         }
     834             : 
     835           5 :         r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
     836           5 :         if (r < 0) {
     837           0 :                 bpf_firewall_unsupported_reason =
     838           0 :                         log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
     839           0 :                 return supported = BPF_FIREWALL_UNSUPPORTED;
     840             :         }
     841             : 
     842           5 :         r = bpf_program_load_kernel(program, NULL, 0);
     843           5 :         if (r < 0) {
     844           0 :                 bpf_firewall_unsupported_reason =
     845           0 :                         log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
     846           0 :                 return supported = BPF_FIREWALL_UNSUPPORTED;
     847             :         }
     848             : 
     849             :         /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
     850             :          * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
     851             :          * program if we can't do a thing with it later?
     852             :          *
     853             :          * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
     854             :          * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
     855             :          * parameters are validated however, and that'll fail with EBADF then. */
     856             : 
     857           5 :         attr = (union bpf_attr) {
     858             :                 .attach_type = BPF_CGROUP_INET_EGRESS,
     859             :                 .target_fd = -1,
     860             :                 .attach_bpf_fd = -1,
     861             :         };
     862             : 
     863           5 :         if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
     864           5 :                 if (errno != EBADF) {
     865           5 :                         bpf_firewall_unsupported_reason =
     866           5 :                                 log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
     867           5 :                         return supported = BPF_FIREWALL_UNSUPPORTED;
     868             :                 }
     869             : 
     870             :                 /* YAY! */
     871             :         } else {
     872           0 :                 log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
     873           0 :                 return supported = BPF_FIREWALL_UNSUPPORTED;
     874             :         }
     875             : 
     876             :         /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
     877             :          * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
     878             :          * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
     879             :          * get EINVAL if it's not supported, and EBADF as before if it is available. */
     880             : 
     881           0 :         attr = (union bpf_attr) {
     882             :                 .attach_type = BPF_CGROUP_INET_EGRESS,
     883             :                 .target_fd = -1,
     884             :                 .attach_bpf_fd = -1,
     885             :                 .attach_flags = BPF_F_ALLOW_MULTI,
     886             :         };
     887             : 
     888           0 :         if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
     889           0 :                 if (errno == EBADF) {
     890           0 :                         log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
     891           0 :                         return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
     892             :                 }
     893             : 
     894           0 :                 if (errno == EINVAL)
     895           0 :                         log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
     896             :                 else
     897           0 :                         log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
     898             : 
     899           0 :                 return supported = BPF_FIREWALL_SUPPORTED;
     900             :         } else {
     901           0 :                 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
     902           0 :                 return supported = BPF_FIREWALL_UNSUPPORTED;
     903             :         }
     904             : }
     905             : 
     906           0 : void emit_bpf_firewall_warning(Unit *u) {
     907             :         static bool warned = false;
     908             : 
     909           0 :         if (!warned) {
     910           0 :                 bool quiet = bpf_firewall_unsupported_reason == -EPERM && detect_container();
     911             : 
     912           0 :                 log_unit_full(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
     913             :                               "unit configures an IP firewall, but %s.\n"
     914             :                               "(This warning is only shown for the first unit using IP firewalling.)",
     915             :                               getuid() != 0 ? "not running as root" :
     916             :                                               "the local system does not support BPF/cgroup firewalling");
     917           0 :                 warned = true;
     918             :         }
     919           0 : }

Generated by: LCOV version 1.14