Line data Source code
1 : /* SPDX-License-Identifier: LGPL-2.1+ */
2 :
3 : #include <arpa/inet.h>
4 : #include <assert.h>
5 : #include <errno.h>
6 : #include <fcntl.h>
7 : #include <linux/bpf_insn.h>
8 : #include <net/ethernet.h>
9 : #include <net/if.h>
10 : #include <netinet/ip.h>
11 : #include <netinet/ip6.h>
12 : #include <stddef.h>
13 : #include <stdio.h>
14 : #include <stdlib.h>
15 : #include <string.h>
16 : #include <unistd.h>
17 :
18 : #include "alloc-util.h"
19 : #include "bpf-firewall.h"
20 : #include "bpf-program.h"
21 : #include "fd-util.h"
22 : #include "ip-address-access.h"
23 : #include "memory-util.h"
24 : #include "missing_syscall.h"
25 : #include "unit.h"
26 : #include "strv.h"
27 : #include "virt.h"
28 :
29 : enum {
30 : MAP_KEY_PACKETS,
31 : MAP_KEY_BYTES,
32 : };
33 :
34 : enum {
35 : ACCESS_ALLOWED = 1,
36 : ACCESS_DENIED = 2,
37 : };
38 :
39 : /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
40 :
41 0 : static int add_lookup_instructions(
42 : BPFProgram *p,
43 : int map_fd,
44 : int protocol,
45 : bool is_ingress,
46 : int verdict) {
47 :
48 : int r, addr_offset, addr_size;
49 :
50 0 : assert(p);
51 0 : assert(map_fd >= 0);
52 :
53 0 : switch (protocol) {
54 :
55 0 : case ETH_P_IP:
56 0 : addr_size = sizeof(uint32_t);
57 0 : addr_offset = is_ingress ?
58 : offsetof(struct iphdr, saddr) :
59 : offsetof(struct iphdr, daddr);
60 0 : break;
61 :
62 0 : case ETH_P_IPV6:
63 0 : addr_size = 4 * sizeof(uint32_t);
64 0 : addr_offset = is_ingress ?
65 : offsetof(struct ip6_hdr, ip6_src.s6_addr) :
66 : offsetof(struct ip6_hdr, ip6_dst.s6_addr);
67 0 : break;
68 :
69 0 : default:
70 0 : return -EAFNOSUPPORT;
71 : }
72 :
73 : do {
74 : /* Compare IPv4 with one word instruction (32bit) */
75 0 : struct bpf_insn insn[] = {
76 : /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
77 0 : BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
78 :
79 : /*
80 : * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
81 : *
82 : * R1: Pointer to the skb
83 : * R2: Data offset
84 : * R3: Destination buffer on the stack (r10 - 4)
85 : * R4: Number of bytes to read (4)
86 : */
87 :
88 : BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
89 : BPF_MOV32_IMM(BPF_REG_2, addr_offset),
90 :
91 : BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
92 0 : BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
93 :
94 : BPF_MOV32_IMM(BPF_REG_4, addr_size),
95 : BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
96 :
97 : /*
98 : * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
99 : * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
100 : * has to be set to the maximum possible value.
101 : *
102 : * On success, the looked up value is stored in R0. For this application, the actual
103 : * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
104 : * matching value.
105 : */
106 :
107 0 : BPF_LD_MAP_FD(BPF_REG_1, map_fd),
108 : BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
109 0 : BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
110 0 : BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
111 :
112 : BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
113 : BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
114 : BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
115 : };
116 :
117 : /* Jump label fixup */
118 0 : insn[0].off = ELEMENTSOF(insn) - 1;
119 :
120 0 : r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
121 0 : if (r < 0)
122 0 : return r;
123 :
124 : } while (false);
125 :
126 0 : return 0;
127 : }
128 :
129 0 : static int add_instructions_for_ip_any(
130 : BPFProgram *p,
131 : int verdict) {
132 : int r;
133 :
134 0 : assert(p);
135 :
136 0 : struct bpf_insn insn[] = {
137 : BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
138 : };
139 :
140 0 : r = bpf_program_add_instructions(p, insn, 1);
141 0 : if (r < 0)
142 0 : return r;
143 :
144 0 : return 0;
145 : }
146 :
147 0 : static int bpf_firewall_compile_bpf(
148 : Unit *u,
149 : bool is_ingress,
150 : BPFProgram **ret,
151 : bool ip_allow_any,
152 : bool ip_deny_any) {
153 :
154 0 : struct bpf_insn pre_insn[] = {
155 : /*
156 : * When the eBPF program is entered, R1 contains the address of the skb.
157 : * However, R1-R5 are scratch registers that are not preserved when calling
158 : * into kernel functions, so we need to save anything that's supposed to
159 : * stay around to R6-R9. Save the skb to R6.
160 : */
161 : BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
162 :
163 : /*
164 : * Although we cannot access the skb data directly from eBPF programs used in this
165 : * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
166 : * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
167 : * for later use.
168 : */
169 : BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
170 :
171 : /*
172 : * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
173 : * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
174 : */
175 : BPF_MOV32_IMM(BPF_REG_8, 0),
176 : };
177 :
178 : /*
179 : * The access checkers compiled for the configured allowance and denial lists
180 : * write to R8 at runtime. The following code prepares for an early exit that
181 : * skip the accounting if the packet is denied.
182 : *
183 : * R0 = 1
184 : * if (R8 == ACCESS_DENIED)
185 : * R0 = 0
186 : *
187 : * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
188 : * is allowed to pass.
189 : */
190 0 : struct bpf_insn post_insn[] = {
191 : BPF_MOV64_IMM(BPF_REG_0, 1),
192 : BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
193 : BPF_MOV64_IMM(BPF_REG_0, 0),
194 : };
195 :
196 0 : _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
197 : int accounting_map_fd, r;
198 : bool access_enabled;
199 :
200 0 : assert(u);
201 0 : assert(ret);
202 :
203 0 : accounting_map_fd = is_ingress ?
204 0 : u->ip_accounting_ingress_map_fd :
205 : u->ip_accounting_egress_map_fd;
206 :
207 0 : access_enabled =
208 0 : u->ipv4_allow_map_fd >= 0 ||
209 0 : u->ipv6_allow_map_fd >= 0 ||
210 0 : u->ipv4_deny_map_fd >= 0 ||
211 0 : u->ipv6_deny_map_fd >= 0 ||
212 0 : ip_allow_any ||
213 : ip_deny_any;
214 :
215 0 : if (accounting_map_fd < 0 && !access_enabled) {
216 0 : *ret = NULL;
217 0 : return 0;
218 : }
219 :
220 0 : r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
221 0 : if (r < 0)
222 0 : return r;
223 :
224 0 : r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
225 0 : if (r < 0)
226 0 : return r;
227 :
228 0 : if (access_enabled) {
229 : /*
230 : * The simple rule this function translates into eBPF instructions is:
231 : *
232 : * - Access will be granted when an address matches an entry in @list_allow
233 : * - Otherwise, access will be denied when an address matches an entry in @list_deny
234 : * - Otherwise, access will be granted
235 : */
236 :
237 0 : if (u->ipv4_deny_map_fd >= 0) {
238 0 : r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
239 0 : if (r < 0)
240 0 : return r;
241 : }
242 :
243 0 : if (u->ipv6_deny_map_fd >= 0) {
244 0 : r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
245 0 : if (r < 0)
246 0 : return r;
247 : }
248 :
249 0 : if (u->ipv4_allow_map_fd >= 0) {
250 0 : r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
251 0 : if (r < 0)
252 0 : return r;
253 : }
254 :
255 0 : if (u->ipv6_allow_map_fd >= 0) {
256 0 : r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
257 0 : if (r < 0)
258 0 : return r;
259 : }
260 :
261 0 : if (ip_allow_any) {
262 0 : r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
263 0 : if (r < 0)
264 0 : return r;
265 : }
266 :
267 0 : if (ip_deny_any) {
268 0 : r = add_instructions_for_ip_any(p, ACCESS_DENIED);
269 0 : if (r < 0)
270 0 : return r;
271 : }
272 : }
273 :
274 0 : r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
275 0 : if (r < 0)
276 0 : return r;
277 :
278 0 : if (accounting_map_fd >= 0) {
279 0 : struct bpf_insn insn[] = {
280 : /*
281 : * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
282 : * The jump label will be fixed up later.
283 : */
284 : BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
285 :
286 : /* Count packets */
287 : BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
288 : BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
289 : BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
290 : BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
291 0 : BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
292 : BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
293 : BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
294 : BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
295 : BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
296 :
297 : /* Count bytes */
298 : BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
299 : BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
300 : BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
301 : BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
302 0 : BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
303 : BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
304 : BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
305 : BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
306 : BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
307 :
308 : /* Allow the packet to pass */
309 : BPF_MOV64_IMM(BPF_REG_0, 1),
310 : };
311 :
312 : /* Jump label fixup */
313 0 : insn[0].off = ELEMENTSOF(insn) - 1;
314 :
315 0 : r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
316 0 : if (r < 0)
317 0 : return r;
318 : }
319 :
320 : do {
321 : /*
322 : * Exit from the eBPF program, R0 contains the verdict.
323 : * 0 means the packet is denied, 1 means the packet may pass.
324 : */
325 0 : struct bpf_insn insn[] = {
326 : BPF_EXIT_INSN()
327 : };
328 :
329 0 : r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
330 0 : if (r < 0)
331 0 : return r;
332 : } while (false);
333 :
334 0 : *ret = TAKE_PTR(p);
335 :
336 0 : return 0;
337 : }
338 :
339 0 : static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
340 : IPAddressAccessItem *a;
341 :
342 0 : assert(n_ipv4);
343 0 : assert(n_ipv6);
344 :
345 0 : LIST_FOREACH(items, a, list) {
346 0 : switch (a->family) {
347 :
348 0 : case AF_INET:
349 0 : (*n_ipv4)++;
350 0 : break;
351 :
352 0 : case AF_INET6:
353 0 : (*n_ipv6)++;
354 0 : break;
355 :
356 0 : default:
357 0 : return -EAFNOSUPPORT;
358 : }
359 : }
360 :
361 0 : return 0;
362 : }
363 :
364 0 : static int bpf_firewall_add_access_items(
365 : IPAddressAccessItem *list,
366 : int ipv4_map_fd,
367 : int ipv6_map_fd,
368 : int verdict) {
369 :
370 : struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
371 0 : uint64_t value = verdict;
372 : IPAddressAccessItem *a;
373 : int r;
374 :
375 0 : key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
376 0 : key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
377 :
378 0 : LIST_FOREACH(items, a, list) {
379 0 : switch (a->family) {
380 :
381 0 : case AF_INET:
382 0 : key_ipv4->prefixlen = a->prefixlen;
383 0 : memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
384 :
385 0 : r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
386 0 : if (r < 0)
387 0 : return r;
388 :
389 0 : break;
390 :
391 0 : case AF_INET6:
392 0 : key_ipv6->prefixlen = a->prefixlen;
393 0 : memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
394 :
395 0 : r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
396 0 : if (r < 0)
397 0 : return r;
398 :
399 0 : break;
400 :
401 0 : default:
402 0 : return -EAFNOSUPPORT;
403 : }
404 : }
405 :
406 0 : return 0;
407 : }
408 :
409 0 : static int bpf_firewall_prepare_access_maps(
410 : Unit *u,
411 : int verdict,
412 : int *ret_ipv4_map_fd,
413 : int *ret_ipv6_map_fd,
414 : bool *ret_has_any) {
415 :
416 0 : _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
417 0 : size_t n_ipv4 = 0, n_ipv6 = 0;
418 : IPAddressAccessItem *list;
419 : Unit *p;
420 : int r;
421 :
422 0 : assert(ret_ipv4_map_fd);
423 0 : assert(ret_ipv6_map_fd);
424 0 : assert(ret_has_any);
425 :
426 0 : for (p = u; p; p = UNIT_DEREF(p->slice)) {
427 : CGroupContext *cc;
428 :
429 0 : cc = unit_get_cgroup_context(p);
430 0 : if (!cc)
431 0 : continue;
432 :
433 0 : list = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
434 :
435 0 : bpf_firewall_count_access_items(list, &n_ipv4, &n_ipv6);
436 :
437 : /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
438 : * needing CAP_SYS_ADMIN for allocating LPM trie map. */
439 0 : if (ip_address_access_item_is_any(list)) {
440 0 : *ret_has_any = true;
441 0 : return 0;
442 : }
443 : }
444 :
445 0 : if (n_ipv4 > 0) {
446 0 : ipv4_map_fd = bpf_map_new(
447 : BPF_MAP_TYPE_LPM_TRIE,
448 : offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
449 : sizeof(uint64_t),
450 : n_ipv4,
451 : BPF_F_NO_PREALLOC);
452 0 : if (ipv4_map_fd < 0)
453 0 : return ipv4_map_fd;
454 : }
455 :
456 0 : if (n_ipv6 > 0) {
457 0 : ipv6_map_fd = bpf_map_new(
458 : BPF_MAP_TYPE_LPM_TRIE,
459 : offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
460 : sizeof(uint64_t),
461 : n_ipv6,
462 : BPF_F_NO_PREALLOC);
463 0 : if (ipv6_map_fd < 0)
464 0 : return ipv6_map_fd;
465 : }
466 :
467 0 : for (p = u; p; p = UNIT_DEREF(p->slice)) {
468 : CGroupContext *cc;
469 :
470 0 : cc = unit_get_cgroup_context(p);
471 0 : if (!cc)
472 0 : continue;
473 :
474 0 : r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
475 : ipv4_map_fd, ipv6_map_fd, verdict);
476 0 : if (r < 0)
477 0 : return r;
478 : }
479 :
480 0 : *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
481 0 : *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
482 0 : *ret_has_any = false;
483 0 : return 0;
484 : }
485 :
486 0 : static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
487 : int r;
488 :
489 0 : assert(u);
490 0 : assert(fd_ingress);
491 0 : assert(fd_egress);
492 :
493 0 : if (enabled) {
494 0 : if (*fd_ingress < 0) {
495 0 : r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
496 0 : if (r < 0)
497 0 : return r;
498 :
499 0 : *fd_ingress = r;
500 : }
501 :
502 0 : if (*fd_egress < 0) {
503 :
504 0 : r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
505 0 : if (r < 0)
506 0 : return r;
507 :
508 0 : *fd_egress = r;
509 : }
510 :
511 : } else {
512 0 : *fd_ingress = safe_close(*fd_ingress);
513 0 : *fd_egress = safe_close(*fd_egress);
514 :
515 0 : zero(u->ip_accounting_extra);
516 : }
517 :
518 0 : return 0;
519 : }
520 :
521 0 : int bpf_firewall_compile(Unit *u) {
522 : CGroupContext *cc;
523 : int r, supported;
524 0 : bool ip_allow_any = false, ip_deny_any = false;
525 :
526 0 : assert(u);
527 :
528 0 : cc = unit_get_cgroup_context(u);
529 0 : if (!cc)
530 0 : return -EINVAL;
531 :
532 0 : supported = bpf_firewall_supported();
533 0 : if (supported < 0)
534 0 : return supported;
535 0 : if (supported == BPF_FIREWALL_UNSUPPORTED)
536 0 : return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
537 : "BPF firewalling not supported on this manager, proceeding without.");
538 0 : if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
539 : /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
540 : * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
541 : * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
542 : * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
543 : * all, either. */
544 0 : return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
545 : "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
546 :
547 : /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
548 : * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
549 : * configuration, but we don't flush out the accounting unnecessarily */
550 :
551 0 : u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
552 0 : u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
553 :
554 0 : u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
555 0 : u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
556 :
557 0 : u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
558 0 : u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
559 :
560 0 : if (u->type != UNIT_SLICE) {
561 : /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
562 : * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
563 : * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
564 : * means that all configure IP access rules *will* take effect on processes, even though we never
565 : * compile them for inner nodes. */
566 :
567 0 : r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
568 0 : if (r < 0)
569 0 : return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m");
570 :
571 0 : r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
572 0 : if (r < 0)
573 0 : return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m");
574 : }
575 :
576 0 : r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
577 0 : if (r < 0)
578 0 : return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m");
579 :
580 0 : r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
581 0 : if (r < 0)
582 0 : return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m");
583 :
584 0 : r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
585 0 : if (r < 0)
586 0 : return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m");
587 :
588 0 : return 0;
589 : }
590 :
591 0 : DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(filter_prog_hash_ops, void, trivial_hash_func, trivial_compare_func, BPFProgram, bpf_program_unref);
592 :
593 0 : static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
594 : char **bpf_fs_path;
595 :
596 0 : set_clear(*set);
597 :
598 0 : STRV_FOREACH(bpf_fs_path, filter_paths) {
599 0 : _cleanup_free_ BPFProgram *prog = NULL;
600 : int r;
601 :
602 0 : r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &prog);
603 0 : if (r < 0)
604 0 : return log_unit_error_errno(u, r, "Can't allocate CGROUP SKB BPF program: %m");
605 :
606 0 : r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
607 0 : if (r < 0)
608 0 : return log_unit_error_errno(u, r, "Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
609 :
610 0 : r = set_ensure_allocated(set, &filter_prog_hash_ops);
611 0 : if (r < 0)
612 0 : return log_unit_error_errno(u, r, "Can't allocate BPF program set: %m");
613 :
614 0 : r = set_put(*set, prog);
615 0 : if (r < 0)
616 0 : return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
617 0 : TAKE_PTR(prog);
618 : }
619 :
620 0 : return 0;
621 : }
622 :
623 6 : int bpf_firewall_load_custom(Unit *u) {
624 : CGroupContext *cc;
625 : int r, supported;
626 :
627 6 : assert(u);
628 :
629 6 : cc = unit_get_cgroup_context(u);
630 6 : if (!cc)
631 0 : return 0;
632 :
633 6 : if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
634 6 : return 0;
635 :
636 0 : supported = bpf_firewall_supported();
637 0 : if (supported < 0)
638 0 : return supported;
639 :
640 0 : if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
641 0 : return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
642 :
643 0 : r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
644 0 : if (r < 0)
645 0 : return r;
646 0 : r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
647 0 : if (r < 0)
648 0 : return r;
649 :
650 0 : return 0;
651 : }
652 :
653 0 : static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
654 : BPFProgram *prog;
655 : Iterator i;
656 : int r;
657 :
658 0 : assert(u);
659 :
660 0 : set_clear(*set_installed);
661 :
662 0 : SET_FOREACH(prog, *set, i) {
663 0 : r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
664 0 : if (r < 0)
665 0 : return log_unit_error_errno(u, r, "Attaching custom egress BPF program to cgroup %s failed: %m", path);
666 : /* Remember that these BPF programs are installed now. */
667 0 : r = set_ensure_allocated(set_installed, &filter_prog_hash_ops);
668 0 : if (r < 0)
669 0 : return log_unit_error_errno(u, r, "Can't allocate BPF program set: %m");
670 :
671 0 : r = set_put(*set_installed, prog);
672 0 : if (r < 0)
673 0 : return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
674 0 : bpf_program_ref(prog);
675 : }
676 :
677 0 : return 0;
678 : }
679 :
680 0 : int bpf_firewall_install(Unit *u) {
681 0 : _cleanup_free_ char *path = NULL;
682 : CGroupContext *cc;
683 : int r, supported;
684 : uint32_t flags;
685 :
686 0 : assert(u);
687 :
688 0 : cc = unit_get_cgroup_context(u);
689 0 : if (!cc)
690 0 : return -EINVAL;
691 0 : if (!u->cgroup_path)
692 0 : return -EINVAL;
693 0 : if (!u->cgroup_realized)
694 0 : return -EINVAL;
695 :
696 0 : supported = bpf_firewall_supported();
697 0 : if (supported < 0)
698 0 : return supported;
699 0 : if (supported == BPF_FIREWALL_UNSUPPORTED) {
700 0 : log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without.");
701 0 : return -EOPNOTSUPP;
702 : }
703 0 : if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
704 0 : log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
705 0 : return -EOPNOTSUPP;
706 : }
707 0 : if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
708 0 : (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
709 0 : return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
710 :
711 0 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
712 0 : if (r < 0)
713 0 : return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
714 :
715 0 : flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
716 0 : (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
717 :
718 : /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
719 : * minimize the time window when we don't account for IP traffic. */
720 0 : u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
721 0 : u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
722 :
723 0 : if (u->ip_bpf_egress) {
724 0 : r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path,
725 0 : flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI));
726 0 : if (r < 0)
727 0 : return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
728 :
729 : /* Remember that this BPF program is installed now. */
730 0 : u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
731 : }
732 :
733 0 : if (u->ip_bpf_ingress) {
734 0 : r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path,
735 0 : flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI));
736 0 : if (r < 0)
737 0 : return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
738 :
739 0 : u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
740 : }
741 :
742 0 : r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
743 0 : if (r < 0)
744 0 : return r;
745 :
746 0 : r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
747 0 : if (r < 0)
748 0 : return r;
749 :
750 0 : return 0;
751 : }
752 :
753 0 : int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
754 : uint64_t key, packets;
755 : int r;
756 :
757 0 : if (map_fd < 0)
758 0 : return -EBADF;
759 :
760 0 : if (ret_packets) {
761 0 : key = MAP_KEY_PACKETS;
762 0 : r = bpf_map_lookup_element(map_fd, &key, &packets);
763 0 : if (r < 0)
764 0 : return r;
765 : }
766 :
767 0 : if (ret_bytes) {
768 0 : key = MAP_KEY_BYTES;
769 0 : r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
770 0 : if (r < 0)
771 0 : return r;
772 : }
773 :
774 0 : if (ret_packets)
775 0 : *ret_packets = packets;
776 :
777 0 : return 0;
778 : }
779 :
780 0 : int bpf_firewall_reset_accounting(int map_fd) {
781 0 : uint64_t key, value = 0;
782 : int r;
783 :
784 0 : if (map_fd < 0)
785 0 : return -EBADF;
786 :
787 0 : key = MAP_KEY_PACKETS;
788 0 : r = bpf_map_update_element(map_fd, &key, &value);
789 0 : if (r < 0)
790 0 : return r;
791 :
792 0 : key = MAP_KEY_BYTES;
793 0 : return bpf_map_update_element(map_fd, &key, &value);
794 : }
795 :
796 : static int bpf_firewall_unsupported_reason = 0;
797 :
798 11 : int bpf_firewall_supported(void) {
799 11 : struct bpf_insn trivial[] = {
800 : BPF_MOV64_IMM(BPF_REG_0, 1),
801 : BPF_EXIT_INSN()
802 : };
803 :
804 11 : _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
805 : static int supported = -1;
806 : union bpf_attr attr;
807 : int r;
808 :
809 : /* Checks whether BPF firewalling is supported. For this, we check the following things:
810 : *
811 : * - whether the unified hierarchy is being used
812 : * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
813 : * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
814 : */
815 11 : if (supported >= 0)
816 6 : return supported;
817 :
818 5 : r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
819 5 : if (r < 0)
820 0 : return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
821 5 : if (r == 0) {
822 0 : bpf_firewall_unsupported_reason =
823 0 : log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
824 : "Not running with unified cgroups, BPF firewalling is not supported.");
825 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
826 : }
827 :
828 5 : r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program);
829 5 : if (r < 0) {
830 0 : bpf_firewall_unsupported_reason =
831 0 : log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
832 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
833 : }
834 :
835 5 : r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
836 5 : if (r < 0) {
837 0 : bpf_firewall_unsupported_reason =
838 0 : log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
839 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
840 : }
841 :
842 5 : r = bpf_program_load_kernel(program, NULL, 0);
843 5 : if (r < 0) {
844 0 : bpf_firewall_unsupported_reason =
845 0 : log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
846 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
847 : }
848 :
849 : /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
850 : * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
851 : * program if we can't do a thing with it later?
852 : *
853 : * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
854 : * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
855 : * parameters are validated however, and that'll fail with EBADF then. */
856 :
857 5 : attr = (union bpf_attr) {
858 : .attach_type = BPF_CGROUP_INET_EGRESS,
859 : .target_fd = -1,
860 : .attach_bpf_fd = -1,
861 : };
862 :
863 5 : if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
864 5 : if (errno != EBADF) {
865 5 : bpf_firewall_unsupported_reason =
866 5 : log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
867 5 : return supported = BPF_FIREWALL_UNSUPPORTED;
868 : }
869 :
870 : /* YAY! */
871 : } else {
872 0 : log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
873 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
874 : }
875 :
876 : /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
877 : * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
878 : * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
879 : * get EINVAL if it's not supported, and EBADF as before if it is available. */
880 :
881 0 : attr = (union bpf_attr) {
882 : .attach_type = BPF_CGROUP_INET_EGRESS,
883 : .target_fd = -1,
884 : .attach_bpf_fd = -1,
885 : .attach_flags = BPF_F_ALLOW_MULTI,
886 : };
887 :
888 0 : if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
889 0 : if (errno == EBADF) {
890 0 : log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
891 0 : return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
892 : }
893 :
894 0 : if (errno == EINVAL)
895 0 : log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
896 : else
897 0 : log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
898 :
899 0 : return supported = BPF_FIREWALL_SUPPORTED;
900 : } else {
901 0 : log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
902 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
903 : }
904 : }
905 :
906 0 : void emit_bpf_firewall_warning(Unit *u) {
907 : static bool warned = false;
908 :
909 0 : if (!warned) {
910 0 : bool quiet = bpf_firewall_unsupported_reason == -EPERM && detect_container();
911 :
912 0 : log_unit_full(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
913 : "unit configures an IP firewall, but %s.\n"
914 : "(This warning is only shown for the first unit using IP firewalling.)",
915 : getuid() != 0 ? "not running as root" :
916 : "the local system does not support BPF/cgroup firewalling");
917 0 : warned = true;
918 : }
919 0 : }
|