Branch data Line data Source code
1 : : /* SPDX-License-Identifier: LGPL-2.1+ */
2 : :
3 : : #include <arpa/inet.h>
4 : : #include <assert.h>
5 : : #include <errno.h>
6 : : #include <fcntl.h>
7 : : #include <linux/bpf_insn.h>
8 : : #include <net/ethernet.h>
9 : : #include <net/if.h>
10 : : #include <netinet/ip.h>
11 : : #include <netinet/ip6.h>
12 : : #include <stddef.h>
13 : : #include <stdio.h>
14 : : #include <stdlib.h>
15 : : #include <string.h>
16 : : #include <unistd.h>
17 : :
18 : : #include "alloc-util.h"
19 : : #include "bpf-firewall.h"
20 : : #include "bpf-program.h"
21 : : #include "fd-util.h"
22 : : #include "ip-address-access.h"
23 : : #include "memory-util.h"
24 : : #include "missing_syscall.h"
25 : : #include "unit.h"
26 : : #include "strv.h"
27 : : #include "virt.h"
28 : :
29 : : enum {
30 : : MAP_KEY_PACKETS,
31 : : MAP_KEY_BYTES,
32 : : };
33 : :
34 : : enum {
35 : : ACCESS_ALLOWED = 1,
36 : : ACCESS_DENIED = 2,
37 : : };
38 : :
39 : : /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
40 : :
41 : 0 : static int add_lookup_instructions(
42 : : BPFProgram *p,
43 : : int map_fd,
44 : : int protocol,
45 : : bool is_ingress,
46 : : int verdict) {
47 : :
48 : : int r, addr_offset, addr_size;
49 : :
50 [ # # ]: 0 : assert(p);
51 [ # # ]: 0 : assert(map_fd >= 0);
52 : :
53 [ # # # ]: 0 : switch (protocol) {
54 : :
55 : 0 : case ETH_P_IP:
56 : 0 : addr_size = sizeof(uint32_t);
57 [ # # ]: 0 : addr_offset = is_ingress ?
58 : : offsetof(struct iphdr, saddr) :
59 : : offsetof(struct iphdr, daddr);
60 : 0 : break;
61 : :
62 : 0 : case ETH_P_IPV6:
63 : 0 : addr_size = 4 * sizeof(uint32_t);
64 [ # # ]: 0 : addr_offset = is_ingress ?
65 : : offsetof(struct ip6_hdr, ip6_src.s6_addr) :
66 : : offsetof(struct ip6_hdr, ip6_dst.s6_addr);
67 : 0 : break;
68 : :
69 : 0 : default:
70 : 0 : return -EAFNOSUPPORT;
71 : : }
72 : :
73 : : do {
74 : : /* Compare IPv4 with one word instruction (32bit) */
75 : 0 : struct bpf_insn insn[] = {
76 : : /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
77 : 0 : BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
78 : :
79 : : /*
80 : : * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
81 : : *
82 : : * R1: Pointer to the skb
83 : : * R2: Data offset
84 : : * R3: Destination buffer on the stack (r10 - 4)
85 : : * R4: Number of bytes to read (4)
86 : : */
87 : :
88 : : BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
89 : : BPF_MOV32_IMM(BPF_REG_2, addr_offset),
90 : :
91 : : BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
92 : 0 : BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
93 : :
94 : : BPF_MOV32_IMM(BPF_REG_4, addr_size),
95 : : BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
96 : :
97 : : /*
98 : : * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
99 : : * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
100 : : * has to be set to the maximum possible value.
101 : : *
102 : : * On success, the looked up value is stored in R0. For this application, the actual
103 : : * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
104 : : * matching value.
105 : : */
106 : :
107 : 0 : BPF_LD_MAP_FD(BPF_REG_1, map_fd),
108 : : BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
109 : 0 : BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
110 : 0 : BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
111 : :
112 : : BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
113 : : BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
114 : : BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
115 : : };
116 : :
117 : : /* Jump label fixup */
118 : 0 : insn[0].off = ELEMENTSOF(insn) - 1;
119 : :
120 : 0 : r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
121 [ # # ]: 0 : if (r < 0)
122 : 0 : return r;
123 : :
124 : : } while (false);
125 : :
126 : 0 : return 0;
127 : : }
128 : :
129 : 0 : static int add_instructions_for_ip_any(
130 : : BPFProgram *p,
131 : : int verdict) {
132 : : int r;
133 : :
134 [ # # ]: 0 : assert(p);
135 : :
136 : 0 : struct bpf_insn insn[] = {
137 : : BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
138 : : };
139 : :
140 : 0 : r = bpf_program_add_instructions(p, insn, 1);
141 [ # # ]: 0 : if (r < 0)
142 : 0 : return r;
143 : :
144 : 0 : return 0;
145 : : }
146 : :
147 : 0 : static int bpf_firewall_compile_bpf(
148 : : Unit *u,
149 : : bool is_ingress,
150 : : BPFProgram **ret,
151 : : bool ip_allow_any,
152 : : bool ip_deny_any) {
153 : :
154 : 0 : struct bpf_insn pre_insn[] = {
155 : : /*
156 : : * When the eBPF program is entered, R1 contains the address of the skb.
157 : : * However, R1-R5 are scratch registers that are not preserved when calling
158 : : * into kernel functions, so we need to save anything that's supposed to
159 : : * stay around to R6-R9. Save the skb to R6.
160 : : */
161 : : BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
162 : :
163 : : /*
164 : : * Although we cannot access the skb data directly from eBPF programs used in this
165 : : * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
166 : : * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
167 : : * for later use.
168 : : */
169 : : BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
170 : :
171 : : /*
172 : : * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
173 : : * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
174 : : */
175 : : BPF_MOV32_IMM(BPF_REG_8, 0),
176 : : };
177 : :
178 : : /*
179 : : * The access checkers compiled for the configured allowance and denial lists
180 : : * write to R8 at runtime. The following code prepares for an early exit that
181 : : * skip the accounting if the packet is denied.
182 : : *
183 : : * R0 = 1
184 : : * if (R8 == ACCESS_DENIED)
185 : : * R0 = 0
186 : : *
187 : : * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
188 : : * is allowed to pass.
189 : : */
190 : 0 : struct bpf_insn post_insn[] = {
191 : : BPF_MOV64_IMM(BPF_REG_0, 1),
192 : : BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
193 : : BPF_MOV64_IMM(BPF_REG_0, 0),
194 : : };
195 : :
196 : 0 : _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
197 : : int accounting_map_fd, r;
198 : : bool access_enabled;
199 : :
200 [ # # ]: 0 : assert(u);
201 [ # # ]: 0 : assert(ret);
202 : :
203 : 0 : accounting_map_fd = is_ingress ?
204 [ # # ]: 0 : u->ip_accounting_ingress_map_fd :
205 : : u->ip_accounting_egress_map_fd;
206 : :
207 : 0 : access_enabled =
208 : 0 : u->ipv4_allow_map_fd >= 0 ||
209 [ # # ]: 0 : u->ipv6_allow_map_fd >= 0 ||
210 [ # # ]: 0 : u->ipv4_deny_map_fd >= 0 ||
211 [ # # # # ]: 0 : u->ipv6_deny_map_fd >= 0 ||
212 [ # # # # ]: 0 : ip_allow_any ||
213 : : ip_deny_any;
214 : :
215 [ # # # # ]: 0 : if (accounting_map_fd < 0 && !access_enabled) {
216 : 0 : *ret = NULL;
217 : 0 : return 0;
218 : : }
219 : :
220 : 0 : r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
221 [ # # ]: 0 : if (r < 0)
222 : 0 : return r;
223 : :
224 : 0 : r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
225 [ # # ]: 0 : if (r < 0)
226 : 0 : return r;
227 : :
228 [ # # ]: 0 : if (access_enabled) {
229 : : /*
230 : : * The simple rule this function translates into eBPF instructions is:
231 : : *
232 : : * - Access will be granted when an address matches an entry in @list_allow
233 : : * - Otherwise, access will be denied when an address matches an entry in @list_deny
234 : : * - Otherwise, access will be granted
235 : : */
236 : :
237 [ # # ]: 0 : if (u->ipv4_deny_map_fd >= 0) {
238 : 0 : r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
239 [ # # ]: 0 : if (r < 0)
240 : 0 : return r;
241 : : }
242 : :
243 [ # # ]: 0 : if (u->ipv6_deny_map_fd >= 0) {
244 : 0 : r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
245 [ # # ]: 0 : if (r < 0)
246 : 0 : return r;
247 : : }
248 : :
249 [ # # ]: 0 : if (u->ipv4_allow_map_fd >= 0) {
250 : 0 : r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
251 [ # # ]: 0 : if (r < 0)
252 : 0 : return r;
253 : : }
254 : :
255 [ # # ]: 0 : if (u->ipv6_allow_map_fd >= 0) {
256 : 0 : r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
257 [ # # ]: 0 : if (r < 0)
258 : 0 : return r;
259 : : }
260 : :
261 [ # # ]: 0 : if (ip_allow_any) {
262 : 0 : r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
263 [ # # ]: 0 : if (r < 0)
264 : 0 : return r;
265 : : }
266 : :
267 [ # # ]: 0 : if (ip_deny_any) {
268 : 0 : r = add_instructions_for_ip_any(p, ACCESS_DENIED);
269 [ # # ]: 0 : if (r < 0)
270 : 0 : return r;
271 : : }
272 : : }
273 : :
274 : 0 : r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
275 [ # # ]: 0 : if (r < 0)
276 : 0 : return r;
277 : :
278 [ # # ]: 0 : if (accounting_map_fd >= 0) {
279 : 0 : struct bpf_insn insn[] = {
280 : : /*
281 : : * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
282 : : * The jump label will be fixed up later.
283 : : */
284 : : BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
285 : :
286 : : /* Count packets */
287 : : BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
288 : : BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
289 : : BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
290 : : BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
291 : 0 : BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
292 : : BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
293 : : BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
294 : : BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
295 : : BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
296 : :
297 : : /* Count bytes */
298 : : BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
299 : : BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
300 : : BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
301 : : BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
302 : 0 : BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
303 : : BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
304 : : BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
305 : : BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
306 : : BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
307 : :
308 : : /* Allow the packet to pass */
309 : : BPF_MOV64_IMM(BPF_REG_0, 1),
310 : : };
311 : :
312 : : /* Jump label fixup */
313 : 0 : insn[0].off = ELEMENTSOF(insn) - 1;
314 : :
315 : 0 : r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
316 [ # # ]: 0 : if (r < 0)
317 : 0 : return r;
318 : : }
319 : :
320 : : do {
321 : : /*
322 : : * Exit from the eBPF program, R0 contains the verdict.
323 : : * 0 means the packet is denied, 1 means the packet may pass.
324 : : */
325 : 0 : struct bpf_insn insn[] = {
326 : : BPF_EXIT_INSN()
327 : : };
328 : :
329 : 0 : r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
330 [ # # ]: 0 : if (r < 0)
331 : 0 : return r;
332 : : } while (false);
333 : :
334 : 0 : *ret = TAKE_PTR(p);
335 : :
336 : 0 : return 0;
337 : : }
338 : :
339 : 0 : static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
340 : : IPAddressAccessItem *a;
341 : :
342 [ # # ]: 0 : assert(n_ipv4);
343 [ # # ]: 0 : assert(n_ipv6);
344 : :
345 [ # # ]: 0 : LIST_FOREACH(items, a, list) {
346 [ # # # ]: 0 : switch (a->family) {
347 : :
348 : 0 : case AF_INET:
349 : 0 : (*n_ipv4)++;
350 : 0 : break;
351 : :
352 : 0 : case AF_INET6:
353 : 0 : (*n_ipv6)++;
354 : 0 : break;
355 : :
356 : 0 : default:
357 : 0 : return -EAFNOSUPPORT;
358 : : }
359 : : }
360 : :
361 : 0 : return 0;
362 : : }
363 : :
364 : 0 : static int bpf_firewall_add_access_items(
365 : : IPAddressAccessItem *list,
366 : : int ipv4_map_fd,
367 : : int ipv6_map_fd,
368 : : int verdict) {
369 : :
370 : : struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
371 : 0 : uint64_t value = verdict;
372 : : IPAddressAccessItem *a;
373 : : int r;
374 : :
375 [ # # ]: 0 : key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
376 [ # # ]: 0 : key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
377 : :
378 [ # # ]: 0 : LIST_FOREACH(items, a, list) {
379 [ # # # ]: 0 : switch (a->family) {
380 : :
381 : 0 : case AF_INET:
382 : 0 : key_ipv4->prefixlen = a->prefixlen;
383 : 0 : memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
384 : :
385 : 0 : r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
386 [ # # ]: 0 : if (r < 0)
387 : 0 : return r;
388 : :
389 : 0 : break;
390 : :
391 : 0 : case AF_INET6:
392 : 0 : key_ipv6->prefixlen = a->prefixlen;
393 : 0 : memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
394 : :
395 : 0 : r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
396 [ # # ]: 0 : if (r < 0)
397 : 0 : return r;
398 : :
399 : 0 : break;
400 : :
401 : 0 : default:
402 : 0 : return -EAFNOSUPPORT;
403 : : }
404 : : }
405 : :
406 : 0 : return 0;
407 : : }
408 : :
409 : 0 : static int bpf_firewall_prepare_access_maps(
410 : : Unit *u,
411 : : int verdict,
412 : : int *ret_ipv4_map_fd,
413 : : int *ret_ipv6_map_fd,
414 : : bool *ret_has_any) {
415 : :
416 : 0 : _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
417 : 0 : size_t n_ipv4 = 0, n_ipv6 = 0;
418 : : IPAddressAccessItem *list;
419 : : Unit *p;
420 : : int r;
421 : :
422 [ # # ]: 0 : assert(ret_ipv4_map_fd);
423 [ # # ]: 0 : assert(ret_ipv6_map_fd);
424 [ # # ]: 0 : assert(ret_has_any);
425 : :
426 [ # # ]: 0 : for (p = u; p; p = UNIT_DEREF(p->slice)) {
427 : : CGroupContext *cc;
428 : :
429 : 0 : cc = unit_get_cgroup_context(p);
430 [ # # ]: 0 : if (!cc)
431 : 0 : continue;
432 : :
433 [ # # ]: 0 : list = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
434 : :
435 : 0 : bpf_firewall_count_access_items(list, &n_ipv4, &n_ipv6);
436 : :
437 : : /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
438 : : * needing CAP_SYS_ADMIN for allocating LPM trie map. */
439 [ # # ]: 0 : if (ip_address_access_item_is_any(list)) {
440 : 0 : *ret_has_any = true;
441 : 0 : return 0;
442 : : }
443 : : }
444 : :
445 [ # # ]: 0 : if (n_ipv4 > 0) {
446 : 0 : ipv4_map_fd = bpf_map_new(
447 : : BPF_MAP_TYPE_LPM_TRIE,
448 : : offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
449 : : sizeof(uint64_t),
450 : : n_ipv4,
451 : : BPF_F_NO_PREALLOC);
452 [ # # ]: 0 : if (ipv4_map_fd < 0)
453 : 0 : return ipv4_map_fd;
454 : : }
455 : :
456 [ # # ]: 0 : if (n_ipv6 > 0) {
457 : 0 : ipv6_map_fd = bpf_map_new(
458 : : BPF_MAP_TYPE_LPM_TRIE,
459 : : offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
460 : : sizeof(uint64_t),
461 : : n_ipv6,
462 : : BPF_F_NO_PREALLOC);
463 [ # # ]: 0 : if (ipv6_map_fd < 0)
464 : 0 : return ipv6_map_fd;
465 : : }
466 : :
467 [ # # ]: 0 : for (p = u; p; p = UNIT_DEREF(p->slice)) {
468 : : CGroupContext *cc;
469 : :
470 : 0 : cc = unit_get_cgroup_context(p);
471 [ # # ]: 0 : if (!cc)
472 : 0 : continue;
473 : :
474 [ # # ]: 0 : r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
475 : : ipv4_map_fd, ipv6_map_fd, verdict);
476 [ # # ]: 0 : if (r < 0)
477 : 0 : return r;
478 : : }
479 : :
480 : 0 : *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
481 : 0 : *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
482 : 0 : *ret_has_any = false;
483 : 0 : return 0;
484 : : }
485 : :
486 : 0 : static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
487 : : int r;
488 : :
489 [ # # ]: 0 : assert(u);
490 [ # # ]: 0 : assert(fd_ingress);
491 [ # # ]: 0 : assert(fd_egress);
492 : :
493 [ # # ]: 0 : if (enabled) {
494 [ # # ]: 0 : if (*fd_ingress < 0) {
495 : 0 : r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
496 [ # # ]: 0 : if (r < 0)
497 : 0 : return r;
498 : :
499 : 0 : *fd_ingress = r;
500 : : }
501 : :
502 [ # # ]: 0 : if (*fd_egress < 0) {
503 : :
504 : 0 : r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
505 [ # # ]: 0 : if (r < 0)
506 : 0 : return r;
507 : :
508 : 0 : *fd_egress = r;
509 : : }
510 : :
511 : : } else {
512 : 0 : *fd_ingress = safe_close(*fd_ingress);
513 : 0 : *fd_egress = safe_close(*fd_egress);
514 : :
515 [ # # ]: 0 : zero(u->ip_accounting_extra);
516 : : }
517 : :
518 : 0 : return 0;
519 : : }
520 : :
521 : 0 : int bpf_firewall_compile(Unit *u) {
522 : : CGroupContext *cc;
523 : : int r, supported;
524 : 0 : bool ip_allow_any = false, ip_deny_any = false;
525 : :
526 [ # # ]: 0 : assert(u);
527 : :
528 : 0 : cc = unit_get_cgroup_context(u);
529 [ # # ]: 0 : if (!cc)
530 : 0 : return -EINVAL;
531 : :
532 : 0 : supported = bpf_firewall_supported();
533 [ # # ]: 0 : if (supported < 0)
534 : 0 : return supported;
535 [ # # ]: 0 : if (supported == BPF_FIREWALL_UNSUPPORTED)
536 [ # # ]: 0 : return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
537 : : "BPF firewalling not supported on this manager, proceeding without.");
538 [ # # # # ]: 0 : if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
539 : : /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
540 : : * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
541 : : * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
542 : : * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
543 : : * all, either. */
544 [ # # ]: 0 : return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
545 : : "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
546 : :
547 : : /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
548 : : * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
549 : : * configuration, but we don't flush out the accounting unnecessarily */
550 : :
551 : 0 : u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
552 : 0 : u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
553 : :
554 : 0 : u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
555 : 0 : u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
556 : :
557 : 0 : u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
558 : 0 : u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
559 : :
560 [ # # ]: 0 : if (u->type != UNIT_SLICE) {
561 : : /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
562 : : * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
563 : : * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
564 : : * means that all configure IP access rules *will* take effect on processes, even though we never
565 : : * compile them for inner nodes. */
566 : :
567 : 0 : r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
568 [ # # ]: 0 : if (r < 0)
569 [ # # ]: 0 : return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m");
570 : :
571 : 0 : r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
572 [ # # ]: 0 : if (r < 0)
573 [ # # ]: 0 : return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m");
574 : : }
575 : :
576 : 0 : r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
577 [ # # ]: 0 : if (r < 0)
578 [ # # ]: 0 : return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m");
579 : :
580 : 0 : r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
581 [ # # ]: 0 : if (r < 0)
582 [ # # ]: 0 : return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m");
583 : :
584 : 0 : r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
585 [ # # ]: 0 : if (r < 0)
586 [ # # ]: 0 : return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m");
587 : :
588 : 0 : return 0;
589 : : }
590 : :
591 : 0 : DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(filter_prog_hash_ops, void, trivial_hash_func, trivial_compare_func, BPFProgram, bpf_program_unref);
592 : :
593 : 0 : static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
594 : : char **bpf_fs_path;
595 : :
596 : 0 : set_clear(*set);
597 : :
598 [ # # # # ]: 0 : STRV_FOREACH(bpf_fs_path, filter_paths) {
599 [ # # ]: 0 : _cleanup_free_ BPFProgram *prog = NULL;
600 : : int r;
601 : :
602 : 0 : r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &prog);
603 [ # # ]: 0 : if (r < 0)
604 [ # # ]: 0 : return log_unit_error_errno(u, r, "Can't allocate CGROUP SKB BPF program: %m");
605 : :
606 : 0 : r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
607 [ # # ]: 0 : if (r < 0)
608 [ # # ]: 0 : return log_unit_error_errno(u, r, "Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
609 : :
610 : 0 : r = set_ensure_allocated(set, &filter_prog_hash_ops);
611 [ # # ]: 0 : if (r < 0)
612 [ # # ]: 0 : return log_unit_error_errno(u, r, "Can't allocate BPF program set: %m");
613 : :
614 : 0 : r = set_put(*set, prog);
615 [ # # ]: 0 : if (r < 0)
616 [ # # ]: 0 : return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
617 : 0 : TAKE_PTR(prog);
618 : : }
619 : :
620 : 0 : return 0;
621 : : }
622 : :
623 : 24 : int bpf_firewall_load_custom(Unit *u) {
624 : : CGroupContext *cc;
625 : : int r, supported;
626 : :
627 [ - + ]: 24 : assert(u);
628 : :
629 : 24 : cc = unit_get_cgroup_context(u);
630 [ - + ]: 24 : if (!cc)
631 : 0 : return 0;
632 : :
633 [ + - + - ]: 24 : if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
634 : 24 : return 0;
635 : :
636 : 0 : supported = bpf_firewall_supported();
637 [ # # ]: 0 : if (supported < 0)
638 : 0 : return supported;
639 : :
640 [ # # ]: 0 : if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
641 [ # # ]: 0 : return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
642 : :
643 : 0 : r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
644 [ # # ]: 0 : if (r < 0)
645 : 0 : return r;
646 : 0 : r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
647 [ # # ]: 0 : if (r < 0)
648 : 0 : return r;
649 : :
650 : 0 : return 0;
651 : : }
652 : :
653 : 0 : static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
654 : : BPFProgram *prog;
655 : : Iterator i;
656 : : int r;
657 : :
658 [ # # ]: 0 : assert(u);
659 : :
660 : 0 : set_clear(*set_installed);
661 : :
662 [ # # ]: 0 : SET_FOREACH(prog, *set, i) {
663 : 0 : r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
664 [ # # ]: 0 : if (r < 0)
665 [ # # ]: 0 : return log_unit_error_errno(u, r, "Attaching custom egress BPF program to cgroup %s failed: %m", path);
666 : : /* Remember that these BPF programs are installed now. */
667 : 0 : r = set_ensure_allocated(set_installed, &filter_prog_hash_ops);
668 [ # # ]: 0 : if (r < 0)
669 [ # # ]: 0 : return log_unit_error_errno(u, r, "Can't allocate BPF program set: %m");
670 : :
671 : 0 : r = set_put(*set_installed, prog);
672 [ # # ]: 0 : if (r < 0)
673 [ # # ]: 0 : return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
674 : 0 : bpf_program_ref(prog);
675 : : }
676 : :
677 : 0 : return 0;
678 : : }
679 : :
680 : 0 : int bpf_firewall_install(Unit *u) {
681 : 0 : _cleanup_free_ char *path = NULL;
682 : : CGroupContext *cc;
683 : : int r, supported;
684 : : uint32_t flags;
685 : :
686 [ # # ]: 0 : assert(u);
687 : :
688 : 0 : cc = unit_get_cgroup_context(u);
689 [ # # ]: 0 : if (!cc)
690 : 0 : return -EINVAL;
691 [ # # ]: 0 : if (!u->cgroup_path)
692 : 0 : return -EINVAL;
693 [ # # ]: 0 : if (!u->cgroup_realized)
694 : 0 : return -EINVAL;
695 : :
696 : 0 : supported = bpf_firewall_supported();
697 [ # # ]: 0 : if (supported < 0)
698 : 0 : return supported;
699 [ # # ]: 0 : if (supported == BPF_FIREWALL_UNSUPPORTED) {
700 [ # # ]: 0 : log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without.");
701 : 0 : return -EOPNOTSUPP;
702 : : }
703 [ # # # # ]: 0 : if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
704 [ # # ]: 0 : log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
705 : 0 : return -EOPNOTSUPP;
706 : : }
707 [ # # ]: 0 : if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
708 [ # # # # ]: 0 : (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
709 [ # # ]: 0 : return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
710 : :
711 : 0 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
712 [ # # ]: 0 : if (r < 0)
713 [ # # ]: 0 : return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
714 : :
715 : 0 : flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
716 [ # # # # : 0 : (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
# # ]
717 : :
718 : : /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
719 : : * minimize the time window when we don't account for IP traffic. */
720 : 0 : u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
721 : 0 : u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
722 : :
723 [ # # ]: 0 : if (u->ip_bpf_egress) {
724 : 0 : r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path,
725 [ # # ]: 0 : flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI));
726 [ # # ]: 0 : if (r < 0)
727 [ # # ]: 0 : return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
728 : :
729 : : /* Remember that this BPF program is installed now. */
730 : 0 : u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
731 : : }
732 : :
733 [ # # ]: 0 : if (u->ip_bpf_ingress) {
734 : 0 : r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path,
735 [ # # ]: 0 : flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI));
736 [ # # ]: 0 : if (r < 0)
737 [ # # ]: 0 : return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
738 : :
739 : 0 : u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
740 : : }
741 : :
742 : 0 : r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
743 [ # # ]: 0 : if (r < 0)
744 : 0 : return r;
745 : :
746 : 0 : r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
747 [ # # ]: 0 : if (r < 0)
748 : 0 : return r;
749 : :
750 : 0 : return 0;
751 : : }
752 : :
753 : 0 : int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
754 : : uint64_t key, packets;
755 : : int r;
756 : :
757 [ # # ]: 0 : if (map_fd < 0)
758 : 0 : return -EBADF;
759 : :
760 [ # # ]: 0 : if (ret_packets) {
761 : 0 : key = MAP_KEY_PACKETS;
762 : 0 : r = bpf_map_lookup_element(map_fd, &key, &packets);
763 [ # # ]: 0 : if (r < 0)
764 : 0 : return r;
765 : : }
766 : :
767 [ # # ]: 0 : if (ret_bytes) {
768 : 0 : key = MAP_KEY_BYTES;
769 : 0 : r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
770 [ # # ]: 0 : if (r < 0)
771 : 0 : return r;
772 : : }
773 : :
774 [ # # ]: 0 : if (ret_packets)
775 : 0 : *ret_packets = packets;
776 : :
777 : 0 : return 0;
778 : : }
779 : :
780 : 0 : int bpf_firewall_reset_accounting(int map_fd) {
781 : 0 : uint64_t key, value = 0;
782 : : int r;
783 : :
784 [ # # ]: 0 : if (map_fd < 0)
785 : 0 : return -EBADF;
786 : :
787 : 0 : key = MAP_KEY_PACKETS;
788 : 0 : r = bpf_map_update_element(map_fd, &key, &value);
789 [ # # ]: 0 : if (r < 0)
790 : 0 : return r;
791 : :
792 : 0 : key = MAP_KEY_BYTES;
793 : 0 : return bpf_map_update_element(map_fd, &key, &value);
794 : : }
795 : :
796 : : static int bpf_firewall_unsupported_reason = 0;
797 : :
798 : 44 : int bpf_firewall_supported(void) {
799 : 44 : struct bpf_insn trivial[] = {
800 : : BPF_MOV64_IMM(BPF_REG_0, 1),
801 : : BPF_EXIT_INSN()
802 : : };
803 : :
804 : 44 : _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
805 : : static int supported = -1;
806 : : union bpf_attr attr;
807 : : int r;
808 : :
809 : : /* Checks whether BPF firewalling is supported. For this, we check the following things:
810 : : *
811 : : * - whether the unified hierarchy is being used
812 : : * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
813 : : * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
814 : : */
815 [ + + ]: 44 : if (supported >= 0)
816 : 24 : return supported;
817 : :
818 : 20 : r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
819 [ - + ]: 20 : if (r < 0)
820 [ # # ]: 0 : return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
821 [ - + ]: 20 : if (r == 0) {
822 : 0 : bpf_firewall_unsupported_reason =
823 [ # # ]: 0 : log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
824 : : "Not running with unified cgroups, BPF firewalling is not supported.");
825 : 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
826 : : }
827 : :
828 : 20 : r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program);
829 [ - + ]: 20 : if (r < 0) {
830 : 0 : bpf_firewall_unsupported_reason =
831 [ # # ]: 0 : log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
832 : 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
833 : : }
834 : :
835 : 20 : r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
836 [ - + ]: 20 : if (r < 0) {
837 : 0 : bpf_firewall_unsupported_reason =
838 [ # # ]: 0 : log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
839 : 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
840 : : }
841 : :
842 : 20 : r = bpf_program_load_kernel(program, NULL, 0);
843 [ - + ]: 20 : if (r < 0) {
844 : 0 : bpf_firewall_unsupported_reason =
845 [ # # ]: 0 : log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
846 : 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
847 : : }
848 : :
849 : : /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
850 : : * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
851 : : * program if we can't do a thing with it later?
852 : : *
853 : : * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
854 : : * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
855 : : * parameters are validated however, and that'll fail with EBADF then. */
856 : :
857 : 20 : attr = (union bpf_attr) {
858 : : .attach_type = BPF_CGROUP_INET_EGRESS,
859 : : .target_fd = -1,
860 : : .attach_bpf_fd = -1,
861 : : };
862 : :
863 [ + - ]: 20 : if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
864 [ + - ]: 20 : if (errno != EBADF) {
865 : 20 : bpf_firewall_unsupported_reason =
866 [ + + ]: 20 : log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
867 : 20 : return supported = BPF_FIREWALL_UNSUPPORTED;
868 : : }
869 : :
870 : : /* YAY! */
871 : : } else {
872 [ # # ]: 0 : log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
873 : 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
874 : : }
875 : :
876 : : /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
877 : : * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
878 : : * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
879 : : * get EINVAL if it's not supported, and EBADF as before if it is available. */
880 : :
881 : 0 : attr = (union bpf_attr) {
882 : : .attach_type = BPF_CGROUP_INET_EGRESS,
883 : : .target_fd = -1,
884 : : .attach_bpf_fd = -1,
885 : : .attach_flags = BPF_F_ALLOW_MULTI,
886 : : };
887 : :
888 [ # # ]: 0 : if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
889 [ # # ]: 0 : if (errno == EBADF) {
890 [ # # ]: 0 : log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
891 : 0 : return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
892 : : }
893 : :
894 [ # # ]: 0 : if (errno == EINVAL)
895 [ # # ]: 0 : log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
896 : : else
897 [ # # ]: 0 : log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
898 : :
899 : 0 : return supported = BPF_FIREWALL_SUPPORTED;
900 : : } else {
901 [ # # ]: 0 : log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
902 : 0 : return supported = BPF_FIREWALL_UNSUPPORTED;
903 : : }
904 : : }
905 : :
906 : 0 : void emit_bpf_firewall_warning(Unit *u) {
907 : : static bool warned = false;
908 : :
909 [ # # ]: 0 : if (!warned) {
910 [ # # # # ]: 0 : bool quiet = bpf_firewall_unsupported_reason == -EPERM && detect_container();
911 : :
912 [ # # # # : 0 : log_unit_full(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
# # # # #
# ]
913 : : "unit configures an IP firewall, but %s.\n"
914 : : "(This warning is only shown for the first unit using IP firewalling.)",
915 : : getuid() != 0 ? "not running as root" :
916 : : "the local system does not support BPF/cgroup firewalling");
917 : 0 : warned = true;
918 : : }
919 : 0 : }
|