systemd
- Per-unit IP access lists and accounting
- Uses BPF_PROG_TYPE_CGROUP_SKB
- Uses BPF_MAP_TYPE_LPM_TRIE
- In kernel aggregated metrics
- No need for new kernel code
systemd + eBPF
# systemd-run -p IPAccounting=1 -t /bin/sh
Running as unit: run-u50630.service
Press ^] three times within 1s to disconnect TTY.
sh-5.0#
systemd + eBPF: accounting
# systemd-run -p IPAccounting=1 -t /bin/sh
Running as unit: run-u50630.service
Press ^] three times within 1s to disconnect TTY.
sh-5.0# systemctl status run-u50630.service
● run-u50630.service - /bin/sh
Loaded: loaded (/run/systemd/transient/run-u50630.service; transient)
Transient: yes
Active: active (running) since Mon 2019-07-29 15:58:39 -03; 1min 4s ago
Main PID: 9975 (sh)
IP: 0B in, 0B out
Tasks: 3 (limit: 4915)
Memory: 2.7M
CGroup: /system.slice/run-u50630.service
├─ 9975 /bin/sh
├─21423 systemctl status run-u50630.service
└─21424 less
Jul 29 15:58:39 quaco systemd[1]: Started /bin/sh.
sh-5.0#
systemd + eBPF: traffic
sh-5.0# ping 1.1.1.1
PING 1.1.1.1 (1.1.1.1) 56(84) bytes of data.
64 bytes from 1.1.1.1: icmp_seq=1 ttl=59 time=35.1 ms
64 bytes from 1.1.1.1: icmp_seq=2 ttl=59 time=34.9 ms
^C
--- 1.1.1.1 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 2ms
rtt min/avg/max/mdev = 34.939/34.996/35.054/0.195 ms
sh-5.0# systemctl status run-u50630.service
● run-u50630.service - /bin/sh
Loaded: loaded (/run/systemd/transient/run-u50630.service; transient)
Transient: yes
Active: active (running) since Mon 2019-07-29 15:58:39 -03; 7min ago
Main PID: 9975 (sh)
IP: 168B in, 168B out
Tasks: 3 (limit: 4915)
Memory: 42.6M
CGroup: /system.slice/run-u50630.service
├─ 9975 /bin/sh
├─13952 systemctl status run-u50630.service
└─13961 less
Jul 29 15:58:39 quaco systemd[1]: Started /bin/sh.
sh-5.0#
systemd + eBPF: block addresses
# systemd-run -p IPAddressDeny=1.1.1.1 -t /bin/sh
Running as unit: run-u50655.service
Press ^] three times within 1s to disconnect TTY.
sh-5.0# ping -c 2 8.8.8.8
PING 8.8.8.8 (8.8.8.8) 56(84) bytes of data.
64 bytes from 8.8.8.8: icmp_seq=1 ttl=56 time=34.9 ms
64 bytes from 8.8.8.8: icmp_seq=2 ttl=56 time=34.9 ms
--- 8.8.8.8 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 3ms
rtt min/avg/max/mdev = 34.915/34.925/34.936/0.187 ms
sh-5.0# ping -c 2 1.1.1.1
PING 1.1.1.1 (1.1.1.1) 56(84) bytes of data.
ping: sendmsg: Operation not permitted
ping: sendmsg: Operation not permitted
^C
--- 1.1.1.1 ping statistics ---
2 packets transmitted, 0 received, 100% packet loss, time 52ms
sh-5.0#
How could I see that?
# bpftool prog | tail -6
4179: cgroup_skb tag 780fbe87f94e0e9e gpl
loaded_at 2019-07-29T16:09:05-0300 uid 0
xlated 176B jited 127B memlock 4096B map_ids 2893
4180: cgroup_skb tag a1b93ab7d85e4bae gpl
loaded_at 2019-07-29T16:09:05-0300 uid 0
xlated 176B jited 127B memlock 4096B map_ids 2893
# bpftool map | tail -2
2893: lpm_trie flags 0x1
key 8B value 8B max_entries 1 memlock 4096B
#
Map dump
# bpftool map dump id 2893
key: 20 00 00 00 01 01 01 01 value: 02 00 00 00 00 00 00 00
Found 1 element
#
/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
struct bpf_lpm_trie_key {
__u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */
__u8 data[0]; /* Arbitrary size */
};
eBPF bytecode
# bpftool prog dump xlated tag a1b93ab7d85e4bae
0: (bf) r6 = r1
1: (69) r7 = *(u16 *)(r6 +176)
2: (b4) w8 = 0
3: (55) if r7 != 0x8 goto pc+14
4: (bf) r1 = r6
5: (b4) w2 = 12
6: (bf) r3 = r10
7: (07) r3 += -4
8: (b4) w4 = 4
9: (85) call bpf_skb_load_bytes#6588432
10: (18) r1 = map[id:2893]
12: (bf) r2 = r10
13: (07) r2 += -8
14: (62) *(u32 *)(r2 +0) = 32
15: (85) call trie_lookup_elem#108128
16: (15) if r0 == 0x0 goto pc+1
17: (44) w8 |= 2
18: (b7) r0 = 1
19: (55) if r8 != 0x2 goto pc+1
20: (b7) r0 = 0
21: (95) exit
#
eBPF JITed code
# bpftool prog dump jit tag a1b93ab7d85e4bae
0xffffffffc0b1c1dc:
0: push %rbp
1: mov %rsp,%rbp
4: sub $0x8,%rsp
b: push %rbx
c: push %r13
e: push %r14
10: push %r15
12: pushq $0x0
14: mov %rdi,%rbx
17: movzwq 0xb0(%rbx),%r13
1f: xor %r14d,%r14d
22: cmp $0x8,%r13
26: jne 0x0000000000000068
28: mov %rbx,%rdi
2b: mov $0xc,%esi
30: mov %rbp,%rdx
33: add $0xfffffffffffffffc,%rdx
37: mov $0x4,%ecx
#
eBPF JITed code 2
3c: callq 0xffffffffcbd1efe4
41: movabs $0xffff8d98eb1edd00,%rdi
4b: mov %rbp,%rsi
4e: add $0xfffffffffffffff8,%rsi
52: movl $0x20,0x0(%rsi)
59: callq 0xffffffffcb6f0e34
5e: cmp $0x0,%rax
62: je 0x0000000000000068
64: or $0x2,%r14d
68: mov $0x1,%eax
6d: cmp $0x2,%r14
71: jne 0x0000000000000075
73: xor %eax,%eax
75: pop %rbx
76: pop %r15
78: pop %r14
7a: pop %r13
7c: pop %rbx
7d: leaveq
7e: retq
#
Facebook's Katran
- Network Load Balancer
- L4
- Before: IPVS
- Now: XDP + BPF
- Open Sourced
- https://code.fb.com/open-source/open-sourcing-katran-a-scalable-network-load-balancer/
BTF
- BPF Type Format
- Types, Source/line information
- Function names/signatures
- pahole generates kernel BTF from DWARF
- Uses libbpf to dedup types
- Makes it always available via sysfs
- Like CFI/.eh_frame for unwinding
- Documentation/bpf/btf.rst
sys_bpf & BTF
- libbpf notices .BTF ELF section
- parses it
- Uses BPF_BTF_LOAD command
- Kernel validates it
- Tools use BPF_OBJ_GET_INFO_BY_FD
Pretty Printing of BPF maps
- Retrieve map's BTF info with BPF_OBJ_GET_INFO_BY_FD
- bpftool
- pretty print each of the elements
BPF maps
struct syscall {
bool enabled;
u16 string_args_len[6];
};
bpf_map(syscalls, ARRAY, int, struct syscall, 512);
BPF maps: expanded
struct syscall {
bool enabled;
u16 string_args_len[6];
};
struct bpf_map __attribute__((section("maps"), used)) syscalls = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(struct syscall),
.max_entries = 512,
};
struct ____btf_map_syscalls {
int key;
struct syscall value;
};
struct ____btf_map_syscalls __attribute__((section(".maps.syscalls"), used))
____btf_map_syscalls = { }
perf trace
- BPF to collect syscall pointer payloads
- BPF maps for filtering
- And for telling what to collect
Putting a bpf prog + maps in place
# perf trace -a -e nanosleep sleep 100h
sleep/15623 nanosleep(0x7ffccede8990, NULL) ...
gpm/14747 nanosleep(0x7ffdd64c5790, 0x7ffdd64c5790) ...
podman/17195 nanosleep(0x7f144a3f0ba8, NULL) = 0
bpftool progs
# bpftool prog | grep tracepoint -A3
4214: tracepoint name sys_enter tag f173133dc38ccf87 gpl
loaded_at 2019-07-29T16:43:06-0300 uid 0
xlated 1344B jited 813B memlock 4096B map_ids 2928,2927,2926,2925
btf_id 1619
4215: tracepoint name sys_exit tag c1bd85c092d6e4aa gpl
loaded_at 2019-07-29T16:43:06-0300 uid 0
xlated 256B jited 162B memlock 4096B map_ids 2927,2926
btf_id 1619
#
bpftool maps
# bpftool map | egrep perf_ -A20
2925: perf_event_array name __augmented_sys flags 0x0
key 4B value 4B max_entries 8 memlock 4096B
2926: array name syscalls flags 0x0
key 4B value 14B max_entries 512 memlock 12288B
btf_id 1619
2927: hash name pids_filtered flags 0x0
key 4B value 1B max_entries 64 memlock 8192B
btf_id 1619
2928: percpu_array name augmented_filen flags 0x0
key 4B value 4168B max_entries 1 memlock 36864B
btf_id 1619
#
Dumping contents of a map
# grep -w nanosleep /tmp/build/perf/arch/x86/include/generated/asm/syscalls_64.c
[35] = "nanosleep",
#
Dumping contents of a map
# bpftool map dump id 2926 | head -40 | tail -10
key: 1e 00 00 00 value: 00 00 00 00 00 00 00 00 00 00 00 00 00 00
key: 1f 00 00 00 value: 00 00 00 00 00 00 00 00 00 00 00 00 00 00
key: 20 00 00 00 value: 00 00 00 00 00 00 00 00 00 00 00 00 00 00
key: 21 00 00 00 value: 00 00 00 00 00 00 00 00 00 00 00 00 00 00
key: 22 00 00 00 value: 00 00 00 00 00 00 00 00 00 00 00 00 00 00
key: 23 00 00 00 value: 01 00 00 00 00 00 00 00 00 00 00 00 00 00
key: 24 00 00 00 value: 00 00 00 00 00 00 00 00 00 00 00 00 00 00
key: 25 00 00 00 value: 00 00 00 00 00 00 00 00 00 00 00 00 00 00
key: 26 00 00 00 value: 00 00 00 00 00 00 00 00 00 00 00 00 00 00
key: 27 00 00 00 value: 00 00 00 00 00 00 00 00 00 00 00 00 00 00
#
Dumping contents of a map with BTF
# bpftool map dump id 2926 | grep '"key": 35,' -B1 -A13
},{
"key": 35,
"value": {
"enabled": true,
"string_args_len": [0,0,0,0,0,0
]
}
},{
"key": 36,
"value": {
"enabled": false,
"string_args_len": [0,0,0,0,0,0
]
}
},{
#
bpftool map lookup
# bpftool map lookup id 2951 key 35
Error: key expected 4 bytes got 1
[root@quaco ~]# bpftool map lookup id 2951 key 35 00 00 00
{
"key": 35,
"value": {
"enabled": true,
"string_args_len": [0,0,0,0,0,0
]
}
}
#
Generating BTF info
- DaveM notices CTF in a Solaris kernel image
- Hands me an initial .h with the main definitions
- pahole gets refactored to support multiple formats
- CTF being the first DWARF companion
- 10 years later: BPF needs this
- BTF
- llvm generates it as well, directly
pahole BTF encoder
- Implemented by Martin Lau @ FB
- Starting from ctf_loader.c
- dwarf loader
- Reads DWARF tags
- Intermediate format
- BTF encoder uses it
- Inserts a new .BTF ELF section
Encoding
$ cat test.c
struct A {
char b;
int a;
};
int test(struct A *t)
{
return t->a;
}
$ gcc -g -c test.c
$ file test.o
test.o: ELF 64-bit LSB relocatable, x86-64, version 1 (SYSV), with debug_info, not stripped
$
DWARF ELF sections
$ readelf -SW test.o | grep \.debug
[Nr] Name Type Addr Off Size
[ 4] .debug_info PROGBITS 0000 051 94
[ 5] .rela.debug_info RELA 0000 408 d8
[ 6] .debug_abbrev PROGBITS 0000 0e5 84
[ 7] .debug_aranges PROGBITS 0000 169 30
[ 8] .rela.debug_aranges RELA 0000 4e0 30
[ 9] .debug_line PROGBITS 0000 199 40
[10] .rela.debug_line RELA 0000 510 18
[11] .debug_str PROGBITS 0000 1d9 69
$
pahole using DWARF
$ pahole test.o
struct A {
char b; /* 0 1 */
/* XXX 3 bytes hole, try to pack */
int a; /* 4 4 */
/* size: 8, cachelines: 1, members: 2 */
/* sum members: 5, holes: 1, sum holes: 3 */
/* last cacheline: 8 bytes */
};
$
pahole encoding BTF
$ pahole -JV test.o
File test.o:
[1] STRUCT A kind_flag=0 size=8 vlen=2
b type_id=2 bits_offset=0
a type_id=3 bits_offset=32
[2] INT char size=1 bit_offset=0 nr_bits=8 encoding=(none)
[3] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
[4] PTR (anon) type_id=1
$
BTF ELF section
$ readelf -SW test.o | grep \.BTF
[Nr] Name Type Addr Off Size
[19] .BTF PROGBITS 0000 619 7f
$
clang 8 generating BTF
$ clang -target bpf -g -c test.c
$ file test.o
test.o: ELF 64-bit LSB relocatable, eBPF, version 1 (SYSV), with debug_info, not stripped
$ readelf -SW test.o | grep \.BTF
[Nr] Name Type Addr Off Size
[ 8] .BTF PROGBITS 0000 23d dd
[ 9] .BTF.ext PROGBITS 0000 31a 68
[10] .rel.BTF.ext REL 0000 670 40
$
pahole BTF loader
- Reads BTF tags
- Intermediate format
- pretty prints
pahole decoding BTF
$ pahole -F btf test.o
struct A {
char b; /* 0 1 */
/* XXX 3 bytes hole, try to pack */
int a; /* 4 4 */
/* size: 8, cachelines: 1, members: 2 */
/* sum members: 5, holes: 1, sum holes: 3 */
/* last cacheline: 8 bytes */
};
$
kernel loading BTF
- libbpf notices __btf_map_MAP_NAME
- In a ".maps." prefixed ELF section
- Collects that BTF data
- sys_bpf(fd, BPF_BTF_LOAD, btf_data)
btf_map(name)
#define bpf_map(name, _type, type_key, type_val, _max_entries) \
struct bpf_map SEC("maps") name = { \
.type = BPF_MAP_TYPE_##_type, \
.key_size = sizeof(type_key), \
.value_size = sizeof(type_val), \
.max_entries = _max_entries, \
}; \
struct ____btf_map_##name { \
type_key key; \
type_val value; \
}; \
struct ____btf_map_##name __attribute__((section(".maps." #name), used)) \
____btf_map_##name = { }
kernel validates BTF
- Validates header
- BTF_MAGIC
- BTF_VERSION
- flags
Some validations performed
$ grep btf_verifier_log kernel/bpf/btf.c
btf_verifier_log(env, "Exceeded max num of types");
btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
btf_verifier_log_type(env, t, "Unsupported encoding");
btf_verifier_log_type(env, t, "Invalid type_id");
btf_verifier_log_type(env, t, "Invalid name");
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
btf_verifier_log_type(env, t, "Expected size:%zu",
btf_verifier_log_type(env, t, "Loop detected");
btf_verifier_log(env, "Unaligned type_off");
btf_verifier_log(env, "No type found");
btf_verifier_log(env, "String section is not at the end");
btf_verifier_log(env, "Invalid string section");
btf_verifier_log(env, "Section overlap found");
btf_verifier_log(env, "Total section length too long");
btf_verifier_log(env, "Unsupported section found");
btf_verifier_log(env, "hdr_len not found");
btf_verifier_log(env, "btf_header not found");
btf_verifier_log(env, "Unsupported btf_header");
btf_verifier_log(env, "Invalid magic");
btf_verifier_log(env, "Unsupported version");
btf_verifier_log(env, "Unsupported flags");
$
kernel validating BTF (excerpts)
# perf ftrace -G '*btf*' perf trace -e *sleep sleep 1
7) | bpf_btf_load() {
7) | capable() {
7) 1.527 us | }
7) | btf_new_fd() {
7) 0.101 us | btf_sec_info_cmp();
7) | btf_struct_check_meta() {
7) 0.135 us | btf_name_valid_identifier.isra.12();
7) 0.109 us | __btf_verifier_log_type();
7) 0.107 us | btf_name_valid_identifier.isra.12();
7) 0.108 us | btf_verifier_log_member();
7) 3.642 us | }
7) | btf_int_check_meta() {
7) 0.100 us | __btf_verifier_log_type();
7) 0.315 us | }
7) | btf_ref_type_check_meta()
7) + 49.743 us | }
btfdiff
- pahole -F btf file.o
- pahole -F dwarf --flat_arrays file.o
- diff them
- Should produce the same results
- Regression tests
fullcircle
- pfunct --compile file.o
- Using BTF or DWARF
- Build resulting file
- codiff debug info in both
- Should match
- Again: regression tests
Encoding multiple CUs
- Compile Unit
- Encode just one .BTF section
- For multiple object files
- Like vmlinux
- Bump the IDs of types
- To make them unique
- Deduplicate later
Deduplication
- Done by Andrii @ FB
- In libbpf
- Huge reduction of BTF type info wrt DWARF
- Algorithm described in blog post
- URL in last slide
vmlinux BTF in kbuild
- CONFIG_DEBUG_INFO_BTF
- Uses pahole >= v1.13
- Encoding BTF from DWARf
- DWARF: ~195 MiB
- BTF dedup'ed: ~2 MiB
- Does it at the end
pahole
- BTF breaths new life
- The CTF work finally paid out
- Helps validating BTF encoding>
- Generates compilable code>
- Matching original>
- DWARF5 DW_AT_alignment
- __attribute__((__packed__)) detection
- Much faster with BTF
__alignment__(N))
$ pahole -C task_struct ../build/v5.1-rc4+/vmlinux | tail
/* --- cacheline 103 boundary (6592 bytes) --- */
refcount_t stack_refcount; /* 6600 4 */
/* XXX 4 bytes hole, try to pack */
void * security; /* 6608 8 */
/* XXX 40 bytes hole, try to pack */
/* --- cacheline 104 boundary (6656 bytes) --- */
struct thread_struct thread __attribute__((__aligned__(64))); /* 6656 4352 */
/* size: 11008, cachelines: 172, members: 207 */
/* sum members: 10902, holes: 16, sum holes: 98 */
/* sum bitfield members: 10 bits, bit holes: 2, sum bit holes: 54 bits */
/* paddings: 3, sum paddings: 14 */
/* forced alignments: 6, forced holes: 1, sum forced holes: 40 */
} __attribute__((__aligned__(64)));
CO-RE - Compile once, run everywhere
- Use BTF in BPF programs
- And the vmlinux .BTF section
- To figure out if ABI changed
- New clang feat: relocation field offset records
- __builtin_preserve_access_index(x)
- Move operands while loading
- To new offset or refuse loading
- https://lkml.kernel.org/r/20190730195408.670063-1-andriin@fb.com>
Further BTF uses
- BPF spinlocks
- socket local storage
- Will allow ss to dump local storage
What to do with that
- Annotation using binutil's libopcodes
- Special handling of BPF spinlocks in annotation
- Ditto for other features using BTF
perf top
perf annotate
nospectre_v1 + nospectre_v2
perf report investigating XDP
32.46% i40e_clean_rx_irq
28.58% bpf_prog_7372f554109f565f
6.30% xdp_do_redirect
3.97% i40e_napi_poll
3.74% i40e_xmit_xdp_ring
2.50% dma_direct_sync_single_for_cpu
2.11% dev_map_enqueue
2.04% dma_direct_unmap_page
1.86% default_idle
1.73% dma_direct_map_page
1.68% i40e_xdp_xmit
1.57% __xdp_return
1.33% i40e_alloc_rx_buffers
1.18% __dev_map_lookup_elem
1.07% page_frag_free
0.89% percpu_array_map_lookup_elem
BPF loader gotchas
- bpf_prog_7372f554109f565f
- Should use bpf.attr.name
- In addition to BPF tag (7372f554109f565f)
- bpf_prog_f173133dc38ccf87_sys_enter
perf annotate
# perf annotate --stdio2 bpf_prog_819967866022f1e1_sys_enter
Samples: 40 of event 'cycles', 4 kHz, Ev.count (approx.): 12152834, [pcnt: local period]
bpf_prog_819967866022f1e1_sys_enter() bpf_prog_819967866022f1e1_sys_enter
69.13 push %rbp
mov %rsp,%rbp
sub $0x170,%rsp
sub $0x28,%rbp
19.07 mov %rbx,0x0(%rbp)
2.08 mov %r13,0x8(%rbp)
mov %r14,0x10(%rbp)
mov %r15,0x18(%rbp)
xor %eax,%eax
mov %rax,0x20(%rbp)
mov %rdi,%rbx
→ callq *ffffffffd356dda0
3.03 mov %eax,-0x148(%rbp)
mov %rbp,%rsi
add $0xfffffffffffffeb8,%rsi
movabs $0xffff9d5b7133a400,%rdi
1.17 → callq *ffffffffd356f050
...
bpf prog dump bytecode
# bpftool prog dump xlated id 4301
int sys_enter_connect(struct syscall_enter_args * args):
; int sys_enter_connect(struct syscall_enter_args *args)
0: (bf) r6 = r1
1: (b7) r1 = 0
; int key = 0;
2: (63) *(u32 *)(r10 -4) = r1
3: (bf) r2 = r10
;
4: (07) r2 += -4
; struct augmented_args_payload *augmented_args = bpf_map_lookup_elem(&augmented_args_tmp, &key);
5: (18) r1 = map[id:2992]
7: (85) call percpu_array_map_lookup_elem#98896
8: (bf) r7 = r0
9: (b7) r0 = 1
; if (augmented_args == NULL)
10: (15) if r7 == 0x0 goto pc+20
;
11: (61) r1 = *(u32 *)(r6 +32)
12: (b7) r8 = 128
; if (socklen > sizeof(augmented_args->saddr))
13: (25) if r1 > 0x80 goto pc+1
14: (bf) r8 = r1
bpf prog dump bytecode
; const void *sockaddr_arg = (const void *)args->args[1];
15: (79) r3 = *(u64 *)(r6 +24)
; probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
16: (bf) r1 = r7
17: (07) r1 += 64
; probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
18: (bf) r2 = r8
19: (85) call bpf_probe_read#-49200
; return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, augmented_args, len + socklen);
20: (07) r8 += 64
; return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, augmented_args, len + socklen);
21: (67) r8 <<= 32
22: (77) r8 >>= 32
; return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, augmented_args, len + socklen);
23: (bf) r1 = r6
24: (18) r2 = map[id:2987]
26: (18) r3 = 0xffffffff
28: (bf) r4 = r7
29: (bf) r5 = r8
30: (85) call bpf_perf_event_output_tp#-47264
; }
31: (95) exit
#