perf workflow
- record + report
- perf.data
- cross arch analysis
- live: top, annotate, trace
perf record
# perf record -a -e syscalls:sys_enter_futex sleep 10
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 1.645 MB perf.data (242 samples) ]
#
# ls -lah perf.data
-rw-------. 1 root root 1.7M Jan 18 11:38 perf.data
#
perf evlist
# perf evlist
syscalls:sys_enter_futex
# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events
#
perf evlist
# perf evlist --trace-fields
syscalls:sys_enter_futex: trace_fields: __syscall_nr,uaddr,op,val,utime,uaddr2,val3
#
perf script
# perf script
named 7012 1 09.524377 sys_enter_futex uaddr 0x7fea293940cc op 0x80 val 0 utime 0x0 uaddr2 0x1 val3 0x0
CPU0/KVM 6767 1 11.177722 sys_enter_futex uaddr 0x55a101643560 op 0x80 val 2 utime 0x0 uaddr2 0x55a101643560 val3 0x8
qemu-x86 6721 2 11.177766 sys_enter_futex uaddr 0x55a101643560 op 0x81 val 1 utime 0x7fc45807d4a0 uaddr2 0x0 val3 0x7fc458085f10
CPU0/KVM 6767 1 11.177828 sys_enter_futex uaddr 0x55a101643560 op 0x81 val 1 utime 0x8 uaddr2 0x0 val3 0xffffffff
CPU0/KVM 6767 0 12.141761 sys_enter_futex uaddr 0x55a101643560 op 0x80 val 2 utime 0x0 uaddr2 0x55a101643560 val3 0x2
qemu-x86 6721 2 12.141766 sys_enter_futex uaddr 0x55a101643560 op 0x81 val 1 utime 0x7fc45807d4a0 uaddr2 0x0 val3 0x7fc458085f10
targets
- Same as for 'record', 'top'
- -a, --all-cpus
- -C, --cpu cpu-list
- -p, --pid pid-list
- -t, --tid tid-list
- -u, --uid user>
system wide stracing
# perf trace -e futex --duration 1
200.139 (2999.515 ms) virt-manager/3384 futex(uaddr: 0x7fb9280039d0, op: WAIT_BITSET|PRIV|CLKRT, val3: MATCH_ANY) = 0
3202.723 (3000.439 ms) virt-manager/3384 futex(uaddr: 0x7fb928001950, op: WAIT_BITSET|PRIV|CLKRT, val3: MATCH_ANY) = 0
196.247 (6009.648 ms) libvirtd/1084 futex(uaddr: 0x5573466d9f60, op: WAIT|PRIV) = 0
2029.069 (4999.934 ms) docker-contain/3259 futex(uaddr: 0xd7a718, utime: 0xc42001fed8) = -1 ETIMEDOUT Connection timed out
2029.136 (4999.990 ms) docker-contain/2912 futex(uaddr: 0xc420027510) = 0
2029.166 (4999.912 ms) docker-contain/2895 futex(uaddr: 0xd7acf8, utime: 0x7fcc5761adb8) = 0
2029.156 (4999.940 ms) docker-contain/2898 futex(uaddr: 0xc420058110) = 0
6207.427 (2999.572 ms) virt-manager/3384 futex(uaddr: 0x7fb928001f80, op: WAIT_BITSET|PRIV|CLKRT, val3: MATCH_ANY) = 0
^C#
system wide stracing
- Uses tracepoint filter to exclude itself
- --filter-pids
- To avoid feedback loops
- Difficult in X session
- Combine --filter-pids + -e \!syscalls
- ssh + perf trace
user stracing
- Enumerates threads owned by user
- Test: ping + named
# trace --summary -u named -e *msg,futex sleep 2
Summary of events:
named (26671), 8 events, 9.5%
syscall calls total min avg max stddev
(msec) (msec) (msec) (msec) (%)
--------------- ----- ------- ------ ------- ------- ------
futex 4 0.551 0.000 0.138 0.537 96.59%
named (26672), 10 events, 11.9%
sendmsg 1 0.083 0.083 0.083 0.083 0.00%
recvmsg 2 0.063 0.020 0.032 0.043 37.51%
futex 2 0.004 0.000 0.002 0.004 100.00%
named (26673), 22 events, 26.2%
futex 4 821.565 0.000 205.391 821.555 100.00%
sendmsg 2 0.113 0.043 0.056 0.070 24.05%
recvmsg 5 0.053 0.006 0.011 0.017 21.22%
named (26674), 18 events, 21.4%
futex 7 821.973 0.000 117.425 821.588 99.95%
recvmsg 2 0.013 0.006 0.007 0.007 11.32%
named (26676), 26 events, 31.0%
futex 13 0.287 0.005 0.022 0.064 22.43%
#
pid, no children
# trace -p 3384 --no-inherit -e futex --duration 1 --max-stack 8
2.538 (3001.384 ms) virt-manager/3384 futex(uaddr: 0x7fb928005ee0, op: WAIT_BITSET|PRIV|CLKRT, val3: MATCH_ANY) = 0
do_futex_wait.constprop.1 (/usr/lib64/libpthread-2.25.so)
__new_sem_wait_slow.constprop.0 (/usr/lib64/libpthread-2.25.so)
PyThread_acquire_lock (/usr/lib64/libpython2.7.so.1.0)
[0xffff8046a2c02fd1] (/usr/lib64/libpython2.7.so.1.0)
PyEval_EvalFrameEx (/usr/lib64/libpython2.7.so.1.0)
PyEval_EvalCodeEx (/usr/lib64/libpython2.7.so.1.0)
PyEval_EvalFrameEx (/usr/lib64/libpython2.7.so.1.0)
PyEval_EvalCodeEx (/usr/lib64/libpython2.7.so.1.0)
^C[root@jouet ~]#
strace overhead: baseline
# dd if=/dev/zero of=/dev/null bs=1 count=500k
512000+0 records in
512000+0 records out
512000 bytes (512 kB, 500 KiB) copied, 0.303143 s, 1.7 MB/s
#
strace overhead
# strace -eaccept dd if=/dev/zero of=/dev/null bs=1 count=500k
512000+0 records in
512000+0 records out
512000 bytes (512 kB, 500 KiB) copied, 12.2114 s, 41.9 kB/s
+++ exited with 0 +++
#
strace
- 40x times slower
- In 2015 it was 173x slower
- Worst case, dd emits syscalls as fast as it can
- Brendan Gregg's article mentions 442x slow down, same example (2014)
- ptrace pauses the application twice
- syscall enter and exit
- www.brendangregg.com/blog/2014-05-11/strace-wow-much-syscall.html
perf trace overhead
# perf trace -e accept dd if=/dev/zero of=/dev/null bs=1 count=500k
512000+0 records in
512000+0 records out
512000 bytes (512 kB, 500 KiB) copied, 0.412906 s, 1.2 MB/s
#
perf trace
- 1.36 times slower
- Again worst case, dd emits syscalls as fast as it can
- No workload stop at syscall enter/exit
- Collects info at syscall enter and exit
- Further optimizations possible
- Currently uses raw_syscalls:sys_{enter,exit} tracepoints
- But in this example could have used just syscalls:sys_{enter,exit}_accept
Perf recording
- Ring buffer
- No context switch
- Asyncrhonous
- May lose records
- But will state that
Perf metadata records
- In the ring buffer
- Together with PERF_RECORD_SAMPLE
- PERF_RECORD_COMM
- PERF_RECORD_MMAP
- PERF_RECORD_FORK
- PERF_RECORD_EXIT
- PERF_RECORD_LOST
- PERF_RECORD_CONTEXT_SWITCH
- Others
kernel source tree
- Faster support for new ABI
- New ABI added to kernel + perf tools
- Do not use headers directly
- Keep a copy
- Build time check of UAPI changes: check_headers.sh
kernel source tree
- Automatic creation of simple beautifiers
- Use to automatically create string tables
- For beautifying args
- Allow building the tools on older distros
- Where recent defines/structs/etc are not present
Automatic creation of string tables
- grep + sed on headers
- E.g.: tools/include/uapi/linux/prctl.h>
- tools/perf/trace/beauty/prctl_option.sh>
- Scripts can be used without args, produce table to stdout>
Automatic creation of string tables
$ cd tools/perf/trace/beauty/
$ ls -1 *.sh
arch_errno_names.sh*
drm_ioctl.sh*
kcmp_type.sh*
kvm_ioctl.sh*
madvise_behavior.sh*
perf_ioctl.sh*
pkey_alloc_access_rights.sh*
prctl_option.sh*
sndrv_ctl_ioctl.sh*
sndrv_pcm_ioctl.sh*
socket_ipproto.sh*
vhost_virtio_ioctl.sh*
$
Automatic creation of string tables
# head /tmp/build/perf/trace/beauty/generated/prctl_option_array.c
static const char *prctl_options[] = {
[1] = "SET_PDEATHSIG",
[2] = "GET_PDEATHSIG",
[3] = "GET_DUMPABLE",
[4] = "SET_DUMPABLE",
[5] = "GET_UNALIGN",
[6] = "SET_UNALIGN",
[7] = "GET_KEEPCAPS",
[8] = "SET_KEEPCAPS",
[9] = "GET_FPEMU",
...
#
Automatic creation of string tables
# trace -e prctl
27097.185 (0.007 ms) SSL Cert #196/23393 prctl(option: SET_NAME, arg2: 0x7efce67eb868) = 0
29741.170 (0.009 ms) StreamTrans #1/23394 prctl(option: SET_NAME, arg2: 0x7f46d51fff08) = 0
30086.642 (0.014 ms) IndexedDB #18/23395 prctl(option: SET_NAME, arg2: 0x7efce7747908) = 0
30151.534 (0.009 ms) IndexedDB #19/23396 prctl(option: SET_NAME, arg2: 0x7efce7d2c8e8) = 0
44595.727 (0.010 ms) systemd-udevd/23400 prctl(option: SET_PDEATHSIG, arg2: 15) = 0
Simple beautifiers
- Associated from field types and names
- pid_t shows COMM
- fd shows file name
syscall tracepoints
Field types
# cd /sys/kernel/debug/tracing/events
# cat syscalls/sys_enter_kill/format
name: sys_enter_kill
ID: 161
format:
field:pid_t pid; offset:16; size:8; signed:0;
field:int sig; offset:24; size:8; signed:0;
#
syscalls operating on pids
# cd /sys/kernel/debug/tracing/events/syscalls/
# grep pid_t sys_enter_*/format | head -5
sys_enter_getpgid/format: field:pid_t pid; offset:16; size:8;
sys_enter_getsid/format: field:pid_t pid; offset:16; size:8;
sys_enter_kcmp/format: field:pid_t pid1; offset:16; size:8;
sys_enter_kcmp/format: field:pid_t pid2; offset:24; size:8;
sys_enter_kill/format: field:pid_t pid; offset:16; size:8;
#
Augment what is provided by the tracepoints
# trace -a -e wait4,kill,prlimit64,kcmp sleep 30
1951.840 (3.789 ms) dhclient/17755 wait4(upid: -1, stat_addr: 0x7ffc2da5631c) ...
1955.629 (0.004 ms) nm-dhcp-helper/23015 prlimit64(resource: STACK, old_rlim: 0x7ffff632cc20) = 0
1951.840 (7.986 ms) dhclient/17755 ... [continued]: wait4()) = 23015 (nm-dhcp-helper)
1965.011 (0.002 ms) (spatcher)/23020 prlimit64(resource: NOFILE, new_rlim: 0x55e291bf6fd0) = 0
1965.471 (0.003 ms) systemd/1 kcmp(pid1: 1 (systemd), pid2: 1 (systemd),
idx1: 253<socket:[7330361]> idx2: 254<socket:[7663370]> = 1
1965.476 (0.002 ms) systemd/1 kcmp(pid1: 1 (systemd), pid2: 1 (systemd),
idx1: 252<socket:[7330360]> idx2: 254<socket:[7663370]> = 1
1974.334 (0.004 ms) nm-dispatcher/23020 prlimit64(resource: STACK, old_rlim: 0x7ffc2c307cc0) = 0
1992.604 (0.021 ms) nm-dispatcher/23020 wait4(upid: 23023 (nm-dispatcher),
stat_addr: 0x55cdc5602544, options: NOHANG) = 0
1997.427 (0.004 ms) 00-netreport/23023 prlimit64(resource: NPROC, old_rlim: 0x7ffc752a1010) = 0
Use same libraries, coding style
- rbtree, refcount_t, atomic_t, list.h, etc
- Facilitate hacking on tools/ by kernel folks
- Write kernel facility...
- ... Show how it is used in tools/
- Not just perf, objtool, etc
Specifying syscalls
$ trace -e nanosleep sleep 1
0.000 (1000.195ms) sleep/22896 nanosleep(rqtp: 0x7ffd03270f10) = 0
$
With backtraces!
$ trace -e nanosleep --call-graph dwarf sleep 1
0.000 (1000.143ms) sleep/22906 nanosleep(rqtp: 0x7ffe91fed9c0) = 0
__nanosleep_nocancel (/usr/lib64/libc-2.25.so)
rpl_nanosleep (/usr/bin/sleep)
xnanosleep (/usr/bin/sleep)
main (/usr/bin/sleep)
__libc_start_main (/usr/lib64/libc-2.25.so)
_start (/usr/bin/sleep)
$
callchains
- fp: cheap but not always available
- dwarf: expensive, but works
- dwarf: Collects user stack + registers, per sample
- To reduce the cost: use just in some events
- Reduce the stack sampled
Reducing stack sampled: drastically
$ trace -e nanosleep --call-graph dwarf,128 sleep 1
0.000 (1000.135ms) sleep/23073 nanosleep(rqtp: 0x7ffc90169e70) = 0
__nanosleep_nocancel (/usr/lib64/libc-2.25.so)
rpl_nanosleep (/usr/bin/sleep)
xnanosleep (/usr/bin/sleep)
[0] ([unknown])
$
Reducing stack sampled: 2
$ trace -e nanosleep --call-graph dwarf,256 sleep 1
0.000 (1000.192ms) sleep/23133 nanosleep(rqtp: 0x7fff7fb7e9e0) = 0
__nanosleep_nocancel (/usr/lib64/libc-2.25.so)
rpl_nanosleep (/usr/bin/sleep)
xnanosleep (/usr/bin/sleep)
main (/usr/bin/sleep)
__libc_start_main (/usr/lib64/libc-2.25.so)
[0] ([unknown])
$
Reducing stack sampled: rigthsized
$ trace -e nanosleep --call-graph dwarf,512 sleep 1
0.000 (1000.125ms) sleep/23146 nanosleep(rqtp: 0x7ffd896eea50) = 0
__nanosleep_nocancel (/usr/lib64/libc-2.25.so)
rpl_nanosleep (/usr/bin/sleep)
xnanosleep (/usr/bin/sleep)
main (/usr/bin/sleep)
__libc_start_main (/usr/lib64/libc-2.25.so)
_start (/usr/bin/sleep)
$
Measuring DWARF callgraphs cost
$ perf record -a --call dwarf sleep 5s
[ perf record: Woken up 30 times to write data ]
[ perf record: Captured and wrote 11.043 MB perf.data (1955 samples) ]
$ perf evlist -v
cycles:ppp: sample_freq: 4000,
sample_type: IP|TID|TIME|ADDR|CALLCHAIN|CPU|PERIOD|REGS_USER|STACK_USER|DATA_SRC,
mmap: 1, comm: 1, task: 1, precise_ip: 3, exclude_guest: 1,
exclude_callchain_user: 1, comm_exec: 1, sample_regs_user: 0xff0fff,
sample_stack_user: 8192
$
Measuring DWARF callgraphs cost
$ perf record -a --call dwarf,128 sleep 5s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 4.033 MB perf.data (1821 samples) ]
$ perf evlist -v
cycles:ppp: ..., sample_stack_user: 128
$
using max-stack
- Use max-stack to reduce sample size and postprocessing cost
- attr.sample_max_stack tells kernel entries to sample
- Ask for callgraphs for just some of the events
- And mix syscalls with other events, say tracepoints
- Use wildcards to type less
$ trace -e *sleep -e sched:*wakeup,sched:*switch/max-stack=8/ sleep 1
1.195 (0.017ms) sleep/23575 nanosleep(rqtp: 0x7ffcf19a17d0) ...
1.211 ( ) sched:sched_switch:sleep:23575 [120] S ==> swapper/2:0 [120])
__schedule ([kernel])
__schedule ([kernel])
schedule ([kernel])
do_nanosleep ([kernel])
hrtimer_nanosleep ([kernel])
sys_nanosleep ([kernel])
do_syscall_64 ([kernel])
return_from_SYSCALL_64 ([kernel])
1001.369 ( ) d_wakeup:sleep:23575 [120] success=1 CPU:002)
1.195 (1000.226ms) sleep/23575 ... [continued]: nanosleep()) = 0
$
syscall args: pointers
# trace -e open touch /tmp/bla
0.000 (0.020ms) touch open(filename: 0x8992e37, flags: CLOEXEC) = 3
0.051 (0.012ms) touch open(filename: 0x8b96640, flags: CLOEXEC) = 3
0.440 (0.018ms) touch open(filename: 0x873cc70, flags: CLOEXEC) = 3
0.532 (0.311ms) touch open(filename: 0x66a9340,
flags: CREAT|NOCTTY|NONBLOCK|WRONLY,
mode: IRUGO|IWUGO) = 3
#
Copying pointer contents
- Use 'perf probe'
- Where it is copied from/to userspace
- Fragile, these places can change over time
- Later will use eBPF for that
Copying that data
# perf probe "vfs_getname=getname_flags:72 pathname=result->name:string"
Added new event:
probe:vfs_getname (on getname_flags:72 with pathname=result->name:string)
You can now use it in all perf tools, such as:
perf record -e probe:vfs_getname -aR sleep 1
#
Tracing syscalls + probe
# trace --no-syscalls -e syscalls:sys_enter_open*,probe:vfs_getname touch /etc/passwd
0.000 probe:vfs_getname:(ffffffff812799e3) pathname="/etc/ld.so.preload")
0.010 syscalls:sys_enter_openat:dfd: 0xffffffffffffff9c, filename: 0x7f19fa31a726, flags: 0x80000, mode: 0x0)
0.013 probe:vfs_getname:(ffffffff812799e3) pathname="/etc/ld.so.cache")
0.032 syscalls:sys_enter_openat:dfd: 0xffffffffffffff9c, filename: 0x7f19fa522ce0, flags: 0x80000, mode: 0x0)
0.034 probe:vfs_getname:(ffffffff812799e3) pathname="/lib64/libc.so.6")
0.258 syscalls:sys_enter_open:filename: 0x7f19fa0c44d0, flags: 0x80000, mode: 0x0)
0.261 probe:vfs_getname:(ffffffff812799e3) pathname="/usr/lib/locale/locale-archive")
0.304 syscalls:sys_enter_openat:dfd: 0xffffffffffffff9c, filename: 0x7ffe33a81379, flags: 0x941, mode: 0x1b6)
0.306 probe:vfs_getname:(ffffffff812799e3) pathname="/etc/passwd")
#
Use probe:vfs_getname when available
# trace --no-inherit -e open* touch /etc/passwd
0.054 (0.036 ms) openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC) = 3
0.159 (0.051 ms) openat(dfd: CWD, filename: /lib64/libc.so.6, flags: CLOEXEC) = 3
1.188 (0.050 ms) open(filename: /usr/lib/locale/locale-archive, flags: CLOEXEC) = 3
1.390 (0.036 ms) openat(dfd: CWD, filename: /etc/passwd,
flags: CREAT|NOCTTY|NONBLOCK|WRONLY, mode: IRUGO|IWUGO) = 3
#
eBPF
- clang + llvm
- sys_bpf()
- Attach to kprobes, tracepoints, others
- Work as filters
- Should be used for filtering by syscall args
- Also to copy pointer payloads for beautification
- Contributed by Wang Nan @ Huawei
eBPF validator
- Checks Control Flow Graph
- No loops: program terminates
- No unreachable instructions
- No out-of-bounds jumps
- No out-of-bounds data access
- Stringent pointer arithmetic for !CAP_SYS_ADMIN
- No reading uninitialized registers
eBPF program types
- Dictates where it can be attached
- Which functions can be called
- If network packet data can be accessed directly
- Type of object passed as first arg to program
Some eBPF program types
BPF_PROG_TYPE_ +
- SOCKET_FILTER
- KPROBE
- should a kprobe fire or not
- TRACEPOINT
- Determine whether a tracepoint should fire
- XDP
- Network pkt filter run from driver rx path
eBPF Integration
# cat filter.c
#include <uapi/linux/bpf.h>
#define SEC(NAME) __attribute__((section(NAME), used))
SEC("func=hrtimer_nanosleep rqtp->tv_nsec")
int func(void *ctx, int err, long nsec)
{
return nsec > 1000;
}
char _license[] SEC("license") = "GPL";
int _version SEC("version") = LINUX_VERSION_CODE;
1000ns
# trace -e *sleep,filter.c usleep 1
0.000 (0.096ms) usleep/7978 nanosleep(rqtp: 0x7fff3ac3cb00) = 0
#
2000ns
# trace -e nanosleep,filter.c usleep 2
0.000 (0.012ms) usleep/7564 nanosleep(rqtp: 0x7ffec11bbd20) ...
0.012 ( ) perf_bpf_probe:func:(ffffffff9f10cb30) tv_nsec=2000)
0.000 (0.071ms) usleep/7564 ... [continued]: nanosleep()) = 0
#
With a backtrace
# trace -e nanosleep,filter.c/max-stack=5/ usleep 1001
0.000 (0.018ms) usleep/963 nanosleep(rqtp: 0x7ffdc6421410) ...
0.018 ( ) perf_bpf_probe:func:(ffffffff9f10cb30) tv_nsec=1001000)
hrtimer_nanosleep ([kernel])
SyS_nanosleep ([kernel])
do_syscall_64 ([kernel])
return_from_SYSCALL_64 ([kernel])
__GI_nanosleep (/usr/lib64/libc-2.25.so)
0.000 (1.095ms) usleep/963 ... [continued]: nanosleep()) = 0
[root@jouet bpf]#
eBPF
- Can attach to tracepoints
- And copy pointer contents
- To perf ring buffer
- For beautification
Hooking into a tracepoint
# cat sys_enter_openat.c
#include "bpf.h"
SEC("syscalls:sys_enter_openat")
int func(void *ctx)
{
char filename[256];
char *filename_ptr = *((char **)(ctx + 24));
int len = bpf_probe_read_str(filename, sizeof(filename), filename_ptr);
if (len > 0 && len < 256) {
perf_event_output(ctx, &__bpf_stdout__, BPF_F_CURRENT_CPU,
&filename, len & (256 - 1));
}
return 0;
}
#
- Offsets from tracefs
- No need for DWARF
- Not hardcoded in 'perf trace'
/*
* /sys/kernel/debug/tracing/events/syscalls/sys_enter_openat/format:
* ...
* field:const char * filename; offset:24; size:8; signed:0;
* ...
* ctx + 24 selects 'filename'
*/
Running it
# trace -e openat,sys_enter_openat.c touch /tmp/bla
LLVM: dumping sys_enter_openat.o
0.000 ( ): __bpf_stdout__:/etc/ld.so.cache....)
0.006 ( 0.005 ms): touch/17545 openat(dfd: CWD, filename: 0x6887726, flags: CLOEXEC) = 3
0.024 ( ): __bpf_stdout__:/lib64/libc.so.6....)
0.026 ( 0.005 ms): touch/17545 openat(dfd: CWD, filename: 0x6a8fce0, flags: CLOEXEC) = 3
0.282 ( ): __bpf_stdout__:/tmp/bla....)
0.283 ( 0.005 ms): touch/17545 openat(dfd: CWD, filename: 0x7cd0413c,
flags: CREAT|NOCTTY|NONBLOCK|WRONLY,
mode: IRUGO|IWUGO) = 3
Running it + vfs_getname
# trace -e openat,sys_enter_openat.c touch /tmp/bla
0.098 ( ): __bpf_stdout__:/etc/ld.so.cache....)
0.108 ( 0.033 ms): touch/24825 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC) = 3
0.217 ( ): __bpf_stdout__:/lib64/libc.so.6....)
0.222 ( 0.035 ms): touch/24825 openat(dfd: CWD, filename: /lib64/libc.so.6, flags: CLOEXEC) = 3
1.403 ( ): __bpf_stdout__:/tmp/bla....)
1.411 ( 0.038 ms): touch/24825 openat(dfd: CWD, filename: /tmp/bla,
flags: CREAT|NOCTTY|NONBLOCK|WRONLY, mode: IRUGO|IWUGO) = 3
#
Compile errors:
# trace -e sys_enter_open.c
sys_enter_open.c:14:20: error: expected ';' at end of declaration
char *filename_ptr *((char **)(ctx + 16));
^
;
1 error generated.
ERROR: unable to compile sys_enter_open.c
Hint: Check error message shown above.
Hint: You can also pre-compile it into .o using:
clang -target bpf -O2 -c sys_enter_open.c
with proper -I and -D options.
event syntax error: 'sys_enter_open.c'
\___ Failed to load sys_enter_open.c from source:
Error when compiling BPF scriptlet
(add -v to see detail)
#
Pre-compiled eBPF programs
# file sys_enter_openat.o
sys_enter_openat.o: ELF 64-bit LSB relocatable, eBPF, version 1 (SYSV), not stripped
# ls -la sys_enter_openat.o
-rw-r--r--. 1 root root 1112 Jan 28 12:23 sys_enter_openat.o
#
# trace -e openat,sys_enter_openat.o touch /tmp/bla
0.098 ( ): __bpf_stdout__:/etc/ld.so.cache....)
0.108 ( 0.033 ms): touch/24825 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC) = 3
0.217 ( ): __bpf_stdout__:/lib64/libc.so.6....)
0.222 ( 0.035 ms): touch/24825 openat(dfd: CWD, filename: /lib64/libc.so.6, flags: CLOEXEC) = 3
1.403 ( ): __bpf_stdout__:/tmp/bla....)
1.411 ( 0.038 ms): touch/24825 openat(dfd: CWD, filename: /tmp/bla,
flags: CREAT|NOCTTY|NONBLOCK|WRONLY, mode: IRUGO|IWUGO) = 3
#
Augmented Syscalls
- Replace raw_syscalls:sys_enter
- eBPF equivalent
- Appends pointer contents to existing payload
- Existing beautifiers changed to use it
- When available, continue working without it
- No need for probe:vfs_getname
- Filters
- Extend the kernel without writing "kernel" code
Augmented Syscalls
- bpf-output event
- bpf_perf_event_output
- bpf_probe_read
- bpf_probe_read_str
Augmenting syscalls
struct bpf_map SEC("maps") __augmented__ = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
}
int syscall_enter(syscall)(struct syscall_enter_open_args *args)
{
struct augmented_open_args augmented;
probe_read(&augmented.args, sizeof(augmented.args), args);
str_len = probe_read_str(&augmented.filename.value,
sizeof(augmented.filename.value),
args->filename_ptr);
perf_event_output(args, &__augmented__, BPF_F_CURRENT_CPU,
&augmented, sizeof(augmented));
return 0;
}
Augmented Syscalls
# cd tools/perf/examples/bpf/
# perf trace -e augmented_syscalls.c cat hello.c
#include <stdio.h>
int syscall_enter(openat)(void *args)
{
puts("Hello, world\n");
return 0;
}
license(GPL);
0.000 cat/31285 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC)
0.029 cat/31285 openat(dfd: CWD, filename: /lib64/libc.so.6, flags: CLOEXEC)
0.250 cat/31285 open(filename: /usr/lib/locale/locale-archive, flags: CLOEXEC)
0.300 cat/31285 openat(dfd: CWD, filename: hello.c)
#
etcsnoop
char etc[6] = "/etc/";
char filename[32] = "/etc/";
int syscall_enter(open)(struct syscall_enter_open_args *args)
{
probe_read_str(filename, sizeof(filename), args.filename_ptr);
if (__builtin_memcmp(filename, etc, 4) != 0)
return 0;
perf_event_output(...);
return 0;
}
etcsnoop
# perf trace -e etcsnoop.c
422.181 gsd-color/2408 openat(dfd: CWD, filename: /etc/localtime)
5015.914 DNS Res~er #23/22494 open(filename: /etc/hosts, flags: CLOEXEC)
5016.397 DNS Res~er #24/25634 open(filename: /etc/hosts, flags: CLOEXEC)
11140.507 sed/31860 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC)
11671.005 DNS Res~er #24/25634 open(filename: /etc/hosts, flags: CLOEXEC)
13813.659 sed/31866 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC)
15070.237 vim/31870 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC)
15076.928 bash/31870 openat(dfd: CWD, filename: /etc/vimrc)
15101.601 bash/31870 open(filename: /etc/nsswitch.conf, flags: CLOEXEC)
15101.662 bash/31870 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC)
15101.852 bash/31870 open(filename: /etc/passwd, flags: CLOEXEC)
17583.675 DNS Res~er #23/22494 open(filename: /etc/hosts, flags: CLOEXEC)
^C#