Always Present Type Information
Thanks to BPF: BTF
Arnaldo Carvalho de Melo
acme@redhat.com
Red Hat Inc.
What is this about?
- BPF Type Format
- Always present
- BPF uses it
- A fraction of DWARF size
- A fraction of DWARF contents
- Other tools use it too
- In kernel snprintf augmented by BTF
- Pretty printing raw data using BTF
Where?
- /sys/kernel/btf/vmlinux
- Modules too: split BTF
What is in there?
- All kernel types
- Kernel ABI: set in stone
- Kernel Internals: always in flux
Let's see
$ pahole rwlock_t
typedef struct {
arch_rwlock_t raw_lock; /* 0 8 */
/* size: 8, cachelines: 1, members: 1 */
/* last cacheline: 8 bytes */
} rwlock_t;
$
Pahole?
- It started for looking at struct holes
- Let's call it another name then
Aliases
$ alias typedef=pahole
$ alias struct=pahole
$ alias union=pahole
$ alias enum=pahole
$ typedef rwlock_t
typedef struct {
arch_rwlock_t raw_lock; /* 0 8 */
/* size: 8, cachelines: 1, members: 1 */
/* last cacheline: 8 bytes */
} rwlock_t;
$ struct list_head
struct list_head {
struct list_head * next; /* 0 8 */
struct list_head * prev; /* 8 8 */
/* size: 16, cachelines: 1, members: 2 */
/* last cacheline: 16 bytes */
};
$
Enumerations
$ enum perf_event_type
enum perf_event_type {
PERF_RECORD_MMAP = 1,
PERF_RECORD_LOST = 2,
PERF_RECORD_COMM = 3,
PERF_RECORD_EXIT = 4,
PERF_RECORD_THROTTLE = 5,
PERF_RECORD_UNTHROTTLE = 6,
PERF_RECORD_FORK = 7,
PERF_RECORD_READ = 8,
PERF_RECORD_SAMPLE = 9,
PERF_RECORD_MMAP2 = 10,
PERF_RECORD_AUX = 11,
PERF_RECORD_ITRACE_START = 12,
PERF_RECORD_LOST_SAMPLES = 13,
PERF_RECORD_SWITCH = 14,
PERF_RECORD_SWITCH_CPU_WIDE = 15,
PERF_RECORD_NAMESPACES = 16,
PERF_RECORD_KSYMBOL = 17,
PERF_RECORD_BPF_EVENT = 18,
PERF_RECORD_CGROUP = 19,
PERF_RECORD_TEXT_POKE = 20,
PERF_RECORD_MAX = 21,
};
$
All kernel types
$ typedef --hex --expand_types rwlock_t
typedef struct {
/* typedef arch_rwlock_t */ struct qrwlock {
union {
/* typedef atomic_t */ struct {
int counter; /* 0 0x4 */
} cnts; /* 0 0x4 */
struct {
/* typedef u8 -> __u8 */ unsigned char wlocked; /* 0 0x1 */
/* typedef u8 -> __u8 */ unsigned char __lstate[3]; /* 0x1 0x3 */
}; /* 0 0x4 */
}; /* 0 0x4 */
/* typedef arch_spinlock_t */ struct qspinlock {
union {
/* typedef atomic_t */ struct {
int counter; /* 0x4 0x4 */
} val; /* 0x4 0x4 */
struct {
/* typedef u8 -> __u8 */ unsigned char locked; /* 0x4 0x1 */
/* typedef u8 -> __u8 */ unsigned char pending; /* 0x5 0x1 */
}; /* 0x4 0x2 */
struct {
/* typedef u16 -> __u16 */ short unsigned int locked_pending; /* 0x4 0x2 */
/* typedef u16 -> __u16 */ short unsigned int tail; /* 0x6 0x2 */
}; /* 0x4 0x4 */
}; /* 0x4 0x4 */
} wait_lock; /* 0x4 0x4 */
} raw_lock; /* 0 0x8 */
} rwlock_t;
$
Split BTF
- For kernel modules
- Do not duplicate types
- module BTF refers to kernel's
- Since pahole v1.19
- Since kernel v5.11
Kconfig variables
$ grep BTF ~/git/build/v5.11.0-rc6+.clang/.config
CONFIG_VIDEO_SONY_BTF_MPX=m
CONFIG_DEBUG_INFO_BTF=y
CONFIG_PAHOLE_HAS_SPLIT_BTF=y
CONFIG_DEBUG_INFO_BTF_MODULES=y
$
Lots more files
$ uname -r
5.11.0-rc6.clang+
$ cd /sys/kernel/btf
$ ls -1 | wc -l
136
$ ls -1 | head
ac97_bus
acpi_pad
asus_wmi
bridge
cec
coretemp
crc32c_intel
crc32_pclmul
crct10dif_pclmul
dca
$
What is in there?
$ pahole acpi_pad
libbpf: Invalid BTF string section
pahole: file 'acpi_pad' has no supported type information.
$ pahole --btf_base=vmlinux acpi_pad | head -11
struct gate_struct {
u16 offset_low; /* 0 2 */
u16 segment; /* 2 2 */
struct idt_bits bits; /* 4 2 */
u16 offset_middle; /* 6 2 */
u32 offset_high; /* 8 4 */
u32 reserved; /* 12 4 */
/* size: 16, cachelines: 1, members: 6 */
/* last cacheline: 16 bytes */
};
$
A shortcut
$ pahole /sys/kernel/btf/acpi_pad | head -11
struct gate_struct {
u16 offset_low; /* 0 2 */
u16 segment; /* 2 2 */
struct idt_bits bits; /* 4 2 */
u16 offset_middle; /* 6 2 */
u32 offset_high; /* 8 4 */
u32 reserved; /* 12 4 */
/* size: 16, cachelines: 1, members: 6 */
/* last cacheline: 16 bytes */
};
$
Can I do more?
- Using plain 'struct foo' is powerful
- For developers
- Reconstruct types
- No need for kernel headers
- Matches the running kernel
- Some more?
A request from a coworker
- Joe Lawrence
- Hey, pahole knows about types
- We need to extract module versioning info
- In shell scripts
- Related to kernel live patching
- Can you help?
Pretty printing raw data
- Use type information
- Format stdin
- Arrays
- pahole v1.18
modversion_info
$ pahole -C modversion_info drivers/scsi/sg.ko
struct modversion_info {
long unsigned int crc; /* 0 8 */
char name[56]; /* 8 56 */
/* size: 64, cachelines: 1, members: 2 */
};
$
pretty print it
$ objcopy -O binary --only-section=__versions drivers/scsi/sg.ko versions
$ ls -la versions
-rw-rw-r--. 1 acme acme 7616 Feb 18 09:39 versions
$ pahole --count 3 -C modversion_info drivers/scsi/sg.ko < versions
{
.crc = 148553092,
.name = "module_layout",
},
{
.crc = 1172595067,
.name = "no_llseek",
},
{
.crc = 2722082444,
.name = "param_ops_int",
},
$
Another example: ELF header
$ pahole elf64_hdr
struct elf64_hdr {
unsigned char e_ident[16]; /* 0 16 */
Elf64_Half e_type; /* 16 2 */
Elf64_Half e_machine; /* 18 2 */
Elf64_Word e_version; /* 20 4 */
Elf64_Addr e_entry; /* 24 8 */
Elf64_Off e_phoff; /* 32 8 */
Elf64_Off e_shoff; /* 40 8 */
Elf64_Word e_flags; /* 48 4 */
Elf64_Half e_ehsize; /* 52 2 */
Elf64_Half e_phentsize; /* 54 2 */
Elf64_Half e_phnum; /* 56 2 */
Elf64_Half e_shentsize; /* 58 2 */
Elf64_Half e_shnum; /* 60 2 */
Elf64_Half e_shstrndx; /* 62 2 */
/* size: 64, cachelines: 1, members: 14 */
};
$
An ELF header
$ pahole --count 1 elf64_hdr < /bin/bash
{
.e_ident = { 127, 69, 76, 70, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
.e_type = 3,
.e_machine = 62,
.e_version = 1,
.e_entry = 199248,
.e_phoff = 64,
.e_shoff = 1342344,
.e_flags = 0,
.e_ehsize = 64,
.e_phentsize = 56,
.e_phnum = 13,
.e_shentsize = 64,
.e_shnum = 31,
.e_shstrndx = 30,
},
Another ELF header
$ pahole --count 1 elf64_hdr < /bin/cp
{
.e_ident = { 127, 69, 76, 70, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
.e_type = 3,
.e_machine = 62,
.e_version = 1,
.e_entry = 23792,
.e_phoff = 64,
.e_shoff = 147760,
.e_flags = 0,
.e_ehsize = 64,
.e_phentsize = 56,
.e_phnum = 13,
.e_shentsize = 64,
.e_shnum = 31,
.e_shstrndx = 30,
},
$
Case closed, huh?
- No
- Let's add some more features...
--header
$ pahole --header elf64_hdr < /lib64/libc-2.32.so
{
.e_ident = { 127, 69, 76, 70, 2, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0 },
.e_type = 3,
.e_machine = 62,
.e_version = 1,
.e_entry = 164640,
.e_phoff = 64,
.e_shoff = 3217696,
.e_flags = 0,
.e_ehsize = 64,
.e_phentsize = 56,
.e_phnum = 14,
.e_shentsize = 64,
.e_shnum = 68,
.e_shstrndx = 67,
},
Header variables
- Fields in the header type
- Can be later referenced
- To decode ranges in a file
perf.data header
$ pahole --hex ~/bin/perf --header=perf_file_header < perf.data
{
.magic = 0x32454c4946524550,
.size = 0x68,
.attr_size = 0x88,
.attrs = {
.offset = 0x128,
.size = 0x88,
},
.data = {
.offset = 0x1b0,
.size = 0x3f0,
},
.adds_features = { 0x16717ffc, 0, 0, 0 },
},
$
perf event header: kernel ABI
$ pahole perf_event_header
struct perf_event_header {
__u32 type; /* 0 4 */
__u16 misc; /* 4 2 */
__u16 size; /* 6 2 */
/* size: 8, cachelines: 1, members: 3 */
/* last cacheline: 8 bytes */
};
$
perf event header: in perf tool
$ pahole -C perf_event_header ~/bin/perf
struct perf_event_header {
__u32 type; /* 0 4 */
__u16 misc; /* 4 2 */
__u16 size; /* 6 2 */
/* size: 8, cachelines: 1, members: 3 */
/* last cacheline: 8 bytes */
};
$
Variable sized record
- Well known member names
- type, size
Unpolished records
$ pahole --hex ~/bin/perf --seek_bytes=0x1b0 \
--size_bytes=0x3f0 \
--count 4 \
-C 'perf_event_header(sizeof=size)' < perf.data
{
.type = 0x4f,
.misc = 0,
.size = 0x38,
},
{
.type = 0x49,
.misc = 0,
.size = 0x28,
},
{
.type = 0x4a,
.misc = 0,
.size = 0x20,
},
{
.type = 0x3,
.misc = 0,
.size = 0x28,
},
$
Using header variables
$ pahole --hex ~/bin/perf --header=perf_file_header \
--seek_bytes='$header.data.offset' \
--size_bytes='$header.data.size' \
--count 4 \
-C 'perf_event_header(sizeof=size)' < perf.data
{
.type = 0x4f,
.misc = 0,
.size = 0x38,
},
{
.type = 0x49,
.misc = 0,
.size = 0x28,
},
{
.type = 0x4a,
.misc = 0,
.size = 0x20,
},
{
.type = 0x3,
.misc = 0,
.size = 0x28,
},
$
Enumerations
$ enum --hex perf_event_type
enum perf_event_type {
PERF_RECORD_MMAP = 0x1,
PERF_RECORD_LOST = 0x2,
PERF_RECORD_COMM = 0x3,
PERF_RECORD_EXIT = 0x4,
PERF_RECORD_THROTTLE = 0x5,
PERF_RECORD_UNTHROTTLE = 0x6,
PERF_RECORD_FORK = 0x7,
PERF_RECORD_READ = 0x8,
PERF_RECORD_SAMPLE = 0x9,
PERF_RECORD_MMAP2 = 0xa,
PERF_RECORD_AUX = 0xb,
PERF_RECORD_ITRACE_START = 0xc,
PERF_RECORD_LOST_SAMPLES = 0xd,
PERF_RECORD_SWITCH = 0xe,
PERF_RECORD_SWITCH_CPU_WIDE = 0xf,
PERF_RECORD_NAMESPACES = 0x10,
PERF_RECORD_KSYMBOL = 0x11,
PERF_RECORD_BPF_EVENT = 0x12,
PERF_RECORD_CGROUP = 0x13,
PERF_RECORD_TEXT_POKE = 0x14,
PERF_RECORD_MAX = 0x15,
};
$
Some type enumeration mapped
$ pahole --seek_bytes=0x1b0 --hex ~/bin/perf \
--size_bytes=0x3f0 --skip 1 --count 3 \
-C 'perf_event_header(sizeof,type,type_enum=perf_event_type)' < perf.data
{
.type = 0x49,
.misc = 0,
.size = 0x28,
},
{
.type = 0x4a,
.misc = 0,
.size = 0x20,
},
{
.header = {
.type = PERF_RECORD_COMM,
.misc = 0,
.size = 0x28,
},
.pid = 0x4edf,
.tid = 0x4edf,
.comm = "perf",
},
$
The record types
$ pahole ~/bin/perf -C perf_event
union perf_event {
struct perf_event_header header; /* 0 8 */
struct perf_record_mmap mmap; /* 0 4136 */
struct perf_record_mmap2 mmap2; /* 0 4168 */
struct perf_record_comm comm; /* 0 32 */
struct perf_record_namespaces namespaces; /* 0 24 */
struct perf_record_cgroup cgroup; /* 0 4112 */
struct perf_record_fork fork; /* 0 32 */
struct perf_record_lost lost; /* 0 24 */
struct perf_record_lost_samples lost_samples; /* 0 16 */
struct perf_record_read read; /* 0 48 */
struct perf_record_throttle throttle; /* 0 32 */
struct perf_record_sample sample; /* 0 8 */
struct perf_record_bpf_event bpf; /* 0 24 */
struct perf_record_ksymbol ksymbol; /* 0 280 */
struct perf_record_text_poke_event text_poke; /* 0 24 */
struct perf_record_header_attr attr; /* 0 128 */
<SNIP>
struct perf_record_time_conv time_conv; /* 0 56 */
struct perf_record_header_feature feat; /* 0 16 */
struct perf_record_compressed pack; /* 0 8 */
};
$
What to 'cast' for
$ pahole ~/bin/perf -C perf_record_comm
struct perf_record_comm {
struct perf_event_header header; /* 0 8 */
__u32 pid; /* 8 4 */
__u32 tid; /* 12 4 */
char comm[16]; /* 16 16 */
/* size: 32, cachelines: 1, members: 4 */
/* last cacheline: 32 bytes */
};
$
Lots more, but we end with:
$ pahole ~/bin/perf --header=perf_file_header \
-C 'perf_file_attr(range=attrs),
perf_event_header(range=data,sizeof,type,
type_enum=perf_event_type+perf_user_event_type)' < perf.data
Goals
- Use it to document a file format
- While providing a full pretty printer
- perf report -D
- New records gets automagically supported
Future
- Experiment more
- Finish perf.data dissector
- Features not specific to it
- Integrate with perf's libbeauty
- Maps integer in kernel ABIs to strings
- Add some of these features to other tools
- gdb?
- crash?
- Fix bugs found making this presentation :-)
- Presentations at: http://vger.kernel.org/~acme/bpf/