|
acme@redhat.com |
|---|
Date: Mon Nov 15 11:02:37 2021 -0800 From: Eric DumazetMove sk_bind_phc next to sk_peer_lock to fill a hole. @@ -489,5 +489,6 @@ struct sock { u16 sk_busy_poll_budget; #endif spinlock_t sk_peer_lock; + int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; @@ -498,5 +499,4 @@ struct sock { seqlock_t sk_stamp_seq; #endif u16 sk_tsflags; - int sk_bind_phc; u8 sk_shutdown;
$ git log -2 --oneline 1ace2b4d2b4e1db8f 1ace2b4d2b4e1db8 net: shrink struct sock by 8 bytes 1b31debca8328448 ipv6: shrink struct ipcm6_cookie $
$ pahole -C list_head ~/git/build/v5.18-rc6+/vmlinux
struct list_head {
struct list_head * next; /* 0 8 */
struct list_head * prev; /* 8 8 */
/* size: 16, cachelines: 1, members: 2 */
/* last cacheline: 16 bytes */
};
$
# pahole -C _IO_FILE ~/bin/perf | head
struct _IO_FILE {
int _flags; /* 0 4 */
/* XXX 4 bytes hole, try to pack */
char * _IO_read_ptr; /* 8 8 */
char * _IO_read_end; /* 16 8 */
char * _IO_read_base; /* 24 8 */
char * _IO_write_base; /* 32 8 */
char * _IO_write_ptr; /* 40 8 */
#
$ pahole --reorganize task_struct | tail
/* --- cacheline 142 boundary (9088 bytes) was 56 bytes ago --- */
struct thread_struct thread __attribute__((__aligned__(64))); /* 9144 4416 */
/* size: 13560, cachelines: 212, members: 252 */
/* sum members: 13487, holes: 2, sum holes: 57 */
/* sum bitfield members: 79 bits, bit holes: 2, sum bit holes: 49 bits */
/* paddings: 6, sum paddings: 49 */
/* forced alignments: 2, forced holes: 1, sum forced holes: 56 */
/* last cacheline: 56 bytes */
}; /* saved 136 bytes and 2 cachelines! */
$
$ pahole spinlock_t typedef struct spinlock spinlock_t; $
$ pahole spinlock
struct spinlock {
union {
struct raw_spinlock rlock; /* 0 4 */
}; /* 0 4 */
/* size: 4, cachelines: 1, members: 1 */
/* last cacheline: 4 bytes */
};
$
$ pahole -E spinlock
struct spinlock {
union {
struct raw_spinlock {
/* typedef arch_spinlock_t */ struct qspinlock {
union {
/* typedef atomic_t */ struct {
int counter; /* 0 4 */
} val; /* 0 4 */
struct {
/* typedef u8 -> __u8 */ unsigned char locked; /* 0 1 */
/* ypedef u8 -> __u8 */ unsigned char pending; /* 1 1 */
}; /* 0 2 */
struct {
/* typedef u16 -> __u16 */ short unsigned int locked_pending; /* 0 2 */
/* typedef u16 -> __u16 */ short unsigned int tail; /* 2 2 */
}; /* 0 4 */
}; /* 0 4 */
} raw_lock; /* 0 4 */
} rlock; /* 0 4 */
}; /* 0 4 */
/* size: 4, cachelines: 1, members: 1 */
/* last cacheline: 4 bytes */
};
$
bpftool btf dump file /sys/kernel/btf/vmlinux format c
Author: Namhyung KimDate: Wed May 18 15:47:23 2022 -0700 perf record: Handle argument change in sched_switch Recently sched_switch tracepoint added a new argument for prev_state, but it's hard to handle the change in a BPF program. Instead, we can check the function prototype in BTF before loading the program.
static void check_sched_switch_args(void)
{
struct btf *btf = bpf_object__btf(skel->obj);
struct btf_type *t1, *t2, *t3;
u32 type_id = btf__find_by_name_kind(btf, "bpf_trace_sched_switch",
BTF_KIND_TYPEDEF);
t1 = btf__type_by_id(btf, type_id);
t2 = btf__type_by_id(btf, t1->type);
if (t3 && btf_is_func_proto(t3) && btf_vlen(t3) == 4) {
// new format: pass prev_state as 4th arg
skel->rodata->has_prev_state = true;
}
}
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
+const volatile bool has_prev_state = false;
+SEC("tp_btf/sched_switch")
+int on_switch(u64 *ctx) {
+ struct task_struct *prev, *next;
+ int prev_state;
+
+ if (!enabled) return 0;
+ prev = (struct task_struct *)ctx[1];
+ next = (struct task_struct *)ctx[2];
+
+ if (has_prev_state)
+ prev_state = (int)ctx[3];
+ else
+ prev_state = get_task_state(prev);
+ return off_cpu_stat(ctx, prev, next, prev_state);
+}
# perf record --off-cpu ^C[ perf record: Woken up 1924 times to write data ] [ perf record: Captured and wrote 483.936 MB perf.data (8857075 samples) ] # ls -la perf.data -rw-------. 1 root root 507845510 May 31 00:45 perf.data #
# bpftool prog | grep on_switch -A4 634: tracing name on_switch tag 3d6d5a513a933c28 gpl loaded_at 2022-05-30T22:37:17+0200 uid 0 xlated 1392B jited 913B memlock 4096B map_ids 497,498,493,494,495,490,491,492 btf_id 602 pids perf(393176) #
# bpftool prog dump jited id 634 int on_switch(u64 * ctx): bpf_prog_3d6d5a513a933c28_on_switch: ; int on_switch(u64 *ctx) 0: nopl 0x0(%rax,%rax,1) 5: xchg %ax,%ax 7: push %rbp 8: mov %rsp,%rbp b: sub $0x38,%rsp 12: push %rbx 13: push %r13 15: push %r14 17: push %r15 19: mov %rdi,%r15 ; if (!enabled) 1c: movabs $0xffffb53a400d2000,%rdi 26: mov 0x0(%rdi),%edi ; if (!enabled) 29: test %rdi,%rdi 2c: je 0x0000000000000386
; next = (struct task_struct *)ctx[2]; 32: mov 0x10(%r15),%r14 ; prev = (struct task_struct *)ctx[1]; 36: mov 0x8(%r15),%rbx ; if (has_prev_state) 3a: movabs $0xffffb53a400f6000,%rdi 44: movzbq 0x0(%rdi),%rdi ; prev_state = (int)ctx[3]; 49: mov $0x1,%edi ; if (bpf_core_field_exists(t->__state)) 4e: mov $0x18,%edi 53: mov %rbx,%rdx 56: add %rdi,%rdx 59: mov %rbp,%rdi ; 5c: add $0xffffffffffffffd8,%rdi ; return BPF_CORE_READ(t, __state); 60: mov $0x4,%esi 65: callq 0xffffffffd5f13f50 ; return BPF_CORE_READ(t, __state); 6a: mov -0x28(%rbp),%r13d
# bpftool map | grep off_cpu -A3 490: array name off_cpu_.rodata flags 0x480 key 4B value 3B max_entries 1 memlock 4096B btf_id 628 frozen pids perf(393176) #
# bpftool map dump id 490
[{
"value": {
".rodata": [{
"has_prev_state": false
},{
"needs_cgroup": false
},{
"uses_cgroup_v1": false
}
]
}
}
#
# perf report --stdio --call-graph=no
# Childr Self Command Shared Object Symbol
# ...... ...... ............... .................. .........................
81.66% 0.00% sched-messaging libc-2.33.so [.] __libc_start_main
81.66% 0.00% sched-messaging perf [.] cmd_bench
81.66% 0.00% sched-messaging perf [.] main
81.66% 0.00% sched-messaging perf [.] run_builtin
81.43% 0.00% sched-messaging perf [.] bench_sched_messaging
40.86% 40.86% sched-messaging libpthread-2.33.so [.] __read
37.66% 37.66% sched-messaging libpthread-2.33.so [.] __write
2.91% 2.91% sched-messaging libc-2.33.so [.] __poll
...
As you can see it spent most of off-cpu time in read and write in
bench_sched_messaging(). The --call-graph=no was added just to make
the output concise here.
LD [M] drivers/media/usb/gspca/gspca_zc3xx.o AR drivers/media/built-in.a AR drivers/built-in.a GEN .version CHK include/generated/compile.h LD vmlinux.o MODPOST vmlinux.symvers MODINFO modules.builtin.modinfo GEN modules.builtin CC .vmlinux.export.o LD .tmp_vmlinux.btf BTF .btf.vmlinux.bin.o LD .tmp_vmlinux.kallsyms1 KSYMS .tmp_vmlinux.kallsyms1.S AS .tmp_vmlinux.kallsyms1.S LD .tmp_vmlinux.kallsyms2 KSYMS .tmp_vmlinux.kallsyms2.S AS .tmp_vmlinux.kallsyms2.S LD vmlinux
LD [M] arch/x86/kvm/kvm-amd.ko BTF [M] arch/x86/kernel/cpu/mce/mce-inject.ko BTF [M] arch/x86/events/rapl.ko LD [M] arch/x86/kvm/kvm-intel.ko LD [M] arch/x86/kvm/kvm.ko BTF [M] arch/x86/kvm/kvm-amd.ko LD [M] crypto/adiantum.ko BTF [M] crypto/adiantum.ko BTF [M] arch/x86/kvm/kvm-intel.ko LD [M] crypto/aegis128.ko BTF [M] crypto/aegis128.ko LD [M] crypto/aes_ti.ko BTF [M] arch/x86/kvm/kvm.ko
$ cat scripts/pahole-flags.sh
#!/bin/sh
if [ "${pahole_ver}" -ge "118" ] && [ "${pahole_ver}" -le "121" ]; then
# pahole 1.18 through 1.21 can't handle zero-sized per-CPU vars
extra_paholeopt="${extra_paholeopt} --skip_encoding_btf_vars"
fi
if [ "${pahole_ver}" -ge "121" ]; then
extra_paholeopt="${extra_paholeopt} --btf_gen_floats"
fi
if [ "${pahole_ver}" -ge "122" ]; then
extra_paholeopt="${extra_paholeopt} -j"
fi
echo ${extra_paholeopt}
$ scripts/pahole-flags.sh
--btf_gen_floats -j
$
pahole --version
v1.23
$
static const char *languages[] = {
[DW_LANG_Ada83] = "ada83",
SNIP
[DW_LANG_C11] = "c11",
[DW_LANG_C89] = "c89",
[DW_LANG_C99] = "c99",
[DW_LANG_C] = "c",
[DW_LANG_Cobol74] = "cobol74",
SNIP
[DW_LANG_C_plus_plus_14] = "c++14",
[DW_LANG_C_plus_plus] = "c++",
[DW_LANG_D] = "d",
[DW_LANG_Dylan] = "dylan",
[DW_LANG_Fortran03] = "fortran03",
SNIP
[DW_LANG_PLI] = "pli",
[DW_LANG_Python] = "python",
[DW_LANG_RenderScript] = "renderscript",
[DW_LANG_Rust] = "rust",
};
static struct btf_ptr b = { };
b.ptr = skb;
b.type_id = __builtin_btf_type_id(struct sk_buff, 1);
bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0);
Default output looks like this:
(struct sk_buff){
.transport_header = (__u16)65535,
.mac_header = (__u16)65535,
.end = (sk_buff_data_t)192,
.head = (unsigned char *)0x000000007524fd8b,
.data = (unsigned char *)0x000000007524fd8b,
.truesize = (unsigned int)768,
.users = (refcount_t){
.refs = (atomic_t){
.counter = (int)1,
},
},
}
Flags modifying display are as follows:
- BTF_F_COMPACT: no formatting around type information
- BTF_F_NONAME: no struct/union member names/types
- BTF_F_PTR_RAW: show raw (unobfuscated) pointer values;
equivalent to %px.
- BTF_F_ZERO: show zero-valued struct/union members;
they are not displayed by default
$ pahole --prettify=- --header elf64_hdr < /bin/bash
{
.e_ident = { 127, 69, 76, 70, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
.e_type = 3,
.e_machine = 62,
.e_version = 1,
.e_entry = 204224,
.e_phoff = 64,
.e_shoff = 1388016,
.e_flags = 0,
.e_ehsize = 64,
.e_phentsize = 56,
.e_phnum = 13,
.e_shentsize = 64,
.e_shnum = 32,
.e_shstrndx = 31,
},
$
$ pahole elf64_hdr
struct elf64_hdr {
unsigned char e_ident[16]; /* 0 16 */
Elf64_Half e_type; /* 16 2 */
Elf64_Half e_machine; /* 18 2 */
Elf64_Word e_version; /* 20 4 */
Elf64_Addr e_entry; /* 24 8 */
Elf64_Off e_phoff; /* 32 8 */
Elf64_Off e_shoff; /* 40 8 */
Elf64_Word e_flags; /* 48 4 */
Elf64_Half e_ehsize; /* 52 2 */
Elf64_Half e_phentsize; /* 54 2 */
Elf64_Half e_phnum; /* 56 2 */
Elf64_Half e_shentsize; /* 58 2 */
Elf64_Half e_shnum; /* 60 2 */
Elf64_Half e_shstrndx; /* 62 2 */
/* size: 64, cachelines: 1, members: 14 */
};
$