|
acme@kernel.org Red Hat Kernel Recipes, Paris, 2024 |
|---|
$ pahole -C _IO_FILE ~/bin/perf
struct _IO_FILE {
int _flags; /* 0 4 */
/* XXX 4 bytes hole, try to pack */
char * _IO_read_ptr; /* 8 8 */
char * _IO_read_end; /* 16 8 */
SNIP
/* --- cacheline 2 boundary (128 bytes) --- */
short unsigned int _cur_column; /* 128 2 */
signed char _vtable_offset; /* 130 1 */
char _shortbuf[1]; /* 131 1 */
/* XXX 4 bytes hole, try to pack */
_IO_lock_t * _lock; /* 136 8 */
__off64_t _offset; /* 144 8 */
SNIP
/* --- cacheline 3 boundary (192 bytes) --- */
int _mode; /* 192 4 */
char _unused2[20]; /* 196 20 */
/* size: 216, cachelines: 4, members: 29 */
/* sum members: 208, holes: 2, sum holes: 8 */
/* last cacheline: 24 bytes */
};
$
$ pahole task_struct | tail /* XXX last struct has 1 hole, 1 bit hole */ /* size: 13696, cachelines: 214, members: 269 */ /* sum members: 13579, holes: 23, sum holes: 101 */ /* sum bitfield members: 83 bits, bit holes: 2, sum bit holes: 45 bits */ /* member types with holes: 4, total: 6, bit holes: 2, total: 2 */ /* paddings: 6, sum paddings: 49 */ /* forced alignments: 2, forced holes: 2, sum forced holes: 24 */ }; $
commit 99123622050f10ca9148a0fffba2de0afd6cdfff Author: Eric DumazetDate: Tue Feb 27 19:27:21 2024 +0000 tcp: remove some holes in struct tcp_sock By moving some fields around, this patch shrinks holes size from 56 to 32, saving 24 bytes on 64bit arches. After the patch pahole gives the following for 'struct tcp_sock': /* size: 2304, cachelines: 36, members: 162 */ /* sum members: 2234, holes: 6, sum holes: 32 */ /* sum bitfield members: 34 bits, bit holes: 5, sum bit holes: 14 bits */ /* padding: 32 */ /* paddings: 3, sum paddings: 10 */ /* forced alignments: 1, forced holes: 1, sum forced holes: 12 */
$ pahole tcp_sock | grep cacheline_group __u8 __cacheline_group_begin__tcp_sock_read_tx[0]; /* 1384 0 */ __u8 __cacheline_group_end__tcp_sock_read_tx[0]; /* 1424 0 */ __u8 __cacheline_group_begin__tcp_sock_read_txrx[0]; /* 1424 0 */ __u8 __cacheline_group_end__tcp_sock_read_txrx[0]; /* 1456 0 */ __u8 __cacheline_group_begin__tcp_sock_read_rx[0]; /* 1456 0 */ __u8 __cacheline_group_end__tcp_sock_read_rx[0]; /* 1525 0 */ __u8 __cacheline_group_begin__tcp_sock_write_tx[0]; /* 1536 0 */ __u8 __cacheline_group_end__tcp_sock_write_tx[0]; /* 1625 0 */ __u8 __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 1625 0 */ __u8 __cacheline_group_end__tcp_sock_write_txrx[0]; /* 1717 0 */ __u8 __cacheline_group_begin__tcp_sock_write_rx[0]; /* 1720 0 */ __u8 __cacheline_group_end__tcp_sock_write_rx[0]; /* 1816 0 */ $
static void __init tcp_struct_check(void)
{
/* TX read-mostly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40);
#define CACHELINE_ASSERT_GROUP_MEMBER(TYPE, GROUP, MEMBER) \
BUILD_BUG_ON(!(offsetof(TYPE, MEMBER) > \
offsetofend(TYPE, __cacheline_group_begin__##GROUP) & \
offsetofend(TYPE, MEMBER) <= \
offsetof(TYPE, __cacheline_group_end__##GROUP)))
$ git grep __cacheline_group_begin | cut -d: -f1 | sort -u drivers/net/ethernet/intel/idpf/idpf_txrx.h include/linux/cache.h include/linux/ipv6.h include/linux/netdevice.h include/linux/tcp.h include/net/libeth/cache.h include/net/netns/ipv4.h include/net/page_pool/types.h include/net/sock.h scripts/kernel-doc $
Tests were run on 6.5-rc1 Efficiency is computed as cpu seconds / throughput (one tcp_rr round trip). The following result shows efficiency delta before and after the patch series is applied. On AMD platforms with 100Gb/s NIC and 256Mb L3 cache: IPv4 Flows with patches clean kernel Percent reduction 30k 0.0001736538065 0.0002741191042 -36.65% 20k 0.0001583661752 0.0002712559158 -41.62% 10k 0.0001639148817 0.0002951800751 -44.47% 5k 0.0001859683866 0.0003320642536 -44.00% 1k 0.0002035190546 0.0003152056382 -35.43%https://lore.kernel.org/netdev/20231129072756.3684495-1-lixiaoyan@google.com/
On Intel platforms with 200Gb/s NIC and 105Mb L3 cache: IPv6 Flows with patches clean kernel Percent reduction 30k 0.0006296537873 0.0006370427753 -1.16% 20k 0.0003451029365 0.0003628016076 -4.88% 10k 0.0003187646958 0.0003346835645 -4.76% 5k 0.0002954676348 0.000311807592 -5.24% 1k 0.0001909169342 0.0001848069709 3.31%https://lore.kernel.org/netdev/20231129072756.3684495-1-lixiaoyan@google.com/
From: Coco Li <lixiaoyan@google.com>
Subject: [PATCH v8 0/5] Analyze and Reorganize core Networking Structs
to optimize cacheline consumption
Currently, variable-heavy structs in the networking stack is organized
chronologically, logically and sometimes by cacheline access.
This patch series attempts to reorganize the core networking stack
variables to minimize cacheline consumption during the phase of data
transfer. Specifically, we looked at the TCP/IP stack and the fast
path definition in TCP.
For documentation purposes, we also added new files for each core data
structure we considered, although not all ended up being modified due
to the amount of existing cacheline they span in the fast path. In
the documentation, we recorded all variables we identified on the
fast path and the reasons. We also hope that in the future when
variables are added/modified, the document can be referred to and
updated accordingly to reflect the latest variable organization.
# echo 1 > /proc/sys/vm/drop_caches # perf mem record find / > /dev/null [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.061 MB perf.data (26 samples) ] #
# taskset -c 0 \
perf record --weight --data \
--event '{cpu_core/mem-loads-aux/,cpu_core/mem-loads,ldlat=30/P}:S' \
--event cpu_core/mem-stores/ find / > /dev/null
[ perf record: Woken up 20 times to write data ]
[ perf record: Captured and wrote 5.138 MB perf.data (79628 samples) ]
#
# perf evlist cpu_core/mem-loads-aux/ cpu_core/mem-loads,ldlat=30/P cpu_core/mem-stores/ dummy:u #
# perf mem report
# Total Lost Samples: 0
#
# Samples: 25K of event 'cpu_core/mem-loads-aux/'
# Total weight : 1123282
# Sort order : local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,local_p_stage_cyc
#
# Overhead Samples LocalWeight Mem access Symbol Shared Obj Data Symbol
# ........ ....... ........... ........... .......................... .......... ......................
#
0.50% 1 5635 RAM hit [k] btrfs_bin_search [kernel] [k] 0xffff90b3b9fe0a31
0.22% 1 2504 RAM hit [k] rb_next [kernel] [k] 0xffff90af31bfcda8
0.13% 1 1472 LFB/MAB hit [k] mutex_lock [kernel] [k] 0xffff90adca8c1d18
0.13% 1 1432 LFB/MAB hit [k] btrfs_get_delayed_node [kernel] [k] 0xffff90b4c9a17158
0.12% 1 1376 LFB/MAB hit [k] generic_fillattr [kernel] [k] 0xffff90b422422032
SNIP
0.02% 1 220 L3 hit [k] ktime_get_update_offsets_now [kernel] [k] tk_core+0xc0
SNIP
0.02% 1 216 LFB/MAB hit [k] update_vsyscall [kernel] [k] shadow_timekeeper+0x40
SNIP
0.02% 1 208 LFB/MAB hit [k] _raw_spin_lock [kernel] [k] jiffies_lock+0x0
# perf report --stdio --mem-mode --sort mem
# Samples: 26K of event 'cpu_core/mem-loads,ldlat=30/P'
# Total weight : 1135614
# Sort order : mem
#
# Overhead Memory access
# ........ .............
#
62.32% LFB/MAB hit
24.22% RAM hit
10.28% L1 hit
2.40% L3 hit
0.78% L2 hit
# perf report --dso '[kernel.kallsyms]' --stdio \
--mem-mode --sort sym,ins_lat
# Overhead Symbol INSTR Latency
# ........ ............................ ...................
#
0.50% [k] btrfs_bin_search 5637
0.22% [k] rb_next 2507
0.18% [k] folio_mark_accessed 419
0.18% [k] __d_lookup 405
0.17% [k] __d_lookup_rcu 389
0.14% [k] down_read 41
0.14% [k] __d_lookup_rcu 390
0.13% [k] mutex_lock 1475
0.13% [k] mutex_lock 487
0.13% [k] btrfs_get_delayed_node 1441
0.12% [k] generic_fillattr 703
0.12% [k] generic_fillattr 1378
0.12% [k] folio_mark_accessed 1371
0.12% [k] _raw_spin_lock 33
0.12% [k] btrfs_get_delayed_node 444
0.11% [k] dcache_readdir 1283
0.11% [k] __d_lookup_rcu 431
0.11% [k] folio_mark_accessed 640
#
# perf report --stdio -s type -i perf.data.mem.find
# Total Lost Samples: 0
#
# Samples: 25K of event 'cpu_core/mem-loads-aux/'
# Event count (approx.): 170070020
#
# Overhead Data Type
# ........ .........
18.34% (stack operation)
15.35% struct btrfs_key
10.83% struct
9.13% (unknown)
8.14% int
7.75% unsigned int
3.69% long long unsigned int
3.02% (stack canary)
2.62% struct _ftsent
2.61% struct extent_buffer
2.50% struct extent_buffer*
2.46% struct __va_list_tag
2.15% struct inode
2.12% long unsigned int
1.03% struct btrfs_delayed_node
0.86% struct nameidata
0.82% struct dentry
0.62% struct mnt_idmap*
0.57% struct malloc_chunk
0.54% struct av_decision
0.41% struct btrfs_path
0.36% struct av_decision*
0.34% unsigned char
0.32% struct hlist_bl_head
# perf report --stdio -s type,sym -i perf.data.mem.find
# Total Lost Samples: 0
#
# Samples: 25K of event 'cpu_core/mem-loads-aux/'
# Event count (approx.): 170070020
#
# Overhead Data Type Symbol
# ........ ......... ...............................
12.56% struct btrfs_key [k] btrfs_real_readdir
7.40% int [.] __GI___readdir64
5.98% unsigned int [k] _raw_spin_lock
4.75% (stack operation) [k] locks_remove_posix
3.24% (stack operation) [k] btrfs_verify_level_key
2.77% (stack operation) [k] check_buffer_tree_ref
2.76% struct [k] up_read
2.47% struct extent_buffer* [k] btrfs_search_slot
2.46% struct __va_list_tag [.] __printf_buffer
2.42% struct btrfs_key [k] btrfs_comp_cpu_keys
2.07% struct [k] down_read
1.81% struct extent_buffer [k] release_extent_buffer
1.59% (unknown) [k] memcpy
1.56% struct [k] check_buffer_tree_ref
1.24% (unknown) [k] __srcu_read_unlock
1.16% struct inode [k] generic_fillattr
1.14% unsigned int [k] find_extent_buffer_nolock
1.14% (stack canary) [k] locks_remove_posix
1.04% struct [k] __fput_sync
1.01% struct _ftsent [.] fts_compare_ino.lto_priv.0
0.97% long long unsigned int [k] mutex_lock
0.93% struct _ftsent [.] consider_visiting
0.89% (stack canary) [k] fsnotify
0.86% (stack operation) [k] read_extent_buffer
0.83% (unknown) [k] __srcu_read_lock
0.83% (stack operation) [k] __btrfs_tree_read_lock
0.81% long long unsigned int [k] lockref_put_return
0.79% (unknown) [.] __memmove_avx_unaligned_erms
0.76% (stack canary) [k] btrfs_verify_level_key
# perf report -s type,typeoff --hierarchy --stdio -i perf.data.mem.find
#
# Overhead Data Type / Data Type Offset
SNIP
2.15% struct inode
0.26% struct inode +40 (i_sb)
0.21% struct inode +356 (i_readcount.counter)
0.15% struct inode +56 (i_security)
0.15% struct inode +13 (i_flags)
0.12% struct inode +8 (i_gid.val)
0.12% struct inode +360 (i_fop)
0.11% struct inode +4 (i_uid.val)
0.10% struct inode +72 (i_nlink)
0.09% struct inode +88 (__i_atime.tv_sec)
0.09% struct inode +32 (i_op)
0.09% struct inode +0 (i_mode)
0.09% struct inode +64 (i_ino)
0.08% struct inode +12 (i_flags)
0.07% struct inode +112 (__i_mtime.tv_nsec)
0.07% struct inode +144 (i_blocks)
0.06% struct inode +96 (__i_atime.tv_nsec)
0.05% struct inode +80 (i_size)
0.05% struct inode +76 (i_rdev)
0.05% struct inode +128 (__i_ctime.tv_nsec)
0.04% struct inode +120 (__i_ctime.tv_sec)
0.04% struct inode +140 (i_bytes)
0.04% struct inode +104 (__i_mtime.tv_sec)
0.03% struct inode +142 (i_blkbits)
SNIP
# perf report -s type,typeoff,sym --hierarchy --stdio -i perf.data.mem.find
SNIP
15.35% struct btrfs_key
7.05% struct btrfs_key +0 (objectid)
6.04% [k] btrfs_real_readdir
0.76% [k] btrfs_comp_cpu_keys
0.26% [k] btrfs_bin_search
4.27% struct btrfs_key +9 (offset)
3.31% [k] btrfs_real_readdir
0.94% [k] btrfs_comp_cpu_keys
0.02% [k] btrfs_bin_search
4.03% struct btrfs_key +8 (type)
3.21% [k] btrfs_real_readdir
0.73% [k] btrfs_comp_cpu_keys
0.09% [k] btrfs_bin_search
SNIP
$ perf mem report -T
...
#
# Overhead Samples Memory access Snoop TLB access Data Type
# ........ ....... ............. ..... ............ .........
#
14.84% 22 L1 hit None L1 or L2 hit (unknown)
7.68% 8 LFB/MAB hit None L1 or L2 hit (unknown)
7.17% 3 RAM hit Hit L2 miss (unknown)
6.29% 12 L1 hit None L1 or L2 hit (stack operation)
4.85% 5 RAM hit Hit L1 or L2 hit (unknown)
3.97% 5 LFB/MAB hit None L1 or L2 hit struct psi_group_cpu
3.18% 3 LFB/MAB hit None L1 or L2 hit (stack operation)
2.58% 3 L1 hit None L1 or L2 hit unsigned int
2.36% 2 L1 hit None L1 or L2 hit struct
2.31% 2 L1 hit None L1 or L2 hit struct psi_group_cpu
...
# perf annotate --stdio --data-type
Annotate type: 'struct btrfs_key' in [kernel.kallsyms] (6282 samples):
event[0] = cpu_core/mem-loads-aux/
event[1] = cpu_core/mem-loads,ldlat=30/P
=========================================================
Percent offset size field
100.00 100.00 0 17 struct btrfs_key {
45.93 45.90 0 8 __u64 objectid;
26.26 26.52 8 1 __u8 type;
27.80 27.58 9 8 __u64 offset;
};
root@number:~# strace -e openat pahole btrfs_key |& tail -11
openat(AT_FDCWD, "/sys/kernel/btf/vmlinux", O_RDONLY) = 3
struct btrfs_key {
__u64 objectid; /* 0 8 */
__u8 type; /* 8 1 */
__u64 offset; /* 9 8 */
/* size: 17, cachelines: 1, members: 3 */
/* last cacheline: 17 bytes */
} __attribute__((__packed__));
+++ exited with 0 +++
root@number:~#
# perf --debug type-profile annotate --data-type find data type for 0x6(reg7) at intel_pmu_handle_irq+0x53 CU for arch/x86/events/intel/core.c (die:0x1b1f23) frame base: cfa=1 fbreg=7 found "late_ack" in scope=1/1 (die: 0x1da6df) stack_offset=0x60 type_offset=0 variable location: use frame base, offset=0xffffffffffffffa6 type='_Bool' size=0x1 (die:0x1b21d4)
static int intel_pmu_handle_irq(struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
bool late_ack = hybrid_bit(cpuc->pmu, late_ack);
bool mid_ack = hybrid_bit(cpuc->pmu, mid_ack);
int loops;
find data type for 0(reg1, reg0) at arch_asym_cpu_priority+0x1b CU for arch/x86/kernel/itmt.c (die:0xed3cc9) frame base: cfa=1 fbreg=7 scope: [1/1] (die:ed5101) bb: [0 - 1b] var [0] reg5 type='int' size=0x4 (die:0xed3d3e) mov [9] reg5 -> reg5 type='int' size=0x4 (die:0xed3d3e) mov [c] imm=0x19a38 -> reg0 mov [13] percpu base reg1 chk [1b] reg1 offset=0 ok=0 kind=2 cfa no variable found
int arch_asym_cpu_priority(int cpu)
{
return per_cpu(sched_core_priority, cpu);
}
$ perf annotate --data-type=bpf_map --stdio
Annotate type: 'struct bpf_map' in [kernel.kallsyms] (4 samples):
event[1] = cpu_core/mem-loads,ldlat=30/P
============================================================
Percent offset size field
100.00 0 256 struct bpf_map {
63.12 0 8 struct bpf_map_ops* ops;
0.00 8 8 struct bpf_map* inner_map_meta;
0.00 16 8 void* security;
0.00 24 4 enum bpf_map_type map_type;
36.88 28 4 u32 key_size;
0.00 32 4 u32 value_size;
0.00 36 4 u32 max_entries;
0.00 40 8 u64 map_extra;
0.00 48 4 u32 map_flags;
0.00 52 4 u32 id;
0.00 56 8 struct btf_record* record;
0.00 64 4 int numa_node;
0.00 68 4 u32 btf_key_type_id;
SNIP