Message ID | 20231102032330.1036151-7-chengming.zhou@linux.dev (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | slub: Delay freezing of CPU partial slabs | expand |
Hello, kernel test robot noticed a 34.2% improvement of stress-ng.rawudp.ops_per_sec on: commit: b73583016198aecef1dea07033a808da7875ede1 ("[PATCH v5 6/9] slub: Delay freezing of partial slabs") url: https://github.com/intel-lab-lkp/linux/commits/chengming-zhou-linux-dev/slub-Reflow-___slab_alloc/20231102-112748 base: git://git.kernel.org/cgit/linux/kernel/git/vbabka/slab.git for-next patch link: https://lore.kernel.org/all/20231102032330.1036151-7-chengming.zhou@linux.dev/ patch subject: [PATCH v5 6/9] slub: Delay freezing of partial slabs testcase: stress-ng test machine: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory parameters: nr_threads: 100% testtime: 60s class: network test: rawudp cpufreq_governor: performance Details are as below: --------------------------------------------------------------------------------------------------> The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20231114/202311141204.e918dbda-oliver.sang@intel.com ========================================================================================= class/compiler/cpufreq_governor/kconfig/nr_threads/rootfs/tbox_group/test/testcase/testtime: network/gcc-12/performance/x86_64-rhel-8.3/100%/debian-11.1-x86_64-20220510.cgz/lkp-icl-2sp8/rawudp/stress-ng/60s commit: fae950af3a ("slub: Introduce freeze_slab()") b735830161 ("slub: Delay freezing of partial slabs") fae950af3a484ec4 b73583016198aecef1dea07033a ---------------- --------------------------- %stddev %change %stddev \ | \ 0.51 ± 15% +0.1 0.66 ± 3% mpstat.cpu.all.usr% 30758 ± 11% +41.0% 43361 ± 4% vmstat.system.cs 14033150 +13.1% 15877158 numa-numastat.node0.local_node 14069328 +13.1% 15912745 numa-numastat.node0.numa_hit 14246260 +11.6% 15894273 numa-numastat.node1.local_node 14274705 +11.6% 15927700 numa-numastat.node1.numa_hit 14069250 +13.1% 15912955 numa-vmstat.node0.numa_hit 14033072 +13.1% 15877368 numa-vmstat.node0.numa_local 14274619 +11.6% 15927488 numa-vmstat.node1.numa_hit 14246174 +11.6% 15894061 numa-vmstat.node1.numa_local 0.30 ± 19% +38.5% 0.42 ± 7% sched_debug.cfs_rq:/.nr_running.stddev 2434 ± 4% -10.6% 2176 ± 6% sched_debug.cpu.curr->pid.avg 1079 ± 20% +36.7% 1474 ± 10% sched_debug.cpu.curr->pid.stddev 18781 ± 2% +28.4% 24113 ± 2% sched_debug.cpu.nr_switches.avg 11540 ± 4% +41.8% 16360 ± 5% sched_debug.cpu.nr_switches.min 0.03 +33.3% 0.04 stress-ng.rawudp.MB_recv'd_per_sec 4064768 +34.2% 5456370 stress-ng.rawudp.ops 67734 +34.2% 90929 stress-ng.rawudp.ops_per_sec 757287 ± 2% +31.6% 996382 stress-ng.time.involuntary_context_switches 1112941 ± 2% +40.5% 1564161 ± 2% stress-ng.time.voluntary_context_switches 28346406 +12.3% 31843356 proc-vmstat.numa_hit 28281784 +12.3% 31774341 proc-vmstat.numa_local 103553 ± 2% +6.3% 110100 ± 3% proc-vmstat.numa_pte_updates 128452 +10.1% 141426 ± 2% proc-vmstat.pgactivate 70199954 +10.8% 77804409 proc-vmstat.pgalloc_normal 70032258 +10.8% 77627740 proc-vmstat.pgfree 81626657 ± 11% +20.5% 98326255 ± 3% perf-stat.i.branch-misses 8.365e+08 ± 11% +16.8% 9.77e+08 ± 3% perf-stat.i.cache-references 31476 ± 12% +42.8% 44947 ± 4% perf-stat.i.context-switches 2.46 ± 2% -5.3% 2.33 perf-stat.i.cpi 7610 ± 11% +51.7% 11547 ± 3% perf-stat.i.cpu-migrations 1807 ± 6% -35.8% 1160 ± 9% perf-stat.i.metric.K/sec 67795798 ± 11% +17.2% 79489592 ± 3% perf-stat.i.node-load-misses 42.11 ± 5% +2.2 44.32 perf-stat.i.node-store-miss-rate% 53640005 ± 11% +16.1% 62254015 ± 3% perf-stat.i.node-store-misses 4.42 +3.9% 4.59 perf-stat.overall.MPKI 0.53 +0.0 0.57 perf-stat.overall.branch-miss-rate% 2.46 -4.0% 2.36 perf-stat.overall.cpi 556.40 -7.6% 513.88 perf-stat.overall.cycles-between-cache-misses 0.41 +4.2% 0.42 perf-stat.overall.ipc 59.01 +0.9 59.88 perf-stat.overall.node-load-miss-rate% 80834242 ± 10% +19.7% 96751983 ± 2% perf-stat.ps.branch-misses 3.595e+08 ± 10% +14.6% 4.121e+08 ± 3% perf-stat.ps.cache-misses 8.308e+08 ± 10% +16.1% 9.643e+08 ± 3% perf-stat.ps.cache-references 31157 ± 10% +42.0% 44245 ± 3% perf-stat.ps.context-switches 7566 ± 10% +50.5% 11389 ± 3% perf-stat.ps.cpu-migrations 67342566 ± 10% +16.5% 78472343 ± 2% perf-stat.ps.node-load-misses 53274045 ± 10% +15.4% 61453717 ± 2% perf-stat.ps.node-store-misses 66741521 ± 10% +13.4% 75684953 ± 2% perf-stat.ps.node-stores 5.522e+12 +4.0% 5.742e+12 perf-stat.total.instructions 0.40 ± 5% -28.4% 0.29 ± 8% perf-sched.sch_delay.avg.ms.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64 0.58 ± 5% -9.2% 0.53 ± 4% perf-sched.sch_delay.avg.ms.schedule_timeout.__skb_wait_for_more_packets.__skb_recv_datagram.skb_recv_datagram 0.37 ± 11% -38.4% 0.22 ± 16% perf-sched.sch_delay.avg.ms.schedule_timeout.rcu_gp_fqs_loop.rcu_gp_kthread.kthread 128.50 ± 19% -30.6% 89.17 ± 18% perf-sched.sch_delay.max.ms.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64 0.52 ± 3% -13.8% 0.45 ± 4% perf-sched.total_sch_delay.average.ms 9.51 ± 5% -24.7% 7.16 ± 5% perf-sched.total_wait_and_delay.average.ms 150858 ± 3% +33.8% 201844 ± 6% perf-sched.total_wait_and_delay.count.ms 8.99 ± 5% -25.3% 6.71 ± 5% perf-sched.total_wait_time.average.ms 4.32 ± 4% -34.8% 2.81 ± 7% perf-sched.wait_and_delay.avg.ms.__cond_resched.dput.__fput.__x64_sys_close.do_syscall_64 3.22 ± 8% -34.3% 2.12 ± 8% perf-sched.wait_and_delay.avg.ms.__cond_resched.slab_pre_alloc_hook.constprop.0.kmem_cache_alloc_lru 47.40 ± 6% -13.9% 40.79 ± 11% perf-sched.wait_and_delay.avg.ms.__cond_resched.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm 4.93 ± 4% -29.4% 3.48 ± 7% perf-sched.wait_and_delay.avg.ms.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64 2.14 ± 5% -16.3% 1.79 ± 4% perf-sched.wait_and_delay.avg.ms.schedule_timeout.__skb_wait_for_more_packets.__skb_recv_datagram.skb_recv_datagram 508.50 ± 7% +39.9% 711.50 ± 9% perf-sched.wait_and_delay.count.__cond_resched.aa_sk_perm.security_socket_recvmsg.sock_recvmsg.__sys_recvfrom 1533 ± 6% +22.0% 1871 ± 6% perf-sched.wait_and_delay.count.__cond_resched.dput.__fput.__x64_sys_close.do_syscall_64 1003 ± 4% +28.4% 1288 ± 6% perf-sched.wait_and_delay.count.__cond_resched.slab_pre_alloc_hook.constprop.0.kmem_cache_alloc_lru 48937 ± 3% +28.2% 62737 ± 5% perf-sched.wait_and_delay.count.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64 86063 ± 4% +39.4% 120002 ± 7% perf-sched.wait_and_delay.count.schedule_timeout.__skb_wait_for_more_packets.__skb_recv_datagram.skb_recv_datagram 6201 ± 3% +14.1% 7077 ± 11% perf-sched.wait_and_delay.count.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm 85.45 ±120% +384.6% 414.09 ± 82% perf-sched.wait_and_delay.max.ms.__cond_resched.mutex_lock.perf_poll.do_poll.constprop 22.16 ±105% +162.1% 58.07 ± 36% perf-sched.wait_and_delay.max.ms.__cond_resched.stop_one_cpu.sched_exec.bprm_execve.part 1.37 ± 17% -38.1% 0.85 ± 26% perf-sched.wait_time.avg.ms.__cond_resched.__kmem_cache_alloc_node.kmalloc_trace.apparmor_sk_alloc_security.security_sk_alloc 2.32 ± 14% -48.3% 1.20 ± 33% perf-sched.wait_time.avg.ms.__cond_resched.aa_sk_perm.security_socket_sendmsg.sock_sendmsg.__sys_sendto 4.31 ± 4% -34.9% 2.81 ± 7% perf-sched.wait_time.avg.ms.__cond_resched.dput.__fput.__x64_sys_close.do_syscall_64 2.24 ± 11% -36.7% 1.42 ± 24% perf-sched.wait_time.avg.ms.__cond_resched.kmem_cache_alloc_node.__alloc_skb.alloc_skb_with_frags.sock_alloc_send_pskb 3.89 ± 41% -57.2% 1.67 ± 36% perf-sched.wait_time.avg.ms.__cond_resched.lock_sock_nested.raw_destroy.sk_common_release.inet_release 3.22 ± 8% -34.2% 2.12 ± 8% perf-sched.wait_time.avg.ms.__cond_resched.slab_pre_alloc_hook.constprop.0.kmem_cache_alloc_lru 47.20 ± 6% -14.0% 40.58 ± 11% perf-sched.wait_time.avg.ms.__cond_resched.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm 4.52 ± 4% -29.5% 3.19 ± 7% perf-sched.wait_time.avg.ms.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64 5.84 ± 24% -45.9% 3.16 ± 40% perf-sched.wait_time.avg.ms.schedule_preempt_disabled.__mutex_lock.constprop.0.ip_ra_control 1.56 ± 5% -19.0% 1.26 ± 4% perf-sched.wait_time.avg.ms.schedule_timeout.__skb_wait_for_more_packets.__skb_recv_datagram.skb_recv_datagram 107.91 ± 77% +281.4% 411.55 ± 82% perf-sched.wait_time.max.ms.__cond_resched.mutex_lock.perf_poll.do_poll.constprop 7.19 -2.9 4.31 ± 4% perf-profile.calltrace.cycles-pp.get_partial_node.___slab_alloc.kmem_cache_alloc.skb_clone.raw_v4_input 10.26 -2.8 7.48 ± 3% perf-profile.calltrace.cycles-pp.___slab_alloc.kmem_cache_alloc.skb_clone.raw_v4_input.ip_protocol_deliver_rcu 12.04 -2.7 9.31 ± 2% perf-profile.calltrace.cycles-pp.kmem_cache_alloc.skb_clone.raw_v4_input.ip_protocol_deliver_rcu.ip_local_deliver_finish 6.51 -2.7 3.79 ± 5% perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.get_partial_node.___slab_alloc.kmem_cache_alloc.skb_clone 12.55 -2.7 9.83 ± 2% perf-profile.calltrace.cycles-pp.skb_clone.raw_v4_input.ip_protocol_deliver_rcu.ip_local_deliver_finish.__netif_receive_skb_one_core 6.37 -2.7 3.67 ± 5% perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath._raw_spin_lock_irqsave.get_partial_node.___slab_alloc.kmem_cache_alloc 6.44 ± 2% -2.1 4.34 ± 5% perf-profile.calltrace.cycles-pp.__unfreeze_partials.raw_recvmsg.inet_recvmsg.sock_recvmsg.__sys_recvfrom 3.03 ± 4% -2.0 0.99 ± 28% perf-profile.calltrace.cycles-pp.__unfreeze_partials.inet_sock_destruct.__sk_destruct.rcu_do_batch.rcu_core 5.59 ± 2% -1.9 3.67 ± 6% perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath._raw_spin_lock_irqsave.__unfreeze_partials.raw_recvmsg.inet_recvmsg 2.81 ± 4% -1.9 0.92 ± 29% perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.__unfreeze_partials.inet_sock_destruct.__sk_destruct.rcu_do_batch 2.69 ± 4% -1.9 0.82 ± 30% perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath._raw_spin_lock_irqsave.__unfreeze_partials.inet_sock_destruct.__sk_destruct 5.88 ± 2% -1.8 4.07 ± 5% perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.__unfreeze_partials.raw_recvmsg.inet_recvmsg.sock_recvmsg 15.12 -1.3 13.86 ± 2% perf-profile.calltrace.cycles-pp.inet_sock_destruct.__sk_destruct.rcu_do_batch.rcu_core.__do_softirq 12.74 -1.1 11.66 ± 2% perf-profile.calltrace.cycles-pp.__sk_destruct.rcu_do_batch.rcu_core.__do_softirq.run_ksoftirqd 13.45 -1.1 12.39 ± 2% perf-profile.calltrace.cycles-pp.rcu_do_batch.rcu_core.__do_softirq.run_ksoftirqd.smpboot_thread_fn 13.46 -1.1 12.40 ± 2% perf-profile.calltrace.cycles-pp.rcu_core.__do_softirq.run_ksoftirqd.smpboot_thread_fn.kthread 13.48 -1.1 12.42 ± 2% perf-profile.calltrace.cycles-pp.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm 13.49 -1.1 12.43 ± 2% perf-profile.calltrace.cycles-pp.ret_from_fork_asm 13.49 -1.1 12.43 ± 2% perf-profile.calltrace.cycles-pp.ret_from_fork.ret_from_fork_asm 13.49 -1.1 12.43 ± 2% perf-profile.calltrace.cycles-pp.kthread.ret_from_fork.ret_from_fork_asm 13.46 -1.1 12.40 ± 2% perf-profile.calltrace.cycles-pp.__do_softirq.run_ksoftirqd.smpboot_thread_fn.kthread.ret_from_fork 13.46 -1.1 12.40 ± 2% perf-profile.calltrace.cycles-pp.run_ksoftirqd.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm 11.50 -0.6 10.88 ± 2% perf-profile.calltrace.cycles-pp.raw_recvmsg.inet_recvmsg.sock_recvmsg.__sys_recvfrom.__x64_sys_recvfrom 11.54 -0.6 10.93 ± 2% perf-profile.calltrace.cycles-pp.inet_recvmsg.sock_recvmsg.__sys_recvfrom.__x64_sys_recvfrom.do_syscall_64 11.72 -0.5 11.18 ± 2% perf-profile.calltrace.cycles-pp.sock_recvmsg.__sys_recvfrom.__x64_sys_recvfrom.do_syscall_64.entry_SYSCALL_64_after_hwframe 1.18 ± 3% -0.5 0.71 ± 4% perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.get_partial_node.get_any_partial.___slab_alloc.kmem_cache_alloc 1.15 ± 3% -0.5 0.68 ± 4% perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath._raw_spin_lock_irqsave.get_partial_node.get_any_partial.___slab_alloc 1.23 ± 3% -0.5 0.77 ± 3% perf-profile.calltrace.cycles-pp.get_partial_node.get_any_partial.___slab_alloc.kmem_cache_alloc.skb_clone 12.05 -0.4 11.61 ± 2% perf-profile.calltrace.cycles-pp.__sys_recvfrom.__x64_sys_recvfrom.do_syscall_64.entry_SYSCALL_64_after_hwframe.recv 12.07 -0.4 11.64 ± 2% perf-profile.calltrace.cycles-pp.__x64_sys_recvfrom.do_syscall_64.entry_SYSCALL_64_after_hwframe.recv 1.38 ± 3% -0.4 0.94 ± 3% perf-profile.calltrace.cycles-pp.get_any_partial.___slab_alloc.kmem_cache_alloc.skb_clone.raw_v4_input 1.84 ± 2% -0.3 1.56 ± 6% perf-profile.calltrace.cycles-pp.__slab_free.inet_sock_destruct.__sk_destruct.rcu_do_batch.rcu_core 0.78 ± 7% -0.1 0.66 ± 5% perf-profile.calltrace.cycles-pp.__do_softirq.do_softirq.__local_bh_enable_ip.sk_common_release.inet_release 0.78 ± 7% -0.1 0.66 ± 5% perf-profile.calltrace.cycles-pp.do_softirq.__local_bh_enable_ip.sk_common_release.inet_release.__sock_release 0.78 ± 7% -0.1 0.66 ± 5% perf-profile.calltrace.cycles-pp.rcu_core.__do_softirq.do_softirq.__local_bh_enable_ip.sk_common_release 0.78 ± 7% -0.1 0.67 ± 5% perf-profile.calltrace.cycles-pp.__local_bh_enable_ip.sk_common_release.inet_release.__sock_release.sock_close 0.52 +0.0 0.55 perf-profile.calltrace.cycles-pp.icmp_route_lookup.__icmp_send.__udp4_lib_rcv.ip_protocol_deliver_rcu.ip_local_deliver_finish 0.56 +0.0 0.60 ± 2% perf-profile.calltrace.cycles-pp.new_inode_pseudo.sock_alloc.__sock_create.__sys_socket.__x64_sys_socket 0.57 +0.0 0.61 ± 2% perf-profile.calltrace.cycles-pp.sock_alloc.__sock_create.__sys_socket.__x64_sys_socket.do_syscall_64 0.52 +0.0 0.57 ± 2% perf-profile.calltrace.cycles-pp.alloc_file.alloc_file_pseudo.sock_alloc_file.__sys_socket.__x64_sys_socket 0.73 +0.1 0.78 perf-profile.calltrace.cycles-pp.sock_alloc_file.__sys_socket.__x64_sys_socket.do_syscall_64.entry_SYSCALL_64_after_hwframe 0.72 +0.1 0.78 ± 2% perf-profile.calltrace.cycles-pp.alloc_file_pseudo.sock_alloc_file.__sys_socket.__x64_sys_socket.do_syscall_64 1.03 ± 3% +0.1 1.10 ± 3% perf-profile.calltrace.cycles-pp.sk_filter_trim_cap.sock_queue_rcv_skb_reason.raw_rcv.raw_v4_input.ip_protocol_deliver_rcu 0.59 ± 3% +0.1 0.68 ± 2% perf-profile.calltrace.cycles-pp.allocate_slab.___slab_alloc.kmem_cache_alloc.skb_clone.raw_v4_input 1.14 +0.1 1.22 perf-profile.calltrace.cycles-pp.dst_release.ipv4_pktinfo_prepare.raw_rcv.raw_v4_input.ip_protocol_deliver_rcu 1.96 +0.1 2.05 perf-profile.calltrace.cycles-pp.skb_release_data.kfree_skb_reason.inet_sock_destruct.__sk_destruct.rcu_do_batch 1.64 +0.1 1.75 ± 2% perf-profile.calltrace.cycles-pp.ipv4_pktinfo_prepare.raw_rcv.raw_v4_input.ip_protocol_deliver_rcu.ip_local_deliver_finish 0.42 ± 44% +0.1 0.55 ± 2% perf-profile.calltrace.cycles-pp.alloc_empty_file.alloc_file.alloc_file_pseudo.sock_alloc_file.__sys_socket 2.42 ± 3% +0.1 2.55 ± 3% perf-profile.calltrace.cycles-pp.icmp_socket_deliver.icmp_unreach.icmp_rcv.ip_protocol_deliver_rcu.ip_local_deliver_finish 2.41 ± 3% +0.1 2.54 ± 3% perf-profile.calltrace.cycles-pp.raw_icmp_error.icmp_socket_deliver.icmp_unreach.icmp_rcv.ip_protocol_deliver_rcu 2.47 ± 3% +0.1 2.60 ± 3% perf-profile.calltrace.cycles-pp.icmp_unreach.icmp_rcv.ip_protocol_deliver_rcu.ip_local_deliver_finish.__netif_receive_skb_one_core 0.66 +0.1 0.80 perf-profile.calltrace.cycles-pp.copyout._copy_to_iter.__skb_datagram_iter.skb_copy_datagram_iter.raw_recvmsg 2.70 ± 2% +0.2 2.84 ± 3% perf-profile.calltrace.cycles-pp.icmp_rcv.ip_protocol_deliver_rcu.ip_local_deliver_finish.__netif_receive_skb_one_core.process_backlog 0.70 +0.2 0.88 perf-profile.calltrace.cycles-pp._copy_to_iter.__skb_datagram_iter.skb_copy_datagram_iter.raw_recvmsg.inet_recvmsg 0.62 ± 7% +0.2 0.79 ± 11% perf-profile.calltrace.cycles-pp.free_unref_page.inet_sock_destruct.__sk_destruct.rcu_do_batch.rcu_core 0.66 ± 2% +0.2 0.85 ± 2% perf-profile.calltrace.cycles-pp.__check_object_size.simple_copy_to_iter.__skb_datagram_iter.skb_copy_datagram_iter.raw_recvmsg 0.66 +0.2 0.84 perf-profile.calltrace.cycles-pp.skb_release_data.consume_skb.raw_recvmsg.inet_recvmsg.sock_recvmsg 0.70 ± 2% +0.2 0.90 ± 2% perf-profile.calltrace.cycles-pp.simple_copy_to_iter.__skb_datagram_iter.skb_copy_datagram_iter.raw_recvmsg.inet_recvmsg 0.94 +0.3 1.24 perf-profile.calltrace.cycles-pp.consume_skb.raw_recvmsg.inet_recvmsg.sock_recvmsg.__sys_recvfrom 0.81 +0.3 1.14 ± 2% perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.__skb_try_recv_datagram.__skb_recv_datagram.skb_recv_datagram.raw_recvmsg 0.99 +0.4 1.37 ± 3% perf-profile.calltrace.cycles-pp.__skb_try_recv_datagram.__skb_recv_datagram.skb_recv_datagram.raw_recvmsg.inet_recvmsg 1.53 +0.4 1.94 ± 2% perf-profile.calltrace.cycles-pp.__skb_datagram_iter.skb_copy_datagram_iter.raw_recvmsg.inet_recvmsg.sock_recvmsg 1.55 ± 2% +0.4 1.97 ± 2% perf-profile.calltrace.cycles-pp.skb_copy_datagram_iter.raw_recvmsg.inet_recvmsg.sock_recvmsg.__sys_recvfrom 1.10 ± 2% +0.4 1.52 ± 3% perf-profile.calltrace.cycles-pp.__skb_recv_datagram.skb_recv_datagram.raw_recvmsg.inet_recvmsg.sock_recvmsg 1.12 ± 2% +0.4 1.54 ± 3% perf-profile.calltrace.cycles-pp.skb_recv_datagram.raw_recvmsg.inet_recvmsg.sock_recvmsg.__sys_recvfrom 0.00 +0.5 0.53 ± 2% perf-profile.calltrace.cycles-pp.alloc_inode.new_inode_pseudo.sock_alloc.__sock_create.__sys_socket 4.85 +0.6 5.40 perf-profile.calltrace.cycles-pp.__copy_skb_header.__skb_clone.raw_v4_input.ip_protocol_deliver_rcu.ip_local_deliver_finish 7.74 +0.6 8.32 perf-profile.calltrace.cycles-pp.kfree_skb_reason.inet_sock_destruct.__sk_destruct.rcu_do_batch.rcu_core 0.00 +0.6 0.61 ± 2% perf-profile.calltrace.cycles-pp.check_heap_object.__check_object_size.simple_copy_to_iter.__skb_datagram_iter.skb_copy_datagram_iter 0.00 +0.7 0.65 ± 3% perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath._raw_spin_lock_irqsave.__skb_try_recv_datagram.__skb_recv_datagram.skb_recv_datagram 4.56 +0.7 5.25 perf-profile.calltrace.cycles-pp.sock_def_readable.__sock_queue_rcv_skb.sock_queue_rcv_skb_reason.raw_rcv.raw_v4_input 7.56 +0.7 8.31 perf-profile.calltrace.cycles-pp.__skb_clone.raw_v4_input.ip_protocol_deliver_rcu.ip_local_deliver_finish.__netif_receive_skb_one_core 50.71 +0.8 51.47 perf-profile.calltrace.cycles-pp.raw_v4_input.ip_protocol_deliver_rcu.ip_local_deliver_finish.__netif_receive_skb_one_core.process_backlog 3.67 +0.9 4.59 ± 2% perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath._raw_spin_lock_irqsave.__sock_queue_rcv_skb.sock_queue_rcv_skb_reason.raw_rcv 55.46 +1.0 56.43 perf-profile.calltrace.cycles-pp.ip_local_deliver_finish.__netif_receive_skb_one_core.process_backlog.__napi_poll.net_rx_action 55.45 +1.0 56.42 perf-profile.calltrace.cycles-pp.ip_protocol_deliver_rcu.ip_local_deliver_finish.__netif_receive_skb_one_core.process_backlog.__napi_poll 58.02 +1.0 59.01 perf-profile.calltrace.cycles-pp.__local_bh_enable_ip.__dev_queue_xmit.ip_finish_output2.raw_send_hdrinc.raw_sendmsg 57.99 +1.0 58.98 perf-profile.calltrace.cycles-pp.__do_softirq.do_softirq.__local_bh_enable_ip.__dev_queue_xmit.ip_finish_output2 58.01 +1.0 59.00 perf-profile.calltrace.cycles-pp.do_softirq.__local_bh_enable_ip.__dev_queue_xmit.ip_finish_output2.raw_send_hdrinc 55.61 +1.0 56.60 perf-profile.calltrace.cycles-pp.__netif_receive_skb_one_core.process_backlog.__napi_poll.net_rx_action.__do_softirq 58.10 +1.0 59.10 perf-profile.calltrace.cycles-pp.ip_finish_output2.raw_send_hdrinc.raw_sendmsg.sock_sendmsg.__sys_sendto 58.09 +1.0 59.09 perf-profile.calltrace.cycles-pp.__dev_queue_xmit.ip_finish_output2.raw_send_hdrinc.raw_sendmsg.sock_sendmsg 55.68 +1.0 56.68 perf-profile.calltrace.cycles-pp.process_backlog.__napi_poll.net_rx_action.__do_softirq.do_softirq 55.70 +1.0 56.70 perf-profile.calltrace.cycles-pp.net_rx_action.__do_softirq.do_softirq.__local_bh_enable_ip.__dev_queue_xmit 55.68 +1.0 56.68 perf-profile.calltrace.cycles-pp.__napi_poll.net_rx_action.__do_softirq.do_softirq.__local_bh_enable_ip 59.51 +1.0 60.54 perf-profile.calltrace.cycles-pp.raw_sendmsg.sock_sendmsg.__sys_sendto.__x64_sys_sendto.do_syscall_64 59.58 +1.0 60.61 perf-profile.calltrace.cycles-pp.sock_sendmsg.__sys_sendto.__x64_sys_sendto.do_syscall_64.entry_SYSCALL_64_after_hwframe 58.64 +1.0 59.68 perf-profile.calltrace.cycles-pp.raw_send_hdrinc.raw_sendmsg.sock_sendmsg.__sys_sendto.__x64_sys_sendto 59.70 +1.0 60.74 perf-profile.calltrace.cycles-pp.__x64_sys_sendto.do_syscall_64.entry_SYSCALL_64_after_hwframe.sendto 59.69 +1.0 60.73 perf-profile.calltrace.cycles-pp.__sys_sendto.__x64_sys_sendto.do_syscall_64.entry_SYSCALL_64_after_hwframe.sendto 4.97 +1.0 6.02 ± 2% perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.__sock_queue_rcv_skb.sock_queue_rcv_skb_reason.raw_rcv.raw_v4_input 59.76 +1.0 60.81 perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.sendto 59.78 +1.1 60.84 perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.sendto 59.85 +1.1 60.91 perf-profile.calltrace.cycles-pp.sendto 22.21 +2.3 24.51 perf-profile.calltrace.cycles-pp.__sock_queue_rcv_skb.sock_queue_rcv_skb_reason.raw_rcv.raw_v4_input.ip_protocol_deliver_rcu 23.65 +2.4 26.03 perf-profile.calltrace.cycles-pp.sock_queue_rcv_skb_reason.raw_rcv.raw_v4_input.ip_protocol_deliver_rcu.ip_local_deliver_finish 29.20 +2.7 31.86 perf-profile.calltrace.cycles-pp.raw_rcv.raw_v4_input.ip_protocol_deliver_rcu.ip_local_deliver_finish.__netif_receive_skb_one_core 28.65 -5.8 22.86 ± 2% perf-profile.children.cycles-pp.native_queued_spin_lock_slowpath 24.32 -5.6 18.77 ± 2% perf-profile.children.cycles-pp._raw_spin_lock_irqsave 10.24 ± 2% -4.1 6.14 ± 5% perf-profile.children.cycles-pp.__unfreeze_partials 8.45 -3.3 5.11 ± 4% perf-profile.children.cycles-pp.get_partial_node 10.66 -2.8 7.90 ± 3% perf-profile.children.cycles-pp.___slab_alloc 12.63 -2.7 9.92 ± 2% perf-profile.children.cycles-pp.skb_clone 12.72 -2.7 10.02 ± 2% perf-profile.children.cycles-pp.kmem_cache_alloc 17.07 -1.4 15.72 ± 2% perf-profile.children.cycles-pp.inet_sock_destruct 17.58 -1.3 16.26 ± 2% perf-profile.children.cycles-pp.__sk_destruct 18.54 -1.3 17.26 ± 2% perf-profile.children.cycles-pp.rcu_do_batch 18.54 -1.3 17.27 ± 2% perf-profile.children.cycles-pp.rcu_core 13.48 -1.1 12.42 ± 2% perf-profile.children.cycles-pp.smpboot_thread_fn 13.49 -1.1 12.43 ± 2% perf-profile.children.cycles-pp.ret_from_fork_asm 13.49 -1.1 12.43 ± 2% perf-profile.children.cycles-pp.ret_from_fork 13.49 -1.1 12.43 ± 2% perf-profile.children.cycles-pp.kthread 13.46 -1.1 12.40 ± 2% perf-profile.children.cycles-pp.run_ksoftirqd 11.51 -0.6 10.89 ± 2% perf-profile.children.cycles-pp.raw_recvmsg 11.55 -0.6 10.94 ± 2% perf-profile.children.cycles-pp.inet_recvmsg 11.73 -0.5 11.19 ± 2% perf-profile.children.cycles-pp.sock_recvmsg 12.06 -0.4 11.63 ± 2% perf-profile.children.cycles-pp.__sys_recvfrom 12.08 -0.4 11.65 ± 2% perf-profile.children.cycles-pp.__x64_sys_recvfrom 1.39 ± 3% -0.4 0.96 ± 3% perf-profile.children.cycles-pp.get_any_partial 3.16 -0.2 3.00 ± 3% perf-profile.children.cycles-pp.__slab_free 0.05 +0.0 0.06 ± 6% perf-profile.children.cycles-pp.syscall_return_via_sysret 0.13 ± 2% +0.0 0.14 ± 3% perf-profile.children.cycles-pp.apparmor_capable 0.12 ± 3% +0.0 0.14 ± 5% perf-profile.children.cycles-pp.apparmor_file_alloc_security 0.05 ± 7% +0.0 0.07 ± 7% perf-profile.children.cycles-pp.__wake_up_common 0.15 +0.0 0.16 ± 3% perf-profile.children.cycles-pp._raw_spin_trylock 0.34 ± 2% +0.0 0.36 perf-profile.children.cycles-pp.ip_route_output_key_hash 0.11 ± 6% +0.0 0.13 ± 2% perf-profile.children.cycles-pp.rmqueue 0.12 ± 3% +0.0 0.14 ± 4% perf-profile.children.cycles-pp.release_sock 0.05 +0.0 0.07 ± 10% perf-profile.children.cycles-pp.schedule_timeout 0.05 +0.0 0.07 ± 10% perf-profile.children.cycles-pp.try_to_wake_up 0.08 ± 6% +0.0 0.09 ± 5% perf-profile.children.cycles-pp.__virt_addr_valid 0.16 ± 2% +0.0 0.18 ± 2% perf-profile.children.cycles-pp.__free_one_page 0.07 ± 6% +0.0 0.09 ± 5% perf-profile.children.cycles-pp.exit_to_user_mode_prepare 0.16 ± 5% +0.0 0.18 ± 3% perf-profile.children.cycles-pp.get_page_from_freelist 0.32 ± 2% +0.0 0.34 perf-profile.children.cycles-pp.security_socket_post_create 0.06 ± 7% +0.0 0.09 ± 5% perf-profile.children.cycles-pp.stress_rawudp_server 0.24 +0.0 0.26 perf-profile.children.cycles-pp._raw_spin_lock_bh 0.06 ± 6% +0.0 0.08 ± 8% perf-profile.children.cycles-pp.__skb_wait_for_more_packets 0.04 ± 44% +0.0 0.06 ± 7% perf-profile.children.cycles-pp.__netif_receive_skb_core 0.24 +0.0 0.26 ± 2% perf-profile.children.cycles-pp.init_file 0.18 ± 4% +0.0 0.21 ± 2% perf-profile.children.cycles-pp.__alloc_pages 0.33 +0.0 0.36 ± 3% perf-profile.children.cycles-pp.sock_alloc_inode 0.11 ± 8% +0.0 0.14 ± 5% perf-profile.children.cycles-pp.__fget_light 0.15 ± 2% +0.0 0.18 ± 2% perf-profile.children.cycles-pp._raw_spin_unlock_irqrestore 0.44 +0.0 0.47 perf-profile.children.cycles-pp.kmem_cache_alloc_lru 0.29 ± 4% +0.0 0.32 ± 2% perf-profile.children.cycles-pp.__sysvec_apic_timer_interrupt 0.14 ± 2% +0.0 0.17 ± 2% perf-profile.children.cycles-pp.put_cpu_partial 0.23 ± 6% +0.0 0.26 ± 5% perf-profile.children.cycles-pp.tick_sched_timer 0.32 ± 3% +0.0 0.35 ± 3% perf-profile.children.cycles-pp.setsockopt 0.08 ± 5% +0.0 0.12 ± 6% perf-profile.children.cycles-pp.__schedule 0.07 ± 6% +0.0 0.10 ± 7% perf-profile.children.cycles-pp.schedule 0.16 ± 2% +0.0 0.19 ± 2% perf-profile.children.cycles-pp.entry_SYSRETQ_unsafe_stack 0.14 ± 6% +0.0 0.17 ± 4% perf-profile.children.cycles-pp.sockfd_lookup_light 0.29 +0.0 0.32 ± 4% perf-profile.children.cycles-pp.setup_object 0.28 ± 4% +0.0 0.32 ± 3% perf-profile.children.cycles-pp.hrtimer_interrupt 0.52 +0.0 0.56 perf-profile.children.cycles-pp.icmp_route_lookup 0.49 +0.0 0.53 ± 2% perf-profile.children.cycles-pp.alloc_inode 0.13 ± 4% +0.0 0.17 ± 2% perf-profile.children.cycles-pp.syscall_exit_to_user_mode 0.51 +0.0 0.55 ± 2% perf-profile.children.cycles-pp.alloc_empty_file 0.22 ± 2% +0.0 0.26 ± 3% perf-profile.children.cycles-pp.__entry_text_start 0.57 +0.0 0.61 perf-profile.children.cycles-pp.sock_alloc 0.44 +0.0 0.48 perf-profile.children.cycles-pp.__list_del_entry_valid_or_report 0.56 +0.0 0.61 perf-profile.children.cycles-pp.new_inode_pseudo 0.52 +0.0 0.57 ± 2% perf-profile.children.cycles-pp.alloc_file 0.00 +0.1 0.05 perf-profile.children.cycles-pp.__ip_finish_output 0.72 ± 2% +0.1 0.78 ± 2% perf-profile.children.cycles-pp.alloc_file_pseudo 0.15 ± 3% +0.1 0.20 ± 2% perf-profile.children.cycles-pp.is_vmalloc_addr 0.01 ±223% +0.1 0.06 ± 6% perf-profile.children.cycles-pp.autoremove_wake_function 0.73 +0.1 0.78 perf-profile.children.cycles-pp.sock_alloc_file 0.16 ± 3% +0.1 0.22 ± 3% perf-profile.children.cycles-pp.security_socket_recvmsg 0.20 ± 2% +0.1 0.27 ± 3% perf-profile.children.cycles-pp.aa_sk_perm 1.12 ± 3% +0.1 1.19 ± 3% perf-profile.children.cycles-pp.sk_filter_trim_cap 0.14 ± 3% +0.1 0.21 ± 5% perf-profile.children.cycles-pp.__wake_up_common_lock 0.67 +0.1 0.75 perf-profile.children.cycles-pp.shuffle_freelist 0.12 ± 4% +0.1 0.20 ± 4% perf-profile.children.cycles-pp.__list_add_valid_or_report 1.24 +0.1 1.33 perf-profile.children.cycles-pp.dst_release 2.12 +0.1 2.22 perf-profile.children.cycles-pp.kmem_cache_free 0.89 ± 2% +0.1 1.00 perf-profile.children.cycles-pp.allocate_slab 1.72 +0.1 1.84 perf-profile.children.cycles-pp.ipv4_pktinfo_prepare 2.42 ± 2% +0.1 2.55 ± 3% perf-profile.children.cycles-pp.icmp_socket_deliver 2.41 ± 3% +0.1 2.54 ± 3% perf-profile.children.cycles-pp.raw_icmp_error 2.47 ± 3% +0.1 2.60 ± 3% perf-profile.children.cycles-pp.icmp_unreach 2.70 ± 2% +0.1 2.85 ± 3% perf-profile.children.cycles-pp.icmp_rcv 0.67 ± 2% +0.2 0.83 perf-profile.children.cycles-pp.copyout 0.60 ± 2% +0.2 0.77 ± 2% perf-profile.children.cycles-pp.check_heap_object 0.71 ± 2% +0.2 0.88 perf-profile.children.cycles-pp._copy_to_iter 0.73 ± 2% +0.2 0.93 ± 2% perf-profile.children.cycles-pp.__check_object_size 0.71 ± 2% +0.2 0.91 ± 2% perf-profile.children.cycles-pp.simple_copy_to_iter 0.82 ± 7% +0.2 1.03 ± 10% perf-profile.children.cycles-pp.free_unref_page 2.39 +0.2 2.63 perf-profile.children.cycles-pp.sock_rfree 2.99 +0.3 3.28 perf-profile.children.cycles-pp.skb_release_head_state 1.01 +0.3 1.31 perf-profile.children.cycles-pp.consume_skb 4.47 +0.4 4.85 perf-profile.children.cycles-pp.skb_release_data 1.01 +0.4 1.39 ± 3% perf-profile.children.cycles-pp.__skb_try_recv_datagram 1.54 +0.4 1.95 ± 2% perf-profile.children.cycles-pp.__skb_datagram_iter 1.56 ± 2% +0.4 1.97 ± 2% perf-profile.children.cycles-pp.skb_copy_datagram_iter 1.11 ± 2% +0.4 1.53 ± 3% perf-profile.children.cycles-pp.__skb_recv_datagram 1.12 ± 2% +0.4 1.55 ± 3% perf-profile.children.cycles-pp.skb_recv_datagram 4.89 +0.6 5.45 perf-profile.children.cycles-pp.__copy_skb_header 4.58 +0.7 5.27 perf-profile.children.cycles-pp.sock_def_readable 10.70 +0.7 11.42 perf-profile.children.cycles-pp.kfree_skb_reason 7.65 +0.8 8.40 perf-profile.children.cycles-pp.__skb_clone 50.92 +0.8 51.69 perf-profile.children.cycles-pp.raw_v4_input 59.46 +0.8 60.25 perf-profile.children.cycles-pp.do_softirq 59.48 +0.8 60.28 perf-profile.children.cycles-pp.__local_bh_enable_ip 85.69 +0.9 86.58 perf-profile.children.cycles-pp.do_syscall_64 85.79 +0.9 86.70 perf-profile.children.cycles-pp.entry_SYSCALL_64_after_hwframe 55.46 +1.0 56.44 perf-profile.children.cycles-pp.ip_local_deliver_finish 55.45 +1.0 56.43 perf-profile.children.cycles-pp.ip_protocol_deliver_rcu 55.61 +1.0 56.61 perf-profile.children.cycles-pp.__netif_receive_skb_one_core 55.68 +1.0 56.68 perf-profile.children.cycles-pp.__napi_poll 55.71 +1.0 56.71 perf-profile.children.cycles-pp.net_rx_action 55.68 +1.0 56.68 perf-profile.children.cycles-pp.process_backlog 58.22 +1.0 59.22 perf-profile.children.cycles-pp.__dev_queue_xmit 58.26 +1.0 59.26 perf-profile.children.cycles-pp.ip_finish_output2 59.52 +1.0 60.54 perf-profile.children.cycles-pp.raw_sendmsg 58.65 +1.0 59.68 perf-profile.children.cycles-pp.raw_send_hdrinc 59.58 +1.0 60.61 perf-profile.children.cycles-pp.sock_sendmsg 59.71 +1.0 60.75 perf-profile.children.cycles-pp.__x64_sys_sendto 59.69 +1.0 60.74 perf-profile.children.cycles-pp.__sys_sendto 59.87 +1.1 60.93 perf-profile.children.cycles-pp.sendto 22.30 +2.3 24.61 perf-profile.children.cycles-pp.__sock_queue_rcv_skb 23.78 +2.4 26.16 perf-profile.children.cycles-pp.sock_queue_rcv_skb_reason 29.56 +2.7 32.22 perf-profile.children.cycles-pp.raw_rcv 28.12 -5.9 22.26 ± 2% perf-profile.self.cycles-pp.native_queued_spin_lock_slowpath 0.64 -0.5 0.12 ± 3% perf-profile.self.cycles-pp.__unfreeze_partials 0.45 -0.2 0.26 ± 3% perf-profile.self.cycles-pp.get_partial_node 0.05 +0.0 0.06 perf-profile.self.cycles-pp.__dev_queue_xmit 0.05 +0.0 0.06 perf-profile.self.cycles-pp.__cond_resched 0.05 +0.0 0.06 perf-profile.self.cycles-pp.syscall_return_via_sysret 0.06 ± 6% +0.0 0.07 perf-profile.self.cycles-pp.__check_object_size 0.13 +0.0 0.14 ± 2% perf-profile.self.cycles-pp.apparmor_capable 0.10 ± 3% +0.0 0.12 ± 4% perf-profile.self.cycles-pp.__sk_destruct 0.37 +0.0 0.39 perf-profile.self.cycles-pp.sock_queue_rcv_skb_reason 0.15 ± 2% +0.0 0.16 ± 2% perf-profile.self.cycles-pp._raw_spin_trylock 0.06 ± 7% +0.0 0.08 perf-profile.self.cycles-pp.__virt_addr_valid 0.43 +0.0 0.44 perf-profile.self.cycles-pp.memcg_slab_post_alloc_hook 0.19 ± 3% +0.0 0.21 perf-profile.self.cycles-pp.ip_route_output_key_hash_rcu 0.10 ± 4% +0.0 0.12 ± 3% perf-profile.self.cycles-pp.entry_SYSCALL_64_after_hwframe 0.06 ± 9% +0.0 0.07 ± 5% perf-profile.self.cycles-pp.is_vmalloc_addr 0.12 ± 3% +0.0 0.14 perf-profile.self.cycles-pp._raw_spin_unlock_irqrestore 0.30 +0.0 0.32 ± 2% perf-profile.self.cycles-pp.apparmor_socket_post_create 0.06 ± 9% +0.0 0.08 ± 6% perf-profile.self.cycles-pp.stress_rawudp_server 0.41 +0.0 0.43 perf-profile.self.cycles-pp.skb_clone 0.24 +0.0 0.26 perf-profile.self.cycles-pp._raw_spin_lock_bh 0.07 ± 5% +0.0 0.10 ± 5% perf-profile.self.cycles-pp.__skb_try_recv_datagram 0.45 +0.0 0.48 ± 2% perf-profile.self.cycles-pp._raw_spin_lock 0.36 ± 2% +0.0 0.38 perf-profile.self.cycles-pp.inet_sock_destruct 0.10 ± 8% +0.0 0.12 ± 6% perf-profile.self.cycles-pp.__fget_light 0.14 ± 3% +0.0 0.16 ± 2% perf-profile.self.cycles-pp.put_cpu_partial 0.69 +0.0 0.72 perf-profile.self.cycles-pp.raw_rcv 0.03 ± 70% +0.0 0.06 ± 6% perf-profile.self.cycles-pp.__netif_receive_skb_core 0.15 ± 2% +0.0 0.18 ± 4% perf-profile.self.cycles-pp.get_any_partial 0.11 ± 3% +0.0 0.15 ± 3% perf-profile.self.cycles-pp.recv 0.16 ± 3% +0.0 0.19 ± 4% perf-profile.self.cycles-pp.entry_SYSRETQ_unsafe_stack 0.12 ± 5% +0.0 0.16 ± 3% perf-profile.self.cycles-pp.__skb_datagram_iter 0.47 +0.0 0.51 perf-profile.self.cycles-pp.skb_release_head_state 0.43 +0.0 0.47 ± 2% perf-profile.self.cycles-pp.__list_del_entry_valid_or_report 0.00 +0.1 0.05 perf-profile.self.cycles-pp._copy_to_iter 0.00 +0.1 0.05 ± 7% perf-profile.self.cycles-pp.__skb_recv_datagram 0.45 ± 2% +0.1 0.51 ± 2% perf-profile.self.cycles-pp.shuffle_freelist 1.12 ± 3% +0.1 1.18 ± 5% perf-profile.self.cycles-pp.raw_v4_input 0.16 ± 2% +0.1 0.22 ± 2% perf-profile.self.cycles-pp.aa_sk_perm 0.20 ± 2% +0.1 0.26 ± 3% perf-profile.self.cycles-pp.__sys_recvfrom 0.11 ± 5% +0.1 0.19 ± 3% perf-profile.self.cycles-pp.__list_add_valid_or_report 1.18 +0.1 1.26 perf-profile.self.cycles-pp.dst_release 1.91 +0.1 2.00 perf-profile.self.cycles-pp.kmem_cache_free 0.44 +0.1 0.57 ± 2% perf-profile.self.cycles-pp.check_heap_object 2.40 ± 3% +0.1 2.53 ± 3% perf-profile.self.cycles-pp.raw_icmp_error 0.64 ± 2% +0.2 0.79 perf-profile.self.cycles-pp.copyout 2.25 +0.2 2.42 perf-profile.self.cycles-pp.__slab_free 0.60 ± 2% +0.2 0.78 ± 3% perf-profile.self.cycles-pp.raw_recvmsg 2.75 +0.2 2.94 perf-profile.self.cycles-pp.__skb_clone 2.36 +0.2 2.60 perf-profile.self.cycles-pp.sock_rfree 4.24 +0.4 4.59 perf-profile.self.cycles-pp.kfree_skb_reason 4.24 +0.4 4.60 perf-profile.self.cycles-pp.skb_release_data 2.52 +0.4 2.91 perf-profile.self.cycles-pp._raw_spin_lock_irqsave 12.54 +0.6 13.09 perf-profile.self.cycles-pp.__sock_queue_rcv_skb 4.83 +0.6 5.38 perf-profile.self.cycles-pp.__copy_skb_header 4.40 +0.6 5.03 perf-profile.self.cycles-pp.sock_def_readable 0.91 +0.7 1.60 perf-profile.self.cycles-pp.___slab_alloc Disclaimer: Results have been estimated based on internal Intel analysis and are provided for informational purposes only. Any difference in system hardware or software design or configuration may affect actual performance.
On Thu, Nov 02, 2023 at 03:23:27AM +0000, chengming.zhou@linux.dev wrote: > From: Chengming Zhou <zhouchengming@bytedance.com> > > Now we will freeze slabs when moving them out of node partial list to > cpu partial list, this method needs two cmpxchg_double operations: > > 1. freeze slab (acquire_slab()) under the node list_lock > 2. get_freelist() when pick used in ___slab_alloc() Recently -next has been failing to boot on a Raspberry Pi 3 with an arm multi_v7_defconfig and a NFS rootfs, a bisect appears to point to this patch (in -next as c8d312e039030edab25836a326bcaeb2a3d4db14) as having introduced the issue. I've included the full bisect log below. When we see problems we see RCU stalls while logging in, for example: debian-testing-armhf login: root (automatic login) Linux debian-testing-armhf 6.7.0-rc1-00006-gc8d312e03903 #1 SMP @1699864348 armv7l The programs included with the Debian GNU/Linux system are free software; the exact distribution terms for each program are described in the individual files in /usr/share/doc/*/copyright. Debian GNU/Linux comes with ABSOLUTELY NO WARRANTY, to the extent permitted by applicable law. [ 46.453323] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [ 46.459361] rcu: 3-...0: (1 GPs behind) idle=def4/1/0x40000000 softirq=1304/1304 fqs=951 [ 46.467669] rcu: (detected by 0, t=2103 jiffies, g=1161, q=499 ncpus=4) [ 46.474472] Sending NMI from CPU 0 to CPUs 3: [ 56.478894] rcu: rcu_sched kthread timer wakeup didn't happen for 1002 jiffies! g1161 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 [ 56.490195] rcu: Possible timer handling issue on cpu=0 timer-softirq=1650 [ 56.497259] rcu: rcu_sched kthread starved for 1005 jiffies! g1161 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=0 [ 56.507589] rcu: Unless rcu_sched kthread gets sufficient CPU time, OOM is now expected behavior. [ 56.516681] rcu: RCU grace-period kthread stack dump: [ 56.521803] task:rcu_sched state:I stack:0 pid:13 tgid:13 ppid:2 flags:0x00000000 [ 56.531267] __schedule from schedule+0x20/0xe8 [ 56.535883] schedule from schedule_timeout+0xa0/0x158 [ 56.541111] schedule_timeout from rcu_gp_fqs_loop+0x104/0x594 [ 56.547048] rcu_gp_fqs_loop from rcu_gp_kthread+0x14c/0x1c0 [ 56.552801] rcu_gp_kthread from kthread+0xe0/0xfc [ 56.557674] kthread from ret_from_fork+0x14/0x28 [ 56.562457] Exception stack(0xf084dfb0 to 0xf084dff8) [ 56.567584] dfa0: 00000000 00000000 00000000 00000000 [ 56.575886] dfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 [ 56.584191] dfe0: 00000000 00000000 00000000 00000000 00000013 00000000 [ 56.590907] rcu: Stack dump where RCU GP kthread last ran: [ 56.596474] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.7.0-rc1-00006-gc8d312e03903 #1 [ 56.604515] Hardware name: BCM2835 [ 56.607965] PC is at default_idle_call+0x1c/0xb0 [ 56.612654] LR is at ct_kernel_enter.constprop.0+0x48/0x11c [ 56.618311] pc : [<c1197054>] lr : [<c1195c98>] psr: 60010013 [ 56.624672] sp : c1b01f70 ip : c1d5af7c fp : 00000000 [ 56.629974] r10: c19cda60 r9 : 00000000 r8 : 00000000 [ 56.635277] r7 : c1b04d50 r6 : c1b04d18 r5 : c1d5b684 r4 : c1b09740 [ 56.641902] r3 : 00000000 r2 : 00000000 r1 : 00000001 r0 : 002a3114 [ 56.648528] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none [ 56.655774] Control: 10c5383d Table: 0237406a DAC: 00000051 [ 56.661605] default_idle_call from do_idle+0x208/0x290 [ 56.666920] do_idle from cpu_startup_entry+0x28/0x2c [ 56.672059] cpu_startup_entry from rest_init+0xac/0xb0 [ 56.677371] rest_init from arch_post_acpi_subsys_init+0x0/0x8 Login ti A full log for that run can be seen at: https://validation.linaro.org/scheduler/job/4017095 Boots to initramfs with the same kernel image seem fine. Other systems, including other 32 bit arm ones, don't seem to be having similar issues with this userspace. I've not investigated beyond running the bisect, the log for which is below: git bisect start # good: [64e6d94bfb47ed0732ad06aedf8ec6af5dd2ab84] Merge branch 'for-linux-next-fixes' of git://anongit.freedesktop.org/drm/drm-misc git bisect good 64e6d94bfb47ed0732ad06aedf8ec6af5dd2ab84 # bad: [5a82d69d48c82e89aef44483d2a129f869f3506a] Add linux-next specific files for 20231120 git bisect bad 5a82d69d48c82e89aef44483d2a129f869f3506a # good: [ce252a92a867da8a6622463bff637e5f7b904a46] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git git bisect good ce252a92a867da8a6622463bff637e5f7b904a46 # good: [c22e026efad504e3b056d4436920d173a09c580e] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git git bisect good c22e026efad504e3b056d4436920d173a09c580e # good: [b7fc58ffb105470cb339163cc2b04e3f59387a45] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git git bisect good b7fc58ffb105470cb339163cc2b04e3f59387a45 # good: [26f89f0614f03e4016578a992fc2e86b048a5cb4] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git git bisect good 26f89f0614f03e4016578a992fc2e86b048a5cb4 # good: [602bf18307981f3bfd9ebf19921791a4256d3fd1] Merge branch 'for-6.7' into for-next git bisect good 602bf18307981f3bfd9ebf19921791a4256d3fd1 # good: [9f16a68069822b1df6bfb8a9ef7258a1e32b25e7] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching git bisect good 9f16a68069822b1df6bfb8a9ef7258a1e32b25e7 # good: [3ff57db6f6569ebc2cc333437e7e949749e59424] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krisman/unicode.git git bisect good 3ff57db6f6569ebc2cc333437e7e949749e59424 # bad: [dd374e220ba492f95344a638b1efe5b2744fdd73] slub: Update frozen slabs documentations in the source git bisect bad dd374e220ba492f95344a638b1efe5b2744fdd73 # good: [a3058965bb35490454953aa2c87ea51004839f2f] slub: Prepare __slab_free() for unfrozen partial slab out of node partial list git bisect good a3058965bb35490454953aa2c87ea51004839f2f # bad: [c8d312e039030edab25836a326bcaeb2a3d4db14] slub: Delay freezing of partial slabs git bisect bad c8d312e039030edab25836a326bcaeb2a3d4db14 # good: [00b15a19ee543f0117cb217fcbab8b7b3fd50677] slub: Introduce freeze_slab() git bisect good 00b15a19ee543f0117cb217fcbab8b7b3fd50677 # first bad commit: [c8d312e039030edab25836a326bcaeb2a3d4db14] slub: Delay freezing of partial slabs
On 2023/11/21 02:49, Mark Brown wrote: > On Thu, Nov 02, 2023 at 03:23:27AM +0000, chengming.zhou@linux.dev wrote: >> From: Chengming Zhou <zhouchengming@bytedance.com> >> >> Now we will freeze slabs when moving them out of node partial list to >> cpu partial list, this method needs two cmpxchg_double operations: >> >> 1. freeze slab (acquire_slab()) under the node list_lock >> 2. get_freelist() when pick used in ___slab_alloc() > > Recently -next has been failing to boot on a Raspberry Pi 3 with an arm > multi_v7_defconfig and a NFS rootfs, a bisect appears to point to this > patch (in -next as c8d312e039030edab25836a326bcaeb2a3d4db14) as having > introduced the issue. I've included the full bisect log below. > > When we see problems we see RCU stalls while logging in, for example: > > debian-testing-armhf login: root (automatic login) > Linux debian-testing-armhf 6.7.0-rc1-00006-gc8d312e03903 #1 SMP @1699864348 armv7l > The programs included with the Debian GNU/Linux system are free software; > the exact distribution terms for each program are described in the > individual files in /usr/share/doc/*/copyright. > Debian GNU/Linux comes with ABSOLUTELY NO WARRANTY, to the extent > permitted by applicable law. > [ 46.453323] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: > [ 46.459361] rcu: 3-...0: (1 GPs behind) idle=def4/1/0x40000000 softirq=1304/1304 fqs=951 > [ 46.467669] rcu: (detected by 0, t=2103 jiffies, g=1161, q=499 ncpus=4) > [ 46.474472] Sending NMI from CPU 0 to CPUs 3: IIUC, here should print the backtrace of CPU 3, right? It looks like CPU 3 is the cause, but we couldn't see what it's doing from the log. Thanks! > [ 56.478894] rcu: rcu_sched kthread timer wakeup didn't happen for 1002 jiffies! g1161 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 > [ 56.490195] rcu: Possible timer handling issue on cpu=0 timer-softirq=1650 > [ 56.497259] rcu: rcu_sched kthread starved for 1005 jiffies! g1161 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=0 > [ 56.507589] rcu: Unless rcu_sched kthread gets sufficient CPU time, OOM is now expected behavior. > [ 56.516681] rcu: RCU grace-period kthread stack dump: > [ 56.521803] task:rcu_sched state:I stack:0 pid:13 tgid:13 ppid:2 flags:0x00000000 > [ 56.531267] __schedule from schedule+0x20/0xe8 > [ 56.535883] schedule from schedule_timeout+0xa0/0x158 > [ 56.541111] schedule_timeout from rcu_gp_fqs_loop+0x104/0x594 > [ 56.547048] rcu_gp_fqs_loop from rcu_gp_kthread+0x14c/0x1c0 > [ 56.552801] rcu_gp_kthread from kthread+0xe0/0xfc > [ 56.557674] kthread from ret_from_fork+0x14/0x28 > [ 56.562457] Exception stack(0xf084dfb0 to 0xf084dff8) > [ 56.567584] dfa0: 00000000 00000000 00000000 00000000 > [ 56.575886] dfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 > [ 56.584191] dfe0: 00000000 00000000 00000000 00000000 00000013 00000000 > [ 56.590907] rcu: Stack dump where RCU GP kthread last ran: > [ 56.596474] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.7.0-rc1-00006-gc8d312e03903 #1 > [ 56.604515] Hardware name: BCM2835 > [ 56.607965] PC is at default_idle_call+0x1c/0xb0 > [ 56.612654] LR is at ct_kernel_enter.constprop.0+0x48/0x11c > [ 56.618311] pc : [<c1197054>] lr : [<c1195c98>] psr: 60010013 > [ 56.624672] sp : c1b01f70 ip : c1d5af7c fp : 00000000 > [ 56.629974] r10: c19cda60 r9 : 00000000 r8 : 00000000 > [ 56.635277] r7 : c1b04d50 r6 : c1b04d18 r5 : c1d5b684 r4 : c1b09740 > [ 56.641902] r3 : 00000000 r2 : 00000000 r1 : 00000001 r0 : 002a3114 > [ 56.648528] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none > [ 56.655774] Control: 10c5383d Table: 0237406a DAC: 00000051 > [ 56.661605] default_idle_call from do_idle+0x208/0x290 > [ 56.666920] do_idle from cpu_startup_entry+0x28/0x2c > [ 56.672059] cpu_startup_entry from rest_init+0xac/0xb0 > [ 56.677371] rest_init from arch_post_acpi_subsys_init+0x0/0x8 > Login ti > > A full log for that run can be seen at: > > https://validation.linaro.org/scheduler/job/4017095 > > Boots to initramfs with the same kernel image seem fine. Other systems, > including other 32 bit arm ones, don't seem to be having similar issues > with this userspace. I've not investigated beyond running the bisect, > the log for which is below: > > git bisect start > # good: [64e6d94bfb47ed0732ad06aedf8ec6af5dd2ab84] Merge branch 'for-linux-next-fixes' of git://anongit.freedesktop.org/drm/drm-misc > git bisect good 64e6d94bfb47ed0732ad06aedf8ec6af5dd2ab84 > # bad: [5a82d69d48c82e89aef44483d2a129f869f3506a] Add linux-next specific files for 20231120 > git bisect bad 5a82d69d48c82e89aef44483d2a129f869f3506a > # good: [ce252a92a867da8a6622463bff637e5f7b904a46] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git > git bisect good ce252a92a867da8a6622463bff637e5f7b904a46 > # good: [c22e026efad504e3b056d4436920d173a09c580e] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git > git bisect good c22e026efad504e3b056d4436920d173a09c580e > # good: [b7fc58ffb105470cb339163cc2b04e3f59387a45] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git > git bisect good b7fc58ffb105470cb339163cc2b04e3f59387a45 > # good: [26f89f0614f03e4016578a992fc2e86b048a5cb4] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git > git bisect good 26f89f0614f03e4016578a992fc2e86b048a5cb4 > # good: [602bf18307981f3bfd9ebf19921791a4256d3fd1] Merge branch 'for-6.7' into for-next > git bisect good 602bf18307981f3bfd9ebf19921791a4256d3fd1 > # good: [9f16a68069822b1df6bfb8a9ef7258a1e32b25e7] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching > git bisect good 9f16a68069822b1df6bfb8a9ef7258a1e32b25e7 > # good: [3ff57db6f6569ebc2cc333437e7e949749e59424] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krisman/unicode.git > git bisect good 3ff57db6f6569ebc2cc333437e7e949749e59424 > # bad: [dd374e220ba492f95344a638b1efe5b2744fdd73] slub: Update frozen slabs documentations in the source > git bisect bad dd374e220ba492f95344a638b1efe5b2744fdd73 > # good: [a3058965bb35490454953aa2c87ea51004839f2f] slub: Prepare __slab_free() for unfrozen partial slab out of node partial list > git bisect good a3058965bb35490454953aa2c87ea51004839f2f > # bad: [c8d312e039030edab25836a326bcaeb2a3d4db14] slub: Delay freezing of partial slabs > git bisect bad c8d312e039030edab25836a326bcaeb2a3d4db14 > # good: [00b15a19ee543f0117cb217fcbab8b7b3fd50677] slub: Introduce freeze_slab() > git bisect good 00b15a19ee543f0117cb217fcbab8b7b3fd50677 > # first bad commit: [c8d312e039030edab25836a326bcaeb2a3d4db14] slub: Delay freezing of partial slabs
On Tue, Nov 21, 2023 at 08:58:40AM +0800, Chengming Zhou wrote: > On 2023/11/21 02:49, Mark Brown wrote: > > On Thu, Nov 02, 2023 at 03:23:27AM +0000, chengming.zhou@linux.dev wrote: > > When we see problems we see RCU stalls while logging in, for example: > > [ 46.453323] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: > > [ 46.459361] rcu: 3-...0: (1 GPs behind) idle=def4/1/0x40000000 softirq=1304/1304 fqs=951 > > [ 46.467669] rcu: (detected by 0, t=2103 jiffies, g=1161, q=499 ncpus=4) > > [ 46.474472] Sending NMI from CPU 0 to CPUs 3: > IIUC, here should print the backtrace of CPU 3, right? It looks like CPU 3 is the cause, > but we couldn't see what it's doing from the log. AIUI yes, but it looks like we've just completely lost the CPU - there's more attempts to talk to it visible in the log: > > A full log for that run can be seen at: > > > > https://validation.linaro.org/scheduler/job/4017095 but none of them appear to cause CPU 3 to respond. Note that 32 bit ARM is just using a regular IPI rather than something that's actually a NMI so this isn't hugely out of the ordinary, I'd guess it's stuck with interrupts masked.
On 2023/11/21 09:29, Mark Brown wrote: > On Tue, Nov 21, 2023 at 08:58:40AM +0800, Chengming Zhou wrote: >> On 2023/11/21 02:49, Mark Brown wrote: >>> On Thu, Nov 02, 2023 at 03:23:27AM +0000, chengming.zhou@linux.dev wrote: > >>> When we see problems we see RCU stalls while logging in, for example: > >>> [ 46.453323] rcu: INFO: rcu_sched detected stalls on CPUs/tasks: >>> [ 46.459361] rcu: 3-...0: (1 GPs behind) idle=def4/1/0x40000000 softirq=1304/1304 fqs=951 >>> [ 46.467669] rcu: (detected by 0, t=2103 jiffies, g=1161, q=499 ncpus=4) >>> [ 46.474472] Sending NMI from CPU 0 to CPUs 3: > >> IIUC, here should print the backtrace of CPU 3, right? It looks like CPU 3 is the cause, >> but we couldn't see what it's doing from the log. > > AIUI yes, but it looks like we've just completely lost the CPU - there's > more attempts to talk to it visible in the log: > >>> A full log for that run can be seen at: >>> >>> https://validation.linaro.org/scheduler/job/4017095 > > but none of them appear to cause CPU 3 to respond. Note that 32 bit ARM > is just using a regular IPI rather than something that's actually a NMI > so this isn't hugely out of the ordinary, I'd guess it's stuck with > interrupts masked. Ah yes, there is no NMI on ARM, so CPU 3 maybe running somewhere with interrupts disabled. I searched the full log, but still haven't a clue. And there is no any WARNING or BUG related to SLUB in the log. I wonder how to reproduce it locally with a Qemu VM since I don't have the ARM machine. Thanks!
On Tue, Nov 21, 2023 at 11:47:26PM +0800, Chengming Zhou wrote: > Ah yes, there is no NMI on ARM, so CPU 3 maybe running somewhere with > interrupts disabled. I searched the full log, but still haven't a clue. > And there is no any WARNING or BUG related to SLUB in the log. Yeah, nor anything else particularly. I tried turning on some debug options: CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DETECT_HUNG_TASK=y CONFIG_WQ_WATCHDOG=y CONFIG_DEBUG_PREEMPT=y CONFIG_DEBUG_LOCKING=y CONFIG_DEBUG_ATOMIC_SLEEP=y https://validation.linaro.org/scheduler/job/4017828 which has some additional warnings related to clock changes but AFAICT those come from today's -next rather than the debug stuff: https://validation.linaro.org/scheduler/job/4017823 so that's not super helpful. > I wonder how to reproduce it locally with a Qemu VM since I don't have > the ARM machine. There's sample qemu jobs available from for example KernelCI: https://storage.kernelci.org/next/master/next-20231120/arm/multi_v7_defconfig/gcc-10/lab-baylibre/baseline-qemu_arm-virt-gicv3.html (includes the command line, though it's not using Debian testing like my test was). Note that I'm testing a bunch of platforms with the same kernel/rootfs combination and it was only the Raspberry Pi 3 which blew up. It is a bit tight for memory which might have some influence? I'm really suspecting this may have made some underlying platform bug more obvious :/
On 11/21/23 19:21, Mark Brown wrote: > On Tue, Nov 21, 2023 at 11:47:26PM +0800, Chengming Zhou wrote: > >> Ah yes, there is no NMI on ARM, so CPU 3 maybe running somewhere with >> interrupts disabled. I searched the full log, but still haven't a clue. >> And there is no any WARNING or BUG related to SLUB in the log. > > Yeah, nor anything else particularly. I tried turning on some debug > options: > > CONFIG_SOFTLOCKUP_DETECTOR=y > CONFIG_DETECT_HUNG_TASK=y > CONFIG_WQ_WATCHDOG=y > CONFIG_DEBUG_PREEMPT=y > CONFIG_DEBUG_LOCKING=y > CONFIG_DEBUG_ATOMIC_SLEEP=y > > https://validation.linaro.org/scheduler/job/4017828 > > which has some additional warnings related to clock changes but AFAICT > those come from today's -next rather than the debug stuff: > > https://validation.linaro.org/scheduler/job/4017823 > > so that's not super helpful. For the record (and to help debugging focus) on IRC we discussed that with CONFIG_SLUB_CPU_PARTIAL=n the problem persists: https://validation.linaro.org/scheduler/job/4017863 Which limits the scope of where to look so that's good :) >> I wonder how to reproduce it locally with a Qemu VM since I don't have >> the ARM machine. > > There's sample qemu jobs available from for example KernelCI: > > https://storage.kernelci.org/next/master/next-20231120/arm/multi_v7_defconfig/gcc-10/lab-baylibre/baseline-qemu_arm-virt-gicv3.html > > (includes the command line, though it's not using Debian testing like my > test was). Note that I'm testing a bunch of platforms with the same > kernel/rootfs combination and it was only the Raspberry Pi 3 which blew > up. It is a bit tight for memory which might have some influence? > > I'm really suspecting this may have made some underlying platform bug > more obvious :/
On 11/20/23 19:49, Mark Brown wrote: > On Thu, Nov 02, 2023 at 03:23:27AM +0000, chengming.zhou@linux.dev wrote: >> From: Chengming Zhou <zhouchengming@bytedance.com> >> >> Now we will freeze slabs when moving them out of node partial list to >> cpu partial list, this method needs two cmpxchg_double operations: >> >> 1. freeze slab (acquire_slab()) under the node list_lock >> 2. get_freelist() when pick used in ___slab_alloc() > > Recently -next has been failing to boot on a Raspberry Pi 3 with an arm > multi_v7_defconfig and a NFS rootfs, a bisect appears to point to this > patch (in -next as c8d312e039030edab25836a326bcaeb2a3d4db14) as having > introduced the issue. I've included the full bisect log below. > > When we see problems we see RCU stalls while logging in, for example: Can you try this, please? ----8<---- From 000030c1ff055ef6a2ca624d0142f08f3ef19d51 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka <vbabka@suse.cz> Date: Wed, 22 Nov 2023 10:32:41 +0100 Subject: [PATCH] mm/slub: try to fix hangs without cmpxchg64/128 If we don't have cmpxchg64/128 and resort to slab_lock()/slab_unlock() which uses PG_locked, we can get RMW with the newly introduced slab_set/clear_node_partial() operation that modify PG_workingset so all the operations have to be atomic now. Signed-off-by: Vlastimil Babka <vbabka@suse.cz> --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index bcb5b2c4e213..f2cdb81ab02e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -522,7 +522,7 @@ static __always_inline void slab_unlock(struct slab *slab) struct page *page = slab_page(slab); VM_BUG_ON_PAGE(PageTail(page), page); - __bit_spin_unlock(PG_locked, &page->flags); + bit_spin_unlock(PG_locked, &page->flags); } static inline bool @@ -2127,12 +2127,12 @@ static inline bool slab_test_node_partial(const struct slab *slab) static inline void slab_set_node_partial(struct slab *slab) { - __set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); + set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); } static inline void slab_clear_node_partial(struct slab *slab) { - __clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); + clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); } /*
On Wed, Nov 22, 2023 at 10:37:39AM +0100, Vlastimil Babka wrote: > Can you try this, please? > Subject: [PATCH] mm/slub: try to fix hangs without cmpxchg64/128 > > If we don't have cmpxchg64/128 and resort to slab_lock()/slab_unlock() > which uses PG_locked, we can get RMW with the newly introduced > slab_set/clear_node_partial() operation that modify PG_workingset so all > the operations have to be atomic now. That seems to resolve the issue: https://validation.linaro.org/scheduler/job/4018096 Tested-by: Mark Brown <broonie@kernel.org> Thanks!
On 2023/11/22 17:37, Vlastimil Babka wrote: > On 11/20/23 19:49, Mark Brown wrote: >> On Thu, Nov 02, 2023 at 03:23:27AM +0000, chengming.zhou@linux.dev wrote: >>> From: Chengming Zhou <zhouchengming@bytedance.com> >>> >>> Now we will freeze slabs when moving them out of node partial list to >>> cpu partial list, this method needs two cmpxchg_double operations: >>> >>> 1. freeze slab (acquire_slab()) under the node list_lock >>> 2. get_freelist() when pick used in ___slab_alloc() >> >> Recently -next has been failing to boot on a Raspberry Pi 3 with an arm >> multi_v7_defconfig and a NFS rootfs, a bisect appears to point to this >> patch (in -next as c8d312e039030edab25836a326bcaeb2a3d4db14) as having >> introduced the issue. I've included the full bisect log below. >> >> When we see problems we see RCU stalls while logging in, for example: > > Can you try this, please? > Great! I manually disabled __CMPXCHG_DOUBLE to reproduce the problem, and this patch can solve the machine hang problem. BTW, I also did the performance testcase on the machine with 128 CPUs. stress-ng --rawpkt 128 --rawpkt-ops 100000000 base patched 2.22s 2.35s 2.21s 3.14s 2.19s 4.75s Found this atomic version performance numbers are not stable. Should I change back to reuse the slab->__unused (mapcount) field? Or should we check "s->flags & __CMPXCHG_DOUBLE" in slab_set/clear_node_partial() to avoid using the atomic version? Thanks! > ----8<---- > From 000030c1ff055ef6a2ca624d0142f08f3ef19d51 Mon Sep 17 00:00:00 2001 > From: Vlastimil Babka <vbabka@suse.cz> > Date: Wed, 22 Nov 2023 10:32:41 +0100 > Subject: [PATCH] mm/slub: try to fix hangs without cmpxchg64/128 > > If we don't have cmpxchg64/128 and resort to slab_lock()/slab_unlock() > which uses PG_locked, we can get RMW with the newly introduced > slab_set/clear_node_partial() operation that modify PG_workingset so all > the operations have to be atomic now. > > Signed-off-by: Vlastimil Babka <vbabka@suse.cz> > --- > mm/slub.c | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/mm/slub.c b/mm/slub.c > index bcb5b2c4e213..f2cdb81ab02e 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -522,7 +522,7 @@ static __always_inline void slab_unlock(struct slab *slab) > struct page *page = slab_page(slab); > > VM_BUG_ON_PAGE(PageTail(page), page); > - __bit_spin_unlock(PG_locked, &page->flags); > + bit_spin_unlock(PG_locked, &page->flags); > } > > static inline bool > @@ -2127,12 +2127,12 @@ static inline bool slab_test_node_partial(const struct slab *slab) > > static inline void slab_set_node_partial(struct slab *slab) > { > - __set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); > + set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); > } > > static inline void slab_clear_node_partial(struct slab *slab) > { > - __clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); > + clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); > } > > /*
On 11/22/23 12:35, Chengming Zhou wrote: > On 2023/11/22 17:37, Vlastimil Babka wrote: >> On 11/20/23 19:49, Mark Brown wrote: >>> On Thu, Nov 02, 2023 at 03:23:27AM +0000, chengming.zhou@linux.dev wrote: >>>> From: Chengming Zhou <zhouchengming@bytedance.com> >>>> >>>> Now we will freeze slabs when moving them out of node partial list to >>>> cpu partial list, this method needs two cmpxchg_double operations: >>>> >>>> 1. freeze slab (acquire_slab()) under the node list_lock >>>> 2. get_freelist() when pick used in ___slab_alloc() >>> >>> Recently -next has been failing to boot on a Raspberry Pi 3 with an arm >>> multi_v7_defconfig and a NFS rootfs, a bisect appears to point to this >>> patch (in -next as c8d312e039030edab25836a326bcaeb2a3d4db14) as having >>> introduced the issue. I've included the full bisect log below. >>> >>> When we see problems we see RCU stalls while logging in, for example: >> >> Can you try this, please? >> > > Great! I manually disabled __CMPXCHG_DOUBLE to reproduce the problem, > and this patch can solve the machine hang problem. > > BTW, I also did the performance testcase on the machine with 128 CPUs. > > stress-ng --rawpkt 128 --rawpkt-ops 100000000 > > base patched > 2.22s 2.35s > 2.21s 3.14s > 2.19s 4.75s > > Found this atomic version performance numbers are not stable. That's weirdly too bad. Is that measured also with __CMPXCHG_DOUBLE disabled, or just the patch? The PG_workingset flag change should be uncontended as we are doing it under list_lock, and with __CMPXCHG_DOUBLE there should be no interfering PG_locked interference.
On 2023/11/22 19:40, Vlastimil Babka wrote: > On 11/22/23 12:35, Chengming Zhou wrote: >> On 2023/11/22 17:37, Vlastimil Babka wrote: >>> On 11/20/23 19:49, Mark Brown wrote: >>>> On Thu, Nov 02, 2023 at 03:23:27AM +0000, chengming.zhou@linux.dev wrote: >>>>> From: Chengming Zhou <zhouchengming@bytedance.com> >>>>> >>>>> Now we will freeze slabs when moving them out of node partial list to >>>>> cpu partial list, this method needs two cmpxchg_double operations: >>>>> >>>>> 1. freeze slab (acquire_slab()) under the node list_lock >>>>> 2. get_freelist() when pick used in ___slab_alloc() >>>> >>>> Recently -next has been failing to boot on a Raspberry Pi 3 with an arm >>>> multi_v7_defconfig and a NFS rootfs, a bisect appears to point to this >>>> patch (in -next as c8d312e039030edab25836a326bcaeb2a3d4db14) as having >>>> introduced the issue. I've included the full bisect log below. >>>> >>>> When we see problems we see RCU stalls while logging in, for example: >>> >>> Can you try this, please? >>> >> >> Great! I manually disabled __CMPXCHG_DOUBLE to reproduce the problem, >> and this patch can solve the machine hang problem. >> >> BTW, I also did the performance testcase on the machine with 128 CPUs. >> >> stress-ng --rawpkt 128 --rawpkt-ops 100000000 >> >> base patched >> 2.22s 2.35s >> 2.21s 3.14s >> 2.19s 4.75s >> >> Found this atomic version performance numbers are not stable. > > That's weirdly too bad. Is that measured also with __CMPXCHG_DOUBLE > disabled, or just the patch? The PG_workingset flag change should be The performance test is just the patch. > uncontended as we are doing it under list_lock, and with __CMPXCHG_DOUBLE > there should be no interfering PG_locked interference. > Yes, I don't know. Maybe it's related with my kernel config, making the atomic operation much expensive? Will look again.. And I also tested the atomic-optional version like below, found the performance numbers are much stable. diff --git a/mm/slub.c b/mm/slub.c index a307d319e82c..e11d34d51a14 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -531,7 +531,7 @@ static __always_inline void slab_unlock(struct slab *slab) struct page *page = slab_page(slab); VM_BUG_ON_PAGE(PageTail(page), page); - __bit_spin_unlock(PG_locked, &page->flags); + bit_spin_unlock(PG_locked, &page->flags); } static inline bool @@ -2136,12 +2136,18 @@ static inline bool slab_test_node_partial(const struct slab *slab) static inline void slab_set_node_partial(struct slab *slab) { - __set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); + if (slab->slab_cache->flags & __CMPXCHG_DOUBLE) + __set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); + else + set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); } static inline void slab_clear_node_partial(struct slab *slab) { - __clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); + if (slab->slab_cache->flags & __CMPXCHG_DOUBLE) + __clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); + else + clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); }
On 11/22/23 12:54, Chengming Zhou wrote: > On 2023/11/22 19:40, Vlastimil Babka wrote: >> On 11/22/23 12:35, Chengming Zhou wrote: >>> On 2023/11/22 17:37, Vlastimil Babka wrote: >>>> On 11/20/23 19:49, Mark Brown wrote: >>>>> On Thu, Nov 02, 2023 at 03:23:27AM +0000, chengming.zhou@linux.dev wrote: >>>>>> From: Chengming Zhou <zhouchengming@bytedance.com> >>>>>> >>>>>> Now we will freeze slabs when moving them out of node partial list to >>>>>> cpu partial list, this method needs two cmpxchg_double operations: >>>>>> >>>>>> 1. freeze slab (acquire_slab()) under the node list_lock >>>>>> 2. get_freelist() when pick used in ___slab_alloc() >>>>> >>>>> Recently -next has been failing to boot on a Raspberry Pi 3 with an arm >>>>> multi_v7_defconfig and a NFS rootfs, a bisect appears to point to this >>>>> patch (in -next as c8d312e039030edab25836a326bcaeb2a3d4db14) as having >>>>> introduced the issue. I've included the full bisect log below. >>>>> >>>>> When we see problems we see RCU stalls while logging in, for example: >>>> >>>> Can you try this, please? >>>> >>> >>> Great! I manually disabled __CMPXCHG_DOUBLE to reproduce the problem, >>> and this patch can solve the machine hang problem. >>> >>> BTW, I also did the performance testcase on the machine with 128 CPUs. >>> >>> stress-ng --rawpkt 128 --rawpkt-ops 100000000 >>> >>> base patched >>> 2.22s 2.35s >>> 2.21s 3.14s >>> 2.19s 4.75s >>> >>> Found this atomic version performance numbers are not stable. >> >> That's weirdly too bad. Is that measured also with __CMPXCHG_DOUBLE >> disabled, or just the patch? The PG_workingset flag change should be > > The performance test is just the patch. > >> uncontended as we are doing it under list_lock, and with __CMPXCHG_DOUBLE >> there should be no interfering PG_locked interference. >> > > Yes, I don't know. Maybe it's related with my kernel config, making the > atomic operation much expensive? Will look again.. I doubt it can explain going from 2.19s to 4.75s, must have been some interference on the machine? > And I also tested the atomic-optional version like below, found the > performance numbers are much stable. This gets rather ugly and fragile so I'd maybe rather go back to the __unused field approach :/ > diff --git a/mm/slub.c b/mm/slub.c > index a307d319e82c..e11d34d51a14 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -531,7 +531,7 @@ static __always_inline void slab_unlock(struct slab *slab) > struct page *page = slab_page(slab); > > VM_BUG_ON_PAGE(PageTail(page), page); > - __bit_spin_unlock(PG_locked, &page->flags); > + bit_spin_unlock(PG_locked, &page->flags); > } > > static inline bool > @@ -2136,12 +2136,18 @@ static inline bool slab_test_node_partial(const struct slab *slab) > > static inline void slab_set_node_partial(struct slab *slab) > { > - __set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); > + if (slab->slab_cache->flags & __CMPXCHG_DOUBLE) > + __set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); > + else > + set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); > } > > static inline void slab_clear_node_partial(struct slab *slab) > { > - __clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); > + if (slab->slab_cache->flags & __CMPXCHG_DOUBLE) > + __clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); > + else > + clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); > }
On 2023/11/22 21:19, Vlastimil Babka wrote: > On 11/22/23 12:54, Chengming Zhou wrote: >> On 2023/11/22 19:40, Vlastimil Babka wrote: >>> On 11/22/23 12:35, Chengming Zhou wrote: >>>> On 2023/11/22 17:37, Vlastimil Babka wrote: >>>>> On 11/20/23 19:49, Mark Brown wrote: >>>>>> On Thu, Nov 02, 2023 at 03:23:27AM +0000, chengming.zhou@linux.dev wrote: >>>>>>> From: Chengming Zhou <zhouchengming@bytedance.com> >>>>>>> >>>>>>> Now we will freeze slabs when moving them out of node partial list to >>>>>>> cpu partial list, this method needs two cmpxchg_double operations: >>>>>>> >>>>>>> 1. freeze slab (acquire_slab()) under the node list_lock >>>>>>> 2. get_freelist() when pick used in ___slab_alloc() >>>>>> >>>>>> Recently -next has been failing to boot on a Raspberry Pi 3 with an arm >>>>>> multi_v7_defconfig and a NFS rootfs, a bisect appears to point to this >>>>>> patch (in -next as c8d312e039030edab25836a326bcaeb2a3d4db14) as having >>>>>> introduced the issue. I've included the full bisect log below. >>>>>> >>>>>> When we see problems we see RCU stalls while logging in, for example: >>>>> >>>>> Can you try this, please? >>>>> >>>> >>>> Great! I manually disabled __CMPXCHG_DOUBLE to reproduce the problem, >>>> and this patch can solve the machine hang problem. >>>> >>>> BTW, I also did the performance testcase on the machine with 128 CPUs. >>>> >>>> stress-ng --rawpkt 128 --rawpkt-ops 100000000 >>>> >>>> base patched >>>> 2.22s 2.35s >>>> 2.21s 3.14s >>>> 2.19s 4.75s >>>> >>>> Found this atomic version performance numbers are not stable. >>> >>> That's weirdly too bad. Is that measured also with __CMPXCHG_DOUBLE >>> disabled, or just the patch? The PG_workingset flag change should be >> >> The performance test is just the patch. >> >>> uncontended as we are doing it under list_lock, and with __CMPXCHG_DOUBLE >>> there should be no interfering PG_locked interference. >>> >> >> Yes, I don't know. Maybe it's related with my kernel config, making the >> atomic operation much expensive? Will look again.. > > I doubt it can explain going from 2.19s to 4.75s, must have been some > interference on the machine? > Yes, it looks so. There are some background services on the 128 CPUs machine. Although "stress-ng --rawpkt 128 --rawpkt-ops 100000000" has so much regression, I tried other less contented testcases: 1. stress-ng --rawpkt 64 --rawpkt-ops 100000000 2. perf bench sched messaging -g 5 -t -l 100000 The performance numbers of this atomic version are pretty much the same. So this atomic version should be good in most cases IMHO. >> And I also tested the atomic-optional version like below, found the >> performance numbers are much stable. > > This gets rather ugly and fragile so I'd maybe rather go back to the > __unused field approach :/ > Agree. If we don't want this atomic version, the __unused field approach seems better. Thanks!
On 11/22/23 15:28, Chengming Zhou wrote: > > Yes, it looks so. There are some background services on the 128 CPUs machine. > Although "stress-ng --rawpkt 128 --rawpkt-ops 100000000" has so much regression, > I tried other less contented testcases: > > 1. stress-ng --rawpkt 64 --rawpkt-ops 100000000 > 2. perf bench sched messaging -g 5 -t -l 100000 > > The performance numbers of this atomic version are pretty much the same. > > So this atomic version should be good in most cases IMHO. OK will fold the fix using full atomic version. >>> And I also tested the atomic-optional version like below, found the >>> performance numbers are much stable. >> >> This gets rather ugly and fragile so I'd maybe rather go back to the >> __unused field approach :/ >> > > Agree. If we don't want this atomic version, the __unused field approach > seems better. > > Thanks! >
On Thu, Nov 2, 2023 at 12:25 PM <chengming.zhou@linux.dev> wrote: > > From: Chengming Zhou <zhouchengming@bytedance.com> > > Now we will freeze slabs when moving them out of node partial list to > cpu partial list, this method needs two cmpxchg_double operations: > > 1. freeze slab (acquire_slab()) under the node list_lock > 2. get_freelist() when pick used in ___slab_alloc() > > Actually we don't need to freeze when moving slabs out of node partial > list, we can delay freezing to when use slab freelist in ___slab_alloc(), > so we can save one cmpxchg_double(). > > And there are other good points: > - The moving of slabs between node partial list and cpu partial list > becomes simpler, since we don't need to freeze or unfreeze at all. > > - The node list_lock contention would be less, since we don't need to > freeze any slab under the node list_lock. > > We can achieve this because there is no concurrent path would manipulate > the partial slab list except the __slab_free() path, which is now > serialized by slab_test_node_partial() under the list_lock. > > Since the slab returned by get_partial() interfaces is not frozen anymore > and no freelist is returned in the partial_context, so we need to use the > introduced freeze_slab() to freeze it and get its freelist. > > Similarly, the slabs on the CPU partial list are not frozen anymore, > we need to freeze_slab() on it before use. > > We can now delete acquire_slab() as it became unused. > > Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com> > Reviewed-by: Vlastimil Babka <vbabka@suse.cz> > Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> > --- > mm/slub.c | 113 +++++++++++------------------------------------------- > 1 file changed, 23 insertions(+), 90 deletions(-) > > diff --git a/mm/slub.c b/mm/slub.c > index edf567971679..bcb5b2c4e213 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -2234,51 +2234,6 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, > return object; > } > > -/* > - * Remove slab from the partial list, freeze it and > - * return the pointer to the freelist. > - * > - * Returns a list of objects or NULL if it fails. > - */ > -static inline void *acquire_slab(struct kmem_cache *s, > - struct kmem_cache_node *n, struct slab *slab, > - int mode) Nit: alloc_single_from_partial()'s comment still refers to acquire_slab(). > -{ > - void *freelist; > - unsigned long counters; > - struct slab new; > - > - lockdep_assert_held(&n->list_lock); > - > - /* > - * Zap the freelist and set the frozen bit. > - * The old freelist is the list of objects for the > - * per cpu allocation list. > - */ > - freelist = slab->freelist; > - counters = slab->counters; > - new.counters = counters; > - if (mode) { > - new.inuse = slab->objects; > - new.freelist = NULL; > - } else { > - new.freelist = freelist; > - } > - > - VM_BUG_ON(new.frozen); > - new.frozen = 1; > - > - if (!__slab_update_freelist(s, slab, > - freelist, counters, > - new.freelist, new.counters, > - "acquire_slab")) > - return NULL; > - > - remove_partial(n, slab); > - WARN_ON(!freelist); > - return freelist; > -} > - > #ifdef CONFIG_SLUB_CPU_PARTIAL > static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); > #else > @@ -2295,7 +2250,6 @@ static struct slab *get_partial_node(struct kmem_cache *s, > struct partial_context *pc) > { > struct slab *slab, *slab2, *partial = NULL; > - void *object = NULL; > unsigned long flags; > unsigned int partial_slabs = 0; > > @@ -2314,7 +2268,7 @@ static struct slab *get_partial_node(struct kmem_cache *s, > continue; > > if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { > - object = alloc_single_from_partial(s, n, slab, > + void *object = alloc_single_from_partial(s, n, slab, > pc->orig_size); > if (object) { > partial = slab; > @@ -2324,13 +2278,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, > continue; > } > > - object = acquire_slab(s, n, slab, object == NULL); > - if (!object) > - break; > + remove_partial(n, slab); > > if (!partial) { > partial = slab; > - pc->object = object; > stat(s, ALLOC_FROM_PARTIAL); > } else { > put_cpu_partial(s, slab, 0); > @@ -2629,9 +2580,6 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) > unsigned long flags = 0; > > while (partial_slab) { > - struct slab new; > - struct slab old; > - > slab = partial_slab; > partial_slab = slab->next; > > @@ -2644,23 +2592,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) > spin_lock_irqsave(&n->list_lock, flags); > } > > - do { > - > - old.freelist = slab->freelist; > - old.counters = slab->counters; > - VM_BUG_ON(!old.frozen); > - > - new.counters = old.counters; > - new.freelist = old.freelist; > - > - new.frozen = 0; > - > - } while (!__slab_update_freelist(s, slab, > - old.freelist, old.counters, > - new.freelist, new.counters, > - "unfreezing slab")); > - > - if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { > + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { > slab->next = slab_to_discard; > slab_to_discard = slab; > } else { > @@ -3167,7 +3099,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > node = NUMA_NO_NODE; > goto new_slab; > } > -redo: > > if (unlikely(!node_match(slab, node))) { > /* > @@ -3243,7 +3174,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > > new_slab: > > - if (slub_percpu_partial(c)) { > +#ifdef CONFIG_SLUB_CPU_PARTIAL > + while (slub_percpu_partial(c)) { > local_lock_irqsave(&s->cpu_slab->lock, flags); > if (unlikely(c->slab)) { > local_unlock_irqrestore(&s->cpu_slab->lock, flags); > @@ -3255,12 +3187,22 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > goto new_objects; > } > > - slab = c->slab = slub_percpu_partial(c); > + slab = slub_percpu_partial(c); > slub_set_percpu_partial(c, slab); > local_unlock_irqrestore(&s->cpu_slab->lock, flags); > stat(s, CPU_PARTIAL_ALLOC); > - goto redo; > + > + if (unlikely(!node_match(slab, node) || > + !pfmemalloc_match(slab, gfpflags))) { > + slab->next = NULL; > + __unfreeze_partials(s, slab); > + continue; > + } > + > + freelist = freeze_slab(s, slab); > + goto retry_load_slab; > } > +#endif > > new_objects: > > @@ -3268,8 +3210,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > pc.orig_size = orig_size; > slab = get_partial(s, node, &pc); > if (slab) { > - freelist = pc.object; > if (kmem_cache_debug(s)) { > + freelist = pc.object; > /* > * For debug caches here we had to go through > * alloc_single_from_partial() so just store the > @@ -3281,6 +3223,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > return freelist; > } > > + freelist = freeze_slab(s, slab); > goto retry_load_slab; > } > > @@ -3682,18 +3625,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > was_frozen = new.frozen; > new.inuse -= cnt; > if ((!new.inuse || !prior) && !was_frozen) { > - > - if (kmem_cache_has_cpu_partial(s) && !prior) { > - > - /* > - * Slab was on no list before and will be > - * partially empty > - * We can defer the list move and instead > - * freeze it. > - */ > - new.frozen = 1; > - > - } else { /* Needs to be taken off a list */ > + /* Needs to be taken off a list */ > + if (!kmem_cache_has_cpu_partial(s) || prior) { > > n = get_node(s, slab_nid(slab)); > /* > @@ -3723,9 +3656,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, > * activity can be necessary. > */ > stat(s, FREE_FROZEN); > - } else if (new.frozen) { > + } else if (kmem_cache_has_cpu_partial(s) && !prior) { > /* > - * If we just froze the slab then put it onto the > + * If we started with a full slab then put it onto the > * per cpu partial list. > */ > put_cpu_partial(s, slab, 1); > -- Looks good to me, Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Thanks! > 2.20.1 >
On 2023/12/3 14:53, Hyeonggon Yoo wrote: > On Thu, Nov 2, 2023 at 12:25 PM <chengming.zhou@linux.dev> wrote: >> >> From: Chengming Zhou <zhouchengming@bytedance.com> >> >> Now we will freeze slabs when moving them out of node partial list to >> cpu partial list, this method needs two cmpxchg_double operations: >> >> 1. freeze slab (acquire_slab()) under the node list_lock >> 2. get_freelist() when pick used in ___slab_alloc() >> >> Actually we don't need to freeze when moving slabs out of node partial >> list, we can delay freezing to when use slab freelist in ___slab_alloc(), >> so we can save one cmpxchg_double(). >> >> And there are other good points: >> - The moving of slabs between node partial list and cpu partial list >> becomes simpler, since we don't need to freeze or unfreeze at all. >> >> - The node list_lock contention would be less, since we don't need to >> freeze any slab under the node list_lock. >> >> We can achieve this because there is no concurrent path would manipulate >> the partial slab list except the __slab_free() path, which is now >> serialized by slab_test_node_partial() under the list_lock. >> >> Since the slab returned by get_partial() interfaces is not frozen anymore >> and no freelist is returned in the partial_context, so we need to use the >> introduced freeze_slab() to freeze it and get its freelist. >> >> Similarly, the slabs on the CPU partial list are not frozen anymore, >> we need to freeze_slab() on it before use. >> >> We can now delete acquire_slab() as it became unused. >> >> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com> >> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> >> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> >> --- >> mm/slub.c | 113 +++++++++++------------------------------------------- >> 1 file changed, 23 insertions(+), 90 deletions(-) >> >> diff --git a/mm/slub.c b/mm/slub.c >> index edf567971679..bcb5b2c4e213 100644 >> --- a/mm/slub.c >> +++ b/mm/slub.c >> @@ -2234,51 +2234,6 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, >> return object; >> } >> >> -/* >> - * Remove slab from the partial list, freeze it and >> - * return the pointer to the freelist. >> - * >> - * Returns a list of objects or NULL if it fails. >> - */ >> -static inline void *acquire_slab(struct kmem_cache *s, >> - struct kmem_cache_node *n, struct slab *slab, >> - int mode) > > Nit: alloc_single_from_partial()'s comment still refers to acquire_slab(). > Ah, right! It should be changed to remove_partial(). diff --git a/mm/slub.c b/mm/slub.c index 437485a2408d..623c17a4cdd6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2463,7 +2463,7 @@ static inline void remove_partial(struct kmem_cache_node *n, } /* - * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * Called only for kmem_cache_debug() caches instead of remove_partial(), with a * slab from the n->partial list. Remove only a single object from the slab, do * the alloc_debug_processing() checks and leave the slab on the list, or move * it to full list if it was the last free object. Hi Vlastimil, could you please help to fold it? Thanks! >> -{ >> - void *freelist; >> - unsigned long counters; >> - struct slab new; >> - >> - lockdep_assert_held(&n->list_lock); >> - >> - /* >> - * Zap the freelist and set the frozen bit. >> - * The old freelist is the list of objects for the >> - * per cpu allocation list. >> - */ >> - freelist = slab->freelist; >> - counters = slab->counters; >> - new.counters = counters; >> - if (mode) { >> - new.inuse = slab->objects; >> - new.freelist = NULL; >> - } else { >> - new.freelist = freelist; >> - } >> - >> - VM_BUG_ON(new.frozen); >> - new.frozen = 1; >> - >> - if (!__slab_update_freelist(s, slab, >> - freelist, counters, >> - new.freelist, new.counters, >> - "acquire_slab")) >> - return NULL; >> - >> - remove_partial(n, slab); >> - WARN_ON(!freelist); >> - return freelist; >> -} >> - >> #ifdef CONFIG_SLUB_CPU_PARTIAL >> static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); >> #else >> @@ -2295,7 +2250,6 @@ static struct slab *get_partial_node(struct kmem_cache *s, >> struct partial_context *pc) >> { >> struct slab *slab, *slab2, *partial = NULL; >> - void *object = NULL; >> unsigned long flags; >> unsigned int partial_slabs = 0; >> >> @@ -2314,7 +2268,7 @@ static struct slab *get_partial_node(struct kmem_cache *s, >> continue; >> >> if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { >> - object = alloc_single_from_partial(s, n, slab, >> + void *object = alloc_single_from_partial(s, n, slab, >> pc->orig_size); >> if (object) { >> partial = slab; >> @@ -2324,13 +2278,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, >> continue; >> } >> >> - object = acquire_slab(s, n, slab, object == NULL); >> - if (!object) >> - break; >> + remove_partial(n, slab); >> >> if (!partial) { >> partial = slab; >> - pc->object = object; >> stat(s, ALLOC_FROM_PARTIAL); >> } else { >> put_cpu_partial(s, slab, 0); >> @@ -2629,9 +2580,6 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) >> unsigned long flags = 0; >> >> while (partial_slab) { >> - struct slab new; >> - struct slab old; >> - >> slab = partial_slab; >> partial_slab = slab->next; >> >> @@ -2644,23 +2592,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) >> spin_lock_irqsave(&n->list_lock, flags); >> } >> >> - do { >> - >> - old.freelist = slab->freelist; >> - old.counters = slab->counters; >> - VM_BUG_ON(!old.frozen); >> - >> - new.counters = old.counters; >> - new.freelist = old.freelist; >> - >> - new.frozen = 0; >> - >> - } while (!__slab_update_freelist(s, slab, >> - old.freelist, old.counters, >> - new.freelist, new.counters, >> - "unfreezing slab")); >> - >> - if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { >> + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { >> slab->next = slab_to_discard; >> slab_to_discard = slab; >> } else { >> @@ -3167,7 +3099,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, >> node = NUMA_NO_NODE; >> goto new_slab; >> } >> -redo: >> >> if (unlikely(!node_match(slab, node))) { >> /* >> @@ -3243,7 +3174,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, >> >> new_slab: >> >> - if (slub_percpu_partial(c)) { >> +#ifdef CONFIG_SLUB_CPU_PARTIAL >> + while (slub_percpu_partial(c)) { >> local_lock_irqsave(&s->cpu_slab->lock, flags); >> if (unlikely(c->slab)) { >> local_unlock_irqrestore(&s->cpu_slab->lock, flags); >> @@ -3255,12 +3187,22 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, >> goto new_objects; >> } >> >> - slab = c->slab = slub_percpu_partial(c); >> + slab = slub_percpu_partial(c); >> slub_set_percpu_partial(c, slab); >> local_unlock_irqrestore(&s->cpu_slab->lock, flags); >> stat(s, CPU_PARTIAL_ALLOC); >> - goto redo; >> + >> + if (unlikely(!node_match(slab, node) || >> + !pfmemalloc_match(slab, gfpflags))) { >> + slab->next = NULL; >> + __unfreeze_partials(s, slab); >> + continue; >> + } >> + >> + freelist = freeze_slab(s, slab); >> + goto retry_load_slab; >> } >> +#endif >> >> new_objects: >> >> @@ -3268,8 +3210,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, >> pc.orig_size = orig_size; >> slab = get_partial(s, node, &pc); >> if (slab) { >> - freelist = pc.object; >> if (kmem_cache_debug(s)) { >> + freelist = pc.object; >> /* >> * For debug caches here we had to go through >> * alloc_single_from_partial() so just store the >> @@ -3281,6 +3223,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, >> return freelist; >> } >> >> + freelist = freeze_slab(s, slab); >> goto retry_load_slab; >> } >> >> @@ -3682,18 +3625,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, >> was_frozen = new.frozen; >> new.inuse -= cnt; >> if ((!new.inuse || !prior) && !was_frozen) { >> - >> - if (kmem_cache_has_cpu_partial(s) && !prior) { >> - >> - /* >> - * Slab was on no list before and will be >> - * partially empty >> - * We can defer the list move and instead >> - * freeze it. >> - */ >> - new.frozen = 1; >> - >> - } else { /* Needs to be taken off a list */ >> + /* Needs to be taken off a list */ >> + if (!kmem_cache_has_cpu_partial(s) || prior) { >> >> n = get_node(s, slab_nid(slab)); >> /* >> @@ -3723,9 +3656,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, >> * activity can be necessary. >> */ >> stat(s, FREE_FROZEN); >> - } else if (new.frozen) { >> + } else if (kmem_cache_has_cpu_partial(s) && !prior) { >> /* >> - * If we just froze the slab then put it onto the >> + * If we started with a full slab then put it onto the >> * per cpu partial list. >> */ >> put_cpu_partial(s, slab, 1); >> -- > > Looks good to me, > Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> > > Thanks! > >> 2.20.1 >>
On 12/3/23 11:15, Chengming Zhou wrote: > On 2023/12/3 14:53, Hyeonggon Yoo wrote: >> On Thu, Nov 2, 2023 at 12:25 PM <chengming.zhou@linux.dev> wrote: >>> >>> From: Chengming Zhou <zhouchengming@bytedance.com> >>> >>> Now we will freeze slabs when moving them out of node partial list to >>> cpu partial list, this method needs two cmpxchg_double operations: >>> >>> 1. freeze slab (acquire_slab()) under the node list_lock >>> 2. get_freelist() when pick used in ___slab_alloc() >>> >>> Actually we don't need to freeze when moving slabs out of node partial >>> list, we can delay freezing to when use slab freelist in ___slab_alloc(), >>> so we can save one cmpxchg_double(). >>> >>> And there are other good points: >>> - The moving of slabs between node partial list and cpu partial list >>> becomes simpler, since we don't need to freeze or unfreeze at all. >>> >>> - The node list_lock contention would be less, since we don't need to >>> freeze any slab under the node list_lock. >>> >>> We can achieve this because there is no concurrent path would manipulate >>> the partial slab list except the __slab_free() path, which is now >>> serialized by slab_test_node_partial() under the list_lock. >>> >>> Since the slab returned by get_partial() interfaces is not frozen anymore >>> and no freelist is returned in the partial_context, so we need to use the >>> introduced freeze_slab() to freeze it and get its freelist. >>> >>> Similarly, the slabs on the CPU partial list are not frozen anymore, >>> we need to freeze_slab() on it before use. >>> >>> We can now delete acquire_slab() as it became unused. >>> >>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com> >>> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> >>> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> >>> --- >>> mm/slub.c | 113 +++++++++++------------------------------------------- >>> 1 file changed, 23 insertions(+), 90 deletions(-) >>> >>> diff --git a/mm/slub.c b/mm/slub.c >>> index edf567971679..bcb5b2c4e213 100644 >>> --- a/mm/slub.c >>> +++ b/mm/slub.c >>> @@ -2234,51 +2234,6 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, >>> return object; >>> } >>> >>> -/* >>> - * Remove slab from the partial list, freeze it and >>> - * return the pointer to the freelist. >>> - * >>> - * Returns a list of objects or NULL if it fails. >>> - */ >>> -static inline void *acquire_slab(struct kmem_cache *s, >>> - struct kmem_cache_node *n, struct slab *slab, >>> - int mode) >> >> Nit: alloc_single_from_partial()'s comment still refers to acquire_slab(). >> > > Ah, right! It should be changed to remove_partial(). > > diff --git a/mm/slub.c b/mm/slub.c > index 437485a2408d..623c17a4cdd6 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -2463,7 +2463,7 @@ static inline void remove_partial(struct kmem_cache_node *n, > } > > /* > - * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a > + * Called only for kmem_cache_debug() caches instead of remove_partial(), with a > * slab from the n->partial list. Remove only a single object from the slab, do > * the alloc_debug_processing() checks and leave the slab on the list, or move > * it to full list if it was the last free object. > > Hi Vlastimil, could you please help to fold it? Done, thanks.
diff --git a/mm/slub.c b/mm/slub.c index edf567971679..bcb5b2c4e213 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2234,51 +2234,6 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, return object; } -/* - * Remove slab from the partial list, freeze it and - * return the pointer to the freelist. - * - * Returns a list of objects or NULL if it fails. - */ -static inline void *acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct slab *slab, - int mode) -{ - void *freelist; - unsigned long counters; - struct slab new; - - lockdep_assert_held(&n->list_lock); - - /* - * Zap the freelist and set the frozen bit. - * The old freelist is the list of objects for the - * per cpu allocation list. - */ - freelist = slab->freelist; - counters = slab->counters; - new.counters = counters; - if (mode) { - new.inuse = slab->objects; - new.freelist = NULL; - } else { - new.freelist = freelist; - } - - VM_BUG_ON(new.frozen); - new.frozen = 1; - - if (!__slab_update_freelist(s, slab, - freelist, counters, - new.freelist, new.counters, - "acquire_slab")) - return NULL; - - remove_partial(n, slab); - WARN_ON(!freelist); - return freelist; -} - #ifdef CONFIG_SLUB_CPU_PARTIAL static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); #else @@ -2295,7 +2250,6 @@ static struct slab *get_partial_node(struct kmem_cache *s, struct partial_context *pc) { struct slab *slab, *slab2, *partial = NULL; - void *object = NULL; unsigned long flags; unsigned int partial_slabs = 0; @@ -2314,7 +2268,7 @@ static struct slab *get_partial_node(struct kmem_cache *s, continue; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - object = alloc_single_from_partial(s, n, slab, + void *object = alloc_single_from_partial(s, n, slab, pc->orig_size); if (object) { partial = slab; @@ -2324,13 +2278,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, continue; } - object = acquire_slab(s, n, slab, object == NULL); - if (!object) - break; + remove_partial(n, slab); if (!partial) { partial = slab; - pc->object = object; stat(s, ALLOC_FROM_PARTIAL); } else { put_cpu_partial(s, slab, 0); @@ -2629,9 +2580,6 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) unsigned long flags = 0; while (partial_slab) { - struct slab new; - struct slab old; - slab = partial_slab; partial_slab = slab->next; @@ -2644,23 +2592,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) spin_lock_irqsave(&n->list_lock, flags); } - do { - - old.freelist = slab->freelist; - old.counters = slab->counters; - VM_BUG_ON(!old.frozen); - - new.counters = old.counters; - new.freelist = old.freelist; - - new.frozen = 0; - - } while (!__slab_update_freelist(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")); - - if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { slab->next = slab_to_discard; slab_to_discard = slab; } else { @@ -3167,7 +3099,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, node = NUMA_NO_NODE; goto new_slab; } -redo: if (unlikely(!node_match(slab, node))) { /* @@ -3243,7 +3174,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, new_slab: - if (slub_percpu_partial(c)) { +#ifdef CONFIG_SLUB_CPU_PARTIAL + while (slub_percpu_partial(c)) { local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->slab)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); @@ -3255,12 +3187,22 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto new_objects; } - slab = c->slab = slub_percpu_partial(c); + slab = slub_percpu_partial(c); slub_set_percpu_partial(c, slab); local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); - goto redo; + + if (unlikely(!node_match(slab, node) || + !pfmemalloc_match(slab, gfpflags))) { + slab->next = NULL; + __unfreeze_partials(s, slab); + continue; + } + + freelist = freeze_slab(s, slab); + goto retry_load_slab; } +#endif new_objects: @@ -3268,8 +3210,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { - freelist = pc.object; if (kmem_cache_debug(s)) { + freelist = pc.object; /* * For debug caches here we had to go through * alloc_single_from_partial() so just store the @@ -3281,6 +3223,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, return freelist; } + freelist = freeze_slab(s, slab); goto retry_load_slab; } @@ -3682,18 +3625,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, was_frozen = new.frozen; new.inuse -= cnt; if ((!new.inuse || !prior) && !was_frozen) { - - if (kmem_cache_has_cpu_partial(s) && !prior) { - - /* - * Slab was on no list before and will be - * partially empty - * We can defer the list move and instead - * freeze it. - */ - new.frozen = 1; - - } else { /* Needs to be taken off a list */ + /* Needs to be taken off a list */ + if (!kmem_cache_has_cpu_partial(s) || prior) { n = get_node(s, slab_nid(slab)); /* @@ -3723,9 +3656,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * activity can be necessary. */ stat(s, FREE_FROZEN); - } else if (new.frozen) { + } else if (kmem_cache_has_cpu_partial(s) && !prior) { /* - * If we just froze the slab then put it onto the + * If we started with a full slab then put it onto the * per cpu partial list. */ put_cpu_partial(s, slab, 1);