Message ID | 20231116022411.2250072-4-yosryahmed@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: memcg: subtree stats flushing and thresholds | expand |
Hello, kernel test robot noticed a -30.2% regression of will-it-scale.per_thread_ops on: commit: c7fbfc7b4e089c4a9b292b1973a42a5761c1342f ("[PATCH v3 3/5] mm: memcg: make stats flushing threshold per-memcg") url: https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/mm-memcg-change-flush_next_time-to-flush_last_time/20231116-103300 base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/all/20231116022411.2250072-4-yosryahmed@google.com/ patch subject: [PATCH v3 3/5] mm: memcg: make stats flushing threshold per-memcg testcase: will-it-scale test machine: 104 threads 2 sockets (Skylake) with 192G memory parameters: nr_task: 50% mode: thread test: fallocate2 cpufreq_governor: performance If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <oliver.sang@intel.com> | Closes: https://lore.kernel.org/oe-lkp/202311221542.973f16ad-oliver.sang@intel.com Details are as below: --------------------------------------------------------------------------------------------------> The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20231122/202311221542.973f16ad-oliver.sang@intel.com ========================================================================================= compiler/cpufreq_governor/kconfig/mode/nr_task/rootfs/tbox_group/test/testcase: gcc-12/performance/x86_64-rhel-8.3/thread/50%/debian-11.1-x86_64-20220510.cgz/lkp-skl-fpga01/fallocate2/will-it-scale commit: c5caa5bb03 ("mm: memcg: move vmstats structs definition above flushing code") c7fbfc7b4e ("mm: memcg: make stats flushing threshold per-memcg") c5caa5bb0376e3e5 c7fbfc7b4e089c4a9b292b1973a ---------------- --------------------------- %stddev %change %stddev \ | \ 1.84 -0.5 1.37 ± 9% mpstat.cpu.all.usr% 0.08 -25.0% 0.06 turbostat.IPC 3121 -9.2% 2835 ± 5% vmstat.system.cs 78.17 ± 12% +96.6% 153.67 ± 18% perf-c2c.DRAM.local 504.17 ± 6% +34.4% 677.50 ± 4% perf-c2c.DRAM.remote 3980762 -30.2% 2777359 will-it-scale.52.threads 76552 -30.2% 53410 will-it-scale.per_thread_ops 3980762 -30.2% 2777359 will-it-scale.workload 1.192e+09 ± 2% -30.2% 8.324e+08 ± 3% numa-numastat.node0.local_node 1.192e+09 ± 2% -30.2% 8.324e+08 ± 3% numa-numastat.node0.numa_hit 1.215e+09 ± 2% -30.3% 8.471e+08 ± 3% numa-numastat.node1.local_node 1.215e+09 ± 2% -30.3% 8.474e+08 ± 3% numa-numastat.node1.numa_hit 1.192e+09 ± 2% -30.2% 8.324e+08 ± 3% numa-vmstat.node0.numa_hit 1.192e+09 ± 2% -30.2% 8.324e+08 ± 3% numa-vmstat.node0.numa_local 1.215e+09 ± 2% -30.3% 8.474e+08 ± 3% numa-vmstat.node1.numa_hit 1.215e+09 ± 2% -30.3% 8.471e+08 ± 3% numa-vmstat.node1.numa_local 31404 -1.6% 30913 proc-vmstat.nr_slab_reclaimable 2.408e+09 -30.2% 1.68e+09 proc-vmstat.numa_hit 2.407e+09 -30.2% 1.68e+09 proc-vmstat.numa_local 2.404e+09 -30.2% 1.678e+09 proc-vmstat.pgalloc_normal 2.403e+09 -30.2% 1.678e+09 proc-vmstat.pgfree 0.05 ± 8% -27.3% 0.04 ± 4% perf-sched.wait_and_delay.avg.ms.__cond_resched.shmem_fallocate.vfs_fallocate.__x64_sys_fallocate.do_syscall_64 0.05 ± 10% -24.9% 0.04 ± 8% perf-sched.wait_and_delay.avg.ms.__cond_resched.shmem_inode_acct_blocks.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate 0.05 ± 8% -27.2% 0.04 ± 5% perf-sched.wait_and_delay.avg.ms.__cond_resched.shmem_undo_range.shmem_setattr.notify_change.do_truncate 1.14 +14.1% 1.30 perf-sched.wait_and_delay.avg.ms.schedule_timeout.__wait_for_common.wait_for_completion_state.kernel_clone 198.38 ± 3% +16.5% 231.12 ± 3% perf-sched.wait_and_delay.avg.ms.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm 1563 ± 5% -11.4% 1384 ± 5% perf-sched.wait_and_delay.count.__cond_resched.shmem_fallocate.vfs_fallocate.__x64_sys_fallocate.do_syscall_64 1677 ± 5% -18.7% 1364 ± 4% perf-sched.wait_and_delay.count.__cond_resched.shmem_undo_range.shmem_setattr.notify_change.do_truncate 3815 ± 2% -14.5% 3260 ± 2% perf-sched.wait_and_delay.count.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm 0.51 ± 5% -32.3% 0.35 ± 16% perf-sched.wait_and_delay.max.ms.__cond_resched.shmem_fallocate.vfs_fallocate.__x64_sys_fallocate.do_syscall_64 0.47 ± 11% -33.3% 0.31 ± 20% perf-sched.wait_and_delay.max.ms.__cond_resched.shmem_inode_acct_blocks.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate 2.37 +13.0% 2.68 ± 2% perf-sched.wait_and_delay.max.ms.schedule_timeout.__wait_for_common.wait_for_completion_state.kernel_clone 0.05 ± 8% -27.3% 0.04 ± 4% perf-sched.wait_time.avg.ms.__cond_resched.shmem_fallocate.vfs_fallocate.__x64_sys_fallocate.do_syscall_64 0.05 ± 10% -24.9% 0.04 ± 8% perf-sched.wait_time.avg.ms.__cond_resched.shmem_inode_acct_blocks.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate 0.05 ± 8% -27.2% 0.04 ± 5% perf-sched.wait_time.avg.ms.__cond_resched.shmem_undo_range.shmem_setattr.notify_change.do_truncate 1.14 +14.1% 1.30 perf-sched.wait_time.avg.ms.schedule_timeout.__wait_for_common.wait_for_completion_state.kernel_clone 198.37 ± 3% +16.5% 231.11 ± 3% perf-sched.wait_time.avg.ms.smpboot_thread_fn.kthread.ret_from_fork.ret_from_fork_asm 0.39 ± 31% -72.9% 0.11 ± 28% perf-sched.wait_time.max.ms.__cond_resched.__alloc_pages.alloc_pages_mpol.shmem_alloc_folio.shmem_alloc_and_add_folio 0.51 ± 5% -32.3% 0.35 ± 16% perf-sched.wait_time.max.ms.__cond_resched.shmem_fallocate.vfs_fallocate.__x64_sys_fallocate.do_syscall_64 0.47 ± 11% -33.3% 0.31 ± 20% perf-sched.wait_time.max.ms.__cond_resched.shmem_inode_acct_blocks.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate 2.37 +13.1% 2.68 ± 2% perf-sched.wait_time.max.ms.schedule_timeout.__wait_for_common.wait_for_completion_state.kernel_clone 0.82 ± 14% +174.7% 2.24 ± 30% perf-stat.i.MPKI 8.476e+09 -27.7% 6.127e+09 ± 10% perf-stat.i.branch-instructions 55486131 -28.1% 39884260 ± 6% perf-stat.i.branch-misses 14.80 ± 2% +6.2 20.96 ± 7% perf-stat.i.cache-miss-rate% 30690945 ± 3% +79.9% 55207216 ± 10% perf-stat.i.cache-misses 2.066e+08 +24.2% 2.567e+08 ± 7% perf-stat.i.cache-references 3070 -9.7% 2772 ± 5% perf-stat.i.context-switches 3.58 ± 2% +39.7% 5.00 ± 11% perf-stat.i.cpi 4688 ± 3% -47.9% 2442 ± 4% perf-stat.i.cycles-between-cache-misses 4098916 -29.7% 2879809 perf-stat.i.dTLB-load-misses 1.052e+10 -27.5% 7.63e+09 ± 10% perf-stat.i.dTLB-loads 5.845e+09 -30.7% 4.051e+09 ± 10% perf-stat.i.dTLB-stores 77.61 -6.0 71.56 perf-stat.i.iTLB-load-miss-rate% 4058819 -32.5% 2739054 ± 8% perf-stat.i.iTLB-load-misses 4.089e+10 -28.3% 2.932e+10 ± 10% perf-stat.i.instructions 0.28 -26.8% 0.21 ± 5% perf-stat.i.ipc 240.84 -27.9% 173.57 ± 10% perf-stat.i.metric.M/sec 3814721 ± 3% +72.2% 6569712 ± 10% perf-stat.i.node-load-misses 407308 ± 7% +72.0% 700502 ± 18% perf-stat.i.node-loads 1323090 ± 2% -28.1% 951590 ± 12% perf-stat.i.node-store-misses 36568 ± 2% -20.7% 29014 ± 12% perf-stat.i.node-stores 0.75 ± 3% +151.0% 1.88 perf-stat.overall.MPKI 14.85 ± 2% +6.6 21.47 ± 3% perf-stat.overall.cache-miss-rate% 3.53 +33.8% 4.72 perf-stat.overall.cpi 4704 ± 3% -46.8% 2505 perf-stat.overall.cycles-between-cache-misses 77.62 -6.2 71.39 perf-stat.overall.iTLB-load-miss-rate% 0.28 -25.3% 0.21 perf-stat.overall.ipc 3121462 +7.4% 3353425 perf-stat.overall.path-length 8.451e+09 -27.6% 6.119e+09 ± 10% perf-stat.ps.branch-instructions 55320195 -28.0% 39804925 ± 6% perf-stat.ps.branch-misses 30594557 ± 3% +80.2% 55116821 ± 9% perf-stat.ps.cache-misses 2.059e+08 +24.4% 2.561e+08 ± 6% perf-stat.ps.cache-references 3059 -9.6% 2765 ± 5% perf-stat.ps.context-switches 4085949 -29.7% 2871251 perf-stat.ps.dTLB-load-misses 1.049e+10 -27.4% 7.62e+09 ± 10% perf-stat.ps.dTLB-loads 5.828e+09 -30.6% 4.046e+09 ± 10% perf-stat.ps.dTLB-stores 4046367 -32.4% 2734227 ± 7% perf-stat.ps.iTLB-load-misses 4.077e+10 -28.2% 2.928e+10 ± 10% perf-stat.ps.instructions 3802900 ± 3% +72.5% 6559980 ± 10% perf-stat.ps.node-load-misses 406123 ± 7% +72.2% 699397 ± 17% perf-stat.ps.node-loads 1319155 ± 2% -28.0% 950261 ± 12% perf-stat.ps.node-store-misses 36542 ± 2% -20.6% 29007 ± 11% perf-stat.ps.node-stores 1.243e+13 -25.0% 9.313e+12 perf-stat.total.instructions 1.26 ± 2% -0.4 0.91 perf-profile.calltrace.cycles-pp.syscall_return_via_sysret.fallocate64 1.22 -0.3 0.88 ± 2% perf-profile.calltrace.cycles-pp.shmem_alloc_folio.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate.vfs_fallocate 0.92 ± 2% -0.3 0.62 ± 3% perf-profile.calltrace.cycles-pp.shmem_inode_acct_blocks.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate.vfs_fallocate 1.04 -0.3 0.76 ± 2% perf-profile.calltrace.cycles-pp.alloc_pages_mpol.shmem_alloc_folio.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate 0.80 -0.2 0.58 ± 3% perf-profile.calltrace.cycles-pp.__alloc_pages.alloc_pages_mpol.shmem_alloc_folio.shmem_alloc_and_add_folio.shmem_get_folio_gfp 1.25 ± 2% -0.2 1.07 perf-profile.calltrace.cycles-pp.folio_batch_move_lru.lru_add_drain_cpu.__folio_batch_release.shmem_undo_range.shmem_setattr 1.25 ± 2% -0.2 1.07 perf-profile.calltrace.cycles-pp.lru_add_drain_cpu.__folio_batch_release.shmem_undo_range.shmem_setattr.notify_change 1.23 ± 2% -0.2 1.06 perf-profile.calltrace.cycles-pp.folio_lruvec_lock_irqsave.folio_batch_move_lru.lru_add_drain_cpu.__folio_batch_release.shmem_undo_range 1.23 ± 2% -0.2 1.06 perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.folio_lruvec_lock_irqsave.folio_batch_move_lru.lru_add_drain_cpu.__folio_batch_release 1.23 ± 2% -0.2 1.05 perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath._raw_spin_lock_irqsave.folio_lruvec_lock_irqsave.folio_batch_move_lru.lru_add_drain_cpu 1.16 ± 2% -0.1 1.02 ± 2% perf-profile.calltrace.cycles-pp.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe.fallocate64 0.68 +0.1 0.75 ± 2% perf-profile.calltrace.cycles-pp.__mem_cgroup_uncharge_list.release_pages.__folio_batch_release.shmem_undo_range.shmem_setattr 1.07 +0.1 1.18 ± 2% perf-profile.calltrace.cycles-pp.lru_add_fn.folio_batch_move_lru.folio_add_lru.shmem_alloc_and_add_folio.shmem_get_folio_gfp 2.95 +0.3 3.21 ± 2% perf-profile.calltrace.cycles-pp.truncate_inode_folio.shmem_undo_range.shmem_setattr.notify_change.do_truncate 2.60 +0.4 2.95 perf-profile.calltrace.cycles-pp.filemap_remove_folio.truncate_inode_folio.shmem_undo_range.shmem_setattr.notify_change 2.27 +0.4 2.71 ± 2% perf-profile.calltrace.cycles-pp.__filemap_remove_folio.filemap_remove_folio.truncate_inode_folio.shmem_undo_range.shmem_setattr 1.38 ± 3% +0.5 1.85 ± 5% perf-profile.calltrace.cycles-pp.get_mem_cgroup_from_mm.__mem_cgroup_charge.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate 2.29 ± 2% +0.6 2.90 ± 2% perf-profile.calltrace.cycles-pp.shmem_add_to_page_cache.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate.vfs_fallocate 0.00 +0.6 0.63 ± 2% perf-profile.calltrace.cycles-pp.__mod_memcg_lruvec_state.release_pages.__folio_batch_release.shmem_undo_range.shmem_setattr 0.00 +0.7 0.74 ± 3% perf-profile.calltrace.cycles-pp.__mod_memcg_lruvec_state.lru_add_fn.folio_batch_move_lru.folio_add_lru.shmem_alloc_and_add_folio 1.30 +0.8 2.07 ± 3% perf-profile.calltrace.cycles-pp.filemap_unaccount_folio.__filemap_remove_folio.filemap_remove_folio.truncate_inode_folio.shmem_undo_range 0.73 ± 2% +0.8 1.53 ± 2% perf-profile.calltrace.cycles-pp.__mod_memcg_lruvec_state.__mod_lruvec_page_state.filemap_unaccount_folio.__filemap_remove_folio.filemap_remove_folio 1.23 +0.8 2.04 ± 3% perf-profile.calltrace.cycles-pp.__mod_lruvec_page_state.filemap_unaccount_folio.__filemap_remove_folio.filemap_remove_folio.truncate_inode_folio 0.00 +0.8 0.82 ± 2% perf-profile.calltrace.cycles-pp.__count_memcg_events.mem_cgroup_commit_charge.__mem_cgroup_charge.shmem_alloc_and_add_folio.shmem_get_folio_gfp 1.39 ± 2% +0.9 2.32 ± 2% perf-profile.calltrace.cycles-pp.__mod_lruvec_page_state.shmem_add_to_page_cache.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate 0.59 ± 2% +0.9 1.53 ± 2% perf-profile.calltrace.cycles-pp.__mod_memcg_lruvec_state.__mod_lruvec_page_state.shmem_add_to_page_cache.shmem_alloc_and_add_folio.shmem_get_folio_gfp 38.12 +1.0 39.16 perf-profile.calltrace.cycles-pp.vfs_fallocate.__x64_sys_fallocate.do_syscall_64.entry_SYSCALL_64_after_hwframe.fallocate64 0.62 ± 4% +1.1 1.71 ± 3% perf-profile.calltrace.cycles-pp.mem_cgroup_commit_charge.__mem_cgroup_charge.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate 37.61 +1.2 38.80 perf-profile.calltrace.cycles-pp.shmem_fallocate.vfs_fallocate.__x64_sys_fallocate.do_syscall_64.entry_SYSCALL_64_after_hwframe 36.54 +1.5 38.02 perf-profile.calltrace.cycles-pp.shmem_get_folio_gfp.shmem_fallocate.vfs_fallocate.__x64_sys_fallocate.do_syscall_64 35.97 +1.6 37.60 perf-profile.calltrace.cycles-pp.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate.vfs_fallocate.__x64_sys_fallocate 2.48 ± 3% +2.3 4.80 ± 4% perf-profile.calltrace.cycles-pp.__mem_cgroup_charge.shmem_alloc_and_add_folio.shmem_get_folio_gfp.shmem_fallocate.vfs_fallocate 1.28 ± 2% -0.4 0.92 perf-profile.children.cycles-pp.syscall_return_via_sysret 1.23 -0.3 0.88 ± 2% perf-profile.children.cycles-pp.shmem_alloc_folio 0.95 ± 2% -0.3 0.64 ± 3% perf-profile.children.cycles-pp.shmem_inode_acct_blocks 1.07 -0.3 0.77 ± 3% perf-profile.children.cycles-pp.alloc_pages_mpol 0.86 ± 2% -0.3 0.58 ± 2% perf-profile.children.cycles-pp.xas_store 0.84 -0.2 0.61 ± 3% perf-profile.children.cycles-pp.__alloc_pages 1.26 ± 2% -0.2 1.08 perf-profile.children.cycles-pp.lru_add_drain_cpu 0.61 ± 3% -0.2 0.43 perf-profile.children.cycles-pp.__entry_text_start 0.56 ± 2% -0.2 0.40 ± 3% perf-profile.children.cycles-pp.free_unref_page_list 0.26 ± 7% -0.2 0.11 ± 5% perf-profile.children.cycles-pp.__list_add_valid_or_report 1.19 ± 2% -0.1 1.04 ± 2% perf-profile.children.cycles-pp.syscall_exit_to_user_mode 0.45 ± 4% -0.1 0.31 perf-profile.children.cycles-pp.__mod_lruvec_state 0.48 ± 2% -0.1 0.35 ± 2% perf-profile.children.cycles-pp.get_page_from_freelist 0.38 ± 5% -0.1 0.27 ± 2% perf-profile.children.cycles-pp.xas_load 0.38 ± 2% -0.1 0.27 ± 2% perf-profile.children.cycles-pp._raw_spin_lock 0.33 ± 4% -0.1 0.23 ± 2% perf-profile.children.cycles-pp.__mod_node_page_state 0.42 ± 2% -0.1 0.32 ± 5% perf-profile.children.cycles-pp.find_lock_entries 0.32 ± 2% -0.1 0.23 ± 2% perf-profile.children.cycles-pp.__dquot_alloc_space 0.33 ± 2% -0.1 0.24 ± 3% perf-profile.children.cycles-pp.rmqueue 0.24 ± 3% -0.1 0.17 ± 4% perf-profile.children.cycles-pp.xas_descend 0.23 ± 3% -0.1 0.16 ± 4% perf-profile.children.cycles-pp.xas_init_marks 0.25 ± 3% -0.1 0.17 ± 2% perf-profile.children.cycles-pp.xas_clear_mark 0.23 ± 2% -0.1 0.16 ± 5% perf-profile.children.cycles-pp.__cond_resched 0.28 ± 5% -0.1 0.22 ± 2% perf-profile.children.cycles-pp.filemap_get_entry 0.24 ± 3% -0.1 0.18 ± 4% perf-profile.children.cycles-pp.truncate_cleanup_folio 0.16 ± 4% -0.1 0.10 ± 4% perf-profile.children.cycles-pp.xas_find_conflict 0.09 ± 7% -0.1 0.03 ± 70% perf-profile.children.cycles-pp.mem_cgroup_update_lru_size 0.18 -0.1 0.12 ± 6% perf-profile.children.cycles-pp.shmem_recalc_inode 0.18 ± 2% -0.1 0.12 ± 3% perf-profile.children.cycles-pp.folio_unlock 0.17 ± 4% -0.1 0.12 ± 3% perf-profile.children.cycles-pp.free_unref_page_prepare 0.16 ± 6% -0.1 0.11 ± 4% perf-profile.children.cycles-pp.security_file_permission 0.13 ± 7% -0.0 0.08 ± 13% perf-profile.children.cycles-pp.security_vm_enough_memory_mm 0.20 ± 4% -0.0 0.15 ± 2% perf-profile.children.cycles-pp.free_unref_page_commit 0.16 ± 5% -0.0 0.11 ± 3% perf-profile.children.cycles-pp.noop_dirty_folio 0.15 ± 5% -0.0 0.11 ± 4% perf-profile.children.cycles-pp.file_modified 0.12 ± 10% -0.0 0.08 perf-profile.children.cycles-pp.__percpu_counter_limited_add 0.19 ± 5% -0.0 0.14 ± 5% perf-profile.children.cycles-pp._raw_spin_lock_irq 0.11 ± 12% -0.0 0.06 ± 17% perf-profile.children.cycles-pp.cap_vm_enough_memory 0.14 ± 5% -0.0 0.10 ± 6% perf-profile.children.cycles-pp.__fget_light 0.14 ± 7% -0.0 0.10 ± 4% perf-profile.children.cycles-pp.apparmor_file_permission 0.14 ± 2% -0.0 0.10 ± 4% perf-profile.children.cycles-pp.__folio_cancel_dirty 0.12 ± 3% -0.0 0.08 ± 4% perf-profile.children.cycles-pp.entry_SYSCALL_64_safe_stack 0.11 ± 10% -0.0 0.08 ± 16% perf-profile.children.cycles-pp.__vm_enough_memory 0.11 ± 8% -0.0 0.08 ± 4% perf-profile.children.cycles-pp.xas_start 0.11 ± 3% -0.0 0.08 ± 4% perf-profile.children.cycles-pp.__fsnotify_parent 0.18 ± 2% -0.0 0.14 ± 6% perf-profile.children.cycles-pp.__list_del_entry_valid_or_report 0.08 ± 6% -0.0 0.04 ± 45% perf-profile.children.cycles-pp.__get_file_rcu 0.12 ± 7% -0.0 0.09 ± 5% perf-profile.children.cycles-pp.inode_add_bytes 0.11 ± 4% -0.0 0.08 ± 8% perf-profile.children.cycles-pp._raw_spin_trylock 0.08 ± 6% -0.0 0.05 ± 45% perf-profile.children.cycles-pp.percpu_counter_add_batch 0.10 -0.0 0.07 ± 9% perf-profile.children.cycles-pp.inode_needs_update_time 0.09 ± 7% -0.0 0.06 ± 6% perf-profile.children.cycles-pp.get_pfnblock_flags_mask 0.07 ± 6% -0.0 0.05 ± 45% perf-profile.children.cycles-pp.shmem_is_huge 0.09 ± 7% -0.0 0.07 ± 7% perf-profile.children.cycles-pp.entry_SYSRETQ_unsafe_stack 0.08 ± 4% -0.0 0.06 ± 8% perf-profile.children.cycles-pp.policy_nodemask 0.19 ± 3% -0.0 0.17 ± 4% perf-profile.children.cycles-pp.try_charge_memcg 0.08 ± 8% -0.0 0.06 ± 8% perf-profile.children.cycles-pp.down_write 0.09 ± 7% -0.0 0.07 ± 5% perf-profile.children.cycles-pp.xas_create 0.09 ± 7% -0.0 0.07 ± 7% perf-profile.children.cycles-pp.filemap_free_folio 0.08 ± 4% -0.0 0.06 ± 6% perf-profile.children.cycles-pp.xas_find 0.07 ± 5% +0.0 0.09 ± 5% perf-profile.children.cycles-pp.propagate_protected_usage 0.25 +0.0 0.28 ± 2% perf-profile.children.cycles-pp.uncharge_folio 0.43 ± 2% +0.0 0.47 ± 2% perf-profile.children.cycles-pp.uncharge_batch 0.69 +0.1 0.75 ± 2% perf-profile.children.cycles-pp.__mem_cgroup_uncharge_list 1.10 +0.1 1.20 ± 2% perf-profile.children.cycles-pp.lru_add_fn 2.96 +0.3 3.21 perf-profile.children.cycles-pp.truncate_inode_folio 2.60 +0.4 2.96 perf-profile.children.cycles-pp.filemap_remove_folio 2.29 +0.4 2.73 ± 2% perf-profile.children.cycles-pp.__filemap_remove_folio 1.39 ± 3% +0.5 1.85 ± 5% perf-profile.children.cycles-pp.get_mem_cgroup_from_mm 2.34 ± 2% +0.6 2.93 ± 2% perf-profile.children.cycles-pp.shmem_add_to_page_cache 0.18 ± 5% +0.7 0.92 ± 2% perf-profile.children.cycles-pp.__count_memcg_events 1.32 +0.8 2.07 ± 3% perf-profile.children.cycles-pp.filemap_unaccount_folio 38.14 +1.0 39.17 perf-profile.children.cycles-pp.vfs_fallocate 0.64 ± 4% +1.1 1.72 ± 3% perf-profile.children.cycles-pp.mem_cgroup_commit_charge 37.63 +1.2 38.81 perf-profile.children.cycles-pp.shmem_fallocate 36.57 +1.5 38.05 perf-profile.children.cycles-pp.shmem_get_folio_gfp 36.04 +1.6 37.65 perf-profile.children.cycles-pp.shmem_alloc_and_add_folio 2.66 +1.7 4.38 ± 3% perf-profile.children.cycles-pp.__mod_lruvec_page_state 2.49 ± 2% +2.3 4.80 ± 4% perf-profile.children.cycles-pp.__mem_cgroup_charge 1.99 ± 2% +2.5 4.46 perf-profile.children.cycles-pp.__mod_memcg_lruvec_state 1.28 ± 2% -0.4 0.92 perf-profile.self.cycles-pp.syscall_return_via_sysret 0.69 ± 2% -0.2 0.50 ± 2% perf-profile.self.cycles-pp.entry_SYSCALL_64_after_hwframe 0.54 ± 2% -0.2 0.36 ± 2% perf-profile.self.cycles-pp.release_pages 0.47 ± 2% -0.2 0.31 ± 2% perf-profile.self.cycles-pp.xas_store 0.53 ± 3% -0.2 0.37 ± 2% perf-profile.self.cycles-pp.__entry_text_start 0.36 ± 3% -0.2 0.21 ± 2% perf-profile.self.cycles-pp.shmem_add_to_page_cache 0.26 ± 8% -0.2 0.11 ± 5% perf-profile.self.cycles-pp.__list_add_valid_or_report 1.14 ± 2% -0.1 1.01 ± 2% perf-profile.self.cycles-pp.syscall_exit_to_user_mode 0.40 ± 4% -0.1 0.28 ± 4% perf-profile.self.cycles-pp.lru_add_fn 0.37 ± 2% -0.1 0.26 perf-profile.self.cycles-pp._raw_spin_lock 0.32 ± 3% -0.1 0.22 ± 2% perf-profile.self.cycles-pp.__mod_node_page_state 0.35 ± 3% -0.1 0.25 ± 2% perf-profile.self.cycles-pp.shmem_fallocate 0.50 ± 2% -0.1 0.40 ± 3% perf-profile.self.cycles-pp.folio_batch_move_lru 0.34 ± 3% -0.1 0.26 ± 5% perf-profile.self.cycles-pp.find_lock_entries 0.28 ± 2% -0.1 0.20 ± 5% perf-profile.self.cycles-pp.__alloc_pages 0.22 ± 2% -0.1 0.16 ± 3% perf-profile.self.cycles-pp.xas_clear_mark 0.21 ± 3% -0.1 0.15 ± 4% perf-profile.self.cycles-pp.shmem_alloc_and_add_folio 0.18 ± 3% -0.1 0.12 ± 5% perf-profile.self.cycles-pp.free_unref_page_list 0.22 ± 3% -0.1 0.16 ± 3% perf-profile.self.cycles-pp.xas_descend 0.20 ± 6% -0.1 0.14 ± 2% perf-profile.self.cycles-pp.__dquot_alloc_space 0.18 ± 4% -0.1 0.12 ± 6% perf-profile.self.cycles-pp.shmem_inode_acct_blocks 0.21 ± 5% -0.1 0.15 ± 5% perf-profile.self.cycles-pp.vfs_fallocate 0.18 ± 2% -0.1 0.12 ± 3% perf-profile.self.cycles-pp.__filemap_remove_folio 0.17 ± 4% -0.1 0.12 ± 3% perf-profile.self.cycles-pp.folio_unlock 0.20 ± 3% -0.0 0.14 ± 5% perf-profile.self.cycles-pp.shmem_get_folio_gfp 0.16 ± 5% -0.0 0.12 ± 6% perf-profile.self.cycles-pp.__cond_resched 0.17 ± 5% -0.0 0.12 ± 3% perf-profile.self.cycles-pp._raw_spin_lock_irq 0.14 ± 4% -0.0 0.10 ± 5% perf-profile.self.cycles-pp.xas_load 0.15 ± 2% -0.0 0.10 ± 4% perf-profile.self.cycles-pp.get_page_from_freelist 0.16 ± 4% -0.0 0.12 ± 4% perf-profile.self.cycles-pp.alloc_pages_mpol 0.12 ± 6% -0.0 0.08 ± 6% perf-profile.self.cycles-pp.rmqueue 0.13 ± 6% -0.0 0.09 ± 4% perf-profile.self.cycles-pp._raw_spin_lock_irqsave 0.13 ± 8% -0.0 0.09 ± 4% perf-profile.self.cycles-pp.apparmor_file_permission 0.15 ± 3% -0.0 0.11 ± 3% perf-profile.self.cycles-pp.noop_dirty_folio 0.10 ± 10% -0.0 0.06 ± 17% perf-profile.self.cycles-pp.cap_vm_enough_memory 0.07 ± 8% -0.0 0.03 ± 70% perf-profile.self.cycles-pp.percpu_counter_add_batch 0.12 ± 10% -0.0 0.08 ± 4% perf-profile.self.cycles-pp.__percpu_counter_limited_add 0.12 ± 4% -0.0 0.08 ± 8% perf-profile.self.cycles-pp.folio_add_lru 0.10 ± 3% -0.0 0.06 ± 7% perf-profile.self.cycles-pp.xas_init_marks 0.10 ± 9% -0.0 0.07 perf-profile.self.cycles-pp.xas_start 0.13 ± 3% -0.0 0.10 ± 5% perf-profile.self.cycles-pp.filemap_remove_folio 0.11 ± 6% -0.0 0.08 ± 6% perf-profile.self.cycles-pp.__fsnotify_parent 0.16 ± 4% -0.0 0.12 ± 4% perf-profile.self.cycles-pp.free_unref_page_commit 0.12 ± 4% -0.0 0.08 ± 4% perf-profile.self.cycles-pp.__mod_lruvec_state 0.11 ± 3% -0.0 0.08 ± 7% perf-profile.self.cycles-pp.fallocate64 0.08 ± 6% -0.0 0.04 ± 45% perf-profile.self.cycles-pp.__get_file_rcu 0.11 ± 4% -0.0 0.08 ± 8% perf-profile.self.cycles-pp.__folio_cancel_dirty 0.11 ± 3% -0.0 0.08 ± 6% perf-profile.self.cycles-pp._raw_spin_trylock 0.11 ± 6% -0.0 0.08 ± 6% perf-profile.self.cycles-pp.truncate_cleanup_folio 0.06 ± 7% -0.0 0.03 ± 70% perf-profile.self.cycles-pp.__fget_light 0.17 ± 3% -0.0 0.14 ± 5% perf-profile.self.cycles-pp.__list_del_entry_valid_or_report 0.09 ± 5% -0.0 0.06 ± 7% perf-profile.self.cycles-pp.filemap_get_entry 0.22 ± 6% -0.0 0.19 ± 3% perf-profile.self.cycles-pp.page_counter_uncharge 0.09 ± 7% -0.0 0.06 ± 6% perf-profile.self.cycles-pp.get_pfnblock_flags_mask 0.08 ± 5% -0.0 0.06 ± 8% perf-profile.self.cycles-pp.free_unref_page_prepare 0.07 ± 11% -0.0 0.04 ± 44% perf-profile.self.cycles-pp.shmem_is_huge 0.08 ± 6% -0.0 0.05 ± 7% perf-profile.self.cycles-pp.__x64_sys_fallocate 0.08 ± 7% -0.0 0.06 ± 6% perf-profile.self.cycles-pp.entry_SYSRETQ_unsafe_stack 0.09 ± 7% -0.0 0.07 ± 7% perf-profile.self.cycles-pp.filemap_free_folio 0.08 ± 8% -0.0 0.06 perf-profile.self.cycles-pp.shmem_alloc_folio 0.12 ± 6% -0.0 0.10 ± 6% perf-profile.self.cycles-pp.try_charge_memcg 0.07 ± 5% +0.0 0.09 ± 5% perf-profile.self.cycles-pp.propagate_protected_usage 0.24 +0.0 0.27 ± 3% perf-profile.self.cycles-pp.uncharge_folio 0.46 ± 4% +0.4 0.86 ± 5% perf-profile.self.cycles-pp.mem_cgroup_commit_charge 1.38 ± 3% +0.5 1.84 ± 5% perf-profile.self.cycles-pp.get_mem_cgroup_from_mm 0.16 ± 3% +0.7 0.90 ± 2% perf-profile.self.cycles-pp.__count_memcg_events 0.28 ± 3% +0.8 1.06 ± 5% perf-profile.self.cycles-pp.__mem_cgroup_charge 1.86 ± 2% +2.5 4.36 perf-profile.self.cycles-pp.__mod_memcg_lruvec_state Disclaimer: Results have been estimated based on internal Intel analysis and are provided for informational purposes only. Any difference in system hardware or software design or configuration may affect actual performance.
On Wed, Nov 22, 2023 at 5:54 AM kernel test robot <oliver.sang@intel.com> wrote: > > > > Hello, > > kernel test robot noticed a -30.2% regression of will-it-scale.per_thread_ops on: > > > commit: c7fbfc7b4e089c4a9b292b1973a42a5761c1342f ("[PATCH v3 3/5] mm: memcg: make stats flushing threshold per-memcg") > url: https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/mm-memcg-change-flush_next_time-to-flush_last_time/20231116-103300 > base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything > patch link: https://lore.kernel.org/all/20231116022411.2250072-4-yosryahmed@google.com/ > patch subject: [PATCH v3 3/5] mm: memcg: make stats flushing threshold per-memcg > > testcase: will-it-scale > test machine: 104 threads 2 sockets (Skylake) with 192G memory > parameters: > > nr_task: 50% > mode: thread > test: fallocate2 > cpufreq_governor: performance > > This regression was also reported in v2, and I explicitly mention it in the cover letter here: https://lore.kernel.org/lkml/20231116022411.2250072-1-yosryahmed@google.com/ In a nutshell, I think this microbenchmark regression does not represent real workloads. On the other hand, there are demonstrated benefits on real workloads from this series in terms of stats reading time.
hi, Yosry Ahmed, On Mon, Nov 27, 2023 at 01:13:44PM -0800, Yosry Ahmed wrote: > On Wed, Nov 22, 2023 at 5:54 AM kernel test robot <oliver.sang@intel.com> wrote: > > > > > > > > Hello, > > > > kernel test robot noticed a -30.2% regression of will-it-scale.per_thread_ops on: > > > > > > commit: c7fbfc7b4e089c4a9b292b1973a42a5761c1342f ("[PATCH v3 3/5] mm: memcg: make stats flushing threshold per-memcg") > > url: https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/mm-memcg-change-flush_next_time-to-flush_last_time/20231116-103300 > > base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything > > patch link: https://lore.kernel.org/all/20231116022411.2250072-4-yosryahmed@google.com/ > > patch subject: [PATCH v3 3/5] mm: memcg: make stats flushing threshold per-memcg > > > > testcase: will-it-scale > > test machine: 104 threads 2 sockets (Skylake) with 192G memory > > parameters: > > > > nr_task: 50% > > mode: thread > > test: fallocate2 > > cpufreq_governor: performance > > > > > > This regression was also reported in v2, and I explicitly mention it > in the cover letter here: > https://lore.kernel.org/lkml/20231116022411.2250072-1-yosryahmed@google.com/ got it. this also reminds us to read cover letter for a patch set in the future. Thanks! > > In a nutshell, I think this microbenchmark regression does not > represent real workloads. On the other hand, there are demonstrated > benefits on real workloads from this series in terms of stats reading > time. > ok, if there are future versions of this patch, or when it is merged, we will ignore similar results. just a small question, since we focus on microbenchmark, if we found other regression (or improvement) on tests other than will-it-scale::fallocate, do you want us to send report or just ignore them, either?
On Mon, Nov 27, 2023 at 5:46 PM Oliver Sang <oliver.sang@intel.com> wrote: > > hi, Yosry Ahmed, > > On Mon, Nov 27, 2023 at 01:13:44PM -0800, Yosry Ahmed wrote: > > On Wed, Nov 22, 2023 at 5:54 AM kernel test robot <oliver.sang@intel.com> wrote: > > > > > > > > > > > > Hello, > > > > > > kernel test robot noticed a -30.2% regression of will-it-scale.per_thread_ops on: > > > > > > > > > commit: c7fbfc7b4e089c4a9b292b1973a42a5761c1342f ("[PATCH v3 3/5] mm: memcg: make stats flushing threshold per-memcg") > > > url: https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/mm-memcg-change-flush_next_time-to-flush_last_time/20231116-103300 > > > base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything > > > patch link: https://lore.kernel.org/all/20231116022411.2250072-4-yosryahmed@google.com/ > > > patch subject: [PATCH v3 3/5] mm: memcg: make stats flushing threshold per-memcg > > > > > > testcase: will-it-scale > > > test machine: 104 threads 2 sockets (Skylake) with 192G memory > > > parameters: > > > > > > nr_task: 50% > > > mode: thread > > > test: fallocate2 > > > cpufreq_governor: performance > > > > > > > > > > This regression was also reported in v2, and I explicitly mention it > > in the cover letter here: > > https://lore.kernel.org/lkml/20231116022411.2250072-1-yosryahmed@google.com/ > > got it. this also reminds us to read cover letter for a patch set in the > future. Thanks! > > > > > In a nutshell, I think this microbenchmark regression does not > > represent real workloads. On the other hand, there are demonstrated > > benefits on real workloads from this series in terms of stats reading > > time. > > > > ok, if there are future versions of this patch, or when it is merged, we will > ignore similar results. > > just a small question, since we focus on microbenchmark, if we found other > regression (or improvement) on tests other than will-it-scale::fallocate, > do you want us to send report or just ignore them, either? I think it would be useful to know if there are regressions/improvements in other microbenchmarks, at least to investigate whether they represent real regressions.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5ae2a8f04be45..74db05237775d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -630,6 +630,9 @@ struct memcg_vmstats_percpu { /* Cgroup1: threshold notifications & softlimit tree updates */ unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; + + /* Stats updates since the last flush */ + unsigned int stats_updates; }; struct memcg_vmstats { @@ -644,6 +647,9 @@ struct memcg_vmstats { /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; unsigned long events_pending[NR_MEMCG_EVENTS]; + + /* Stats updates since the last flush */ + atomic64_t stats_updates; }; /* @@ -663,9 +669,7 @@ struct memcg_vmstats { */ static void flush_memcg_stats_dwork(struct work_struct *w); static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_ongoing = ATOMIC_INIT(0); -static atomic_t stats_flush_threshold = ATOMIC_INIT(0); static u64 flush_last_time; #define FLUSH_TIME (2UL*HZ) @@ -692,26 +696,37 @@ static void memcg_stats_unlock(void) preempt_enable_nested(); } + +static bool memcg_should_flush_stats(struct mem_cgroup *memcg) +{ + return atomic64_read(&memcg->vmstats->stats_updates) > + MEMCG_CHARGE_BATCH * num_online_cpus(); +} + static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { + int cpu = smp_processor_id(); unsigned int x; if (!val) return; - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + cgroup_rstat_updated(memcg->css.cgroup, cpu); + + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates, + abs(val)); + + if (x < MEMCG_CHARGE_BATCH) + continue; - x = __this_cpu_add_return(stats_updates, abs(val)); - if (x > MEMCG_CHARGE_BATCH) { /* - * If stats_flush_threshold exceeds the threshold - * (>num_online_cpus()), cgroup stats update will be triggered - * in __mem_cgroup_flush_stats(). Increasing this var further - * is redundant and simply adds overhead in atomic update. + * If @memcg is already flush-able, increasing stats_updates is + * redundant. Avoid the overhead of the atomic update. */ - if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) - atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); - __this_cpu_write(stats_updates, 0); + if (!memcg_should_flush_stats(memcg)) + atomic64_add(x, &memcg->vmstats->stats_updates); + __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0); } } @@ -730,13 +745,12 @@ static void do_flush_stats(void) cgroup_rstat_flush(root_mem_cgroup->css.cgroup); - atomic_set(&stats_flush_threshold, 0); atomic_set(&stats_flush_ongoing, 0); } void mem_cgroup_flush_stats(void) { - if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + if (memcg_should_flush_stats(root_mem_cgroup)) do_flush_stats(); } @@ -750,8 +764,8 @@ void mem_cgroup_flush_stats_ratelimited(void) static void flush_memcg_stats_dwork(struct work_struct *w) { /* - * Always flush here so that flushing in latency-sensitive paths is - * as cheap as possible. + * Deliberately ignore memcg_should_flush_stats() here so that flushing + * in latency-sensitive paths is as cheap as possible. */ do_flush_stats(); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); @@ -5784,6 +5798,10 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } } } + statc->stats_updates = 0; + /* We are in a per-cpu loop here, only do the atomic write once */ + if (atomic64_read(&memcg->vmstats->stats_updates)) + atomic64_set(&memcg->vmstats->stats_updates, 0); } #ifdef CONFIG_MMU