diff mbox series

[v3,2/2] kernel/fork: group allocation/free of per-cpu counters for mm struct

Message ID 20230823050609.2228718-3-mjguzik@gmail.com (mailing list archive)
State New
Headers show
Series execve scalability issues, part 1 | expand

Commit Message

Mateusz Guzik Aug. 23, 2023, 5:06 a.m. UTC
A trivial execve scalability test which tries to be very friendly
(statically linked binaries, all separate) is predominantly bottlenecked
by back-to-back per-cpu counter allocations which serialize on global
locks.

Ease the pain by allocating and freeing them in one go.

Bench can be found here:
http://apollo.backplane.com/DFlyMisc/doexec.c

$ cc -static -O2 -o static-doexec doexec.c
$ ./static-doexec $(nproc)

Even at a very modest scale of 26 cores (ops/s):
before:	133543.63
after:	186061.81 (+39%)

While with the patch these allocations remain a significant problem,
the primary bottleneck shifts to page release handling.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
 kernel/fork.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

Comments

Dennis Zhou Aug. 24, 2023, 6:28 a.m. UTC | #1
On Wed, Aug 23, 2023 at 07:06:09AM +0200, Mateusz Guzik wrote:
> A trivial execve scalability test which tries to be very friendly
> (statically linked binaries, all separate) is predominantly bottlenecked
> by back-to-back per-cpu counter allocations which serialize on global
> locks.
> 
> Ease the pain by allocating and freeing them in one go.
> 
> Bench can be found here:
> http://apollo.backplane.com/DFlyMisc/doexec.c
> 
> $ cc -static -O2 -o static-doexec doexec.c
> $ ./static-doexec $(nproc)
> 
> Even at a very modest scale of 26 cores (ops/s):
> before:	133543.63
> after:	186061.81 (+39%)
> 
> While with the patch these allocations remain a significant problem,
> the primary bottleneck shifts to page release handling.
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>

Same message as for 1/2. I'm happy with this, just a minor reflow. I'll
take this for-6.6 unless there are other comments / objections to that.

I'll run a few tests myself too tomorrow just for validation.

Reviewed-by: Dennis Zhou <dennis@kernel.org>

Thanks,
Dennis

> ---
>  kernel/fork.c | 14 +++-----------
>  1 file changed, 3 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d2e12b6d2b18..4f0ada33457e 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
>   */
>  void __mmdrop(struct mm_struct *mm)
>  {
> -	int i;
> -
>  	BUG_ON(mm == &init_mm);
>  	WARN_ON_ONCE(mm == current->mm);
>  
> @@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm)
>  	put_user_ns(mm->user_ns);
>  	mm_pasid_drop(mm);
>  	mm_destroy_cid(mm);
> +	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
>  
> -	for (i = 0; i < NR_MM_COUNTERS; i++)
> -		percpu_counter_destroy(&mm->rss_stat[i]);
>  	free_mm(mm);
>  }
>  EXPORT_SYMBOL_GPL(__mmdrop);
> @@ -1252,8 +1249,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
>  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	struct user_namespace *user_ns)
>  {
> -	int i;
> -
>  	mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
>  	mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
>  	atomic_set(&mm->mm_users, 1);
> @@ -1301,17 +1296,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	if (mm_alloc_cid(mm))
>  		goto fail_cid;
>  
> -	for (i = 0; i < NR_MM_COUNTERS; i++)
> -		if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
> -			goto fail_pcpu;
> +	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS))
> +		goto fail_pcpu;
>  
>  	mm->user_ns = get_user_ns(user_ns);
>  	lru_gen_init_mm(mm);
>  	return mm;
>  
>  fail_pcpu:
> -	while (i > 0)
> -		percpu_counter_destroy(&mm->rss_stat[--i]);
>  	mm_destroy_cid(mm);
>  fail_cid:
>  	destroy_context(mm);
> -- 
> 2.41.0
>
kernel test robot Sept. 6, 2023, 8:25 a.m. UTC | #2
Hello,

kernel test robot noticed a -8.2% improvement of phoronix-test-suite.osbench.LaunchPrograms.us_per_event on:


commit: 9d32938c115580bfff128a926d704199d2f33ba3 ("[PATCH v3 2/2] kernel/fork: group allocation/free of per-cpu counters for mm struct")
url: https://github.com/intel-lab-lkp/linux/commits/Mateusz-Guzik/pcpcntr-add-group-allocation-free/20230823-130803
base: https://git.kernel.org/cgit/linux/kernel/git/dennis/percpu.git for-next
patch link: https://lore.kernel.org/all/20230823050609.2228718-3-mjguzik@gmail.com/
patch subject: [PATCH v3 2/2] kernel/fork: group allocation/free of per-cpu counters for mm struct

testcase: phoronix-test-suite
test machine: 96 threads 2 sockets Intel(R) Xeon(R) Gold 6252 CPU @ 2.10GHz (Cascade Lake) with 512G memory
parameters:

	test: osbench-1.0.2
	option_a: Launch Programs
	cpufreq_governor: performance






Details are as below:
-------------------------------------------------------------------------------------------------->


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20230906/202309061504.7e645826-oliver.sang@intel.com

=========================================================================================
compiler/cpufreq_governor/kconfig/option_a/rootfs/tbox_group/test/testcase:
  gcc-12/performance/x86_64-rhel-8.3/Launch Programs/debian-x86_64-phoronix/lkp-csl-2sp7/osbench-1.0.2/phoronix-test-suite

commit: 
  1db50472c8 ("pcpcntr: add group allocation/free")
  9d32938c11 ("kernel/fork: group allocation/free of per-cpu counters for mm struct")

1db50472c8bc1d34 9d32938c115580bfff128a926d7 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      3.00           +33.3%       4.00        vmstat.procs.r
     14111            +5.7%      14918        vmstat.system.cs
      2114            +1.1%       2136        turbostat.Bzy_MHz
      1.67            +0.2        1.83        turbostat.C1E%
    121.98            +5.1%     128.24        turbostat.PkgWatt
     98.05            -8.2%      90.02        phoronix-test-suite.osbench.LaunchPrograms.us_per_event
     16246 ±  4%      +6.1%      17243        phoronix-test-suite.time.involuntary_context_switches
   9791476            +9.2%   10689455        phoronix-test-suite.time.minor_page_faults
    311.33            +9.3%     340.33        phoronix-test-suite.time.percent_of_cpu_this_job_got
     83.40 ±  2%      +9.2%      91.07 ±  2%  phoronix-test-suite.time.system_time
    151333            +8.6%     164355        phoronix-test-suite.time.voluntary_context_switches
      3225            -5.5%       3046 ±  5%  proc-vmstat.nr_page_table_pages
   9150454            +8.0%    9884178        proc-vmstat.numa_hit
   9088660            +8.7%    9882518        proc-vmstat.numa_local
   9971116            +8.3%   10802925        proc-vmstat.pgalloc_normal
  10202032            +8.8%   11099649        proc-vmstat.pgfault
   9845338            +8.4%   10676360        proc-vmstat.pgfree
    207049           +10.3%     228380 ±  8%  proc-vmstat.pgreuse
 1.947e+09            +5.0%  2.045e+09        perf-stat.i.branch-instructions
  52304206            +4.4%   54610501        perf-stat.i.branch-misses
      9.06 ±  2%      +0.5        9.52        perf-stat.i.cache-miss-rate%
  19663522 ±  3%     +10.0%   21634645        perf-stat.i.cache-misses
 1.658e+08            +3.6%  1.717e+08        perf-stat.i.cache-references
     14769            +6.2%      15691        perf-stat.i.context-switches
 1.338e+10            +6.2%   1.42e+10        perf-stat.i.cpu-cycles
   3112873 ±  3%     -12.5%    2724690 ±  3%  perf-stat.i.dTLB-load-misses
 2.396e+09            +5.5%  2.528e+09        perf-stat.i.dTLB-loads
      0.11 ±  4%      -0.0        0.10 ±  2%  perf-stat.i.dTLB-store-miss-rate%
   1003394 ±  6%     -14.0%     862768 ±  5%  perf-stat.i.dTLB-store-misses
  1.25e+09            +6.0%  1.325e+09        perf-stat.i.dTLB-stores
     71.16            -1.3       69.88        perf-stat.i.iTLB-load-miss-rate%
   1872082            +8.2%    2025999        perf-stat.i.iTLB-loads
 9.606e+09            +5.4%  1.012e+10        perf-stat.i.instructions
     23.37 ±  5%     +30.6%      30.53 ±  4%  perf-stat.i.major-faults
      0.14            +6.2%       0.15        perf-stat.i.metric.GHz
     59.39            +5.4%      62.61        perf-stat.i.metric.M/sec
    249517           +10.0%     274572        perf-stat.i.minor-faults
   5081285            +6.0%    5385686 ±  4%  perf-stat.i.node-load-misses
    565117 ±  3%      +8.1%     610682 ±  3%  perf-stat.i.node-loads
    249541           +10.0%     274602        perf-stat.i.page-faults
     17.27            -1.7%      16.98        perf-stat.overall.MPKI
     11.85 ±  2%      +0.7       12.59        perf-stat.overall.cache-miss-rate%
      0.13 ±  2%      -0.0        0.11 ±  2%  perf-stat.overall.dTLB-load-miss-rate%
      0.08 ±  7%      -0.0        0.07 ±  4%  perf-stat.overall.dTLB-store-miss-rate%
     67.26            -1.1       66.12        perf-stat.overall.iTLB-load-miss-rate%
 1.895e+09            +5.0%   1.99e+09        perf-stat.ps.branch-instructions
  50921385            +4.4%   53146828        perf-stat.ps.branch-misses
  19140130 ±  3%     +10.0%   21047707        perf-stat.ps.cache-misses
 1.615e+08            +3.5%  1.672e+08        perf-stat.ps.cache-references
     14376            +6.2%      15266        perf-stat.ps.context-switches
 1.303e+10            +6.1%  1.383e+10        perf-stat.ps.cpu-cycles
   3033019 ±  3%     -12.5%    2654269 ±  3%  perf-stat.ps.dTLB-load-misses
 2.332e+09            +5.5%   2.46e+09        perf-stat.ps.dTLB-loads
    976773 ±  6%     -14.1%     839517 ±  5%  perf-stat.ps.dTLB-store-misses
 1.217e+09            +6.0%  1.289e+09        perf-stat.ps.dTLB-stores
   1822198            +8.2%    1971115        perf-stat.ps.iTLB-loads
 9.349e+09            +5.3%  9.846e+09        perf-stat.ps.instructions
     22.75 ±  5%     +30.5%      29.69 ±  4%  perf-stat.ps.major-faults
    242831           +10.0%     267074        perf-stat.ps.minor-faults
   4945101            +5.9%    5238638 ±  4%  perf-stat.ps.node-load-misses
    550029 ±  3%      +8.0%     594116 ±  3%  perf-stat.ps.node-loads
    242854           +10.0%     267104        perf-stat.ps.page-faults
 3.719e+11            +4.4%  3.883e+11        perf-stat.total.instructions




Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.
diff mbox series

Patch

diff --git a/kernel/fork.c b/kernel/fork.c
index d2e12b6d2b18..4f0ada33457e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -909,8 +909,6 @@  static void cleanup_lazy_tlbs(struct mm_struct *mm)
  */
 void __mmdrop(struct mm_struct *mm)
 {
-	int i;
-
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
 
@@ -925,9 +923,8 @@  void __mmdrop(struct mm_struct *mm)
 	put_user_ns(mm->user_ns);
 	mm_pasid_drop(mm);
 	mm_destroy_cid(mm);
+	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
 
-	for (i = 0; i < NR_MM_COUNTERS; i++)
-		percpu_counter_destroy(&mm->rss_stat[i]);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1252,8 +1249,6 @@  static void mm_init_uprobes_state(struct mm_struct *mm)
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	struct user_namespace *user_ns)
 {
-	int i;
-
 	mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
 	mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
 	atomic_set(&mm->mm_users, 1);
@@ -1301,17 +1296,14 @@  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (mm_alloc_cid(mm))
 		goto fail_cid;
 
-	for (i = 0; i < NR_MM_COUNTERS; i++)
-		if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
-			goto fail_pcpu;
+	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS))
+		goto fail_pcpu;
 
 	mm->user_ns = get_user_ns(user_ns);
 	lru_gen_init_mm(mm);
 	return mm;
 
 fail_pcpu:
-	while (i > 0)
-		percpu_counter_destroy(&mm->rss_stat[--i]);
 	mm_destroy_cid(mm);
 fail_cid:
 	destroy_context(mm);