diff mbox series

[PATCHv3] arm64/mm: save memory access in check_and_switch_context() fast switch path

Message ID 1594389852-19949-1-git-send-email-kernelfans@gmail.com (mailing list archive)
State Mainlined
Commit c4885bbb3afee80f41d39a33e49881a18e500f47
Headers show
Series [PATCHv3] arm64/mm: save memory access in check_and_switch_context() fast switch path | expand

Commit Message

Pingfan Liu July 10, 2020, 2:04 p.m. UTC
On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
using the per-cpu offset stored in the tpidr_el1 system register. In
some cases we generate a per-cpu address with a sequence like:

  cpu_ptr = &per_cpu(ptr, smp_processor_id());

Which potentially incurs a cache miss for both `cpu_number` and the
in-memory `__per_cpu_offset` array. This can be written more optimally
as:

  cpu_ptr = this_cpu_ptr(ptr);

Which only needs the offset from tpidr_el1, and does not need to
load from memory.

The following two test cases show a small performance improvement measured
on a 46-cpus qualcomm machine with 5.8.0-rc4 kernel.

Test 1: (about 0.3% improvement)
    #cat b.sh
    make clean && make all -j138
    #perf stat --repeat 10 --null --sync sh b.sh

    - before this patch
     Performance counter stats for 'sh b.sh' (10 runs):

                298.62 +- 1.86 seconds time elapsed  ( +-  0.62% )

    - after this patch
     Performance counter stats for 'sh b.sh' (10 runs):

               297.734 +- 0.954 seconds time elapsed  ( +-  0.32% )

Test 2: (about 1.69% improvement)
     'perf stat -r 10 perf bench sched messaging'
        Then sum the total time of 'sched/messaging' by manual.

    - before this patch
      total 0.707 sec for 10 times
    - after this patch
      totol 0.695 sec for 10 times

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
To: linux-arm-kernel@lists.infradead.org
---
v2 -> v3: improve commit log with performance result
 arch/arm64/include/asm/mmu_context.h |  6 ++----
 arch/arm64/mm/context.c              | 10 ++++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

Comments

Mark Rutland July 30, 2020, 11:40 a.m. UTC | #1
On Fri, Jul 10, 2020 at 10:04:12PM +0800, Pingfan Liu wrote:
> On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
> using the per-cpu offset stored in the tpidr_el1 system register. In
> some cases we generate a per-cpu address with a sequence like:
> 
>   cpu_ptr = &per_cpu(ptr, smp_processor_id());
> 
> Which potentially incurs a cache miss for both `cpu_number` and the
> in-memory `__per_cpu_offset` array. This can be written more optimally
> as:
> 
>   cpu_ptr = this_cpu_ptr(ptr);
> 
> Which only needs the offset from tpidr_el1, and does not need to
> load from memory.
> 
> The following two test cases show a small performance improvement measured
> on a 46-cpus qualcomm machine with 5.8.0-rc4 kernel.
> 
> Test 1: (about 0.3% improvement)
>     #cat b.sh
>     make clean && make all -j138
>     #perf stat --repeat 10 --null --sync sh b.sh
> 
>     - before this patch
>      Performance counter stats for 'sh b.sh' (10 runs):
> 
>                 298.62 +- 1.86 seconds time elapsed  ( +-  0.62% )
> 
>     - after this patch
>      Performance counter stats for 'sh b.sh' (10 runs):
> 
>                297.734 +- 0.954 seconds time elapsed  ( +-  0.32% )
> 
> Test 2: (about 1.69% improvement)
>      'perf stat -r 10 perf bench sched messaging'
>         Then sum the total time of 'sched/messaging' by manual.
> 
>     - before this patch
>       total 0.707 sec for 10 times
>     - after this patch
>       totol 0.695 sec for 10 times
> 
> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Steve Capper <steve.capper@arm.com>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Vladimir Murzin <vladimir.murzin@arm.com>
> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> To: linux-arm-kernel@lists.infradead.org

The patch looks sound, so FWIW:

Acked-by: Mark Rutland <mark.rutland@arm.com>

... I'll leave it to Catalin and Will to decide whether to pick this up.

Mark.

> ---
> v2 -> v3: improve commit log with performance result
>  arch/arm64/include/asm/mmu_context.h |  6 ++----
>  arch/arm64/mm/context.c              | 10 ++++++----
>  2 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index ab46187..808c3be 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
>   * take CPU migration into account.
>   */
>  #define destroy_context(mm)		do { } while(0)
> -void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
> +void check_and_switch_context(struct mm_struct *mm);
>  
>  #define init_new_context(tsk,mm)	({ atomic64_set(&(mm)->context.id, 0); 0; })
>  
> @@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
>  
>  static inline void __switch_mm(struct mm_struct *next)
>  {
> -	unsigned int cpu = smp_processor_id();
> -
>  	/*
>  	 * init_mm.pgd does not contain any user mappings and it is always
>  	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
> @@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next)
>  		return;
>  	}
>  
> -	check_and_switch_context(next, cpu);
> +	check_and_switch_context(next);
>  }
>  
>  static inline void
> diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
> index d702d60..a206655 100644
> --- a/arch/arm64/mm/context.c
> +++ b/arch/arm64/mm/context.c
> @@ -198,9 +198,10 @@ static u64 new_context(struct mm_struct *mm)
>  	return idx2asid(asid) | generation;
>  }
>  
> -void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
> +void check_and_switch_context(struct mm_struct *mm)
>  {
>  	unsigned long flags;
> +	unsigned int cpu;
>  	u64 asid, old_active_asid;
>  
>  	if (system_supports_cnp())
> @@ -222,9 +223,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>  	 *   relaxed xchg in flush_context will treat us as reserved
>  	 *   because atomic RmWs are totally ordered for a given location.
>  	 */
> -	old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
> +	old_active_asid = atomic64_read(this_cpu_ptr(&active_asids));
>  	if (old_active_asid && asid_gen_match(asid) &&
> -	    atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
> +	    atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids),
>  				     old_active_asid, asid))
>  		goto switch_mm_fastpath;
>  
> @@ -236,10 +237,11 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>  		atomic64_set(&mm->context.id, asid);
>  	}
>  
> +	cpu = smp_processor_id();
>  	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
>  		local_flush_tlb_all();
>  
> -	atomic64_set(&per_cpu(active_asids, cpu), asid);
> +	atomic64_set(this_cpu_ptr(&active_asids), asid);
>  	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
>  
>  switch_mm_fastpath:
> -- 
> 2.7.5
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index ab46187..808c3be 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -175,7 +175,7 @@  static inline void cpu_replace_ttbr1(pgd_t *pgdp)
  * take CPU migration into account.
  */
 #define destroy_context(mm)		do { } while(0)
-void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
+void check_and_switch_context(struct mm_struct *mm);
 
 #define init_new_context(tsk,mm)	({ atomic64_set(&(mm)->context.id, 0); 0; })
 
@@ -214,8 +214,6 @@  enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 
 static inline void __switch_mm(struct mm_struct *next)
 {
-	unsigned int cpu = smp_processor_id();
-
 	/*
 	 * init_mm.pgd does not contain any user mappings and it is always
 	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
@@ -225,7 +223,7 @@  static inline void __switch_mm(struct mm_struct *next)
 		return;
 	}
 
-	check_and_switch_context(next, cpu);
+	check_and_switch_context(next);
 }
 
 static inline void
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index d702d60..a206655 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -198,9 +198,10 @@  static u64 new_context(struct mm_struct *mm)
 	return idx2asid(asid) | generation;
 }
 
-void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
+void check_and_switch_context(struct mm_struct *mm)
 {
 	unsigned long flags;
+	unsigned int cpu;
 	u64 asid, old_active_asid;
 
 	if (system_supports_cnp())
@@ -222,9 +223,9 @@  void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	 *   relaxed xchg in flush_context will treat us as reserved
 	 *   because atomic RmWs are totally ordered for a given location.
 	 */
-	old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
+	old_active_asid = atomic64_read(this_cpu_ptr(&active_asids));
 	if (old_active_asid && asid_gen_match(asid) &&
-	    atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
+	    atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids),
 				     old_active_asid, asid))
 		goto switch_mm_fastpath;
 
@@ -236,10 +237,11 @@  void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 		atomic64_set(&mm->context.id, asid);
 	}
 
+	cpu = smp_processor_id();
 	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
 		local_flush_tlb_all();
 
-	atomic64_set(&per_cpu(active_asids, cpu), asid);
+	atomic64_set(this_cpu_ptr(&active_asids), asid);
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
 switch_mm_fastpath: