diff mbox series

[v2] RISC-V: Probe misaligned access speed in parallel

Message ID 20230920193801.3035093-1-evan@rivosinc.com (mailing list archive)
State Superseded
Commit 0c37376609da25bf57e89acfa0e6f8c43beb2f3e
Headers show
Series [v2] RISC-V: Probe misaligned access speed in parallel | expand

Checks

Context Check Description
conchuod/cover_letter success Single patches do not need cover letters
conchuod/tree_selection success Guessed tree name to be fixes at HEAD b06fab003ae1
conchuod/fixes_present success Fixes tag present in non-next series
conchuod/maintainers_pattern success MAINTAINERS pattern errors before the patch: 5 and now 5
conchuod/verify_signedoff success Signed-off-by tag matches author and committer
conchuod/kdoc success Errors and warnings before: 0 this patch: 0
conchuod/build_rv64_clang_allmodconfig success Errors and warnings before: 9 this patch: 9
conchuod/module_param success Was 0 now: 0
conchuod/build_rv64_gcc_allmodconfig success Errors and warnings before: 9 this patch: 9
conchuod/build_rv32_defconfig success Build OK
conchuod/dtb_warn_rv64 success Errors and warnings before: 29 this patch: 29
conchuod/header_inline success No static functions without inline keyword in header files
conchuod/checkpatch warning CHECK: Unnecessary parentheses around 'system_state == SYSTEM_RUNNING'
conchuod/build_rv64_nommu_k210_defconfig success Build OK
conchuod/verify_fixes success Fixes tag looks correct
conchuod/build_rv64_nommu_virt_defconfig success Build OK
conchuod/patch-1-test-13 success .github/scripts/patches/verify_signedoff.sh
conchuod/patch-1-test-1 success .github/scripts/patches/build_rv32_defconfig.sh
conchuod/patch-1-test-2 success .github/scripts/patches/build_rv64_clang_allmodconfig.sh
conchuod/patch-1-test-3 success .github/scripts/patches/build_rv64_gcc_allmodconfig.sh
conchuod/patch-1-test-4 success .github/scripts/patches/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-1-test-5 success .github/scripts/patches/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-1-test-6 warning .github/scripts/patches/checkpatch.sh
conchuod/patch-1-test-7 success .github/scripts/patches/dtb_warn_rv64.sh
conchuod/patch-1-test-8 success .github/scripts/patches/header_inline.sh
conchuod/patch-1-test-9 success .github/scripts/patches/kdoc.sh
conchuod/patch-1-test-10 success .github/scripts/patches/module_param.sh
conchuod/patch-1-test-11 success .github/scripts/patches/verify_fixes.sh
conchuod/patch-1-test-12 success .github/scripts/patches/verify_signedoff.sh
conchuod/vmtest-for-next-PR warning PR summary

Commit Message

Evan Green Sept. 20, 2023, 7:38 p.m. UTC
Probing for misaligned access speed takes about 0.06 seconds. On a
system with 64 cores, doing this in smp_callin() means it's done
serially, extending boot time by 3.8 seconds. That's a lot of boot time.

Instead of measuring each CPU serially, let's do the measurements on
all CPUs in parallel. If we disable preemption on all CPUs, the
jiffies stop ticking, so we can do this in stages of 1) everybody
except core 0, then 2) core 0.

The measurement call in smp_callin() stays around, but is now
conditionalized to only run if a new CPU shows up after the round of
in-parallel measurements has run. The goal is to have the measurement
call not run during boot or suspend/resume, but only on a hotplug
addition.

Reported-by: Jisheng Zhang <jszhang@kernel.org>
Closes: https://lore.kernel.org/all/mhng-9359993d-6872-4134-83ce-c97debe1cf9a@palmer-ri-x1c9/T/#mae9b8f40016f9df428829d33360144dc5026bcbf
Fixes: 584ea6564bca ("RISC-V: Probe for unaligned access speed")
Signed-off-by: Evan Green <evan@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Tested-by: Andrew Jones <ajones@ventanamicro.com>

---

Changes in v2:
 - Removed new global, used system_state == SYSTEM_RUNNING instead
   (Jisheng)
 - Added tags

 arch/riscv/include/asm/cpufeature.h |  2 +-
 arch/riscv/kernel/cpufeature.c      | 22 +++++++++++++++++-----
 arch/riscv/kernel/smpboot.c         | 11 ++++++++++-
 3 files changed, 28 insertions(+), 7 deletions(-)

Comments

Conor Dooley Sept. 20, 2023, 9:03 p.m. UTC | #1
Yo,

On Wed, Sep 20, 2023 at 12:38:01PM -0700, Evan Green wrote:
> Probing for misaligned access speed takes about 0.06 seconds. On a
> system with 64 cores, doing this in smp_callin() means it's done
> serially, extending boot time by 3.8 seconds. That's a lot of boot time.
> 
> Instead of measuring each CPU serially, let's do the measurements on
> all CPUs in parallel. If we disable preemption on all CPUs, the
> jiffies stop ticking, so we can do this in stages of 1) everybody
> except core 0, then 2) core 0.
> 
> The measurement call in smp_callin() stays around, but is now
> conditionalized to only run if a new CPU shows up after the round of
> in-parallel measurements has run. The goal is to have the measurement
> call not run during boot or suspend/resume, but only on a hotplug
> addition.
> 
> Reported-by: Jisheng Zhang <jszhang@kernel.org>
> Closes: https://lore.kernel.org/all/mhng-9359993d-6872-4134-83ce-c97debe1cf9a@palmer-ri-x1c9/T/#mae9b8f40016f9df428829d33360144dc5026bcbf
> Fixes: 584ea6564bca ("RISC-V: Probe for unaligned access speed")
> Signed-off-by: Evan Green <evan@rivosinc.com>
> Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> Tested-by: Andrew Jones <ajones@ventanamicro.com>
> 
> ---
> 
> Changes in v2:
>  - Removed new global, used system_state == SYSTEM_RUNNING instead
>    (Jisheng)
>  - Added tags
> 
>  arch/riscv/include/asm/cpufeature.h |  2 +-
>  arch/riscv/kernel/cpufeature.c      | 22 +++++++++++++++++-----
>  arch/riscv/kernel/smpboot.c         | 11 ++++++++++-
>  3 files changed, 28 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> index d0345bd659c9..b139796392d0 100644
> --- a/arch/riscv/include/asm/cpufeature.h
> +++ b/arch/riscv/include/asm/cpufeature.h
> @@ -30,6 +30,6 @@ DECLARE_PER_CPU(long, misaligned_access_speed);
>  /* Per-cpu ISA extensions. */
>  extern struct riscv_isainfo hart_isa[NR_CPUS];
>  
> -void check_unaligned_access(int cpu);
> +int check_unaligned_access(void *unused);
>  
>  #endif
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index 1cfbba65d11a..40bb854fcb96 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -556,8 +556,9 @@ unsigned long riscv_get_elf_hwcap(void)
>  	return hwcap;
>  }
>  
> -void check_unaligned_access(int cpu)
> +int check_unaligned_access(void *unused)
>  {
> +	int cpu = smp_processor_id();
>  	u64 start_cycles, end_cycles;
>  	u64 word_cycles;
>  	u64 byte_cycles;
> @@ -571,7 +572,7 @@ void check_unaligned_access(int cpu)
>  	page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE));
>  	if (!page) {
>  		pr_warn("Can't alloc pages to measure memcpy performance");
> -		return;
> +		return 0;

Dumb question maybe, but I am limited setup wise at the moment due to
a hardware failure which makes checking stuff hard, why the signature
change? Requirement for on_each_cpu()?

>  	}
>  
>  	/* Make an unaligned destination buffer. */
> @@ -643,15 +644,26 @@ void check_unaligned_access(int cpu)
>  
>  out:
>  	__free_pages(page, get_order(MISALIGNED_BUFFER_SIZE));
> +	return 0;
> +}
> +
> +static void check_unaligned_access_nonboot_cpu(void *param)
> +{
> +	if (smp_processor_id() != 0)
> +		check_unaligned_access(param);
>  }
>  
> -static int check_unaligned_access_boot_cpu(void)
> +static int check_unaligned_access_all_cpus(void)
>  {
> -	check_unaligned_access(0);
> +	/* Check everybody except 0, who stays behind to tend jiffies. */
> +	on_each_cpu(check_unaligned_access_nonboot_cpu, NULL, 1);
> +
> +	/* Check core 0. */
> +	smp_call_on_cpu(0, check_unaligned_access, NULL, true);
>  	return 0;

Why does this function return an int if it can only return 0?

Cheers,
Conor.

>  }
>  
> -arch_initcall(check_unaligned_access_boot_cpu);
> +arch_initcall(check_unaligned_access_all_cpus);
>  
>  #ifdef CONFIG_RISCV_ALTERNATIVE
>  /*
> diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
> index 1b8da4e40a4d..a014955b8699 100644
> --- a/arch/riscv/kernel/smpboot.c
> +++ b/arch/riscv/kernel/smpboot.c
> @@ -27,6 +27,7 @@
>  #include <linux/sched/mm.h>
>  #include <asm/cpu_ops.h>
>  #include <asm/cpufeature.h>
> +#include <asm/hwprobe.h>
>  #include <asm/irq.h>
>  #include <asm/mmu_context.h>
>  #include <asm/numa.h>
> @@ -246,7 +247,15 @@ asmlinkage __visible void smp_callin(void)
>  
>  	numa_add_cpu(curr_cpuid);
>  	set_cpu_online(curr_cpuid, 1);
> -	check_unaligned_access(curr_cpuid);
> +
> +	/*
> +	 * Boot-time misaligned access speed measurements are done in parallel
> +	 * in an initcall. Only measure here for hotplug.
> +	 */
> +	if ((system_state == SYSTEM_RUNNING) &&
> +	    (per_cpu(misaligned_access_speed, curr_cpuid) == RISCV_HWPROBE_MISALIGNED_UNKNOWN)) {
> +		check_unaligned_access(NULL);
> +	}
>  
>  	if (has_vector()) {
>  		if (riscv_v_setup_vsize())
> -- 
> 2.34.1
>
Atish Patra Sept. 20, 2023, 9:27 p.m. UTC | #2
On Wed, Sep 20, 2023 at 2:04 PM Conor Dooley <conor@kernel.org> wrote:
>
> Yo,
>
> On Wed, Sep 20, 2023 at 12:38:01PM -0700, Evan Green wrote:
> > Probing for misaligned access speed takes about 0.06 seconds. On a
> > system with 64 cores, doing this in smp_callin() means it's done
> > serially, extending boot time by 3.8 seconds. That's a lot of boot time.
> >
> > Instead of measuring each CPU serially, let's do the measurements on
> > all CPUs in parallel. If we disable preemption on all CPUs, the
> > jiffies stop ticking, so we can do this in stages of 1) everybody
> > except core 0, then 2) core 0.
> >
> > The measurement call in smp_callin() stays around, but is now
> > conditionalized to only run if a new CPU shows up after the round of
> > in-parallel measurements has run. The goal is to have the measurement
> > call not run during boot or suspend/resume, but only on a hotplug
> > addition.
> >
> > Reported-by: Jisheng Zhang <jszhang@kernel.org>
> > Closes: https://lore.kernel.org/all/mhng-9359993d-6872-4134-83ce-c97debe1cf9a@palmer-ri-x1c9/T/#mae9b8f40016f9df428829d33360144dc5026bcbf
> > Fixes: 584ea6564bca ("RISC-V: Probe for unaligned access speed")
> > Signed-off-by: Evan Green <evan@rivosinc.com>
> > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > Tested-by: Andrew Jones <ajones@ventanamicro.com>
> >
> > ---
> >
> > Changes in v2:
> >  - Removed new global, used system_state == SYSTEM_RUNNING instead
> >    (Jisheng)
> >  - Added tags
> >
> >  arch/riscv/include/asm/cpufeature.h |  2 +-
> >  arch/riscv/kernel/cpufeature.c      | 22 +++++++++++++++++-----
> >  arch/riscv/kernel/smpboot.c         | 11 ++++++++++-
> >  3 files changed, 28 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > index d0345bd659c9..b139796392d0 100644
> > --- a/arch/riscv/include/asm/cpufeature.h
> > +++ b/arch/riscv/include/asm/cpufeature.h
> > @@ -30,6 +30,6 @@ DECLARE_PER_CPU(long, misaligned_access_speed);
> >  /* Per-cpu ISA extensions. */
> >  extern struct riscv_isainfo hart_isa[NR_CPUS];
> >
> > -void check_unaligned_access(int cpu);
> > +int check_unaligned_access(void *unused);
> >
> >  #endif
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index 1cfbba65d11a..40bb854fcb96 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -556,8 +556,9 @@ unsigned long riscv_get_elf_hwcap(void)
> >       return hwcap;
> >  }
> >
> > -void check_unaligned_access(int cpu)
> > +int check_unaligned_access(void *unused)
> >  {
> > +     int cpu = smp_processor_id();
> >       u64 start_cycles, end_cycles;
> >       u64 word_cycles;
> >       u64 byte_cycles;
> > @@ -571,7 +572,7 @@ void check_unaligned_access(int cpu)
> >       page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE));
> >       if (!page) {
> >               pr_warn("Can't alloc pages to measure memcpy performance");
> > -             return;
> > +             return 0;
>
> Dumb question maybe, but I am limited setup wise at the moment due to
> a hardware failure which makes checking stuff hard, why the signature
> change? Requirement for on_each_cpu()?
>

Requirement for smp_call_on_cpu.

> >       }
> >
> >       /* Make an unaligned destination buffer. */
> > @@ -643,15 +644,26 @@ void check_unaligned_access(int cpu)
> >
> >  out:
> >       __free_pages(page, get_order(MISALIGNED_BUFFER_SIZE));
> > +     return 0;
> > +}
> > +
> > +static void check_unaligned_access_nonboot_cpu(void *param)
> > +{
> > +     if (smp_processor_id() != 0)
> > +             check_unaligned_access(param);
> >  }
> >
> > -static int check_unaligned_access_boot_cpu(void)
> > +static int check_unaligned_access_all_cpus(void)
> >  {
> > -     check_unaligned_access(0);
> > +     /* Check everybody except 0, who stays behind to tend jiffies. */
> > +     on_each_cpu(check_unaligned_access_nonboot_cpu, NULL, 1);
> > +
> > +     /* Check core 0. */
> > +     smp_call_on_cpu(0, check_unaligned_access, NULL, true);
> >       return 0;
>
> Why does this function return an int if it can only return 0?
>

Should we define a check_unaligned_access_boot_cpu to avoid the return
type change ?
We can also get rid of the unused parameter in the
check_unaligned_access function.

I also noticed on_each_cpu invokes the function within preempt_disable/enable.
check_unaligned_access will invoke it again. It's probably harmless
but there is no need for non-boot cpus.

We can just have preempt_disable/enable around check_unaligned_access
in the check_unaligned_access_boot_cpu function.

> Cheers,
> Conor.
>
> >  }
> >
> > -arch_initcall(check_unaligned_access_boot_cpu);
> > +arch_initcall(check_unaligned_access_all_cpus);
> >
> >  #ifdef CONFIG_RISCV_ALTERNATIVE
> >  /*
> > diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
> > index 1b8da4e40a4d..a014955b8699 100644
> > --- a/arch/riscv/kernel/smpboot.c
> > +++ b/arch/riscv/kernel/smpboot.c
> > @@ -27,6 +27,7 @@
> >  #include <linux/sched/mm.h>
> >  #include <asm/cpu_ops.h>
> >  #include <asm/cpufeature.h>
> > +#include <asm/hwprobe.h>
> >  #include <asm/irq.h>
> >  #include <asm/mmu_context.h>
> >  #include <asm/numa.h>
> > @@ -246,7 +247,15 @@ asmlinkage __visible void smp_callin(void)
> >
> >       numa_add_cpu(curr_cpuid);
> >       set_cpu_online(curr_cpuid, 1);
> > -     check_unaligned_access(curr_cpuid);
> > +
> > +     /*
> > +      * Boot-time misaligned access speed measurements are done in parallel
> > +      * in an initcall. Only measure here for hotplug.
> > +      */
> > +     if ((system_state == SYSTEM_RUNNING) &&
> > +         (per_cpu(misaligned_access_speed, curr_cpuid) == RISCV_HWPROBE_MISALIGNED_UNKNOWN)) {
> > +             check_unaligned_access(NULL);
> > +     }
> >
> >       if (has_vector()) {
> >               if (riscv_v_setup_vsize())
> > --
> > 2.34.1
> >
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Evan Green Sept. 20, 2023, 10:06 p.m. UTC | #3
On Wed, Sep 20, 2023 at 2:27 PM Atish Patra <atishp@atishpatra.org> wrote:
>
> On Wed, Sep 20, 2023 at 2:04 PM Conor Dooley <conor@kernel.org> wrote:
> >
> > Yo,
> >
> > On Wed, Sep 20, 2023 at 12:38:01PM -0700, Evan Green wrote:
> > > Probing for misaligned access speed takes about 0.06 seconds. On a
> > > system with 64 cores, doing this in smp_callin() means it's done
> > > serially, extending boot time by 3.8 seconds. That's a lot of boot time.
> > >
> > > Instead of measuring each CPU serially, let's do the measurements on
> > > all CPUs in parallel. If we disable preemption on all CPUs, the
> > > jiffies stop ticking, so we can do this in stages of 1) everybody
> > > except core 0, then 2) core 0.
> > >
> > > The measurement call in smp_callin() stays around, but is now
> > > conditionalized to only run if a new CPU shows up after the round of
> > > in-parallel measurements has run. The goal is to have the measurement
> > > call not run during boot or suspend/resume, but only on a hotplug
> > > addition.
> > >
> > > Reported-by: Jisheng Zhang <jszhang@kernel.org>
> > > Closes: https://lore.kernel.org/all/mhng-9359993d-6872-4134-83ce-c97debe1cf9a@palmer-ri-x1c9/T/#mae9b8f40016f9df428829d33360144dc5026bcbf
> > > Fixes: 584ea6564bca ("RISC-V: Probe for unaligned access speed")
> > > Signed-off-by: Evan Green <evan@rivosinc.com>
> > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > > Tested-by: Andrew Jones <ajones@ventanamicro.com>
> > >
> > > ---
> > >
> > > Changes in v2:
> > >  - Removed new global, used system_state == SYSTEM_RUNNING instead
> > >    (Jisheng)
> > >  - Added tags
> > >
> > >  arch/riscv/include/asm/cpufeature.h |  2 +-
> > >  arch/riscv/kernel/cpufeature.c      | 22 +++++++++++++++++-----
> > >  arch/riscv/kernel/smpboot.c         | 11 ++++++++++-
> > >  3 files changed, 28 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > > index d0345bd659c9..b139796392d0 100644
> > > --- a/arch/riscv/include/asm/cpufeature.h
> > > +++ b/arch/riscv/include/asm/cpufeature.h
> > > @@ -30,6 +30,6 @@ DECLARE_PER_CPU(long, misaligned_access_speed);
> > >  /* Per-cpu ISA extensions. */
> > >  extern struct riscv_isainfo hart_isa[NR_CPUS];
> > >
> > > -void check_unaligned_access(int cpu);
> > > +int check_unaligned_access(void *unused);
> > >
> > >  #endif
> > > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > > index 1cfbba65d11a..40bb854fcb96 100644
> > > --- a/arch/riscv/kernel/cpufeature.c
> > > +++ b/arch/riscv/kernel/cpufeature.c
> > > @@ -556,8 +556,9 @@ unsigned long riscv_get_elf_hwcap(void)
> > >       return hwcap;
> > >  }
> > >
> > > -void check_unaligned_access(int cpu)
> > > +int check_unaligned_access(void *unused)
> > >  {
> > > +     int cpu = smp_processor_id();
> > >       u64 start_cycles, end_cycles;
> > >       u64 word_cycles;
> > >       u64 byte_cycles;
> > > @@ -571,7 +572,7 @@ void check_unaligned_access(int cpu)
> > >       page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE));
> > >       if (!page) {
> > >               pr_warn("Can't alloc pages to measure memcpy performance");
> > > -             return;
> > > +             return 0;
> >
> > Dumb question maybe, but I am limited setup wise at the moment due to
> > a hardware failure which makes checking stuff hard, why the signature
> > change? Requirement for on_each_cpu()?
> >
>
> Requirement for smp_call_on_cpu.

Right.

>
> > >       }
> > >
> > >       /* Make an unaligned destination buffer. */
> > > @@ -643,15 +644,26 @@ void check_unaligned_access(int cpu)
> > >
> > >  out:
> > >       __free_pages(page, get_order(MISALIGNED_BUFFER_SIZE));
> > > +     return 0;
> > > +}
> > > +
> > > +static void check_unaligned_access_nonboot_cpu(void *param)
> > > +{
> > > +     if (smp_processor_id() != 0)
> > > +             check_unaligned_access(param);
> > >  }
> > >
> > > -static int check_unaligned_access_boot_cpu(void)
> > > +static int check_unaligned_access_all_cpus(void)
> > >  {
> > > -     check_unaligned_access(0);
> > > +     /* Check everybody except 0, who stays behind to tend jiffies. */
> > > +     on_each_cpu(check_unaligned_access_nonboot_cpu, NULL, 1);
> > > +
> > > +     /* Check core 0. */
> > > +     smp_call_on_cpu(0, check_unaligned_access, NULL, true);
> > >       return 0;
> >
> > Why does this function return an int if it can only return 0?
> >

This is a requirement on the initcall_t function pointer type.

>
> Should we define a check_unaligned_access_boot_cpu to avoid the return
> type change ?
> We can also get rid of the unused parameter in the
> check_unaligned_access function.

I don't think it matters too much either way. In my mind it sorta
shifts the ballast around but doesn't really decrease it.

>
> I also noticed on_each_cpu invokes the function within preempt_disable/enable.
> check_unaligned_access will invoke it again. It's probably harmless
> but there is no need for non-boot cpus.
>
> We can just have preempt_disable/enable around check_unaligned_access
> in the check_unaligned_access_boot_cpu function.

It is harmless. I haven't walked fully through smp_call_on_cpu(), but
I'd be intuitively quite surprised if preemption were enabled on those
callbacks too (as you'd be saying "run my function on this cpu, but no
guarantee it will stay there!"). So in theory I should be able to
remove the preempt_disable/enable entirely. As a side note: I added
the smp_call_on_cpu() because I couldn't convince myself the initcalls
would stay on cpu 0. In practice they always seem to be.


-Evan
Conor Dooley Sept. 20, 2023, 10:25 p.m. UTC | #4
On Wed, Sep 20, 2023 at 03:06:45PM -0700, Evan Green wrote:
> On Wed, Sep 20, 2023 at 2:27 PM Atish Patra <atishp@atishpatra.org> wrote:
> >
> > On Wed, Sep 20, 2023 at 2:04 PM Conor Dooley <conor@kernel.org> wrote:
> > >
> > > Yo,
> > >
> > > On Wed, Sep 20, 2023 at 12:38:01PM -0700, Evan Green wrote:
> > > > Probing for misaligned access speed takes about 0.06 seconds. On a
> > > > system with 64 cores, doing this in smp_callin() means it's done
> > > > serially, extending boot time by 3.8 seconds. That's a lot of boot time.
> > > >
> > > > Instead of measuring each CPU serially, let's do the measurements on
> > > > all CPUs in parallel. If we disable preemption on all CPUs, the
> > > > jiffies stop ticking, so we can do this in stages of 1) everybody
> > > > except core 0, then 2) core 0.
> > > >
> > > > The measurement call in smp_callin() stays around, but is now
> > > > conditionalized to only run if a new CPU shows up after the round of
> > > > in-parallel measurements has run. The goal is to have the measurement
> > > > call not run during boot or suspend/resume, but only on a hotplug
> > > > addition.
> > > >
> > > > Reported-by: Jisheng Zhang <jszhang@kernel.org>
> > > > Closes: https://lore.kernel.org/all/mhng-9359993d-6872-4134-83ce-c97debe1cf9a@palmer-ri-x1c9/T/#mae9b8f40016f9df428829d33360144dc5026bcbf
> > > > Fixes: 584ea6564bca ("RISC-V: Probe for unaligned access speed")
> > > > Signed-off-by: Evan Green <evan@rivosinc.com>
> > > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > > > Tested-by: Andrew Jones <ajones@ventanamicro.com>
> > > >
> > > > ---
> > > >
> > > > Changes in v2:
> > > >  - Removed new global, used system_state == SYSTEM_RUNNING instead
> > > >    (Jisheng)
> > > >  - Added tags
> > > >
> > > >  arch/riscv/include/asm/cpufeature.h |  2 +-
> > > >  arch/riscv/kernel/cpufeature.c      | 22 +++++++++++++++++-----
> > > >  arch/riscv/kernel/smpboot.c         | 11 ++++++++++-
> > > >  3 files changed, 28 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > > > index d0345bd659c9..b139796392d0 100644
> > > > --- a/arch/riscv/include/asm/cpufeature.h
> > > > +++ b/arch/riscv/include/asm/cpufeature.h
> > > > @@ -30,6 +30,6 @@ DECLARE_PER_CPU(long, misaligned_access_speed);
> > > >  /* Per-cpu ISA extensions. */
> > > >  extern struct riscv_isainfo hart_isa[NR_CPUS];
> > > >
> > > > -void check_unaligned_access(int cpu);
> > > > +int check_unaligned_access(void *unused);
> > > >
> > > >  #endif
> > > > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > > > index 1cfbba65d11a..40bb854fcb96 100644
> > > > --- a/arch/riscv/kernel/cpufeature.c
> > > > +++ b/arch/riscv/kernel/cpufeature.c
> > > > @@ -556,8 +556,9 @@ unsigned long riscv_get_elf_hwcap(void)
> > > >       return hwcap;
> > > >  }
> > > >
> > > > -void check_unaligned_access(int cpu)
> > > > +int check_unaligned_access(void *unused)
> > > >  {
> > > > +     int cpu = smp_processor_id();
> > > >       u64 start_cycles, end_cycles;
> > > >       u64 word_cycles;
> > > >       u64 byte_cycles;
> > > > @@ -571,7 +572,7 @@ void check_unaligned_access(int cpu)
> > > >       page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE));
> > > >       if (!page) {
> > > >               pr_warn("Can't alloc pages to measure memcpy performance");
> > > > -             return;
> > > > +             return 0;
> > >
> > > Dumb question maybe, but I am limited setup wise at the moment due to
> > > a hardware failure which makes checking stuff hard, why the signature
> > > change? Requirement for on_each_cpu()?
> > >
> >
> > Requirement for smp_call_on_cpu.
> 
> Right.
> 
> >
> > > >       }
> > > >
> > > >       /* Make an unaligned destination buffer. */
> > > > @@ -643,15 +644,26 @@ void check_unaligned_access(int cpu)
> > > >
> > > >  out:
> > > >       __free_pages(page, get_order(MISALIGNED_BUFFER_SIZE));
> > > > +     return 0;
> > > > +}
> > > > +
> > > > +static void check_unaligned_access_nonboot_cpu(void *param)
> > > > +{
> > > > +     if (smp_processor_id() != 0)
> > > > +             check_unaligned_access(param);
> > > >  }
> > > >
> > > > -static int check_unaligned_access_boot_cpu(void)
> > > > +static int check_unaligned_access_all_cpus(void)
> > > >  {
> > > > -     check_unaligned_access(0);
> > > > +     /* Check everybody except 0, who stays behind to tend jiffies. */
> > > > +     on_each_cpu(check_unaligned_access_nonboot_cpu, NULL, 1);
> > > > +
> > > > +     /* Check core 0. */
> > > > +     smp_call_on_cpu(0, check_unaligned_access, NULL, true);
> > > >       return 0;
> > >
> > > Why does this function return an int if it can only return 0?
> > >
> 
> This is a requirement on the initcall_t function pointer type.

Ahh great, thanks for the explanations!
Atish Patra Sept. 20, 2023, 10:57 p.m. UTC | #5
On Wed, Sep 20, 2023 at 3:07 PM Evan Green <evan@rivosinc.com> wrote:
>
> On Wed, Sep 20, 2023 at 2:27 PM Atish Patra <atishp@atishpatra.org> wrote:
> >
> > On Wed, Sep 20, 2023 at 2:04 PM Conor Dooley <conor@kernel.org> wrote:
> > >
> > > Yo,
> > >
> > > On Wed, Sep 20, 2023 at 12:38:01PM -0700, Evan Green wrote:
> > > > Probing for misaligned access speed takes about 0.06 seconds. On a
> > > > system with 64 cores, doing this in smp_callin() means it's done
> > > > serially, extending boot time by 3.8 seconds. That's a lot of boot time.
> > > >
> > > > Instead of measuring each CPU serially, let's do the measurements on
> > > > all CPUs in parallel. If we disable preemption on all CPUs, the
> > > > jiffies stop ticking, so we can do this in stages of 1) everybody
> > > > except core 0, then 2) core 0.
> > > >
> > > > The measurement call in smp_callin() stays around, but is now
> > > > conditionalized to only run if a new CPU shows up after the round of
> > > > in-parallel measurements has run. The goal is to have the measurement
> > > > call not run during boot or suspend/resume, but only on a hotplug
> > > > addition.
> > > >
> > > > Reported-by: Jisheng Zhang <jszhang@kernel.org>
> > > > Closes: https://lore.kernel.org/all/mhng-9359993d-6872-4134-83ce-c97debe1cf9a@palmer-ri-x1c9/T/#mae9b8f40016f9df428829d33360144dc5026bcbf
> > > > Fixes: 584ea6564bca ("RISC-V: Probe for unaligned access speed")
> > > > Signed-off-by: Evan Green <evan@rivosinc.com>
> > > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > > > Tested-by: Andrew Jones <ajones@ventanamicro.com>
> > > >
> > > > ---
> > > >
> > > > Changes in v2:
> > > >  - Removed new global, used system_state == SYSTEM_RUNNING instead
> > > >    (Jisheng)
> > > >  - Added tags
> > > >
> > > >  arch/riscv/include/asm/cpufeature.h |  2 +-
> > > >  arch/riscv/kernel/cpufeature.c      | 22 +++++++++++++++++-----
> > > >  arch/riscv/kernel/smpboot.c         | 11 ++++++++++-
> > > >  3 files changed, 28 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > > > index d0345bd659c9..b139796392d0 100644
> > > > --- a/arch/riscv/include/asm/cpufeature.h
> > > > +++ b/arch/riscv/include/asm/cpufeature.h
> > > > @@ -30,6 +30,6 @@ DECLARE_PER_CPU(long, misaligned_access_speed);
> > > >  /* Per-cpu ISA extensions. */
> > > >  extern struct riscv_isainfo hart_isa[NR_CPUS];
> > > >
> > > > -void check_unaligned_access(int cpu);
> > > > +int check_unaligned_access(void *unused);
> > > >
> > > >  #endif
> > > > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > > > index 1cfbba65d11a..40bb854fcb96 100644
> > > > --- a/arch/riscv/kernel/cpufeature.c
> > > > +++ b/arch/riscv/kernel/cpufeature.c
> > > > @@ -556,8 +556,9 @@ unsigned long riscv_get_elf_hwcap(void)
> > > >       return hwcap;
> > > >  }
> > > >
> > > > -void check_unaligned_access(int cpu)
> > > > +int check_unaligned_access(void *unused)
> > > >  {
> > > > +     int cpu = smp_processor_id();
> > > >       u64 start_cycles, end_cycles;
> > > >       u64 word_cycles;
> > > >       u64 byte_cycles;
> > > > @@ -571,7 +572,7 @@ void check_unaligned_access(int cpu)
> > > >       page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE));
> > > >       if (!page) {
> > > >               pr_warn("Can't alloc pages to measure memcpy performance");
> > > > -             return;
> > > > +             return 0;
> > >
> > > Dumb question maybe, but I am limited setup wise at the moment due to
> > > a hardware failure which makes checking stuff hard, why the signature
> > > change? Requirement for on_each_cpu()?
> > >
> >
> > Requirement for smp_call_on_cpu.
>
> Right.
>
> >
> > > >       }
> > > >
> > > >       /* Make an unaligned destination buffer. */
> > > > @@ -643,15 +644,26 @@ void check_unaligned_access(int cpu)
> > > >
> > > >  out:
> > > >       __free_pages(page, get_order(MISALIGNED_BUFFER_SIZE));
> > > > +     return 0;
> > > > +}
> > > > +
> > > > +static void check_unaligned_access_nonboot_cpu(void *param)
> > > > +{
> > > > +     if (smp_processor_id() != 0)
> > > > +             check_unaligned_access(param);
> > > >  }
> > > >
> > > > -static int check_unaligned_access_boot_cpu(void)
> > > > +static int check_unaligned_access_all_cpus(void)
> > > >  {
> > > > -     check_unaligned_access(0);
> > > > +     /* Check everybody except 0, who stays behind to tend jiffies. */
> > > > +     on_each_cpu(check_unaligned_access_nonboot_cpu, NULL, 1);
> > > > +
> > > > +     /* Check core 0. */
> > > > +     smp_call_on_cpu(0, check_unaligned_access, NULL, true);
> > > >       return 0;
> > >
> > > Why does this function return an int if it can only return 0?
> > >
>
> This is a requirement on the initcall_t function pointer type.
>
> >
> > Should we define a check_unaligned_access_boot_cpu to avoid the return
> > type change ?
> > We can also get rid of the unused parameter in the
> > check_unaligned_access function.
>
> I don't think it matters too much either way. In my mind it sorta
> shifts the ballast around but doesn't really decrease it.
>

Yeah. But it is just a bit cleaner to avoid unused arguments if we can.

> >
> > I also noticed on_each_cpu invokes the function within preempt_disable/enable.
> > check_unaligned_access will invoke it again. It's probably harmless
> > but there is no need for non-boot cpus.
> >
> > We can just have preempt_disable/enable around check_unaligned_access
> > in the check_unaligned_access_boot_cpu function.
>
> It is harmless. I haven't walked fully through smp_call_on_cpu(), but
> I'd be intuitively quite surprised if preemption were enabled on those
> callbacks too (as you'd be saying "run my function on this cpu, but no
> guarantee it will stay there!"). So in theory I should be able to

I didn't see anything explicitly disable/enable preemption on smp_call_on_cpu.
As per my understanding, your queued function is guaranteed to run on that cpu
as it is queued on the cpu specific wq but it may be preempted by
something else.

For probing alignment speed, you just care about running it on that
cpu. Correct ?

> remove the preempt_disable/enable entirely. As a side note: I added
> the smp_call_on_cpu() because I couldn't convince myself the initcalls
> would stay on cpu 0. In practice they always seem to be.
>

I think that's the correct way to do it as the scheduler is active
now. There is no guarantee that
it will stay on cpu0 as you mentioned.

>
> -Evan
Evan Green Sept. 20, 2023, 11:54 p.m. UTC | #6
On Wed, Sep 20, 2023 at 3:57 PM Atish Patra <atishp@atishpatra.org> wrote:
>
> On Wed, Sep 20, 2023 at 3:07 PM Evan Green <evan@rivosinc.com> wrote:
> >
> > On Wed, Sep 20, 2023 at 2:27 PM Atish Patra <atishp@atishpatra.org> wrote:
> > >
> > > On Wed, Sep 20, 2023 at 2:04 PM Conor Dooley <conor@kernel.org> wrote:
> > > >
> > > > Yo,
> > > >
> > > > On Wed, Sep 20, 2023 at 12:38:01PM -0700, Evan Green wrote:
> > > > > Probing for misaligned access speed takes about 0.06 seconds. On a
> > > > > system with 64 cores, doing this in smp_callin() means it's done
> > > > > serially, extending boot time by 3.8 seconds. That's a lot of boot time.
> > > > >
> > > > > Instead of measuring each CPU serially, let's do the measurements on
> > > > > all CPUs in parallel. If we disable preemption on all CPUs, the
> > > > > jiffies stop ticking, so we can do this in stages of 1) everybody
> > > > > except core 0, then 2) core 0.
> > > > >
> > > > > The measurement call in smp_callin() stays around, but is now
> > > > > conditionalized to only run if a new CPU shows up after the round of
> > > > > in-parallel measurements has run. The goal is to have the measurement
> > > > > call not run during boot or suspend/resume, but only on a hotplug
> > > > > addition.
> > > > >
> > > > > Reported-by: Jisheng Zhang <jszhang@kernel.org>
> > > > > Closes: https://lore.kernel.org/all/mhng-9359993d-6872-4134-83ce-c97debe1cf9a@palmer-ri-x1c9/T/#mae9b8f40016f9df428829d33360144dc5026bcbf
> > > > > Fixes: 584ea6564bca ("RISC-V: Probe for unaligned access speed")
> > > > > Signed-off-by: Evan Green <evan@rivosinc.com>
> > > > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> > > > > Tested-by: Andrew Jones <ajones@ventanamicro.com>
> > > > >
> > > > > ---
> > > > >
> > > > > Changes in v2:
> > > > >  - Removed new global, used system_state == SYSTEM_RUNNING instead
> > > > >    (Jisheng)
> > > > >  - Added tags
> > > > >
> > > > >  arch/riscv/include/asm/cpufeature.h |  2 +-
> > > > >  arch/riscv/kernel/cpufeature.c      | 22 +++++++++++++++++-----
> > > > >  arch/riscv/kernel/smpboot.c         | 11 ++++++++++-
> > > > >  3 files changed, 28 insertions(+), 7 deletions(-)
> > > > >
> > > > > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > > > > index d0345bd659c9..b139796392d0 100644
> > > > > --- a/arch/riscv/include/asm/cpufeature.h
> > > > > +++ b/arch/riscv/include/asm/cpufeature.h
> > > > > @@ -30,6 +30,6 @@ DECLARE_PER_CPU(long, misaligned_access_speed);
> > > > >  /* Per-cpu ISA extensions. */
> > > > >  extern struct riscv_isainfo hart_isa[NR_CPUS];
> > > > >
> > > > > -void check_unaligned_access(int cpu);
> > > > > +int check_unaligned_access(void *unused);
> > > > >
> > > > >  #endif
> > > > > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > > > > index 1cfbba65d11a..40bb854fcb96 100644
> > > > > --- a/arch/riscv/kernel/cpufeature.c
> > > > > +++ b/arch/riscv/kernel/cpufeature.c
> > > > > @@ -556,8 +556,9 @@ unsigned long riscv_get_elf_hwcap(void)
> > > > >       return hwcap;
> > > > >  }
> > > > >
> > > > > -void check_unaligned_access(int cpu)
> > > > > +int check_unaligned_access(void *unused)
> > > > >  {
> > > > > +     int cpu = smp_processor_id();
> > > > >       u64 start_cycles, end_cycles;
> > > > >       u64 word_cycles;
> > > > >       u64 byte_cycles;
> > > > > @@ -571,7 +572,7 @@ void check_unaligned_access(int cpu)
> > > > >       page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE));
> > > > >       if (!page) {
> > > > >               pr_warn("Can't alloc pages to measure memcpy performance");
> > > > > -             return;
> > > > > +             return 0;
> > > >
> > > > Dumb question maybe, but I am limited setup wise at the moment due to
> > > > a hardware failure which makes checking stuff hard, why the signature
> > > > change? Requirement for on_each_cpu()?
> > > >
> > >
> > > Requirement for smp_call_on_cpu.
> >
> > Right.
> >
> > >
> > > > >       }
> > > > >
> > > > >       /* Make an unaligned destination buffer. */
> > > > > @@ -643,15 +644,26 @@ void check_unaligned_access(int cpu)
> > > > >
> > > > >  out:
> > > > >       __free_pages(page, get_order(MISALIGNED_BUFFER_SIZE));
> > > > > +     return 0;
> > > > > +}
> > > > > +
> > > > > +static void check_unaligned_access_nonboot_cpu(void *param)
> > > > > +{
> > > > > +     if (smp_processor_id() != 0)
> > > > > +             check_unaligned_access(param);
> > > > >  }
> > > > >
> > > > > -static int check_unaligned_access_boot_cpu(void)
> > > > > +static int check_unaligned_access_all_cpus(void)
> > > > >  {
> > > > > -     check_unaligned_access(0);
> > > > > +     /* Check everybody except 0, who stays behind to tend jiffies. */
> > > > > +     on_each_cpu(check_unaligned_access_nonboot_cpu, NULL, 1);
> > > > > +
> > > > > +     /* Check core 0. */
> > > > > +     smp_call_on_cpu(0, check_unaligned_access, NULL, true);
> > > > >       return 0;
> > > >
> > > > Why does this function return an int if it can only return 0?
> > > >
> >
> > This is a requirement on the initcall_t function pointer type.
> >
> > >
> > > Should we define a check_unaligned_access_boot_cpu to avoid the return
> > > type change ?
> > > We can also get rid of the unused parameter in the
> > > check_unaligned_access function.
> >
> > I don't think it matters too much either way. In my mind it sorta
> > shifts the ballast around but doesn't really decrease it.
> >
>
> Yeah. But it is just a bit cleaner to avoid unused arguments if we can.

The unused argument will still be there, just now in the form of a
wrapper function. The size of the mess seems about the same to me, but
now with an additional layer of indirection. OTOH, I'm not really
opposed to it either. If you think it's important I'll do it, but if
not my preference is to keep it as is.

>
> > >
> > > I also noticed on_each_cpu invokes the function within preempt_disable/enable.
> > > check_unaligned_access will invoke it again. It's probably harmless
> > > but there is no need for non-boot cpus.
> > >
> > > We can just have preempt_disable/enable around check_unaligned_access
> > > in the check_unaligned_access_boot_cpu function.
> >
> > It is harmless. I haven't walked fully through smp_call_on_cpu(), but
> > I'd be intuitively quite surprised if preemption were enabled on those
> > callbacks too (as you'd be saying "run my function on this cpu, but no
> > guarantee it will stay there!"). So in theory I should be able to
>
> I didn't see anything explicitly disable/enable preemption on smp_call_on_cpu.
> As per my understanding, your queued function is guaranteed to run on that cpu
> as it is queued on the cpu specific wq but it may be preempted by
> something else.
>
> For probing alignment speed, you just care about running it on that
> cpu. Correct ?

For this we care both about not migrating to other CPUs, and also
secondarily minimizing disturbances while the test is being run.
Usually I equate pre-emption with migration, but in this case I think
the worker threads are bound to that CPU. So I'll keep the
preempt_disable/enable where it is, since it's harmless for CPUs other
than 0, but useful for 0. I also like it for readability as it
highlights the critical section (as a reader, "is preemption disabled"
would be one of my first questions when studying this).

-Evan
Jisheng Zhang Sept. 21, 2023, 2:41 a.m. UTC | #7
On Wed, Sep 20, 2023 at 12:38:01PM -0700, Evan Green wrote:
> Probing for misaligned access speed takes about 0.06 seconds. On a
> system with 64 cores, doing this in smp_callin() means it's done
> serially, extending boot time by 3.8 seconds. That's a lot of boot time.
> 
> Instead of measuring each CPU serially, let's do the measurements on
> all CPUs in parallel. If we disable preemption on all CPUs, the
> jiffies stop ticking, so we can do this in stages of 1) everybody
> except core 0, then 2) core 0.
> 
> The measurement call in smp_callin() stays around, but is now
> conditionalized to only run if a new CPU shows up after the round of
> in-parallel measurements has run. The goal is to have the measurement
> call not run during boot or suspend/resume, but only on a hotplug
> addition.
> 
> Reported-by: Jisheng Zhang <jszhang@kernel.org>
> Closes: https://lore.kernel.org/all/mhng-9359993d-6872-4134-83ce-c97debe1cf9a@palmer-ri-x1c9/T/#mae9b8f40016f9df428829d33360144dc5026bcbf
> Fixes: 584ea6564bca ("RISC-V: Probe for unaligned access speed")
> Signed-off-by: Evan Green <evan@rivosinc.com>
> Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> Tested-by: Andrew Jones <ajones@ventanamicro.com>

Tested-by: Jisheng Zhang <jszhang@kernel.org>

> 
> ---
> 
> Changes in v2:
>  - Removed new global, used system_state == SYSTEM_RUNNING instead
>    (Jisheng)
>  - Added tags
> 
>  arch/riscv/include/asm/cpufeature.h |  2 +-
>  arch/riscv/kernel/cpufeature.c      | 22 +++++++++++++++++-----
>  arch/riscv/kernel/smpboot.c         | 11 ++++++++++-
>  3 files changed, 28 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> index d0345bd659c9..b139796392d0 100644
> --- a/arch/riscv/include/asm/cpufeature.h
> +++ b/arch/riscv/include/asm/cpufeature.h
> @@ -30,6 +30,6 @@ DECLARE_PER_CPU(long, misaligned_access_speed);
>  /* Per-cpu ISA extensions. */
>  extern struct riscv_isainfo hart_isa[NR_CPUS];
>  
> -void check_unaligned_access(int cpu);
> +int check_unaligned_access(void *unused);
>  
>  #endif
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index 1cfbba65d11a..40bb854fcb96 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -556,8 +556,9 @@ unsigned long riscv_get_elf_hwcap(void)
>  	return hwcap;
>  }
>  
> -void check_unaligned_access(int cpu)
> +int check_unaligned_access(void *unused)
>  {
> +	int cpu = smp_processor_id();
>  	u64 start_cycles, end_cycles;
>  	u64 word_cycles;
>  	u64 byte_cycles;
> @@ -571,7 +572,7 @@ void check_unaligned_access(int cpu)
>  	page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE));
>  	if (!page) {
>  		pr_warn("Can't alloc pages to measure memcpy performance");
> -		return;
> +		return 0;
>  	}
>  
>  	/* Make an unaligned destination buffer. */
> @@ -643,15 +644,26 @@ void check_unaligned_access(int cpu)
>  
>  out:
>  	__free_pages(page, get_order(MISALIGNED_BUFFER_SIZE));
> +	return 0;
> +}
> +
> +static void check_unaligned_access_nonboot_cpu(void *param)
> +{
> +	if (smp_processor_id() != 0)
> +		check_unaligned_access(param);
>  }
>  
> -static int check_unaligned_access_boot_cpu(void)
> +static int check_unaligned_access_all_cpus(void)
>  {
> -	check_unaligned_access(0);
> +	/* Check everybody except 0, who stays behind to tend jiffies. */
> +	on_each_cpu(check_unaligned_access_nonboot_cpu, NULL, 1);
> +
> +	/* Check core 0. */
> +	smp_call_on_cpu(0, check_unaligned_access, NULL, true);
>  	return 0;
>  }
>  
> -arch_initcall(check_unaligned_access_boot_cpu);
> +arch_initcall(check_unaligned_access_all_cpus);
>  
>  #ifdef CONFIG_RISCV_ALTERNATIVE
>  /*
> diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
> index 1b8da4e40a4d..a014955b8699 100644
> --- a/arch/riscv/kernel/smpboot.c
> +++ b/arch/riscv/kernel/smpboot.c
> @@ -27,6 +27,7 @@
>  #include <linux/sched/mm.h>
>  #include <asm/cpu_ops.h>
>  #include <asm/cpufeature.h>
> +#include <asm/hwprobe.h>
>  #include <asm/irq.h>
>  #include <asm/mmu_context.h>
>  #include <asm/numa.h>
> @@ -246,7 +247,15 @@ asmlinkage __visible void smp_callin(void)
>  
>  	numa_add_cpu(curr_cpuid);
>  	set_cpu_online(curr_cpuid, 1);
> -	check_unaligned_access(curr_cpuid);
> +
> +	/*
> +	 * Boot-time misaligned access speed measurements are done in parallel
> +	 * in an initcall. Only measure here for hotplug.
> +	 */
> +	if ((system_state == SYSTEM_RUNNING) &&
> +	    (per_cpu(misaligned_access_speed, curr_cpuid) == RISCV_HWPROBE_MISALIGNED_UNKNOWN)) {
> +		check_unaligned_access(NULL);
> +	}
>  
>  	if (has_vector()) {
>  		if (riscv_v_setup_vsize())
> -- 
> 2.34.1
>
David Laight Sept. 21, 2023, 10:22 a.m. UTC | #8
...
> > For probing alignment speed, you just care about running it on that
> > cpu. Correct ?
> 
> For this we care both about not migrating to other CPUs, and also
> secondarily minimizing disturbances while the test is being run.
> Usually I equate pre-emption with migration, but in this case I think
> the worker threads are bound to that CPU. So I'll keep the
> preempt_disable/enable where it is, since it's harmless for CPUs other
> than 0, but useful for 0. I also like it for readability as it
> highlights the critical section (as a reader, "is preemption disabled"
> would be one of my first questions when studying this).

You need to disable pre-emption to get any kind of meaningful answer.

But why do you need to run the test on more than the boot cpu?
If you've a heterogenous mix of cpu any code that looks at the answer
is going to behave incorrectly unless it has also disabled pre-emption
or is bound to a cpu.

One obvious use of the result is to setup some static branches.
But that assumes all cpu are the same.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Evan Green Sept. 21, 2023, 4:44 p.m. UTC | #9
On Thu, Sep 21, 2023 at 3:22 AM David Laight <David.Laight@aculab.com> wrote:
>
> ...
> > > For probing alignment speed, you just care about running it on that
> > > cpu. Correct ?
> >
> > For this we care both about not migrating to other CPUs, and also
> > secondarily minimizing disturbances while the test is being run.
> > Usually I equate pre-emption with migration, but in this case I think
> > the worker threads are bound to that CPU. So I'll keep the
> > preempt_disable/enable where it is, since it's harmless for CPUs other
> > than 0, but useful for 0. I also like it for readability as it
> > highlights the critical section (as a reader, "is preemption disabled"
> > would be one of my first questions when studying this).
>
> You need to disable pre-emption to get any kind of meaningful answer.
>
> But why do you need to run the test on more than the boot cpu?
> If you've a heterogenous mix of cpu any code that looks at the answer
> is going to behave incorrectly unless it has also disabled pre-emption
> or is bound to a cpu.

I don't think it's safe to assume misaligned access speed is the same
across all cores. In a big.little combination I can easily imagine the
big cores having fast misaligned access and the slow cores not having
it (though hopefully the slow cores don't kick it to firmware). Since
this info is presented to usermode per-cpu, I'd like it to be correct.

>
> One obvious use of the result is to setup some static branches.
> But that assumes all cpu are the same.

Right, this could be used to set up static branches, or in an ifunc
selector. This is why we provide pre-computed answers for "all CPUs"
in hwprobe. If the situation I describe above did happen, code asking
on behalf of all CPUs would come back "unknown", which is true (though
would probably default to the byte-copy version). More specific
environments might choose to curate their scheduling a bit more, and
this info per-core would be useful there.

-Evan
Evan Green Sept. 21, 2023, 4:49 p.m. UTC | #10
On Thu, Sep 21, 2023 at 9:44 AM Evan Green <evan@rivosinc.com> wrote:
>
> On Thu, Sep 21, 2023 at 3:22 AM David Laight <David.Laight@aculab.com> wrote:
> >
> > ...
> > > > For probing alignment speed, you just care about running it on that
> > > > cpu. Correct ?
> > >
> > > For this we care both about not migrating to other CPUs, and also
> > > secondarily minimizing disturbances while the test is being run.
> > > Usually I equate pre-emption with migration, but in this case I think
> > > the worker threads are bound to that CPU. So I'll keep the
> > > preempt_disable/enable where it is, since it's harmless for CPUs other
> > > than 0, but useful for 0. I also like it for readability as it
> > > highlights the critical section (as a reader, "is preemption disabled"
> > > would be one of my first questions when studying this).
> >
> > You need to disable pre-emption to get any kind of meaningful answer.
> >
> > But why do you need to run the test on more than the boot cpu?
> > If you've a heterogenous mix of cpu any code that looks at the answer
> > is going to behave incorrectly unless it has also disabled pre-emption
> > or is bound to a cpu.
>
> I don't think it's safe to assume misaligned access speed is the same
> across all cores. In a big.little combination I can easily imagine the
> big cores having fast misaligned access and the slow cores not having
> it (though hopefully the slow cores don't kick it to firmware). Since
> this info is presented to usermode per-cpu, I'd like it to be correct.
>
> >
> > One obvious use of the result is to setup some static branches.
> > But that assumes all cpu are the same.
>
> Right, this could be used to set up static branches, or in an ifunc
> selector. This is why we provide pre-computed answers for "all CPUs"
> in hwprobe. If the situation I describe above did happen, code asking

Correction: Not exactly precomputed answers, but cached vDSO data
capable of quickly answering usermode queries for systems with
homogeneous CPUs.
Conor Dooley Sept. 29, 2023, 11:19 a.m. UTC | #11
On Wed, Sep 20, 2023 at 11:25:02PM +0100, Conor Dooley wrote:

> Ahh great, thanks for the explanations!


Just to make it clear that the questions I had were answered,
Acked-by: Conor Dooley <conor.dooley@microchip.com>

Thanks,
Conor.
patchwork-bot+linux-riscv@kernel.org Nov. 2, 2023, 8:20 p.m. UTC | #12
Hello:

This patch was applied to riscv/linux.git (for-next)
by Palmer Dabbelt <palmer@rivosinc.com>:

On Wed, 20 Sep 2023 12:38:01 -0700 you wrote:
> Probing for misaligned access speed takes about 0.06 seconds. On a
> system with 64 cores, doing this in smp_callin() means it's done
> serially, extending boot time by 3.8 seconds. That's a lot of boot time.
> 
> Instead of measuring each CPU serially, let's do the measurements on
> all CPUs in parallel. If we disable preemption on all CPUs, the
> jiffies stop ticking, so we can do this in stages of 1) everybody
> except core 0, then 2) core 0.
> 
> [...]

Here is the summary with links:
  - [v2] RISC-V: Probe misaligned access speed in parallel
    https://git.kernel.org/riscv/c/0c37376609da

You are awesome, thank you!
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
index d0345bd659c9..b139796392d0 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -30,6 +30,6 @@  DECLARE_PER_CPU(long, misaligned_access_speed);
 /* Per-cpu ISA extensions. */
 extern struct riscv_isainfo hart_isa[NR_CPUS];
 
-void check_unaligned_access(int cpu);
+int check_unaligned_access(void *unused);
 
 #endif
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 1cfbba65d11a..40bb854fcb96 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -556,8 +556,9 @@  unsigned long riscv_get_elf_hwcap(void)
 	return hwcap;
 }
 
-void check_unaligned_access(int cpu)
+int check_unaligned_access(void *unused)
 {
+	int cpu = smp_processor_id();
 	u64 start_cycles, end_cycles;
 	u64 word_cycles;
 	u64 byte_cycles;
@@ -571,7 +572,7 @@  void check_unaligned_access(int cpu)
 	page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE));
 	if (!page) {
 		pr_warn("Can't alloc pages to measure memcpy performance");
-		return;
+		return 0;
 	}
 
 	/* Make an unaligned destination buffer. */
@@ -643,15 +644,26 @@  void check_unaligned_access(int cpu)
 
 out:
 	__free_pages(page, get_order(MISALIGNED_BUFFER_SIZE));
+	return 0;
+}
+
+static void check_unaligned_access_nonboot_cpu(void *param)
+{
+	if (smp_processor_id() != 0)
+		check_unaligned_access(param);
 }
 
-static int check_unaligned_access_boot_cpu(void)
+static int check_unaligned_access_all_cpus(void)
 {
-	check_unaligned_access(0);
+	/* Check everybody except 0, who stays behind to tend jiffies. */
+	on_each_cpu(check_unaligned_access_nonboot_cpu, NULL, 1);
+
+	/* Check core 0. */
+	smp_call_on_cpu(0, check_unaligned_access, NULL, true);
 	return 0;
 }
 
-arch_initcall(check_unaligned_access_boot_cpu);
+arch_initcall(check_unaligned_access_all_cpus);
 
 #ifdef CONFIG_RISCV_ALTERNATIVE
 /*
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 1b8da4e40a4d..a014955b8699 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -27,6 +27,7 @@ 
 #include <linux/sched/mm.h>
 #include <asm/cpu_ops.h>
 #include <asm/cpufeature.h>
+#include <asm/hwprobe.h>
 #include <asm/irq.h>
 #include <asm/mmu_context.h>
 #include <asm/numa.h>
@@ -246,7 +247,15 @@  asmlinkage __visible void smp_callin(void)
 
 	numa_add_cpu(curr_cpuid);
 	set_cpu_online(curr_cpuid, 1);
-	check_unaligned_access(curr_cpuid);
+
+	/*
+	 * Boot-time misaligned access speed measurements are done in parallel
+	 * in an initcall. Only measure here for hotplug.
+	 */
+	if ((system_state == SYSTEM_RUNNING) &&
+	    (per_cpu(misaligned_access_speed, curr_cpuid) == RISCV_HWPROBE_MISALIGNED_UNKNOWN)) {
+		check_unaligned_access(NULL);
+	}
 
 	if (has_vector()) {
 		if (riscv_v_setup_vsize())