diff mbox

[v2,2/2] ARM: Replace calls to __aeabi_{u}idiv with udiv/sdiv instructions

Message ID 1448488264-23400-3-git-send-email-sboyd@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Stephen Boyd Nov. 25, 2015, 9:51 p.m. UTC
The ARM compiler inserts calls to __aeabi_uidiv() and
__aeabi_idiv() when it needs to perform division on signed and
unsigned integers. If a processor has support for the udiv and
sdiv division instructions the calls to these support routines
can be replaced with those instructions. Now that recordmcount
records the locations of calls to these library functions in
two sections (one for udiv and one for sdiv), iterate over these
sections early at boot and patch the call sites with the
appropriate division instruction when we determine that the
processor supports the division instructions. Using the division
instructions should be faster and less power intensive than
running the support code.

Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Måns Rullgård <mans@mansr.com>
Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 arch/arm/Kconfig               | 14 +++++++++
 arch/arm/include/asm/cputype.h |  8 ++++-
 arch/arm/include/asm/setup.h   |  3 ++
 arch/arm/kernel/module.c       | 40 +++++++++++++++++++++++++
 arch/arm/kernel/setup.c        | 68 ++++++++++++++++++++++++++++++++++++++++++
 arch/arm/kernel/vmlinux.lds.S  | 13 ++++++++
 6 files changed, 145 insertions(+), 1 deletion(-)

Comments

Nicolas Pitre Nov. 25, 2015, 11:09 p.m. UTC | #1
On Wed, 25 Nov 2015, Stephen Boyd wrote:

> The ARM compiler inserts calls to __aeabi_uidiv() and
> __aeabi_idiv() when it needs to perform division on signed and
> unsigned integers. If a processor has support for the udiv and
> sdiv division instructions the calls to these support routines
> can be replaced with those instructions. Now that recordmcount
> records the locations of calls to these library functions in
> two sections (one for udiv and one for sdiv), iterate over these
> sections early at boot and patch the call sites with the
> appropriate division instruction when we determine that the
> processor supports the division instructions. Using the division
> instructions should be faster and less power intensive than
> running the support code.

A few remarks:

1) The code assumes unconditional branches to __aeabi_idiv and 
   __aeabi_uidiv. What if there are conditional branches? Also, tail 
   call optimizations will generate a straight b opcode rather than a bl 
   and patching those will obviously have catastrophic results.  I think 
   you should validate the presence of a bl before patching over it.

2) For those cases where a call to __aeabi_uidiv and __aeabi_idiv is not 
   patched due to (1), you could patch a uidiv/idiv plus "bx lr" 
   at those function entry points too.

3) In fact I was wondering if the overhead of the branch and back is 
   really significant compared to the non trivial cost of a idiv 
   instruction and all the complex infrastructure required to patch 
   those branches directly, and consequently if the performance 
   difference is actually worth it versus simply doing (2) alone.

4) Please add some printing to the boot log (debug level should be fine)  
   about the fact that you did modify n branch instances with a div 
   insn. That _could_ turn out to be a useful clue when debugging kernel 
   "strangeties".

> Cc: Nicolas Pitre <nico@fluxnic.net>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Måns Rullgård <mans@mansr.com>
> Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
> Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
> ---
>  arch/arm/Kconfig               | 14 +++++++++
>  arch/arm/include/asm/cputype.h |  8 ++++-
>  arch/arm/include/asm/setup.h   |  3 ++
>  arch/arm/kernel/module.c       | 40 +++++++++++++++++++++++++
>  arch/arm/kernel/setup.c        | 68 ++++++++++++++++++++++++++++++++++++++++++
>  arch/arm/kernel/vmlinux.lds.S  | 13 ++++++++
>  6 files changed, 145 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 0365cbbc9179..aa8bc7da6331 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -1617,6 +1617,20 @@ config AEABI
>  
>  	  To use this you need GCC version 4.0.0 or later.
>  
> +config ARM_PATCH_UIDIV
> +	bool "Runtime patch calls to __aeabi_{u}idiv() with udiv/sdiv"
> +	depends on CPU_32v7 && !XIP_KERNEL && AEABI
> +	help
> +	  Some v7 CPUs have support for the udiv and sdiv instructions
> +	  that can be used in place of calls to __aeabi_uidiv and __aeabi_idiv
> +	  functions provided by the ARM runtime ABI.
> +
> +	  Enabling this option allows the kernel to modify itself to replace
> +	  branches to these library functions with the udiv and sdiv
> +	  instructions themselves. Typically this will be faster and less
> +	  power intensive than running the library support code to do
> +	  integer division.
> +
>  config OABI_COMPAT
>  	bool "Allow old ABI binaries to run with this kernel (EXPERIMENTAL)"
>  	depends on AEABI && !THUMB2_KERNEL
> diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h
> index 85e374f873ac..48c77d422a0d 100644
> --- a/arch/arm/include/asm/cputype.h
> +++ b/arch/arm/include/asm/cputype.h
> @@ -250,8 +250,14 @@ static inline int cpu_is_pj4(void)
>  
>  	return 0;
>  }
> +
> +static inline bool __attribute_const__ cpu_is_pj4_nomp(void)
> +{
> +	return read_cpuid_part() == 0x56005810;
> +}
>  #else
> -#define cpu_is_pj4()	0
> +#define cpu_is_pj4()		0
> +#define cpu_is_pj4_nomp()	0
>  #endif
>  
>  static inline int __attribute_const__ cpuid_feature_extract_field(u32 features,
> diff --git a/arch/arm/include/asm/setup.h b/arch/arm/include/asm/setup.h
> index e0adb9f1bf94..a514552d5cbd 100644
> --- a/arch/arm/include/asm/setup.h
> +++ b/arch/arm/include/asm/setup.h
> @@ -25,4 +25,7 @@ extern int arm_add_memory(u64 start, u64 size);
>  extern void early_print(const char *str, ...);
>  extern void dump_machine_table(void);
>  
> +extern void patch_udiv(void *addr, size_t size);
> +extern void patch_sdiv(void *addr, size_t size);
> +
>  #endif
> diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
> index efdddcb97dd1..aa59e5cfe6a0 100644
> --- a/arch/arm/kernel/module.c
> +++ b/arch/arm/kernel/module.c
> @@ -20,7 +20,9 @@
>  #include <linux/string.h>
>  #include <linux/gfp.h>
>  
> +#include <asm/hwcap.h>
>  #include <asm/pgtable.h>
> +#include <asm/setup.h>
>  #include <asm/sections.h>
>  #include <asm/smp_plat.h>
>  #include <asm/unwind.h>
> @@ -51,6 +53,38 @@ void *module_alloc(unsigned long size)
>  }
>  #endif
>  
> +#ifdef CONFIG_ARM_PATCH_UIDIV
> +static int module_patch_aeabi_uidiv(unsigned long loc, const Elf32_Sym *sym)
> +{
> +	extern char __aeabi_uidiv[], __aeabi_idiv[];
> +	unsigned long udiv_addr = (unsigned long)__aeabi_uidiv;
> +	unsigned long sdiv_addr = (unsigned long)__aeabi_idiv;
> +	unsigned int mask;
> +
> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL))
> +		mask = HWCAP_IDIVT;
> +	else
> +		mask = HWCAP_IDIVA;
> +
> +	if (elf_hwcap & mask) {
> +		if (sym->st_value == udiv_addr) {
> +			patch_udiv(&loc, sizeof(loc));
> +			return 1;
> +		} else if (sym->st_value == sdiv_addr) {
> +			patch_sdiv(&loc, sizeof(loc));
> +			return 1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +#else
> +static int module_patch_aeabi_uidiv(unsigned long loc, const Elf32_Sym *sym)
> +{
> +	return 0;
> +}
> +#endif
> +
>  int
>  apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
>  	       unsigned int relindex, struct module *module)
> @@ -109,6 +143,9 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
>  				return -ENOEXEC;
>  			}
>  
> +			if (module_patch_aeabi_uidiv(loc, sym))
> +				break;
> +
>  			offset = __mem_to_opcode_arm(*(u32 *)loc);
>  			offset = (offset & 0x00ffffff) << 2;
>  			if (offset & 0x02000000)
> @@ -195,6 +232,9 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
>  				return -ENOEXEC;
>  			}
>  
> +			if (module_patch_aeabi_uidiv(loc, sym))
> +				break;
> +
>  			upper = __mem_to_opcode_thumb16(*(u16 *)loc);
>  			lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
>  
> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> index 20edd349d379..39a46059d5e8 100644
> --- a/arch/arm/kernel/setup.c
> +++ b/arch/arm/kernel/setup.c
> @@ -32,6 +32,7 @@
>  #include <linux/compiler.h>
>  #include <linux/sort.h>
>  #include <linux/psci.h>
> +#include <linux/module.h>
>  
>  #include <asm/unified.h>
>  #include <asm/cp15.h>
> @@ -375,6 +376,72 @@ void __init early_print(const char *str, ...)
>  	printk("%s", buf);
>  }
>  
> +#ifdef CONFIG_ARM_PATCH_UIDIV
> +/* "sdiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 4" if we're on pj4 w/o MP */
> +static u32 __attribute_const__ sdiv_instruction(void)
> +{
> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
> +		if (cpu_is_pj4_nomp())
> +			return __opcode_to_mem_thumb32(0xee300691);
> +		return __opcode_to_mem_thumb32(0xfb90f0f1);
> +	}
> +
> +	if (cpu_is_pj4_nomp())
> +		return __opcode_to_mem_arm(0xee300691);
> +	return __opcode_to_mem_arm(0xe710f110);
> +}
> +
> +/* "udiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 0" if we're on pj4 w/o MP */
> +static u32 __attribute_const__ udiv_instruction(void)
> +{
> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
> +		if (cpu_is_pj4_nomp())
> +			return __opcode_to_mem_thumb32(0xee300611);
> +		return __opcode_to_mem_thumb32(0xfbb0f0f1);
> +	}
> +
> +	if (cpu_is_pj4_nomp())
> +		return __opcode_to_mem_arm(0xee300611);
> +	return __opcode_to_mem_arm(0xe730f110);
> +}
> +
> +static void __init_or_module patch(u32 **addr, size_t count, u32 insn)
> +{
> +	for (; count != 0; count -= 4)
> +		**addr++ = insn;
> +}
> +
> +void __init_or_module patch_udiv(void *addr, size_t size)
> +{
> +	patch(addr, size, udiv_instruction());
> +}
> +
> +void __init_or_module patch_sdiv(void *addr, size_t size)
> +{
> +	return patch(addr, size, sdiv_instruction());
> +}
> +
> +static void __init patch_aeabi_uidiv(void)
> +{
> +	extern char __start_udiv_loc[], __stop_udiv_loc[];
> +	extern char __start_idiv_loc[], __stop_idiv_loc[];
> +	unsigned int mask;
> +
> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL))
> +		mask = HWCAP_IDIVT;
> +	else
> +		mask = HWCAP_IDIVA;
> +
> +	if (!(elf_hwcap & mask))
> +		return;
> +
> +	patch_udiv(__start_udiv_loc, __stop_udiv_loc - __start_udiv_loc);
> +	patch_sdiv(__start_idiv_loc, __stop_idiv_loc - __start_idiv_loc);
> +}
> +#else
> +static void __init patch_aeabi_uidiv(void) { }
> +#endif
> +
>  static void __init cpuid_init_hwcaps(void)
>  {
>  	int block;
> @@ -642,6 +709,7 @@ static void __init setup_processor(void)
>  	elf_hwcap = list->elf_hwcap;
>  
>  	cpuid_init_hwcaps();
> +	patch_aeabi_uidiv();
>  
>  #ifndef CONFIG_ARM_THUMB
>  	elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT);
> diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
> index 8b60fde5ce48..bc87a2e04e6f 100644
> --- a/arch/arm/kernel/vmlinux.lds.S
> +++ b/arch/arm/kernel/vmlinux.lds.S
> @@ -28,6 +28,18 @@
>  	*(.hyp.idmap.text)						\
>  	VMLINUX_SYMBOL(__hyp_idmap_text_end) = .;
>  
> +#ifdef CONFIG_ARM_PATCH_UIDIV
> +#define UIDIV_REC	. = ALIGN(8);					\
> +			VMLINUX_SYMBOL(__start_udiv_loc) = .;		\
> +			*(__udiv_loc)					\
> +			VMLINUX_SYMBOL(__stop_udiv_loc) = .;		\
> +			VMLINUX_SYMBOL(__start_idiv_loc) = .;		\
> +			*(__idiv_loc)					\
> +			VMLINUX_SYMBOL(__stop_idiv_loc) = .;
> +#else
> +#define UIDIV_REC
> +#endif
> +
>  #ifdef CONFIG_HOTPLUG_CPU
>  #define ARM_CPU_DISCARD(x)
>  #define ARM_CPU_KEEP(x)		x
> @@ -210,6 +222,7 @@ SECTIONS
>  	.init.data : {
>  #ifndef CONFIG_XIP_KERNEL
>  		INIT_DATA
> +		UIDIV_REC
>  #endif
>  		INIT_SETUP(16)
>  		INIT_CALLS
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
> 
>
Russell King - ARM Linux Nov. 26, 2015, 12:05 a.m. UTC | #2
On Wed, Nov 25, 2015 at 06:09:13PM -0500, Nicolas Pitre wrote:
> 3) In fact I was wondering if the overhead of the branch and back is 
>    really significant compared to the non trivial cost of a idiv 
>    instruction and all the complex infrastructure required to patch 
>    those branches directly, and consequently if the performance 
>    difference is actually worth it versus simply doing (2) alone.

I definitely agree with you on this, given that modern CPUs which
are going to be benefitting from idiv are modern CPUs with a branch
predictor (and if it's not predicting such unconditional calls and
returns it's not much use as a branch predictor!)

I think what we need to see is the performance of existing kernels,
vs patching the idiv instructions at every callsite, vs patching
the called function itself.

> > +#ifdef CONFIG_ARM_PATCH_UIDIV
> > +/* "sdiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 4" if we're on pj4 w/o MP */
> > +static u32 __attribute_const__ sdiv_instruction(void)
> > +{
> > +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
> > +		if (cpu_is_pj4_nomp())
> > +			return __opcode_to_mem_thumb32(0xee300691);
> > +		return __opcode_to_mem_thumb32(0xfb90f0f1);
> > +	}
> > +
> > +	if (cpu_is_pj4_nomp())
> > +		return __opcode_to_mem_arm(0xee300691);
> > +	return __opcode_to_mem_arm(0xe710f110);
> > +}
> > +
> > +/* "udiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 0" if we're on pj4 w/o MP */
> > +static u32 __attribute_const__ udiv_instruction(void)
> > +{
> > +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
> > +		if (cpu_is_pj4_nomp())
> > +			return __opcode_to_mem_thumb32(0xee300611);
> > +		return __opcode_to_mem_thumb32(0xfbb0f0f1);
> > +	}
> > +
> > +	if (cpu_is_pj4_nomp())
> > +		return __opcode_to_mem_arm(0xee300611);
> > +	return __opcode_to_mem_arm(0xe730f110);
> > +}

Any reason the above aren't marked with __init_or_module as well, as
the compiler can choose not to inline them?

> > +
> > +static void __init_or_module patch(u32 **addr, size_t count, u32 insn)
> > +{
> > +	for (; count != 0; count -= 4)
> > +		**addr++ = insn;
> > +}
> > +
> > +void __init_or_module patch_udiv(void *addr, size_t size)
> > +{
> > +	patch(addr, size, udiv_instruction());
> > +}
> > +
> > +void __init_or_module patch_sdiv(void *addr, size_t size)
> > +{
> > +	return patch(addr, size, sdiv_instruction());
> > +}
> > +
> > +static void __init patch_aeabi_uidiv(void)
> > +{
> > +	extern char __start_udiv_loc[], __stop_udiv_loc[];
> > +	extern char __start_idiv_loc[], __stop_idiv_loc[];
> > +	unsigned int mask;
> > +
> > +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL))
> > +		mask = HWCAP_IDIVT;
> > +	else
> > +		mask = HWCAP_IDIVA;
> > +
> > +	if (!(elf_hwcap & mask))
> > +		return;
> > +
> > +	patch_udiv(__start_udiv_loc, __stop_udiv_loc - __start_udiv_loc);
> > +	patch_sdiv(__start_idiv_loc, __stop_idiv_loc - __start_idiv_loc);

I'm left really concerned about this.  We're modifying code with all
the caches on, and the above is not only missing any coherency of the
I/D paths, it's also missing any branch predictor maintanence.  So, if
we've executed any divisions at this point, the predictor could already
predicted one of these branches that's being modified.
Måns Rullgård Nov. 26, 2015, 12:07 a.m. UTC | #3
Nicolas Pitre <nico@fluxnic.net> writes:

> On Wed, 25 Nov 2015, Stephen Boyd wrote:
>
>> The ARM compiler inserts calls to __aeabi_uidiv() and
>> __aeabi_idiv() when it needs to perform division on signed and
>> unsigned integers. If a processor has support for the udiv and
>> sdiv division instructions the calls to these support routines
>> can be replaced with those instructions. Now that recordmcount
>> records the locations of calls to these library functions in
>> two sections (one for udiv and one for sdiv), iterate over these
>> sections early at boot and patch the call sites with the
>> appropriate division instruction when we determine that the
>> processor supports the division instructions. Using the division
>> instructions should be faster and less power intensive than
>> running the support code.
>
> A few remarks:
>
> 1) The code assumes unconditional branches to __aeabi_idiv and 
>    __aeabi_uidiv. What if there are conditional branches? Also, tail 
>    call optimizations will generate a straight b opcode rather than a bl 
>    and patching those will obviously have catastrophic results.  I think 
>    you should validate the presence of a bl before patching over it.

I did a quick check on a compiled kernel I had nearby, and there are no
conditional or tail calls to those functions, so although they should
obviously be checked for correctness, performance is unlikely to matter
for those.

However, there are almost half as many calls to __aeabi_{u}idivmod as to
the plain div functions, 129 vs 228 for signed and unsigned combined.
For best results, these functions should also be patched with the
hardware instructions.  Obviously the call sites for these can't be
patched.

> 2) For those cases where a call to __aeabi_uidiv and __aeabi_idiv is not 
>    patched due to (1), you could patch a uidiv/idiv plus "bx lr" 
>    at those function entry points too.
>
> 3) In fact I was wondering if the overhead of the branch and back is 
>    really significant compared to the non trivial cost of a idiv 
>    instruction and all the complex infrastructure required to patch 
>    those branches directly, and consequently if the performance 
>    difference is actually worth it versus simply doing (2) alone.

Depending on the operands, the div instruction can take as few as 3
cycles on a Cortex-A7.
Russell King - ARM Linux Nov. 26, 2015, 12:08 a.m. UTC | #4
On Wed, Nov 25, 2015 at 01:51:04PM -0800, Stephen Boyd wrote:
> diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h
> index 85e374f873ac..48c77d422a0d 100644
> --- a/arch/arm/include/asm/cputype.h
> +++ b/arch/arm/include/asm/cputype.h
> @@ -250,8 +250,14 @@ static inline int cpu_is_pj4(void)
>  
>  	return 0;
>  }
> +
> +static inline bool __attribute_const__ cpu_is_pj4_nomp(void)
> +{
> +	return read_cpuid_part() == 0x56005810;

One other thing here, we really ought to avoid adding more magic numbers
to this file.  We have the ARM processors nicely #defined, but not a lot
else.  Maybe its time that we continued with the nice definitions for
these part numbers?
Nicolas Pitre Nov. 26, 2015, 12:44 a.m. UTC | #5
On Thu, 26 Nov 2015, Måns Rullgård wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > On Wed, 25 Nov 2015, Stephen Boyd wrote:
> >
> >> The ARM compiler inserts calls to __aeabi_uidiv() and
> >> __aeabi_idiv() when it needs to perform division on signed and
> >> unsigned integers. If a processor has support for the udiv and
> >> sdiv division instructions the calls to these support routines
> >> can be replaced with those instructions. Now that recordmcount
> >> records the locations of calls to these library functions in
> >> two sections (one for udiv and one for sdiv), iterate over these
> >> sections early at boot and patch the call sites with the
> >> appropriate division instruction when we determine that the
> >> processor supports the division instructions. Using the division
> >> instructions should be faster and less power intensive than
> >> running the support code.
> >
> > A few remarks:
> >
> > 1) The code assumes unconditional branches to __aeabi_idiv and 
> >    __aeabi_uidiv. What if there are conditional branches? Also, tail 
> >    call optimizations will generate a straight b opcode rather than a bl 
> >    and patching those will obviously have catastrophic results.  I think 
> >    you should validate the presence of a bl before patching over it.
> 
> I did a quick check on a compiled kernel I had nearby, and there are no
> conditional or tail calls to those functions, so although they should
> obviously be checked for correctness, performance is unlikely to matter
> for those.

I'm more worried about correctness than performance.  This kind of 
patching should be done defensively.

> However, there are almost half as many calls to __aeabi_{u}idivmod as to
> the plain div functions, 129 vs 228 for signed and unsigned combined.
> For best results, these functions should also be patched with the
> hardware instructions.  Obviously the call sites for these can't be
> patched.

Current __aeabi_{u}idivmod implementations are simple wrappers around a 
call to __aeabi_{u}idiv so they'd get the benefit automatically 
regardless of the chosen approach.

> > 2) For those cases where a call to __aeabi_uidiv and __aeabi_idiv is not 
> >    patched due to (1), you could patch a uidiv/idiv plus "bx lr" 
> >    at those function entry points too.
> >
> > 3) In fact I was wondering if the overhead of the branch and back is 
> >    really significant compared to the non trivial cost of a idiv 
> >    instruction and all the complex infrastructure required to patch 
> >    those branches directly, and consequently if the performance 
> >    difference is actually worth it versus simply doing (2) alone.
> 
> Depending on the operands, the div instruction can take as few as 3
> cycles on a Cortex-A7.

Even the current software based implementation can produce a result with 
about 5 simple ALU instructions depending on the operands.

The average cycle count is more important than the easy-way-out case. 
And then how significant the two branches around it are compared to idiv 
alone from direct patching of every call to it.


Nicolas
Måns Rullgård Nov. 26, 2015, 12:50 a.m. UTC | #6
Nicolas Pitre <nico@fluxnic.net> writes:

> On Thu, 26 Nov 2015, Måns Rullgård wrote:
>
>> Nicolas Pitre <nico@fluxnic.net> writes:
>> 
>> > 3) In fact I was wondering if the overhead of the branch and back is 
>> >    really significant compared to the non trivial cost of a idiv 
>> >    instruction and all the complex infrastructure required to patch 
>> >    those branches directly, and consequently if the performance 
>> >    difference is actually worth it versus simply doing (2) alone.
>> 
>> Depending on the operands, the div instruction can take as few as 3
>> cycles on a Cortex-A7.
>
> Even the current software based implementation can produce a result with 
> about 5 simple ALU instructions depending on the operands.
>
> The average cycle count is more important than the easy-way-out case. 
> And then how significant the two branches around it are compared to idiv 
> alone from direct patching of every call to it.

If not calling the function saves an I-cache miss, the benefit can be
substantial.  No, I have no proof of this being a problem, but it's
something that could happen.

Of course, none of this is going to be as good as letting the compiler
generate div instructions directly.
Russell King - ARM Linux Nov. 26, 2015, 1:28 a.m. UTC | #7
On Thu, Nov 26, 2015 at 12:50:08AM +0000, Måns Rullgård wrote:
> If not calling the function saves an I-cache miss, the benefit can be
> substantial.  No, I have no proof of this being a problem, but it's
> something that could happen.

That's a simplistic view of modern CPUs.

As I've already said, modern CPUs which have branch prediction, but
they also have speculative instruction fetching and speculative data
prefetching - which the CPUs which have idiv support will have.

With such features, the branch predictor is able to learn that the
branch will be taken, and because of the speculative instruction
fetching, it can bring the cache line in so that it has the
instructions it needs with minimal or, if working correctly,
without stalling the CPU pipeline.
Måns Rullgård Nov. 26, 2015, 2:19 a.m. UTC | #8
Russell King - ARM Linux <linux@arm.linux.org.uk> writes:

> On Thu, Nov 26, 2015 at 12:50:08AM +0000, Måns Rullgård wrote:
>> If not calling the function saves an I-cache miss, the benefit can be
>> substantial.  No, I have no proof of this being a problem, but it's
>> something that could happen.
>
> That's a simplistic view of modern CPUs.
>
> As I've already said, modern CPUs which have branch prediction, but
> they also have speculative instruction fetching and speculative data
> prefetching - which the CPUs which have idiv support will have.
>
> With such features, the branch predictor is able to learn that the
> branch will be taken, and because of the speculative instruction
> fetching, it can bring the cache line in so that it has the
> instructions it needs with minimal or, if working correctly,
> without stalling the CPU pipeline.

It doesn't matter how many fancy features the CPU has.  Executing more
branches and using more cache lines puts additional pressure on those
resources, reducing overall performance.  Besides, the performance
counters readily show that the prediction is nothing near as perfect as
you seem to believe.
Nicolas Pitre Nov. 26, 2015, 5:32 a.m. UTC | #9
On Thu, 26 Nov 2015, Måns Rullgård wrote:

> Russell King - ARM Linux <linux@arm.linux.org.uk> writes:
> 
> > On Thu, Nov 26, 2015 at 12:50:08AM +0000, Måns Rullgård wrote:
> >> If not calling the function saves an I-cache miss, the benefit can be
> >> substantial.  No, I have no proof of this being a problem, but it's
> >> something that could happen.
> >
> > That's a simplistic view of modern CPUs.
> >
> > As I've already said, modern CPUs which have branch prediction, but
> > they also have speculative instruction fetching and speculative data
> > prefetching - which the CPUs which have idiv support will have.
> >
> > With such features, the branch predictor is able to learn that the
> > branch will be taken, and because of the speculative instruction
> > fetching, it can bring the cache line in so that it has the
> > instructions it needs with minimal or, if working correctly,
> > without stalling the CPU pipeline.
> 
> It doesn't matter how many fancy features the CPU has.  Executing more
> branches and using more cache lines puts additional pressure on those
> resources, reducing overall performance.  Besides, the performance
> counters readily show that the prediction is nothing near as perfect as
> you seem to believe.

OK... Let's try to come up with actual numbers.

We know that letting gcc emit idiv by itself is the ultimate solution. 
And it is free of maintenance on our side besides passing the 
appropriate argument to gcc of course. So this is worth doing.

For the case where you have a set of target machines in your kernel that 
may or may not have idiv, then the first step should be to patch 
__aeabi_uidiv and __aeabi_idiv.  This is a pretty small and simple 
change that might turn out to be more than good enough. It is necessary 
anyway as the full patching solution does not cover all cases.

Then, IMHO, it would be a good idea to get performance numbers to 
compare that first step and the full patching solution. Of course the 
full patching will yield better performance. It has to. But if the 
difference is not significant enough, then it might not be worth 
introducing the implied complexity into mainline.  And it is not because 
the approach is bad. In fact I think this is a very cool hack. But it 
comes with a cost in maintenance and that cost has to be justified.

Just to have an idea, I produced the attached micro benchmark. I tested 
on a TC2 forced to a single Cortex-A15 core and I got those results:

Testing INLINE_DIV ...

real    0m7.182s
user    0m7.170s
sys     0m0.000s

Testing PATCHED_DIV ...

real    0m7.181s
user    0m7.170s
sys     0m0.000s

Testing OUTOFLINE_DIV ...

real    0m7.181s
user    0m7.170s
sys     0m0.005s

Testing LIBGCC_DIV ...

real    0m18.659s
user    0m18.635s
sys     0m0.000s

As you can see, whether the div is inline or out-of-line, whether 
arguments are moved into r0-r1 or not, makes no difference at all on a 
Cortex-A15.

Now forcing it onto a Cortex-A7 core:

Testing INLINE_DIV ...

real    0m8.917s
user    0m8.895s
sys     0m0.005s

Testing PATCHED_DIV ...

real    0m11.666s
user    0m11.645s
sys     0m0.000s

Testing OUTOFLINE_DIV ...

real    0m13.065s
user    0m13.025s
sys     0m0.000s

Testing LIBGCC_DIV ...

real    0m51.815s
user    0m51.750s
sys     0m0.005s

So on A cortex-A7 the various overheads become visible. How significant 
is it in practice with normal kernel usage? I don't know.


Nicolas
#!/bin/sh
set -e
for test in INLINE_DIV PATCHED_DIV OUTOFLINE_DIV LIBGCC_DIV; do
  gcc -o divtest_$test divtest.S -D$test
  echo "Testing $test ..."
  time ./divtest_$test
  echo
  rm -f divtest_$test
done
.arm
	.arch_extension idiv

	.globl main
main:

	stmfd	sp!, {r4, r5, lr}

	mov	r4, #17
1:	mov	r5, #1

2:
#if defined(INLINE_DIV)

	udiv	r0, r4, r5

#elif defined(OUTOFLINE_DIV)

	mov	r0, r4
	mov	r1, r5
	bl	my_div

#elif defined(PATCHED_DIV)

	mov	r0, r4
	mov	r1, r5
	udiv	r0, r0, r1

#elif defined(LIBGCC_DIV)

	mov	r0, r4
	mov	r1, r5
	bl	__aeabi_uidiv

#else
#error "define INLINE_DIV, OUTOFLINE_DIV or LIBGCC_DIV"
#endif

	add	r5, r5, #1
	cmp	r4, r5
	bhs	2b
	adds	r4, r4, r4, lsl #1
	bpl	1b

	mov	r0, #0
	ldmfd	sp!, {r4, r5, pc}

	.space 1024

my_div:

	udiv	r0, r0, r1
	bx	lr
Måns Rullgård Nov. 26, 2015, 12:41 p.m. UTC | #10
Nicolas Pitre <nico@fluxnic.net> writes:

> On Thu, 26 Nov 2015, Måns Rullgård wrote:
>
>> Russell King - ARM Linux <linux@arm.linux.org.uk> writes:
>> 
>> > On Thu, Nov 26, 2015 at 12:50:08AM +0000, Måns Rullgård wrote:
>> >> If not calling the function saves an I-cache miss, the benefit can be
>> >> substantial.  No, I have no proof of this being a problem, but it's
>> >> something that could happen.
>> >
>> > That's a simplistic view of modern CPUs.
>> >
>> > As I've already said, modern CPUs which have branch prediction, but
>> > they also have speculative instruction fetching and speculative data
>> > prefetching - which the CPUs which have idiv support will have.
>> >
>> > With such features, the branch predictor is able to learn that the
>> > branch will be taken, and because of the speculative instruction
>> > fetching, it can bring the cache line in so that it has the
>> > instructions it needs with minimal or, if working correctly,
>> > without stalling the CPU pipeline.
>> 
>> It doesn't matter how many fancy features the CPU has.  Executing more
>> branches and using more cache lines puts additional pressure on those
>> resources, reducing overall performance.  Besides, the performance
>> counters readily show that the prediction is nothing near as perfect as
>> you seem to believe.
>
> OK... Let's try to come up with actual numbers.
>
> We know that letting gcc emit idiv by itself is the ultimate solution. 
> And it is free of maintenance on our side besides passing the 
> appropriate argument to gcc of course. So this is worth doing.
>
> For the case where you have a set of target machines in your kernel that 
> may or may not have idiv, then the first step should be to patch 
> __aeabi_uidiv and __aeabi_idiv.  This is a pretty small and simple 
> change that might turn out to be more than good enough. It is necessary 
> anyway as the full patching solution does not cover all cases.
>
> Then, IMHO, it would be a good idea to get performance numbers to 
> compare that first step and the full patching solution. Of course the 
> full patching will yield better performance. It has to. But if the 
> difference is not significant enough, then it might not be worth 
> introducing the implied complexity into mainline.  And it is not because 
> the approach is bad. In fact I think this is a very cool hack. But it 
> comes with a cost in maintenance and that cost has to be justified.
>
> Just to have an idea, I produced the attached micro benchmark. I tested 
> on a TC2 forced to a single Cortex-A15 core and I got those results:
>
> Testing INLINE_DIV ...
>
> real    0m7.182s
> user    0m7.170s
> sys     0m0.000s
>
> Testing PATCHED_DIV ...
>
> real    0m7.181s
> user    0m7.170s
> sys     0m0.000s
>
> Testing OUTOFLINE_DIV ...
>
> real    0m7.181s
> user    0m7.170s
> sys     0m0.005s
>
> Testing LIBGCC_DIV ...
>
> real    0m18.659s
> user    0m18.635s
> sys     0m0.000s
>
> As you can see, whether the div is inline or out-of-line, whether 
> arguments are moved into r0-r1 or not, makes no difference at all on a 
> Cortex-A15.
>
> Now forcing it onto a Cortex-A7 core:
>
> Testing INLINE_DIV ...
>
> real    0m8.917s
> user    0m8.895s
> sys     0m0.005s
>
> Testing PATCHED_DIV ...
>
> real    0m11.666s
> user    0m11.645s
> sys     0m0.000s
>
> Testing OUTOFLINE_DIV ...
>
> real    0m13.065s
> user    0m13.025s
> sys     0m0.000s
>
> Testing LIBGCC_DIV ...
>
> real    0m51.815s
> user    0m51.750s
> sys     0m0.005s
>
> So on A cortex-A7 the various overheads become visible. How significant 
> is it in practice with normal kernel usage? I don't know.

Bear in mind that in a trivial test like this, everything fits in L1
caches and branch prediction works perfectly.  It would be more
informative to measure the effect on a load that already has some cache
and branch prediction misses.
diff mbox

Patch

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0365cbbc9179..aa8bc7da6331 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1617,6 +1617,20 @@  config AEABI
 
 	  To use this you need GCC version 4.0.0 or later.
 
+config ARM_PATCH_UIDIV
+	bool "Runtime patch calls to __aeabi_{u}idiv() with udiv/sdiv"
+	depends on CPU_32v7 && !XIP_KERNEL && AEABI
+	help
+	  Some v7 CPUs have support for the udiv and sdiv instructions
+	  that can be used in place of calls to __aeabi_uidiv and __aeabi_idiv
+	  functions provided by the ARM runtime ABI.
+
+	  Enabling this option allows the kernel to modify itself to replace
+	  branches to these library functions with the udiv and sdiv
+	  instructions themselves. Typically this will be faster and less
+	  power intensive than running the library support code to do
+	  integer division.
+
 config OABI_COMPAT
 	bool "Allow old ABI binaries to run with this kernel (EXPERIMENTAL)"
 	depends on AEABI && !THUMB2_KERNEL
diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h
index 85e374f873ac..48c77d422a0d 100644
--- a/arch/arm/include/asm/cputype.h
+++ b/arch/arm/include/asm/cputype.h
@@ -250,8 +250,14 @@  static inline int cpu_is_pj4(void)
 
 	return 0;
 }
+
+static inline bool __attribute_const__ cpu_is_pj4_nomp(void)
+{
+	return read_cpuid_part() == 0x56005810;
+}
 #else
-#define cpu_is_pj4()	0
+#define cpu_is_pj4()		0
+#define cpu_is_pj4_nomp()	0
 #endif
 
 static inline int __attribute_const__ cpuid_feature_extract_field(u32 features,
diff --git a/arch/arm/include/asm/setup.h b/arch/arm/include/asm/setup.h
index e0adb9f1bf94..a514552d5cbd 100644
--- a/arch/arm/include/asm/setup.h
+++ b/arch/arm/include/asm/setup.h
@@ -25,4 +25,7 @@  extern int arm_add_memory(u64 start, u64 size);
 extern void early_print(const char *str, ...);
 extern void dump_machine_table(void);
 
+extern void patch_udiv(void *addr, size_t size);
+extern void patch_sdiv(void *addr, size_t size);
+
 #endif
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index efdddcb97dd1..aa59e5cfe6a0 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -20,7 +20,9 @@ 
 #include <linux/string.h>
 #include <linux/gfp.h>
 
+#include <asm/hwcap.h>
 #include <asm/pgtable.h>
+#include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/smp_plat.h>
 #include <asm/unwind.h>
@@ -51,6 +53,38 @@  void *module_alloc(unsigned long size)
 }
 #endif
 
+#ifdef CONFIG_ARM_PATCH_UIDIV
+static int module_patch_aeabi_uidiv(unsigned long loc, const Elf32_Sym *sym)
+{
+	extern char __aeabi_uidiv[], __aeabi_idiv[];
+	unsigned long udiv_addr = (unsigned long)__aeabi_uidiv;
+	unsigned long sdiv_addr = (unsigned long)__aeabi_idiv;
+	unsigned int mask;
+
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL))
+		mask = HWCAP_IDIVT;
+	else
+		mask = HWCAP_IDIVA;
+
+	if (elf_hwcap & mask) {
+		if (sym->st_value == udiv_addr) {
+			patch_udiv(&loc, sizeof(loc));
+			return 1;
+		} else if (sym->st_value == sdiv_addr) {
+			patch_sdiv(&loc, sizeof(loc));
+			return 1;
+		}
+	}
+
+	return 0;
+}
+#else
+static int module_patch_aeabi_uidiv(unsigned long loc, const Elf32_Sym *sym)
+{
+	return 0;
+}
+#endif
+
 int
 apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 	       unsigned int relindex, struct module *module)
@@ -109,6 +143,9 @@  apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 				return -ENOEXEC;
 			}
 
+			if (module_patch_aeabi_uidiv(loc, sym))
+				break;
+
 			offset = __mem_to_opcode_arm(*(u32 *)loc);
 			offset = (offset & 0x00ffffff) << 2;
 			if (offset & 0x02000000)
@@ -195,6 +232,9 @@  apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 				return -ENOEXEC;
 			}
 
+			if (module_patch_aeabi_uidiv(loc, sym))
+				break;
+
 			upper = __mem_to_opcode_thumb16(*(u16 *)loc);
 			lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
 
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 20edd349d379..39a46059d5e8 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -32,6 +32,7 @@ 
 #include <linux/compiler.h>
 #include <linux/sort.h>
 #include <linux/psci.h>
+#include <linux/module.h>
 
 #include <asm/unified.h>
 #include <asm/cp15.h>
@@ -375,6 +376,72 @@  void __init early_print(const char *str, ...)
 	printk("%s", buf);
 }
 
+#ifdef CONFIG_ARM_PATCH_UIDIV
+/* "sdiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 4" if we're on pj4 w/o MP */
+static u32 __attribute_const__ sdiv_instruction(void)
+{
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
+		if (cpu_is_pj4_nomp())
+			return __opcode_to_mem_thumb32(0xee300691);
+		return __opcode_to_mem_thumb32(0xfb90f0f1);
+	}
+
+	if (cpu_is_pj4_nomp())
+		return __opcode_to_mem_arm(0xee300691);
+	return __opcode_to_mem_arm(0xe710f110);
+}
+
+/* "udiv r0, r0, r1" or "mrc p6, 1, r0, CR0, CR1, 0" if we're on pj4 w/o MP */
+static u32 __attribute_const__ udiv_instruction(void)
+{
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
+		if (cpu_is_pj4_nomp())
+			return __opcode_to_mem_thumb32(0xee300611);
+		return __opcode_to_mem_thumb32(0xfbb0f0f1);
+	}
+
+	if (cpu_is_pj4_nomp())
+		return __opcode_to_mem_arm(0xee300611);
+	return __opcode_to_mem_arm(0xe730f110);
+}
+
+static void __init_or_module patch(u32 **addr, size_t count, u32 insn)
+{
+	for (; count != 0; count -= 4)
+		**addr++ = insn;
+}
+
+void __init_or_module patch_udiv(void *addr, size_t size)
+{
+	patch(addr, size, udiv_instruction());
+}
+
+void __init_or_module patch_sdiv(void *addr, size_t size)
+{
+	return patch(addr, size, sdiv_instruction());
+}
+
+static void __init patch_aeabi_uidiv(void)
+{
+	extern char __start_udiv_loc[], __stop_udiv_loc[];
+	extern char __start_idiv_loc[], __stop_idiv_loc[];
+	unsigned int mask;
+
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL))
+		mask = HWCAP_IDIVT;
+	else
+		mask = HWCAP_IDIVA;
+
+	if (!(elf_hwcap & mask))
+		return;
+
+	patch_udiv(__start_udiv_loc, __stop_udiv_loc - __start_udiv_loc);
+	patch_sdiv(__start_idiv_loc, __stop_idiv_loc - __start_idiv_loc);
+}
+#else
+static void __init patch_aeabi_uidiv(void) { }
+#endif
+
 static void __init cpuid_init_hwcaps(void)
 {
 	int block;
@@ -642,6 +709,7 @@  static void __init setup_processor(void)
 	elf_hwcap = list->elf_hwcap;
 
 	cpuid_init_hwcaps();
+	patch_aeabi_uidiv();
 
 #ifndef CONFIG_ARM_THUMB
 	elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT);
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 8b60fde5ce48..bc87a2e04e6f 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -28,6 +28,18 @@ 
 	*(.hyp.idmap.text)						\
 	VMLINUX_SYMBOL(__hyp_idmap_text_end) = .;
 
+#ifdef CONFIG_ARM_PATCH_UIDIV
+#define UIDIV_REC	. = ALIGN(8);					\
+			VMLINUX_SYMBOL(__start_udiv_loc) = .;		\
+			*(__udiv_loc)					\
+			VMLINUX_SYMBOL(__stop_udiv_loc) = .;		\
+			VMLINUX_SYMBOL(__start_idiv_loc) = .;		\
+			*(__idiv_loc)					\
+			VMLINUX_SYMBOL(__stop_idiv_loc) = .;
+#else
+#define UIDIV_REC
+#endif
+
 #ifdef CONFIG_HOTPLUG_CPU
 #define ARM_CPU_DISCARD(x)
 #define ARM_CPU_KEEP(x)		x
@@ -210,6 +222,7 @@  SECTIONS
 	.init.data : {
 #ifndef CONFIG_XIP_KERNEL
 		INIT_DATA
+		UIDIV_REC
 #endif
 		INIT_SETUP(16)
 		INIT_CALLS