diff mbox series

[v2] ARM: Optimise copy_{from/to}_user for !CPU_USE_DOMAINS

Message ID 20181012131406.18601-1-vincent.whitchurch@axis.com (mailing list archive)
State New, archived
Headers show
Series [v2] ARM: Optimise copy_{from/to}_user for !CPU_USE_DOMAINS | expand

Commit Message

Vincent Whitchurch Oct. 12, 2018, 1:14 p.m. UTC
ARMv6+ processors do not use CONFIG_CPU_USE_DOMAINS and use privileged
ldr/str instructions in copy_{from/to}_user.  They are currently
unnecessarily using single ldr/str instructions and can use ldm/stm
instructions instead like memcpy does (but with appropriate fixup
tables).

This speeds up a "dd if=foo of=bar bs=32k" on a tmpfs filesystem by
about 4% on my Cortex-A9.

before:134217728 bytes (128.0MB) copied, 0.543848 seconds, 235.4MB/s
before:134217728 bytes (128.0MB) copied, 0.538610 seconds, 237.6MB/s
before:134217728 bytes (128.0MB) copied, 0.544356 seconds, 235.1MB/s
before:134217728 bytes (128.0MB) copied, 0.544364 seconds, 235.1MB/s
before:134217728 bytes (128.0MB) copied, 0.537130 seconds, 238.3MB/s
before:134217728 bytes (128.0MB) copied, 0.533443 seconds, 240.0MB/s
before:134217728 bytes (128.0MB) copied, 0.545691 seconds, 234.6MB/s
before:134217728 bytes (128.0MB) copied, 0.534695 seconds, 239.4MB/s
before:134217728 bytes (128.0MB) copied, 0.540561 seconds, 236.8MB/s
before:134217728 bytes (128.0MB) copied, 0.541025 seconds, 236.6MB/s

 after:134217728 bytes (128.0MB) copied, 0.520445 seconds, 245.9MB/s
 after:134217728 bytes (128.0MB) copied, 0.527846 seconds, 242.5MB/s
 after:134217728 bytes (128.0MB) copied, 0.519510 seconds, 246.4MB/s
 after:134217728 bytes (128.0MB) copied, 0.527231 seconds, 242.8MB/s
 after:134217728 bytes (128.0MB) copied, 0.525030 seconds, 243.8MB/s
 after:134217728 bytes (128.0MB) copied, 0.524236 seconds, 244.2MB/s
 after:134217728 bytes (128.0MB) copied, 0.523659 seconds, 244.4MB/s
 after:134217728 bytes (128.0MB) copied, 0.525018 seconds, 243.8MB/s
 after:134217728 bytes (128.0MB) copied, 0.519249 seconds, 246.5MB/s
 after:134217728 bytes (128.0MB) copied, 0.518527 seconds, 246.9MB/s

Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
---
v2: Group *_SHIFT #defines with respective .macro implementations

 arch/arm/include/asm/assembler.h |  6 ++++--
 arch/arm/lib/copy_from_user.S    | 27 ++++++++++++++++++++++++++-
 arch/arm/lib/copy_to_user.S      | 31 ++++++++++++++++++++++++++-----
 3 files changed, 56 insertions(+), 8 deletions(-)

Comments

Nicolas Pitre Oct. 12, 2018, 2:55 p.m. UTC | #1
On Fri, 12 Oct 2018, Vincent Whitchurch wrote:

> ARMv6+ processors do not use CONFIG_CPU_USE_DOMAINS and use privileged
> ldr/str instructions in copy_{from/to}_user.  They are currently
> unnecessarily using single ldr/str instructions and can use ldm/stm
> instructions instead like memcpy does (but with appropriate fixup
> tables).
> 
> This speeds up a "dd if=foo of=bar bs=32k" on a tmpfs filesystem by
> about 4% on my Cortex-A9.
> 
> before:134217728 bytes (128.0MB) copied, 0.543848 seconds, 235.4MB/s
> before:134217728 bytes (128.0MB) copied, 0.538610 seconds, 237.6MB/s
> before:134217728 bytes (128.0MB) copied, 0.544356 seconds, 235.1MB/s
> before:134217728 bytes (128.0MB) copied, 0.544364 seconds, 235.1MB/s
> before:134217728 bytes (128.0MB) copied, 0.537130 seconds, 238.3MB/s
> before:134217728 bytes (128.0MB) copied, 0.533443 seconds, 240.0MB/s
> before:134217728 bytes (128.0MB) copied, 0.545691 seconds, 234.6MB/s
> before:134217728 bytes (128.0MB) copied, 0.534695 seconds, 239.4MB/s
> before:134217728 bytes (128.0MB) copied, 0.540561 seconds, 236.8MB/s
> before:134217728 bytes (128.0MB) copied, 0.541025 seconds, 236.6MB/s
> 
>  after:134217728 bytes (128.0MB) copied, 0.520445 seconds, 245.9MB/s
>  after:134217728 bytes (128.0MB) copied, 0.527846 seconds, 242.5MB/s
>  after:134217728 bytes (128.0MB) copied, 0.519510 seconds, 246.4MB/s
>  after:134217728 bytes (128.0MB) copied, 0.527231 seconds, 242.8MB/s
>  after:134217728 bytes (128.0MB) copied, 0.525030 seconds, 243.8MB/s
>  after:134217728 bytes (128.0MB) copied, 0.524236 seconds, 244.2MB/s
>  after:134217728 bytes (128.0MB) copied, 0.523659 seconds, 244.4MB/s
>  after:134217728 bytes (128.0MB) copied, 0.525018 seconds, 243.8MB/s
>  after:134217728 bytes (128.0MB) copied, 0.519249 seconds, 246.5MB/s
>  after:134217728 bytes (128.0MB) copied, 0.518527 seconds, 246.9MB/s
> 
> Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
> ---
> v2: Group *_SHIFT #defines with respective .macro implementations

Reviewed-by: Nicolas Pitre <nico@linaro.org>

Please submit your patch here:
http://www.arm.linux.org.uk/developer/patches/

>  arch/arm/include/asm/assembler.h |  6 ++++--
>  arch/arm/lib/copy_from_user.S    | 27 ++++++++++++++++++++++++++-
>  arch/arm/lib/copy_to_user.S      | 31 ++++++++++++++++++++++++++-----
>  3 files changed, 56 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> index b17ee03d280b..da16d31c7ef9 100644
> --- a/arch/arm/include/asm/assembler.h
> +++ b/arch/arm/include/asm/assembler.h
> @@ -243,13 +243,15 @@
>  	.endm
>  #endif
>  
> -#define USER(x...)				\
> +#define USERL(l, x...)				\
>  9999:	x;					\
>  	.pushsection __ex_table,"a";		\
>  	.align	3;				\
> -	.long	9999b,9001f;			\
> +	.long	9999b,l;			\
>  	.popsection
>  
> +#define USER(x...)	USERL(9001f, x)
> +
>  #ifdef CONFIG_SMP
>  #define ALT_SMP(instr...)					\
>  9998:	instr
> diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
> index a826df3d3814..fca3a7037725 100644
> --- a/arch/arm/lib/copy_from_user.S
> +++ b/arch/arm/lib/copy_from_user.S
> @@ -34,12 +34,13 @@
>   *	Number of bytes NOT copied.
>   */
>  
> +#ifdef CONFIG_CPU_USE_DOMAINS
> +
>  #ifndef CONFIG_THUMB2_KERNEL
>  #define LDR1W_SHIFT	0
>  #else
>  #define LDR1W_SHIFT	1
>  #endif
> -#define STR1W_SHIFT	0
>  
>  	.macro ldr1w ptr reg abort
>  	ldrusr	\reg, \ptr, 4, abort=\abort
> @@ -61,6 +62,30 @@
>  	ldrusr	\reg, \ptr, 1, \cond, abort=\abort
>  	.endm
>  
> +#else
> +
> +#define LDR1W_SHIFT	0
> +
> +	.macro ldr1w ptr reg abort
> +	USERL(\abort, W(ldr) \reg, [\ptr], #4)
> +	.endm
> +
> +	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
> +	USERL(\abort, ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4})
> +	.endm
> +
> +	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> +	USERL(\abort, ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8})
> +	.endm
> +
> +	.macro ldr1b ptr reg cond=al abort
> +	USERL(\abort, ldr\cond\()b \reg, [\ptr], #1)
> +	.endm
> +
> +#endif /* CONFIG_CPU_USE_DOMAINS */
> +
> +#define STR1W_SHIFT	0
> +
>  	.macro str1w ptr reg abort
>  	W(str) \reg, [\ptr], #4
>  	.endm
> diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
> index caf5019d8161..d6fe31fb9bc4 100644
> --- a/arch/arm/lib/copy_to_user.S
> +++ b/arch/arm/lib/copy_to_user.S
> @@ -35,11 +35,6 @@
>   */
>  
>  #define LDR1W_SHIFT	0
> -#ifndef CONFIG_THUMB2_KERNEL
> -#define STR1W_SHIFT	0
> -#else
> -#define STR1W_SHIFT	1
> -#endif
>  
>  	.macro ldr1w ptr reg abort
>  	W(ldr) \reg, [\ptr], #4
> @@ -57,6 +52,14 @@
>  	ldr\cond\()b \reg, [\ptr], #1
>  	.endm
>  
> +#ifdef CONFIG_CPU_USE_DOMAINS
> +
> +#ifndef CONFIG_THUMB2_KERNEL
> +#define STR1W_SHIFT	0
> +#else
> +#define STR1W_SHIFT	1
> +#endif
> +
>  	.macro str1w ptr reg abort
>  	strusr	\reg, \ptr, 4, abort=\abort
>  	.endm
> @@ -76,6 +79,24 @@
>  	strusr	\reg, \ptr, 1, \cond, abort=\abort
>  	.endm
>  
> +#else
> +
> +#define STR1W_SHIFT	0
> +
> +	.macro str1w ptr reg abort
> +	USERL(\abort, W(str) \reg, [\ptr], #4)
> +	.endm
> +
> +	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> +	USERL(\abort, stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8})
> +	.endm
> +
> +	.macro str1b ptr reg cond=al abort
> +	USERL(\abort, str\cond\()b \reg, [\ptr], #1)
> +	.endm
> +
> +#endif /* CONFIG_CPU_USE_DOMAINS */
> +
>  	.macro enter reg1 reg2
>  	mov	r3, #0
>  	stmdb	sp!, {r0, r2, r3, \reg1, \reg2}
> -- 
> 2.11.0
> 
>
Vincent Whitchurch Oct. 19, 2018, 9 a.m. UTC | #2
On Fri, Oct 12, 2018 at 03:14:06PM +0200, Vincent Whitchurch wrote:
> +	.macro ldr1b ptr reg cond=al abort
> +	USERL(\abort, ldr\cond\()b \reg, [\ptr], #1)
> +	.endm

This is broken on Thumb-2 since the fault handler is installed on the
implicitly-inserted IT instead of the LDR.  I will send a v3.
diff mbox series

Patch

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index b17ee03d280b..da16d31c7ef9 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -243,13 +243,15 @@ 
 	.endm
 #endif
 
-#define USER(x...)				\
+#define USERL(l, x...)				\
 9999:	x;					\
 	.pushsection __ex_table,"a";		\
 	.align	3;				\
-	.long	9999b,9001f;			\
+	.long	9999b,l;			\
 	.popsection
 
+#define USER(x...)	USERL(9001f, x)
+
 #ifdef CONFIG_SMP
 #define ALT_SMP(instr...)					\
 9998:	instr
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index a826df3d3814..fca3a7037725 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -34,12 +34,13 @@ 
  *	Number of bytes NOT copied.
  */
 
+#ifdef CONFIG_CPU_USE_DOMAINS
+
 #ifndef CONFIG_THUMB2_KERNEL
 #define LDR1W_SHIFT	0
 #else
 #define LDR1W_SHIFT	1
 #endif
-#define STR1W_SHIFT	0
 
 	.macro ldr1w ptr reg abort
 	ldrusr	\reg, \ptr, 4, abort=\abort
@@ -61,6 +62,30 @@ 
 	ldrusr	\reg, \ptr, 1, \cond, abort=\abort
 	.endm
 
+#else
+
+#define LDR1W_SHIFT	0
+
+	.macro ldr1w ptr reg abort
+	USERL(\abort, W(ldr) \reg, [\ptr], #4)
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	USERL(\abort, ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4})
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	USERL(\abort, ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8})
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	USERL(\abort, ldr\cond\()b \reg, [\ptr], #1)
+	.endm
+
+#endif /* CONFIG_CPU_USE_DOMAINS */
+
+#define STR1W_SHIFT	0
+
 	.macro str1w ptr reg abort
 	W(str) \reg, [\ptr], #4
 	.endm
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index caf5019d8161..d6fe31fb9bc4 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -35,11 +35,6 @@ 
  */
 
 #define LDR1W_SHIFT	0
-#ifndef CONFIG_THUMB2_KERNEL
-#define STR1W_SHIFT	0
-#else
-#define STR1W_SHIFT	1
-#endif
 
 	.macro ldr1w ptr reg abort
 	W(ldr) \reg, [\ptr], #4
@@ -57,6 +52,14 @@ 
 	ldr\cond\()b \reg, [\ptr], #1
 	.endm
 
+#ifdef CONFIG_CPU_USE_DOMAINS
+
+#ifndef CONFIG_THUMB2_KERNEL
+#define STR1W_SHIFT	0
+#else
+#define STR1W_SHIFT	1
+#endif
+
 	.macro str1w ptr reg abort
 	strusr	\reg, \ptr, 4, abort=\abort
 	.endm
@@ -76,6 +79,24 @@ 
 	strusr	\reg, \ptr, 1, \cond, abort=\abort
 	.endm
 
+#else
+
+#define STR1W_SHIFT	0
+
+	.macro str1w ptr reg abort
+	USERL(\abort, W(str) \reg, [\ptr], #4)
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	USERL(\abort, stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8})
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	USERL(\abort, str\cond\()b \reg, [\ptr], #1)
+	.endm
+
+#endif /* CONFIG_CPU_USE_DOMAINS */
+
 	.macro enter reg1 reg2
 	mov	r3, #0
 	stmdb	sp!, {r0, r2, r3, \reg1, \reg2}