diff mbox series

[v3] ARM: Optimise copy_{from/to}_user for !CPU_USE_DOMAINS

Message ID 20181022073426.24856-1-vincent.whitchurch@axis.com (mailing list archive)
State New, archived
Headers show
Series [v3] ARM: Optimise copy_{from/to}_user for !CPU_USE_DOMAINS | expand

Commit Message

Vincent Whitchurch Oct. 22, 2018, 7:34 a.m. UTC
ARMv6+ processors do not use CONFIG_CPU_USE_DOMAINS and use privileged
ldr/str instructions in copy_{from/to}_user.  They are currently
unnecessarily using single ldr/str instructions and can use ldm/stm
instructions instead like memcpy does (but with appropriate fixup
tables).

This speeds up a "dd if=foo of=bar bs=32k" on a tmpfs filesystem by
about 4% on my Cortex-A9.

before:134217728 bytes (128.0MB) copied, 0.543848 seconds, 235.4MB/s
before:134217728 bytes (128.0MB) copied, 0.538610 seconds, 237.6MB/s
before:134217728 bytes (128.0MB) copied, 0.544356 seconds, 235.1MB/s
before:134217728 bytes (128.0MB) copied, 0.544364 seconds, 235.1MB/s
before:134217728 bytes (128.0MB) copied, 0.537130 seconds, 238.3MB/s
before:134217728 bytes (128.0MB) copied, 0.533443 seconds, 240.0MB/s
before:134217728 bytes (128.0MB) copied, 0.545691 seconds, 234.6MB/s
before:134217728 bytes (128.0MB) copied, 0.534695 seconds, 239.4MB/s
before:134217728 bytes (128.0MB) copied, 0.540561 seconds, 236.8MB/s
before:134217728 bytes (128.0MB) copied, 0.541025 seconds, 236.6MB/s

 after:134217728 bytes (128.0MB) copied, 0.520445 seconds, 245.9MB/s
 after:134217728 bytes (128.0MB) copied, 0.527846 seconds, 242.5MB/s
 after:134217728 bytes (128.0MB) copied, 0.519510 seconds, 246.4MB/s
 after:134217728 bytes (128.0MB) copied, 0.527231 seconds, 242.8MB/s
 after:134217728 bytes (128.0MB) copied, 0.525030 seconds, 243.8MB/s
 after:134217728 bytes (128.0MB) copied, 0.524236 seconds, 244.2MB/s
 after:134217728 bytes (128.0MB) copied, 0.523659 seconds, 244.4MB/s
 after:134217728 bytes (128.0MB) copied, 0.525018 seconds, 243.8MB/s
 after:134217728 bytes (128.0MB) copied, 0.519249 seconds, 246.5MB/s
 after:134217728 bytes (128.0MB) copied, 0.518527 seconds, 246.9MB/s

Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
---
v3: Explicitly add IT instructions to fix fault handling on Thumb-2
v2: Group *_SHIFT #defines with respective .macro implementations

 arch/arm/include/asm/assembler.h |  6 ++++--
 arch/arm/lib/copy_from_user.S    | 30 +++++++++++++++++++++++++++++-
 arch/arm/lib/copy_to_user.S      | 34 +++++++++++++++++++++++++++++-----
 3 files changed, 62 insertions(+), 8 deletions(-)

Comments

Nicolas Pitre Oct. 22, 2018, 3:19 p.m. UTC | #1
On Mon, 22 Oct 2018, Vincent Whitchurch wrote:

> ARMv6+ processors do not use CONFIG_CPU_USE_DOMAINS and use privileged
> ldr/str instructions in copy_{from/to}_user.  They are currently
> unnecessarily using single ldr/str instructions and can use ldm/stm
> instructions instead like memcpy does (but with appropriate fixup
> tables).
> 
> This speeds up a "dd if=foo of=bar bs=32k" on a tmpfs filesystem by
> about 4% on my Cortex-A9.
> 
> before:134217728 bytes (128.0MB) copied, 0.543848 seconds, 235.4MB/s
> before:134217728 bytes (128.0MB) copied, 0.538610 seconds, 237.6MB/s
> before:134217728 bytes (128.0MB) copied, 0.544356 seconds, 235.1MB/s
> before:134217728 bytes (128.0MB) copied, 0.544364 seconds, 235.1MB/s
> before:134217728 bytes (128.0MB) copied, 0.537130 seconds, 238.3MB/s
> before:134217728 bytes (128.0MB) copied, 0.533443 seconds, 240.0MB/s
> before:134217728 bytes (128.0MB) copied, 0.545691 seconds, 234.6MB/s
> before:134217728 bytes (128.0MB) copied, 0.534695 seconds, 239.4MB/s
> before:134217728 bytes (128.0MB) copied, 0.540561 seconds, 236.8MB/s
> before:134217728 bytes (128.0MB) copied, 0.541025 seconds, 236.6MB/s
> 
>  after:134217728 bytes (128.0MB) copied, 0.520445 seconds, 245.9MB/s
>  after:134217728 bytes (128.0MB) copied, 0.527846 seconds, 242.5MB/s
>  after:134217728 bytes (128.0MB) copied, 0.519510 seconds, 246.4MB/s
>  after:134217728 bytes (128.0MB) copied, 0.527231 seconds, 242.8MB/s
>  after:134217728 bytes (128.0MB) copied, 0.525030 seconds, 243.8MB/s
>  after:134217728 bytes (128.0MB) copied, 0.524236 seconds, 244.2MB/s
>  after:134217728 bytes (128.0MB) copied, 0.523659 seconds, 244.4MB/s
>  after:134217728 bytes (128.0MB) copied, 0.525018 seconds, 243.8MB/s
>  after:134217728 bytes (128.0MB) copied, 0.519249 seconds, 246.5MB/s
>  after:134217728 bytes (128.0MB) copied, 0.518527 seconds, 246.9MB/s
> 
> Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
> ---
> v3: Explicitly add IT instructions to fix fault handling on Thumb-2

Sorry if this is starting to look like bikeshedding, but...

> +	.macro ldr1b ptr reg cond=al abort
> +	.ifnc	\cond, al
> +	it	\cond
> +	.endif
> +	USERL(\abort, ldr\cond\()b \reg, [\ptr], #1)
> +	.endm

You could simply factor out the ldr1b macro definition out of the 
CONFIG_CPU_USE_DOMAINS conditional to reuse the existing one which makes 
use of ldrusr where this it issue is already covered for you. The ldr1b 
and str1b don't actually need to be different.


Nicolas
diff mbox series

Patch

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index b17ee03d280b..da16d31c7ef9 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -243,13 +243,15 @@ 
 	.endm
 #endif
 
-#define USER(x...)				\
+#define USERL(l, x...)				\
 9999:	x;					\
 	.pushsection __ex_table,"a";		\
 	.align	3;				\
-	.long	9999b,9001f;			\
+	.long	9999b,l;			\
 	.popsection
 
+#define USER(x...)	USERL(9001f, x)
+
 #ifdef CONFIG_SMP
 #define ALT_SMP(instr...)					\
 9998:	instr
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index a826df3d3814..216f5c5956a1 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -34,12 +34,13 @@ 
  *	Number of bytes NOT copied.
  */
 
+#ifdef CONFIG_CPU_USE_DOMAINS
+
 #ifndef CONFIG_THUMB2_KERNEL
 #define LDR1W_SHIFT	0
 #else
 #define LDR1W_SHIFT	1
 #endif
-#define STR1W_SHIFT	0
 
 	.macro ldr1w ptr reg abort
 	ldrusr	\reg, \ptr, 4, abort=\abort
@@ -61,6 +62,33 @@ 
 	ldrusr	\reg, \ptr, 1, \cond, abort=\abort
 	.endm
 
+#else
+
+#define LDR1W_SHIFT	0
+
+	.macro ldr1w ptr reg abort
+	USERL(\abort, W(ldr) \reg, [\ptr], #4)
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	USERL(\abort, ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4})
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	USERL(\abort, ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8})
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	.ifnc	\cond, al
+	it	\cond
+	.endif
+	USERL(\abort, ldr\cond\()b \reg, [\ptr], #1)
+	.endm
+
+#endif /* CONFIG_CPU_USE_DOMAINS */
+
+#define STR1W_SHIFT	0
+
 	.macro str1w ptr reg abort
 	W(str) \reg, [\ptr], #4
 	.endm
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index caf5019d8161..ce92f282feb4 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -35,11 +35,6 @@ 
  */
 
 #define LDR1W_SHIFT	0
-#ifndef CONFIG_THUMB2_KERNEL
-#define STR1W_SHIFT	0
-#else
-#define STR1W_SHIFT	1
-#endif
 
 	.macro ldr1w ptr reg abort
 	W(ldr) \reg, [\ptr], #4
@@ -57,6 +52,14 @@ 
 	ldr\cond\()b \reg, [\ptr], #1
 	.endm
 
+#ifdef CONFIG_CPU_USE_DOMAINS
+
+#ifndef CONFIG_THUMB2_KERNEL
+#define STR1W_SHIFT	0
+#else
+#define STR1W_SHIFT	1
+#endif
+
 	.macro str1w ptr reg abort
 	strusr	\reg, \ptr, 4, abort=\abort
 	.endm
@@ -76,6 +79,27 @@ 
 	strusr	\reg, \ptr, 1, \cond, abort=\abort
 	.endm
 
+#else
+
+#define STR1W_SHIFT	0
+
+	.macro str1w ptr reg abort
+	USERL(\abort, W(str) \reg, [\ptr], #4)
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	USERL(\abort, stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8})
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	.ifnc	\cond, al
+	it	\cond
+	.endif
+	USERL(\abort, str\cond\()b \reg, [\ptr], #1)
+	.endm
+
+#endif /* CONFIG_CPU_USE_DOMAINS */
+
 	.macro enter reg1 reg2
 	mov	r3, #0
 	stmdb	sp!, {r0, r2, r3, \reg1, \reg2}