Message ID | 20211109120336.3561463-3-ardb@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64: use SHA3 instructions to speed up XOR | expand |
Hi Ard, I trust you on the algorithm but some minor issues below. On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote: > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 6f2d3e31fb54..14354acba5b4 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT > def_bool y > depends on COMPAT && SYSVIPC > > +config CC_HAVE_SHA3 > + def_bool $(cc-option, -march=armv8.2-a+sha3) Is it the compiler or the assembler that we need to support this? I think it's sufficient to only check the latter. I'd also move it to the ARMv8.2 section. > + > menu "Power management options" > > source "kernel/power/Kconfig" > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c > index ee4795f3e166..0415cb94c781 100644 > --- a/arch/arm64/lib/xor-neon.c > +++ b/arch/arm64/lib/xor-neon.c > @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, > } > EXPORT_SYMBOL(xor_arm64_neon_5); > > +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) > +{ > + uint64x2_t res; > + > + asm(".arch armv8.2-a+sha3 \n" > + "eor3 %0.16b, %1.16b, %2.16b, %3.16b" > + : "=w"(res) : "w"(p), "w"(q), "w"(r)); > + return res; > +} The .arch here may confuse the compiler/assembler since it overrides any other .arch. I think this diff on top would do but I haven't extensively tested it. I can fold it in if you give it a try: diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5adae54c98d8..c5104e8829e5 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1545,6 +1545,12 @@ endmenu menu "ARMv8.2 architectural features" +config AS_HAS_ARMV8_2 + def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a) + +config AS_HAS_SHA3 + def_bool $(as-instr,.arch armv8.2-a+sha3) + config ARM64_PMEM bool "Enable support for persistent memory" select ARCH_HAS_PMEM_API @@ -2032,9 +2038,6 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC -config CC_HAVE_SHA3 - def_bool $(cc-option, -march=armv8.2-a+sha3) - menu "Power management options" source "kernel/power/Kconfig" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index e8cfc5868aa8..2f1de88651e6 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -58,6 +58,11 @@ stack_protector_prepare: prepare0 include/generated/asm-offsets.h)) endif +ifeq ($(CONFIG_AS_HAS_ARMV8_2), y) +# make sure to pass the newest target architecture to -march. +asm-arch := armv8.2-a +endif + # Ensure that if the compiler supports branch protection we default it # off, this will be overridden if we are using branch protection. branch-prot-flags-y += $(call cc-option,-mbranch-protection=none) diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c index 0415cb94c781..2ca823825363 100644 --- a/arch/arm64/lib/xor-neon.c +++ b/arch/arm64/lib/xor-neon.c @@ -176,7 +176,7 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) { uint64x2_t res; - asm(".arch armv8.2-a+sha3 \n" + asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" "eor3 %0.16b, %1.16b, %2.16b, %3.16b" : "=w"(res) : "w"(p), "w"(q), "w"(r)); return res; @@ -311,7 +311,7 @@ EXPORT_STATIC_CALL(xor_arm64_5); static int __init xor_neon_init(void) { - if (IS_ENABLED(CONFIG_CC_HAVE_SHA3) && cpu_have_named_feature(SHA3)) { + if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) { static_call_update(xor_arm64_3, xor_arm64_eor3_3); static_call_update(xor_arm64_4, xor_arm64_eor3_4); static_call_update(xor_arm64_5, xor_arm64_eor3_5);
On Mon, 13 Dec 2021 at 14:25, Catalin Marinas <catalin.marinas@arm.com> wrote: > > Hi Ard, > > I trust you on the algorithm but some minor issues below. > > On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote: > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > > index 6f2d3e31fb54..14354acba5b4 100644 > > --- a/arch/arm64/Kconfig > > +++ b/arch/arm64/Kconfig > > @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT > > def_bool y > > depends on COMPAT && SYSVIPC > > > > +config CC_HAVE_SHA3 > > + def_bool $(cc-option, -march=armv8.2-a+sha3) > > Is it the compiler or the assembler that we need to support this? I > think it's sufficient to only check the latter. > > I'd also move it to the ARMv8.2 section. > > > + > > menu "Power management options" > > > > source "kernel/power/Kconfig" > > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c > > index ee4795f3e166..0415cb94c781 100644 > > --- a/arch/arm64/lib/xor-neon.c > > +++ b/arch/arm64/lib/xor-neon.c > > @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, > > } > > EXPORT_SYMBOL(xor_arm64_neon_5); > > > > +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) > > +{ > > + uint64x2_t res; > > + > > + asm(".arch armv8.2-a+sha3 \n" > > + "eor3 %0.16b, %1.16b, %2.16b, %3.16b" > > + : "=w"(res) : "w"(p), "w"(q), "w"(r)); > > + return res; > > +} > > The .arch here may confuse the compiler/assembler since it overrides any > other .arch. I think this diff on top would do but I haven't extensively > tested it. I can fold it in if you give it a try: > I was going to respin this without the static_call changes, since those are not going to land anytime soon, and for this code, it doesn't really matter anyway. I'll fold in your diff and test it as well. > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 5adae54c98d8..c5104e8829e5 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -1545,6 +1545,12 @@ endmenu > > menu "ARMv8.2 architectural features" > > +config AS_HAS_ARMV8_2 > + def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a) > + > +config AS_HAS_SHA3 > + def_bool $(as-instr,.arch armv8.2-a+sha3) > + > config ARM64_PMEM > bool "Enable support for persistent memory" > select ARCH_HAS_PMEM_API > @@ -2032,9 +2038,6 @@ config SYSVIPC_COMPAT > def_bool y > depends on COMPAT && SYSVIPC > > -config CC_HAVE_SHA3 > - def_bool $(cc-option, -march=armv8.2-a+sha3) > - > menu "Power management options" > > source "kernel/power/Kconfig" > diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile > index e8cfc5868aa8..2f1de88651e6 100644 > --- a/arch/arm64/Makefile > +++ b/arch/arm64/Makefile > @@ -58,6 +58,11 @@ stack_protector_prepare: prepare0 > include/generated/asm-offsets.h)) > endif > > +ifeq ($(CONFIG_AS_HAS_ARMV8_2), y) > +# make sure to pass the newest target architecture to -march. > +asm-arch := armv8.2-a > +endif > + > # Ensure that if the compiler supports branch protection we default it > # off, this will be overridden if we are using branch protection. > branch-prot-flags-y += $(call cc-option,-mbranch-protection=none) > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c > index 0415cb94c781..2ca823825363 100644 > --- a/arch/arm64/lib/xor-neon.c > +++ b/arch/arm64/lib/xor-neon.c > @@ -176,7 +176,7 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) > { > uint64x2_t res; > > - asm(".arch armv8.2-a+sha3 \n" > + asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" > "eor3 %0.16b, %1.16b, %2.16b, %3.16b" > : "=w"(res) : "w"(p), "w"(q), "w"(r)); > return res; > @@ -311,7 +311,7 @@ EXPORT_STATIC_CALL(xor_arm64_5); > > static int __init xor_neon_init(void) > { > - if (IS_ENABLED(CONFIG_CC_HAVE_SHA3) && cpu_have_named_feature(SHA3)) { > + if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) { > static_call_update(xor_arm64_3, xor_arm64_eor3_3); > static_call_update(xor_arm64_4, xor_arm64_eor3_4); > static_call_update(xor_arm64_5, xor_arm64_eor3_5); > > -- > Catalin
On Mon, Dec 13, 2021 at 02:33:21PM +0100, Ard Biesheuvel wrote: > On Mon, 13 Dec 2021 at 14:25, Catalin Marinas <catalin.marinas@arm.com> wrote: > > On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote: > > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > > > index 6f2d3e31fb54..14354acba5b4 100644 > > > --- a/arch/arm64/Kconfig > > > +++ b/arch/arm64/Kconfig > > > @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT > > > def_bool y > > > depends on COMPAT && SYSVIPC > > > > > > +config CC_HAVE_SHA3 > > > + def_bool $(cc-option, -march=armv8.2-a+sha3) > > > > Is it the compiler or the assembler that we need to support this? I > > think it's sufficient to only check the latter. > > > > I'd also move it to the ARMv8.2 section. > > > > > + > > > menu "Power management options" > > > > > > source "kernel/power/Kconfig" > > > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c > > > index ee4795f3e166..0415cb94c781 100644 > > > --- a/arch/arm64/lib/xor-neon.c > > > +++ b/arch/arm64/lib/xor-neon.c > > > @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, > > > } > > > EXPORT_SYMBOL(xor_arm64_neon_5); > > > > > > +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) > > > +{ > > > + uint64x2_t res; > > > + > > > + asm(".arch armv8.2-a+sha3 \n" > > > + "eor3 %0.16b, %1.16b, %2.16b, %3.16b" > > > + : "=w"(res) : "w"(p), "w"(q), "w"(r)); > > > + return res; > > > +} > > > > The .arch here may confuse the compiler/assembler since it overrides any > > other .arch. I think this diff on top would do but I haven't extensively > > tested it. I can fold it in if you give it a try: > > I was going to respin this without the static_call changes, since > those are not going to land anytime soon, I thought the generic implementation still works, though not the most efficient. > and for this code, it > doesn't really matter anyway. I'll fold in your diff and test it as > well. Sounds fine to me.
On Mon, 13 Dec 2021 at 16:05, Catalin Marinas <catalin.marinas@arm.com> wrote: > > On Mon, Dec 13, 2021 at 02:33:21PM +0100, Ard Biesheuvel wrote: > > On Mon, 13 Dec 2021 at 14:25, Catalin Marinas <catalin.marinas@arm.com> wrote: > > > On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote: > > > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > > > > index 6f2d3e31fb54..14354acba5b4 100644 > > > > --- a/arch/arm64/Kconfig > > > > +++ b/arch/arm64/Kconfig > > > > @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT > > > > def_bool y > > > > depends on COMPAT && SYSVIPC > > > > > > > > +config CC_HAVE_SHA3 > > > > + def_bool $(cc-option, -march=armv8.2-a+sha3) > > > > > > Is it the compiler or the assembler that we need to support this? I > > > think it's sufficient to only check the latter. > > > > > > I'd also move it to the ARMv8.2 section. > > > > > > > + > > > > menu "Power management options" > > > > > > > > source "kernel/power/Kconfig" > > > > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c > > > > index ee4795f3e166..0415cb94c781 100644 > > > > --- a/arch/arm64/lib/xor-neon.c > > > > +++ b/arch/arm64/lib/xor-neon.c > > > > @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, > > > > } > > > > EXPORT_SYMBOL(xor_arm64_neon_5); > > > > > > > > +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) > > > > +{ > > > > + uint64x2_t res; > > > > + > > > > + asm(".arch armv8.2-a+sha3 \n" > > > > + "eor3 %0.16b, %1.16b, %2.16b, %3.16b" > > > > + : "=w"(res) : "w"(p), "w"(q), "w"(r)); > > > > + return res; > > > > +} > > > > > > The .arch here may confuse the compiler/assembler since it overrides any > > > other .arch. I think this diff on top would do but I haven't extensively > > > tested it. I can fold it in if you give it a try: > > > > I was going to respin this without the static_call changes, since > > those are not going to land anytime soon, > > I thought the generic implementation still works, though not the most > efficient. > It does work, but the existing code already uses function pointers, so at this point, it is just unneeded churn. > > and for this code, it > > doesn't really matter anyway. I'll fold in your diff and test it as > > well. > > Sounds fine to me. > > -- > Catalin
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6f2d3e31fb54..14354acba5b4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC +config CC_HAVE_SHA3 + def_bool $(cc-option, -march=armv8.2-a+sha3) + menu "Power management options" source "kernel/power/Kconfig" diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c index ee4795f3e166..0415cb94c781 100644 --- a/arch/arm64/lib/xor-neon.c +++ b/arch/arm64/lib/xor-neon.c @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, } EXPORT_SYMBOL(xor_arm64_neon_5); +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) +{ + uint64x2_t res; + + asm(".arch armv8.2-a+sha3 \n" + "eor3 %0.16b, %1.16b, %2.16b, %3.16b" + : "=w"(res) : "w"(p), "w"(q), "w"(r)); + return res; +} + +static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + } while (--lines > 0); +} + +static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3, + unsigned long *p4) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 */ + v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + } while (--lines > 0); +} + +static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3, + unsigned long *p4, unsigned long *p5) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + uint64_t *dp5 = (uint64_t *)p5; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 ^ p5 */ + v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); + v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); + v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); + v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + dp5 += 8; + } while (--lines > 0); +} + DEFINE_STATIC_CALL(xor_arm64_3, xor_arm64_neon_3); DEFINE_STATIC_CALL(xor_arm64_4, xor_arm64_neon_4); DEFINE_STATIC_CALL(xor_arm64_5, xor_arm64_neon_5); @@ -180,6 +309,22 @@ EXPORT_STATIC_CALL(xor_arm64_3); EXPORT_STATIC_CALL(xor_arm64_4); EXPORT_STATIC_CALL(xor_arm64_5); +static int __init xor_neon_init(void) +{ + if (IS_ENABLED(CONFIG_CC_HAVE_SHA3) && cpu_have_named_feature(SHA3)) { + static_call_update(xor_arm64_3, xor_arm64_eor3_3); + static_call_update(xor_arm64_4, xor_arm64_eor3_4); + static_call_update(xor_arm64_5, xor_arm64_eor3_5); + } + return 0; +} +module_init(xor_neon_init); + +static void __exit xor_neon_exit(void) +{ +} +module_exit(xor_neon_exit); + MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); MODULE_DESCRIPTION("ARMv8 XOR Extensions"); MODULE_LICENSE("GPL");
Use the EOR3 instruction to implement xor_blocks() if the instruction is available, which is the case if the CPU implements the SHA-3 extension. This is about 20% faster on Apple M1 when using the 5-way version. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> --- arch/arm64/Kconfig | 3 + arch/arm64/lib/xor-neon.c | 145 ++++++++++++++++++++ 2 files changed, 148 insertions(+)