diff mbox series

[2/2] arm64/xor: use EOR3 instructions when available

Message ID 20211109120336.3561463-3-ardb@kernel.org (mailing list archive)
State New, archived
Headers show
Series arm64: use SHA3 instructions to speed up XOR | expand

Commit Message

Ard Biesheuvel Nov. 9, 2021, 12:03 p.m. UTC
Use the EOR3 instruction to implement xor_blocks() if the instruction is
available, which is the case if the CPU implements the SHA-3 extension.
This is about 20% faster on Apple M1 when using the 5-way version.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/Kconfig        |   3 +
 arch/arm64/lib/xor-neon.c | 145 ++++++++++++++++++++
 2 files changed, 148 insertions(+)

Comments

Catalin Marinas Dec. 13, 2021, 1:24 p.m. UTC | #1
Hi Ard,

I trust you on the algorithm but some minor issues below.

On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote:
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 6f2d3e31fb54..14354acba5b4 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT
>  	def_bool y
>  	depends on COMPAT && SYSVIPC
>  
> +config CC_HAVE_SHA3
> +	def_bool $(cc-option, -march=armv8.2-a+sha3)

Is it the compiler or the assembler that we need to support this? I
think it's sufficient to only check the latter.

I'd also move it to the ARMv8.2 section.

> +
>  menu "Power management options"
>  
>  source "kernel/power/Kconfig"
> diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> index ee4795f3e166..0415cb94c781 100644
> --- a/arch/arm64/lib/xor-neon.c
> +++ b/arch/arm64/lib/xor-neon.c
> @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
>  }
>  EXPORT_SYMBOL(xor_arm64_neon_5);
>  
> +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> +{
> +	uint64x2_t res;
> +
> +	asm(".arch	armv8.2-a+sha3		\n"
> +	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
> +	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
> +	return res;
> +}

The .arch here may confuse the compiler/assembler since it overrides any
other .arch. I think this diff on top would do but I haven't extensively
tested it. I can fold it in if you give it a try:

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5adae54c98d8..c5104e8829e5 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1545,6 +1545,12 @@ endmenu
 
 menu "ARMv8.2 architectural features"
 
+config AS_HAS_ARMV8_2
+	def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)
+
+config AS_HAS_SHA3
+	def_bool $(as-instr,.arch armv8.2-a+sha3)
+
 config ARM64_PMEM
 	bool "Enable support for persistent memory"
 	select ARCH_HAS_PMEM_API
@@ -2032,9 +2038,6 @@ config SYSVIPC_COMPAT
 	def_bool y
 	depends on COMPAT && SYSVIPC
 
-config CC_HAVE_SHA3
-	def_bool $(cc-option, -march=armv8.2-a+sha3)
-
 menu "Power management options"
 
 source "kernel/power/Kconfig"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index e8cfc5868aa8..2f1de88651e6 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
 					include/generated/asm-offsets.h))
 endif
 
+ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
+# make sure to pass the newest target architecture to -march.
+asm-arch := armv8.2-a
+endif
+
 # Ensure that if the compiler supports branch protection we default it
 # off, this will be overridden if we are using branch protection.
 branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 0415cb94c781..2ca823825363 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -176,7 +176,7 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
 {
 	uint64x2_t res;
 
-	asm(".arch	armv8.2-a+sha3		\n"
+	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
 	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
 	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
 	return res;
@@ -311,7 +311,7 @@ EXPORT_STATIC_CALL(xor_arm64_5);
 
 static int __init xor_neon_init(void)
 {
-	if (IS_ENABLED(CONFIG_CC_HAVE_SHA3) && cpu_have_named_feature(SHA3)) {
+	if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
 		static_call_update(xor_arm64_3, xor_arm64_eor3_3);
 		static_call_update(xor_arm64_4, xor_arm64_eor3_4);
 		static_call_update(xor_arm64_5, xor_arm64_eor3_5);
Ard Biesheuvel Dec. 13, 2021, 1:33 p.m. UTC | #2
On Mon, 13 Dec 2021 at 14:25, Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> Hi Ard,
>
> I trust you on the algorithm but some minor issues below.
>
> On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote:
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 6f2d3e31fb54..14354acba5b4 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT
> >       def_bool y
> >       depends on COMPAT && SYSVIPC
> >
> > +config CC_HAVE_SHA3
> > +     def_bool $(cc-option, -march=armv8.2-a+sha3)
>
> Is it the compiler or the assembler that we need to support this? I
> think it's sufficient to only check the latter.
>
> I'd also move it to the ARMv8.2 section.
>
> > +
> >  menu "Power management options"
> >
> >  source "kernel/power/Kconfig"
> > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> > index ee4795f3e166..0415cb94c781 100644
> > --- a/arch/arm64/lib/xor-neon.c
> > +++ b/arch/arm64/lib/xor-neon.c
> > @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
> >  }
> >  EXPORT_SYMBOL(xor_arm64_neon_5);
> >
> > +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> > +{
> > +     uint64x2_t res;
> > +
> > +     asm(".arch      armv8.2-a+sha3          \n"
> > +         "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
> > +         : "=w"(res) : "w"(p), "w"(q), "w"(r));
> > +     return res;
> > +}
>
> The .arch here may confuse the compiler/assembler since it overrides any
> other .arch. I think this diff on top would do but I haven't extensively
> tested it. I can fold it in if you give it a try:
>

I was going to respin this without the static_call changes, since
those are not going to land anytime soon, and for this code, it
doesn't really matter anyway. I'll fold in your diff and test it as
well.


> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 5adae54c98d8..c5104e8829e5 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -1545,6 +1545,12 @@ endmenu
>
>  menu "ARMv8.2 architectural features"
>
> +config AS_HAS_ARMV8_2
> +       def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)
> +
> +config AS_HAS_SHA3
> +       def_bool $(as-instr,.arch armv8.2-a+sha3)
> +
>  config ARM64_PMEM
>         bool "Enable support for persistent memory"
>         select ARCH_HAS_PMEM_API
> @@ -2032,9 +2038,6 @@ config SYSVIPC_COMPAT
>         def_bool y
>         depends on COMPAT && SYSVIPC
>
> -config CC_HAVE_SHA3
> -       def_bool $(cc-option, -march=armv8.2-a+sha3)
> -
>  menu "Power management options"
>
>  source "kernel/power/Kconfig"
> diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
> index e8cfc5868aa8..2f1de88651e6 100644
> --- a/arch/arm64/Makefile
> +++ b/arch/arm64/Makefile
> @@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
>                                         include/generated/asm-offsets.h))
>  endif
>
> +ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
> +# make sure to pass the newest target architecture to -march.
> +asm-arch := armv8.2-a
> +endif
> +
>  # Ensure that if the compiler supports branch protection we default it
>  # off, this will be overridden if we are using branch protection.
>  branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
> diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> index 0415cb94c781..2ca823825363 100644
> --- a/arch/arm64/lib/xor-neon.c
> +++ b/arch/arm64/lib/xor-neon.c
> @@ -176,7 +176,7 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
>  {
>         uint64x2_t res;
>
> -       asm(".arch      armv8.2-a+sha3          \n"
> +       asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
>             "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
>             : "=w"(res) : "w"(p), "w"(q), "w"(r));
>         return res;
> @@ -311,7 +311,7 @@ EXPORT_STATIC_CALL(xor_arm64_5);
>
>  static int __init xor_neon_init(void)
>  {
> -       if (IS_ENABLED(CONFIG_CC_HAVE_SHA3) && cpu_have_named_feature(SHA3)) {
> +       if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
>                 static_call_update(xor_arm64_3, xor_arm64_eor3_3);
>                 static_call_update(xor_arm64_4, xor_arm64_eor3_4);
>                 static_call_update(xor_arm64_5, xor_arm64_eor3_5);
>
> --
> Catalin
Catalin Marinas Dec. 13, 2021, 3:05 p.m. UTC | #3
On Mon, Dec 13, 2021 at 02:33:21PM +0100, Ard Biesheuvel wrote:
> On Mon, 13 Dec 2021 at 14:25, Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote:
> > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > index 6f2d3e31fb54..14354acba5b4 100644
> > > --- a/arch/arm64/Kconfig
> > > +++ b/arch/arm64/Kconfig
> > > @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT
> > >       def_bool y
> > >       depends on COMPAT && SYSVIPC
> > >
> > > +config CC_HAVE_SHA3
> > > +     def_bool $(cc-option, -march=armv8.2-a+sha3)
> >
> > Is it the compiler or the assembler that we need to support this? I
> > think it's sufficient to only check the latter.
> >
> > I'd also move it to the ARMv8.2 section.
> >
> > > +
> > >  menu "Power management options"
> > >
> > >  source "kernel/power/Kconfig"
> > > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> > > index ee4795f3e166..0415cb94c781 100644
> > > --- a/arch/arm64/lib/xor-neon.c
> > > +++ b/arch/arm64/lib/xor-neon.c
> > > @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
> > >  }
> > >  EXPORT_SYMBOL(xor_arm64_neon_5);
> > >
> > > +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> > > +{
> > > +     uint64x2_t res;
> > > +
> > > +     asm(".arch      armv8.2-a+sha3          \n"
> > > +         "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
> > > +         : "=w"(res) : "w"(p), "w"(q), "w"(r));
> > > +     return res;
> > > +}
> >
> > The .arch here may confuse the compiler/assembler since it overrides any
> > other .arch. I think this diff on top would do but I haven't extensively
> > tested it. I can fold it in if you give it a try:
> 
> I was going to respin this without the static_call changes, since
> those are not going to land anytime soon,

I thought the generic implementation still works, though not the most
efficient.

> and for this code, it
> doesn't really matter anyway. I'll fold in your diff and test it as
> well.

Sounds fine to me.
Ard Biesheuvel Dec. 13, 2021, 3:10 p.m. UTC | #4
On Mon, 13 Dec 2021 at 16:05, Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Mon, Dec 13, 2021 at 02:33:21PM +0100, Ard Biesheuvel wrote:
> > On Mon, 13 Dec 2021 at 14:25, Catalin Marinas <catalin.marinas@arm.com> wrote:
> > > On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote:
> > > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > > index 6f2d3e31fb54..14354acba5b4 100644
> > > > --- a/arch/arm64/Kconfig
> > > > +++ b/arch/arm64/Kconfig
> > > > @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT
> > > >       def_bool y
> > > >       depends on COMPAT && SYSVIPC
> > > >
> > > > +config CC_HAVE_SHA3
> > > > +     def_bool $(cc-option, -march=armv8.2-a+sha3)
> > >
> > > Is it the compiler or the assembler that we need to support this? I
> > > think it's sufficient to only check the latter.
> > >
> > > I'd also move it to the ARMv8.2 section.
> > >
> > > > +
> > > >  menu "Power management options"
> > > >
> > > >  source "kernel/power/Kconfig"
> > > > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> > > > index ee4795f3e166..0415cb94c781 100644
> > > > --- a/arch/arm64/lib/xor-neon.c
> > > > +++ b/arch/arm64/lib/xor-neon.c
> > > > @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
> > > >  }
> > > >  EXPORT_SYMBOL(xor_arm64_neon_5);
> > > >
> > > > +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> > > > +{
> > > > +     uint64x2_t res;
> > > > +
> > > > +     asm(".arch      armv8.2-a+sha3          \n"
> > > > +         "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
> > > > +         : "=w"(res) : "w"(p), "w"(q), "w"(r));
> > > > +     return res;
> > > > +}
> > >
> > > The .arch here may confuse the compiler/assembler since it overrides any
> > > other .arch. I think this diff on top would do but I haven't extensively
> > > tested it. I can fold it in if you give it a try:
> >
> > I was going to respin this without the static_call changes, since
> > those are not going to land anytime soon,
>
> I thought the generic implementation still works, though not the most
> efficient.
>

It does work, but the existing code already uses function pointers, so
at this point, it is just unneeded churn.

> > and for this code, it
> > doesn't really matter anyway. I'll fold in your diff and test it as
> > well.
>
> Sounds fine to me.
>
> --
> Catalin
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6f2d3e31fb54..14354acba5b4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2034,6 +2034,9 @@  config SYSVIPC_COMPAT
 	def_bool y
 	depends on COMPAT && SYSVIPC
 
+config CC_HAVE_SHA3
+	def_bool $(cc-option, -march=armv8.2-a+sha3)
+
 menu "Power management options"
 
 source "kernel/power/Kconfig"
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index ee4795f3e166..0415cb94c781 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -172,6 +172,135 @@  void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
 }
 EXPORT_SYMBOL(xor_arm64_neon_5);
 
+static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
+{
+	uint64x2_t res;
+
+	asm(".arch	armv8.2-a+sha3		\n"
+	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
+	return res;
+}
+
+static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+	} while (--lines > 0);
+}
+
+static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3,
+			     unsigned long *p4)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+	uint64_t *dp4 = (uint64_t *)p4;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* p1 ^= p4 */
+		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
+		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
+		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
+		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+		dp4 += 8;
+	} while (--lines > 0);
+}
+
+static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3,
+			     unsigned long *p4, unsigned long *p5)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+	uint64_t *dp4 = (uint64_t *)p4;
+	uint64_t *dp5 = (uint64_t *)p5;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* p1 ^= p4 ^ p5 */
+		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
+		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
+		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
+		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+		dp4 += 8;
+		dp5 += 8;
+	} while (--lines > 0);
+}
+
 DEFINE_STATIC_CALL(xor_arm64_3, xor_arm64_neon_3);
 DEFINE_STATIC_CALL(xor_arm64_4, xor_arm64_neon_4);
 DEFINE_STATIC_CALL(xor_arm64_5, xor_arm64_neon_5);
@@ -180,6 +309,22 @@  EXPORT_STATIC_CALL(xor_arm64_3);
 EXPORT_STATIC_CALL(xor_arm64_4);
 EXPORT_STATIC_CALL(xor_arm64_5);
 
+static int __init xor_neon_init(void)
+{
+	if (IS_ENABLED(CONFIG_CC_HAVE_SHA3) && cpu_have_named_feature(SHA3)) {
+		static_call_update(xor_arm64_3, xor_arm64_eor3_3);
+		static_call_update(xor_arm64_4, xor_arm64_eor3_4);
+		static_call_update(xor_arm64_5, xor_arm64_eor3_5);
+	}
+	return 0;
+}
+module_init(xor_neon_init);
+
+static void __exit xor_neon_exit(void)
+{
+}
+module_exit(xor_neon_exit);
+
 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
 MODULE_LICENSE("GPL");