diff mbox

[RFT] crypto: arm64/ghash - add NEON accelerated fallback for 64-bit PMULL

Message ID 20170703102919.21714-1-ard.biesheuvel@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Ard Biesheuvel July 3, 2017, 10:29 a.m. UTC
Implement a NEON fallback for systems that do support NEON but have
no support for the optional 64x64->128 polynomial multiplication
instruction that is part of the ARMv8 Crypto Extensions. It is based
on the paper "Fast Software Polynomial Multiplication on ARM Processors
Using the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
Ricardo Dahab (https://hal.inria.fr/hal-01506572)

On a low-end core such as the Cortex-A53 found in the Raspberry Pi3, the
NEON based implementation is ~2.8x faster than the table based one, and
is time invariant as well, making it less vulnerable to timing attacks.
When combined with the bit-sliced NEON implementation of AES-CTR, the
AES-GCM performance increases by 75% (from 58 to 33 cycles per byte).

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
Note that this is the arm64 counterpart of the patch
"crypto: arm/ghash - add NEON accelerated fallback for vmull.p64"

Raw numbers for a 1.2 Ghz Cortex-A53 (Raspberry Pi3) after the patch.

This patch applies onto the patch "crypto: arm64/gcm - implement native
driver using v8 Crypto Extensions" which can be found here:
http://www.mail-archive.com/linux-crypto@vger.kernel.org/msg26385.html

 arch/arm64/crypto/ghash-ce-core.S | 161 +++++++++++++++++---
 arch/arm64/crypto/ghash-ce-glue.c |  36 ++++-
 2 files changed, 170 insertions(+), 27 deletions(-)
diff mbox

Patch

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index cb22459eba85..8a789f6154fc 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -1,7 +1,7 @@ 
 /*
  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
  *
- * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published
@@ -11,24 +11,119 @@ 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-	SHASH	.req	v0
-	SHASH2	.req	v1
-	T1	.req	v2
-	T2	.req	v3
-	MASK	.req	v4
-	XL	.req	v5
-	XM	.req	v6
-	XH	.req	v7
-	IN1	.req	v7
+	SHASH		.req	v0
+	SHASH2		.req	v1
+	T1		.req	v2
+	T2		.req	v3
+	MASK		.req	v4
+	XL		.req	v5
+	XM		.req	v6
+	XH		.req	v7
+	IN1		.req	v7
+
+	k00_16		.req	v8
+	k32_48		.req	v9
+
+	t3		.req	v10
+	t4		.req	v11
+	t5		.req	v12
+	t6		.req	v13
+	t7		.req	v14
+	t8		.req	v15
+	t9		.req	v16
+
+	perm1		.req	v17
+	perm2		.req	v18
+	perm3		.req	v19
+	perm4		.req	v20
 
 	.text
 	.arch		armv8-a+crypto
 
-	/*
-	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
-	 *			   struct ghash_key const *k, const char *head)
-	 */
-ENTRY(pmull_ghash_update)
+	.macro		__pmull_p64, rd, rn, rm, i
+	.ifb		\i
+	pmull		\rd\().1q, \rn\().1d, \rm\().1d
+	.else
+	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
+	.endif
+	.endm
+
+	.macro		__pmull_p8, rq, ad, bd, i
+	.ifb		\i
+	ext		t4.8b, \ad\().8b, \ad\().8b, #1		// A1
+	ext		t8.8b, \bd\().8b, \bd\().8b, #1		// B1
+	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
+	ext		t7.8b, \bd\().8b, \bd\().8b, #2		// B2
+	ext		t6.8b, \ad\().8b, \ad\().8b, #3		// A3
+	ext		t9.8b, \bd\().8b, \bd\().8b, #3		// B3
+	ext		t3.8b, \bd\().8b, \bd\().8b, #4		// B4
+
+	pmull		t4.8h, t4.8b, \bd\().8b			// F = A1*B
+	pmull		t8.8h, \ad\().8b, t8.8b			// E = A*B1
+	pmull		t5.8h, t5.8b, \bd\().8b			// H = A2*B
+	pmull		t7.8h, \ad\().8b, t7.8b			// G = A*B2
+	pmull		t6.8h, t6.8b, \bd\().8b			// J = A3*B
+	pmull		t9.8h, \ad\().8b, t9.8b			// I = A*B3
+	pmull		t3.8h, \ad\().8b, t3.8b			// K = A*B4
+	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
+	.else
+	tbl		t4.16b, {\ad\().16b}, perm1.16b		// A1
+	tbl		t8.16b, {\bd\().16b}, perm1.16b		// B1
+	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
+	tbl		t7.16b, {\bd\().16b}, perm2.16b		// B2
+	tbl		t6.16b, {\ad\().16b}, perm3.16b		// A3
+	tbl		t9.16b, {\bd\().16b}, perm3.16b		// B3
+	tbl		t3.16b, {\bd\().16b}, perm4.16b		// B4
+
+	pmull2		t4.8h, t4.16b, \bd\().16b		// F = A1*B
+	pmull2		t8.8h, \ad\().16b, t8.16b		// E = A*B1
+	pmull2		t5.8h, t5.16b, \bd\().16b		// H = A2*B
+	pmull2		t7.8h, \ad\().16b, t7.16b		// G = A*B2
+	pmull2		t6.8h, t6.16b, \bd\().16b		// J = A3*B
+	pmull2		t9.8h, \ad\().16b, t9.16b		// I = A*B3
+	pmull2		t3.8h, \ad\().16b, t3.16b		// K = A*B4
+	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
+	.endif
+
+	eor		t4.16b, t4.16b, t8.16b			// L = E + F
+	eor		t5.16b, t5.16b, t7.16b			// M = G + H
+	eor		t6.16b, t6.16b, t9.16b			// N = I + J
+
+	uzp1		t8.2d, t4.2d, t5.2d
+	uzp2		t4.2d, t4.2d, t5.2d
+	uzp1		t7.2d, t6.2d, t3.2d
+	uzp2		t6.2d, t6.2d, t3.2d
+
+	// t4 = (L) (P0 + P1) << 8
+	// t5 = (M) (P2 + P3) << 16
+	eor		t8.16b, t8.16b, t4.16b
+	and		t4.16b, t4.16b, k32_48.16b
+
+	// t6 = (N) (P4 + P5) << 24
+	// t7 = (K) (P6 + P7) << 32
+	eor		t7.16b, t7.16b, t6.16b
+	and		t6.16b, t6.16b, k00_16.16b
+
+	eor		t8.16b, t8.16b, t4.16b
+	eor		t7.16b, t7.16b, t6.16b
+
+	zip2		t5.2d, t8.2d, t4.2d
+	zip1		t4.2d, t8.2d, t4.2d
+	zip2		t3.2d, t7.2d, t6.2d
+	zip1		t6.2d, t7.2d, t6.2d
+
+	ext		t4.16b, t4.16b, t4.16b, #15
+	ext		t5.16b, t5.16b, t5.16b, #14
+	ext		t6.16b, t6.16b, t6.16b, #13
+	ext		t3.16b, t3.16b, t3.16b, #12
+
+	eor		t4.16b, t4.16b, t5.16b
+	eor		t6.16b, t6.16b, t3.16b
+	eor		\rq\().16b, \rq\().16b, t4.16b
+	eor		\rq\().16b, \rq\().16b, t6.16b
+	.endm
+
+	.macro		__pmull_ghash, pm
 	ld1		{SHASH.2d}, [x3]
 	ld1		{XL.2d}, [x1]
 	movi		MASK.16b, #0xe1
@@ -52,23 +147,23 @@  CPU_LE(	rev64		T1.16b, T1.16b	)
 	eor		T1.16b, T1.16b, T2.16b
 	eor		XL.16b, XL.16b, IN1.16b
 
-	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
+	\pm		XH, SHASH, XL, 2		// a1 * b1
 	eor		T1.16b, T1.16b, XL.16b
-	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
-	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
+	\pm 		XL, SHASH, XL			// a0 * b0
+	\pm 		XM, SHASH2, T1			// (a1 + a0)(b1 + b0)
 
 	ext		T1.16b, XL.16b, XH.16b, #8
 	eor		T2.16b, XL.16b, XH.16b
 	eor		XM.16b, XM.16b, T1.16b
 	eor		XM.16b, XM.16b, T2.16b
-	pmull		T2.1q, XL.1d, MASK.1d
+	\pm		T2, XL, MASK
 
 	mov		XH.d[0], XM.d[1]
 	mov		XM.d[1], XL.d[0]
 
 	eor		XL.16b, XM.16b, T2.16b
 	ext		T2.16b, XL.16b, XL.16b, #8
-	pmull		XL.1q, XL.1d, MASK.1d
+	\pm		XL, XL, MASK
 	eor		T2.16b, T2.16b, XH.16b
 	eor		XL.16b, XL.16b, T2.16b
 
@@ -76,7 +171,31 @@  CPU_LE(	rev64		T1.16b, T1.16b	)
 
 	st1		{XL.2d}, [x1]
 	ret
-ENDPROC(pmull_ghash_update)
+	.endm
+
+	/*
+	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+	 *			   struct ghash_key const *k, const char *head)
+	 */
+ENTRY(pmull_ghash_update_p64)
+	__pmull_ghash	__pmull_p64
+ENDPROC(pmull_ghash_update_p64)
+
+ENTRY(pmull_ghash_update_p8)
+	// k00_16 := 0x0000000000000000_000000000000ffff
+	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
+	movi		k32_48.2d, #0xffffffff
+	mov		k32_48.h[2], k32_48.h[0]
+	ushr		k00_16.2d, k32_48.2d, #32
+
+	mov_q		x5, 0x080f0e0d0c0b0a09
+	dup		perm1.2d, x5
+	ext		perm2.16b, perm1.16b, perm1.16b, #1
+	ext		perm3.16b, perm1.16b, perm1.16b, #2
+	ext		perm4.16b, perm1.16b, perm1.16b, #3
+
+	__pmull_ghash	__pmull_p8
+ENDPROC(pmull_ghash_update_p8)
 
 	KS		.req	v8
 	CTR		.req	v9
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 524dd5a5aca1..6bf08e4d84fe 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -26,6 +26,7 @@ 
 MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("ghash");
 
 #define GHASH_BLOCK_SIZE	16
 #define GHASH_DIGEST_SIZE	16
@@ -48,8 +49,17 @@  struct gcm_aes_ctx {
 	struct ghash_key	ghash_key;
 };
 
-asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
-				   struct ghash_key const *k, const char *head);
+asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
+				       struct ghash_key const *k,
+				       const char *head);
+
+asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
+				      struct ghash_key const *k,
+				      const char *head);
+
+static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
+				  struct ghash_key const *k,
+				  const char *head);
 
 asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
 				  const u8 src[], struct ghash_key const *k,
@@ -554,9 +564,18 @@  static int __init ghash_ce_mod_init(void)
 {
 	int ret;
 
-	ret = crypto_register_aead(&gcm_aes_alg);
-	if (ret)
-		return ret;
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+
+	if (elf_hwcap & HWCAP_PMULL) {
+		pmull_ghash_update = pmull_ghash_update_p64;
+
+		ret = crypto_register_aead(&gcm_aes_alg);
+		if (ret)
+			return ret;
+	} else {
+		pmull_ghash_update = pmull_ghash_update_p8;
+	}
 
 	ret = crypto_register_shash(&ghash_alg);
 	if (ret)
@@ -570,5 +589,10 @@  static void __exit ghash_ce_mod_exit(void)
 	crypto_unregister_aead(&gcm_aes_alg);
 }
 
-module_cpu_feature_match(PMULL, ghash_ce_mod_init);
+static const struct cpu_feature ghash_cpu_feature[] = {
+	{ cpu_feature(PMULL) }, { }
+};
+MODULE_DEVICE_TABLE(cpu, ghash_cpu_feature);
+
+module_init(ghash_ce_mod_init);
 module_exit(ghash_ce_mod_exit);