Patchwork [4/5] crypto/arm64: sha3 - new implementation based on special instructions

login
register
mail settings
Submitter Ard Biesheuvel
Date Jan. 12, 2018, 1:15 p.m.
Message ID <20180112131522.25663-5-ard.biesheuvel@linaro.org>
Download mbox | patch
Permalink /patch/10160827/
State Superseded
Delegated to: Herbert Xu
Headers show

Comments

Ard Biesheuvel - Jan. 12, 2018, 1:15 p.m.
Implement the various flavours of SHA3 using the new optional
EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Kconfig        |   6 +
 arch/arm64/crypto/Makefile       |   3 +
 arch/arm64/crypto/sha3-ce-core.S | 224 ++++++++++++++++++++
 arch/arm64/crypto/sha3-ce-glue.c | 156 ++++++++++++++
 4 files changed, 389 insertions(+)

Patch

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index aad288f4b9de..4f2974687606 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -35,6 +35,12 @@  config CRYPTO_SHA512_ARM64_CE
 	select CRYPTO_HASH
 	select CRYPTO_SHA512_ARM64
 
+config CRYPTO_SHA3_ARM64_CE
+	tristate "SHA3 digest algorithm (ARMv8 Crypto Extensions)"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_HASH
+	select CRYPTO_SHA3
+
 config CRYPTO_GHASH_ARM64_CE
 	tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index d7573d31d397..04eaf8b78816 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -17,6 +17,9 @@  sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o
 sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
 
+obj-$(CONFIG_CRYPTO_SHA3_ARM64_CE) += sha3-ce.o
+sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
+
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
new file mode 100644
index 000000000000..b0b3d68ef3d3
--- /dev/null
+++ b/arch/arm64/crypto/sha3-ce-core.S
@@ -0,0 +1,224 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+
+	.irp		b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+	.set		.Lv\b\().2d, \b
+	.set		.Lv\b\().16b, \b
+	.endr
+
+	.macro		eor3, rd, rn, ra, rm
+	.inst		0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+	.endm
+
+	.macro		rax1, rd, rn, rm
+	.inst		0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+	.endm
+
+	.macro		bcax, rd, rn, ra, rm
+	.inst		0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+	.endm
+
+	.macro		xar, rd, rn, rm, imm6
+	.inst		0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
+	.endm
+
+	/*
+	 * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size);
+	 */
+ENTRY(sha3_ce_transform)
+	/* load state */
+	mov		x8, x0
+	ld1		{ v0.1d- v3.1d}, [x8], #32
+	ld1		{ v4.1d- v7.1d}, [x8], #32
+	ld1		{ v8.1d-v11.1d}, [x8], #32
+	ld1		{v12.1d-v15.1d}, [x8], #32
+	ld1		{v16.1d-v19.1d}, [x8], #32
+	ld1		{v20.1d-v23.1d}, [x8], #32
+	ld1		{v24.1d}, [x8]
+
+0:	sub		w2, w2, #1
+	mov		w8, #24
+	adr_l		x9, .Lsha3_rcon
+
+	/* load input */
+	ld1		{v25.8b-v28.8b}, [x1], #32
+	ld1		{v29.8b-v31.8b}, [x1], #24
+	eor		v0.8b, v0.8b, v25.8b
+	eor		v1.8b, v1.8b, v26.8b
+	eor		v2.8b, v2.8b, v27.8b
+	eor		v3.8b, v3.8b, v28.8b
+	eor		v4.8b, v4.8b, v29.8b
+	eor		v5.8b, v5.8b, v30.8b
+	eor		v6.8b, v6.8b, v31.8b
+
+	tbnz		x3, #6, 2f		// SHA3-512
+
+	ld1		{v25.8b-v28.8b}, [x1], #32
+	ld1		{v29.8b-v30.8b}, [x1], #16
+	eor		 v7.8b,  v7.8b, v25.8b
+	eor		 v8.8b,  v8.8b, v26.8b
+	eor		 v9.8b,  v9.8b, v27.8b
+	eor		v10.8b, v10.8b, v28.8b
+	eor		v11.8b, v11.8b, v29.8b
+	eor		v12.8b, v12.8b, v30.8b
+
+	tbnz		x3, #4, 1f		// SHA3-384 or SHA3-224
+
+	// SHA3-256
+	ld1		{v25.8b-v28.8b}, [x1], #32
+	eor		v13.8b, v13.8b, v25.8b
+	eor		v14.8b, v14.8b, v26.8b
+	eor		v15.8b, v15.8b, v27.8b
+	eor		v16.8b, v16.8b, v28.8b
+	b		3f
+
+1:	tbz		x3, #2, 3f		// bit 2 cleared? SHA-384
+
+	// SHA3-224
+	ld1		{v25.8b-v28.8b}, [x1], #32
+	ld1		{v29.8b}, [x1], #8
+	eor		v13.8b, v13.8b, v25.8b
+	eor		v14.8b, v14.8b, v26.8b
+	eor		v15.8b, v15.8b, v27.8b
+	eor		v16.8b, v16.8b, v28.8b
+	eor		v17.8b, v17.8b, v29.8b
+	b		3f
+
+	// SHA3-512
+2:	ld1		{v25.8b-v26.8b}, [x1], #16
+	eor		 v7.8b,  v7.8b, v25.8b
+	eor		 v8.8b,  v8.8b, v26.8b
+
+3:	sub		w8, w8, #1
+
+	eor3		v25.16b,  v0.16b,  v5.16b, v10.16b
+	eor3		v25.16b, v25.16b, v15.16b, v20.16b
+	eor3		v26.16b,  v1.16b,  v6.16b, v11.16b
+	eor3		v26.16b, v26.16b, v16.16b, v21.16b
+	eor3		v27.16b,  v2.16b,  v7.16b, v12.16b
+	eor3		v27.16b, v27.16b, v17.16b, v22.16b
+	eor3		v28.16b,  v3.16b,  v8.16b, v13.16b
+	eor3		v28.16b, v28.16b, v18.16b, v23.16b
+	eor3		v29.16b,  v4.16b,  v9.16b, v14.16b
+	eor3		v29.16b, v29.16b, v19.16b, v24.16b
+
+	rax1		v30.2d, v29.2d, v26.2d		// bc[0]
+	rax1		v31.2d, v27.2d, v29.2d		// bc[3]
+	rax1		v29.2d, v25.2d, v27.2d		// bc[1]
+	rax1		v27.2d, v28.2d, v25.2d		// bc[4]
+	rax1		v25.2d, v26.2d, v28.2d		// bc[2]
+
+	eor		 v0.8b,  v0.8b, v30.8b
+	mov		v26.16b, v1.16b
+	xar		 v1.2d,  v6.2d, v29.2d, (64 - 44)
+	xar		 v6.2d,  v9.2d, v27.2d, (64 - 20)
+	xar		 v9.2d, v22.2d, v25.2d, (64 - 61)
+	xar		v22.2d, v14.2d, v27.2d, (64 - 39)
+	xar		v14.2d, v20.2d, v30.2d, (64 - 18)
+	xar		v20.2d,  v2.2d, v25.2d, (64 - 62)
+	xar		 v2.2d, v12.2d, v25.2d, (64 - 43)
+	xar		v12.2d, v13.2d, v31.2d, (64 - 25)
+	xar		v13.2d, v19.2d, v27.2d, (64 - 8)
+	xar		v19.2d, v23.2d, v31.2d, (64 - 56)
+	xar		v23.2d, v15.2d, v30.2d, (64 - 41)
+	xar		v15.2d,  v4.2d, v27.2d, (64 - 27)
+	xar		 v4.2d, v24.2d, v27.2d, (64 - 14)
+	xar		v24.2d, v21.2d, v29.2d, (64 - 2)
+	xar		v21.2d,  v8.2d, v31.2d, (64 - 55)
+	xar		 v8.2d, v16.2d, v29.2d, (64 - 45)
+	xar		v16.2d,  v5.2d, v30.2d, (64 - 36)
+	xar		 v5.2d,  v3.2d, v31.2d, (64 - 28)
+	xar		 v3.2d, v18.2d, v31.2d, (64 - 21)
+	xar		v18.2d, v17.2d, v25.2d, (64 - 15)
+	xar		v17.2d, v11.2d, v29.2d, (64 - 10)
+	xar		v11.2d,  v7.2d, v25.2d, (64 - 6)
+	xar		 v7.2d, v10.2d, v30.2d, (64 - 3)
+	xar		v10.2d, v26.2d, v29.2d, (64 - 1)
+
+	ld1		{v27.1d}, [x9], #8
+
+	bcax		v25.16b,  v0.16b,  v1.16b,  v2.16b
+	bcax		v26.16b,  v1.16b,  v2.16b,  v3.16b
+	bcax		 v2.16b,  v2.16b,  v3.16b,  v4.16b
+	bcax		 v3.16b,  v3.16b,  v4.16b,  v0.16b
+	bcax		 v4.16b,  v4.16b,  v0.16b,  v1.16b
+	mov		 v0.16b, v25.16b
+	mov		 v1.16b, v26.16b
+
+	bcax		v25.16b,  v5.16b,  v6.16b,  v7.16b
+	bcax		v26.16b,  v6.16b,  v7.16b,  v8.16b
+	bcax		 v7.16b,  v7.16b,  v8.16b,  v9.16b
+	bcax		 v8.16b,  v8.16b,  v9.16b,  v5.16b
+	bcax		 v9.16b,  v9.16b,  v5.16b,  v6.16b
+	mov		 v5.16b, v25.16b
+	mov		 v6.16b, v26.16b
+
+	bcax		v25.16b, v10.16b, v11.16b, v12.16b
+	bcax		v26.16b, v11.16b, v12.16b, v13.16b
+	bcax		v12.16b, v12.16b, v13.16b, v14.16b
+	bcax		v13.16b, v13.16b, v14.16b, v10.16b
+	bcax		v14.16b, v14.16b, v10.16b, v11.16b
+	mov		v10.16b, v25.16b
+	mov		v11.16b, v26.16b
+
+	bcax		v25.16b, v15.16b, v16.16b, v17.16b
+	bcax		v26.16b, v16.16b, v17.16b, v18.16b
+	bcax		v17.16b, v17.16b, v18.16b, v19.16b
+	bcax		v18.16b, v18.16b, v19.16b, v15.16b
+	bcax		v19.16b, v19.16b, v15.16b, v16.16b
+	mov		v15.16b, v25.16b
+	mov		v16.16b, v26.16b
+
+	bcax		v25.16b, v20.16b, v21.16b, v22.16b
+	bcax		v26.16b, v21.16b, v22.16b, v23.16b
+	bcax		v22.16b, v22.16b, v23.16b, v24.16b
+	bcax		v23.16b, v23.16b, v24.16b, v20.16b
+	bcax		v24.16b, v24.16b, v20.16b, v21.16b
+	mov		v20.16b, v25.16b
+	mov		v21.16b, v26.16b
+
+	eor		v0.8b, v0.8b, v27.8b
+
+	cbnz		w8, 3b
+	cbnz		w2, 0b
+
+	/* save state */
+	mov		x8, x0
+	st1		{ v0.1d- v3.1d}, [x8], #32
+	st1		{ v4.1d- v7.1d}, [x8], #32
+	st1		{ v8.1d-v11.1d}, [x8], #32
+	st1		{v12.1d-v15.1d}, [x8], #32
+	st1		{v16.1d-v19.1d}, [x8], #32
+	st1		{v20.1d-v23.1d}, [x8], #32
+	st1		{v24.1d}, [x8]
+	ret
+ENDPROC(sha3_ce_transform)
+
+	.section	".rodata", "a"
+	.align		4
+.Lsha3_rcon:
+	.quad		0x0000000000000001, 0x0000000000008082
+	.quad		0x800000000000808a, 0x8000000080008000
+	.quad		0x000000000000808b, 0x0000000080000001
+	.quad		0x8000000080008081, 0x8000000000008009
+	.quad		0x000000000000008a, 0x0000000000000088
+	.quad		0x0000000080008009, 0x000000008000000a
+	.quad		0x000000008000808b, 0x800000000000008b
+	.quad		0x8000000000008089, 0x8000000000008003
+	.quad		0x8000000000008002, 0x8000000000000080
+	.quad		0x000000000000800a, 0x800000008000000a
+	.quad		0x8000000080008081, 0x8000000000008080
+	.quad		0x0000000080000001, 0x8000000080008008
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
new file mode 100644
index 000000000000..a81377c16f1c
--- /dev/null
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -0,0 +1,156 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha3-ce-glue.c - SHA3 using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/sha3.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks,
+				  int md_len);
+
+static int sha3_ce_update(struct shash_desc *desc, const u8 *data,
+			  unsigned int len)
+{
+	struct sha3_state *sctx = shash_desc_ctx(desc);
+	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+
+	if (!may_use_simd())
+		return crypto_sha3_update(desc, data, len);
+
+	if ((sctx->partial + len) >= sctx->rsiz) {
+		int blocks;
+
+		if (sctx->partial) {
+			int p = sctx->rsiz - sctx->partial;
+
+			memcpy(sctx->buf + sctx->partial, data, p);
+			kernel_neon_begin();
+			sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
+			kernel_neon_end();
+
+			data += p;
+			len -= p;
+			sctx->partial = 0;
+		}
+
+		blocks = len / sctx->rsiz;
+		len %= sctx->rsiz;
+
+		if (blocks) {
+			kernel_neon_begin();
+			sha3_ce_transform(sctx->st, data, blocks, digest_size);
+			kernel_neon_end();
+			data += blocks * sctx->rsiz;
+		}
+	}
+
+	if (len) {
+		memcpy(sctx->buf + sctx->partial, data, len);
+		sctx->partial += len;
+	}
+	return 0;
+}
+
+static int sha3_ce_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha3_state *sctx = shash_desc_ctx(desc);
+	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+	__le64 *digest = (__le64 *)out;
+	int i;
+
+	if (!may_use_simd())
+		return crypto_sha3_final(desc, out);
+
+	sctx->buf[sctx->partial++] = 0x06;
+	memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial);
+	sctx->buf[sctx->rsiz - 1] |= 0x80;
+
+	kernel_neon_begin();
+	sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
+	kernel_neon_end();
+
+	for (i = 0; i < digest_size / 8; i++)
+		put_unaligned_le64(sctx->st[i], digest++);
+
+	if (digest_size & 4)
+		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
+
+	*sctx = (struct sha3_state){};
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
+	.digestsize		= SHA3_224_DIGEST_SIZE,
+	.init			= crypto_sha3_init,
+	.update			= sha3_ce_update,
+	.final			= sha3_ce_final,
+	.descsize		= sizeof(struct sha3_state),
+	.base.cra_name		= "sha3-224",
+	.base.cra_driver_name	= "sha3-224-ce",
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_256_DIGEST_SIZE,
+	.init			= crypto_sha3_init,
+	.update			= sha3_ce_update,
+	.final			= sha3_ce_final,
+	.descsize		= sizeof(struct sha3_state),
+	.base.cra_name		= "sha3-256",
+	.base.cra_driver_name	= "sha3-256-ce",
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_384_DIGEST_SIZE,
+	.init			= crypto_sha3_init,
+	.update			= sha3_ce_update,
+	.final			= sha3_ce_final,
+	.descsize		= sizeof(struct sha3_state),
+	.base.cra_name		= "sha3-384",
+	.base.cra_driver_name	= "sha3-384-ce",
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_512_DIGEST_SIZE,
+	.init			= crypto_sha3_init,
+	.update			= sha3_ce_update,
+	.final			= sha3_ce_final,
+	.descsize		= sizeof(struct sha3_state),
+	.base.cra_name		= "sha3-512",
+	.base.cra_driver_name	= "sha3-512-ce",
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init sha3_ce_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit sha3_ce_mod_fini(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_cpu_feature_match(SHA3, sha3_ce_mod_init);
+module_exit(sha3_ce_mod_fini);