@@ -1,7 +1,7 @@
/*
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
*
- * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
@@ -11,24 +11,119 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
- SHASH .req v0
- SHASH2 .req v1
- T1 .req v2
- T2 .req v3
- MASK .req v4
- XL .req v5
- XM .req v6
- XH .req v7
- IN1 .req v7
+ SHASH .req v0
+ SHASH2 .req v1
+ T1 .req v2
+ T2 .req v3
+ MASK .req v4
+ XL .req v5
+ XM .req v6
+ XH .req v7
+ IN1 .req v7
+
+ k00_16 .req v8
+ k32_48 .req v9
+
+ t3 .req v10
+ t4 .req v11
+ t5 .req v12
+ t6 .req v13
+ t7 .req v14
+ t8 .req v15
+ t9 .req v16
+
+ perm1 .req v17
+ perm2 .req v18
+ perm3 .req v19
+ perm4 .req v20
.text
.arch armv8-a+crypto
- /*
- * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- * struct ghash_key const *k, const char *head)
- */
-ENTRY(pmull_ghash_update)
+ .macro __pmull_p64, rd, rn, rm, i
+ .ifb \i
+ pmull \rd\().1q, \rn\().1d, \rm\().1d
+ .else
+ pmull2 \rd\().1q, \rn\().2d, \rm\().2d
+ .endif
+ .endm
+
+ .macro __pmull_p8, rq, ad, bd, i
+ .ifb \i
+ ext t4.8b, \ad\().8b, \ad\().8b, #1 // A1
+ ext t8.8b, \bd\().8b, \bd\().8b, #1 // B1
+ ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
+ ext t7.8b, \bd\().8b, \bd\().8b, #2 // B2
+ ext t6.8b, \ad\().8b, \ad\().8b, #3 // A3
+ ext t9.8b, \bd\().8b, \bd\().8b, #3 // B3
+ ext t3.8b, \bd\().8b, \bd\().8b, #4 // B4
+
+ pmull t4.8h, t4.8b, \bd\().8b // F = A1*B
+ pmull t8.8h, \ad\().8b, t8.8b // E = A*B1
+ pmull t5.8h, t5.8b, \bd\().8b // H = A2*B
+ pmull t7.8h, \ad\().8b, t7.8b // G = A*B2
+ pmull t6.8h, t6.8b, \bd\().8b // J = A3*B
+ pmull t9.8h, \ad\().8b, t9.8b // I = A*B3
+ pmull t3.8h, \ad\().8b, t3.8b // K = A*B4
+ pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B
+ .else
+ tbl t4.16b, {\ad\().16b}, perm1.16b // A1
+ tbl t8.16b, {\bd\().16b}, perm1.16b // B1
+ tbl t5.16b, {\ad\().16b}, perm2.16b // A2
+ tbl t7.16b, {\bd\().16b}, perm2.16b // B2
+ tbl t6.16b, {\ad\().16b}, perm3.16b // A3
+ tbl t9.16b, {\bd\().16b}, perm3.16b // B3
+ tbl t3.16b, {\bd\().16b}, perm4.16b // B4
+
+ pmull2 t4.8h, t4.16b, \bd\().16b // F = A1*B
+ pmull2 t8.8h, \ad\().16b, t8.16b // E = A*B1
+ pmull2 t5.8h, t5.16b, \bd\().16b // H = A2*B
+ pmull2 t7.8h, \ad\().16b, t7.16b // G = A*B2
+ pmull2 t6.8h, t6.16b, \bd\().16b // J = A3*B
+ pmull2 t9.8h, \ad\().16b, t9.16b // I = A*B3
+ pmull2 t3.8h, \ad\().16b, t3.16b // K = A*B4
+ pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B
+ .endif
+
+ eor t4.16b, t4.16b, t8.16b // L = E + F
+ eor t5.16b, t5.16b, t7.16b // M = G + H
+ eor t6.16b, t6.16b, t9.16b // N = I + J
+
+ uzp1 t8.2d, t4.2d, t5.2d
+ uzp2 t4.2d, t4.2d, t5.2d
+ uzp1 t7.2d, t6.2d, t3.2d
+ uzp2 t6.2d, t6.2d, t3.2d
+
+ // t4 = (L) (P0 + P1) << 8
+ // t5 = (M) (P2 + P3) << 16
+ eor t8.16b, t8.16b, t4.16b
+ and t4.16b, t4.16b, k32_48.16b
+
+ // t6 = (N) (P4 + P5) << 24
+ // t7 = (K) (P6 + P7) << 32
+ eor t7.16b, t7.16b, t6.16b
+ and t6.16b, t6.16b, k00_16.16b
+
+ eor t8.16b, t8.16b, t4.16b
+ eor t7.16b, t7.16b, t6.16b
+
+ zip2 t5.2d, t8.2d, t4.2d
+ zip1 t4.2d, t8.2d, t4.2d
+ zip2 t3.2d, t7.2d, t6.2d
+ zip1 t6.2d, t7.2d, t6.2d
+
+ ext t4.16b, t4.16b, t4.16b, #15
+ ext t5.16b, t5.16b, t5.16b, #14
+ ext t6.16b, t6.16b, t6.16b, #13
+ ext t3.16b, t3.16b, t3.16b, #12
+
+ eor t4.16b, t4.16b, t5.16b
+ eor t6.16b, t6.16b, t3.16b
+ eor \rq\().16b, \rq\().16b, t4.16b
+ eor \rq\().16b, \rq\().16b, t6.16b
+ .endm
+
+ .macro __pmull_ghash, pm
ld1 {SHASH.2d}, [x3]
ld1 {XL.2d}, [x1]
movi MASK.16b, #0xe1
@@ -52,23 +147,23 @@ CPU_LE( rev64 T1.16b, T1.16b )
eor T1.16b, T1.16b, T2.16b
eor XL.16b, XL.16b, IN1.16b
- pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
+ \pm XH, SHASH, XL, 2 // a1 * b1
eor T1.16b, T1.16b, XL.16b
- pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
- pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
+ \pm XL, SHASH, XL // a0 * b0
+ \pm XM, SHASH2, T1 // (a1 + a0)(b1 + b0)
ext T1.16b, XL.16b, XH.16b, #8
eor T2.16b, XL.16b, XH.16b
eor XM.16b, XM.16b, T1.16b
eor XM.16b, XM.16b, T2.16b
- pmull T2.1q, XL.1d, MASK.1d
+ \pm T2, XL, MASK
mov XH.d[0], XM.d[1]
mov XM.d[1], XL.d[0]
eor XL.16b, XM.16b, T2.16b
ext T2.16b, XL.16b, XL.16b, #8
- pmull XL.1q, XL.1d, MASK.1d
+ \pm XL, XL, MASK
eor T2.16b, T2.16b, XH.16b
eor XL.16b, XL.16b, T2.16b
@@ -76,7 +171,31 @@ CPU_LE( rev64 T1.16b, T1.16b )
st1 {XL.2d}, [x1]
ret
-ENDPROC(pmull_ghash_update)
+ .endm
+
+ /*
+ * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+ * struct ghash_key const *k, const char *head)
+ */
+ENTRY(pmull_ghash_update_p64)
+ __pmull_ghash __pmull_p64
+ENDPROC(pmull_ghash_update_p64)
+
+ENTRY(pmull_ghash_update_p8)
+ // k00_16 := 0x0000000000000000_000000000000ffff
+ // k32_48 := 0x00000000ffffffff_0000ffffffffffff
+ movi k32_48.2d, #0xffffffff
+ mov k32_48.h[2], k32_48.h[0]
+ ushr k00_16.2d, k32_48.2d, #32
+
+ mov_q x5, 0x080f0e0d0c0b0a09
+ dup perm1.2d, x5
+ ext perm2.16b, perm1.16b, perm1.16b, #1
+ ext perm3.16b, perm1.16b, perm1.16b, #2
+ ext perm4.16b, perm1.16b, perm1.16b, #3
+
+ __pmull_ghash __pmull_p8
+ENDPROC(pmull_ghash_update_p8)
KS .req v8
CTR .req v9
@@ -26,6 +26,7 @@
MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("ghash");
#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
@@ -48,8 +49,17 @@ struct gcm_aes_ctx {
struct ghash_key ghash_key;
};
-asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k, const char *head);
+asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
+ struct ghash_key const *k,
+ const char *head);
+
+asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
+ struct ghash_key const *k,
+ const char *head);
+
+static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
+ struct ghash_key const *k,
+ const char *head);
asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
const u8 src[], struct ghash_key const *k,
@@ -554,9 +564,18 @@ static int __init ghash_ce_mod_init(void)
{
int ret;
- ret = crypto_register_aead(&gcm_aes_alg);
- if (ret)
- return ret;
+ if (!(elf_hwcap & HWCAP_ASIMD))
+ return -ENODEV;
+
+ if (elf_hwcap & HWCAP_PMULL) {
+ pmull_ghash_update = pmull_ghash_update_p64;
+
+ ret = crypto_register_aead(&gcm_aes_alg);
+ if (ret)
+ return ret;
+ } else {
+ pmull_ghash_update = pmull_ghash_update_p8;
+ }
ret = crypto_register_shash(&ghash_alg);
if (ret)
@@ -570,5 +589,10 @@ static void __exit ghash_ce_mod_exit(void)
crypto_unregister_aead(&gcm_aes_alg);
}
-module_cpu_feature_match(PMULL, ghash_ce_mod_init);
+static const struct cpu_feature ghash_cpu_feature[] = {
+ { cpu_feature(PMULL) }, { }
+};
+MODULE_DEVICE_TABLE(cpu, ghash_cpu_feature);
+
+module_init(ghash_ce_mod_init);
module_exit(ghash_ce_mod_exit);
Implement a NEON fallback for systems that do support NEON but have no support for the optional 64x64->128 polynomial multiplication instruction that is part of the ARMv8 Crypto Extensions. It is based on the paper "Fast Software Polynomial Multiplication on ARM Processors Using the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab (https://hal.inria.fr/hal-01506572) On a low-end core such as the Cortex-A53 found in the Raspberry Pi3, the NEON based implementation is ~2.8x faster than the table based one, and is time invariant as well, making it less vulnerable to timing attacks. When combined with the bit-sliced NEON implementation of AES-CTR, the AES-GCM performance increases by 75% (from 58 to 33 cycles per byte). Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> --- Note that this is the arm64 counterpart of the patch "crypto: arm/ghash - add NEON accelerated fallback for vmull.p64" Raw numbers for a 1.2 Ghz Cortex-A53 (Raspberry Pi3) after the patch. This patch applies onto the patch "crypto: arm64/gcm - implement native driver using v8 Crypto Extensions" which can be found here: http://www.mail-archive.com/linux-crypto@vger.kernel.org/msg26385.html arch/arm64/crypto/ghash-ce-core.S | 161 +++++++++++++++++--- arch/arm64/crypto/ghash-ce-glue.c | 36 ++++- 2 files changed, 170 insertions(+), 27 deletions(-)