@@ -99,6 +99,7 @@ config CRYPTO_GHASH_ARM_CE
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_CRYPTD
+ select CRYPTO_GF128MUL
help
Use an implementation of GHASH (used by the GCM AEAD chaining mode)
that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
@@ -63,6 +63,27 @@
k48 .req d31
SHASH2_p64 .req d31
+ HH .req q10
+ HH3 .req q11
+ HH4 .req q12
+ HH34 .req q13
+
+ HH_L .req d20
+ HH_H .req d21
+ HH3_L .req d22
+ HH3_H .req d23
+ HH4_L .req d24
+ HH4_H .req d25
+ HH34_L .req d26
+ HH34_H .req d27
+ SHASH2_H .req d29
+
+ XL2 .req q5
+ XM2 .req q6
+ XH2 .req q7
+ XL3 .req q8
+ XM3 .req q9
+
.text
.fpu crypto-neon-fp-armv8
@@ -175,12 +196,76 @@
beq 0f
vld1.64 {T1}, [ip]
teq r0, #0
- b 1f
+ b 3f
+
+0: .ifc \pn, p64
+ tst r0, #3 // skip until #blocks is a
+ bne 2f // round multiple of 4
+
+1: vld1.8 {XL2-XM2}, [r2]!
+ vld1.8 {XL3}, [r2]!
+ vrev64.8 T1, XL2
+
+ subs r0, r0, #4
+
+ vext.8 T2, T1, T1, #8
+ veor T1_H, T1_H, XL_L
+ veor XL, XL, T2
+
+ vmull.p64 XH, HH4_H, XL_H // a1 * b1
+ veor T1_H, T1_H, XL_H
+ vmull.p64 XL, HH4_L, XL_L // a0 * b0
+ vmull.p64 XM, HH34_H, T1_H // (a1 + a0)(b1 + b0)
+
+ vrev64.8 T1, XM2
+
+ vmull.p64 XH2, HH3_H, T1_L // a1 * b1
+ veor T1_L, T1_L, T1_H
+ vmull.p64 XL2, HH3_L, T1_H // a0 * b0
+ vmull.p64 XM2, HH34_L, T1_L // (a1 + a0)(b1 + b0)
+
+ vrev64.8 T1, XL3
+
+ vmull.p64 XL3, HH_H, T1_L // a1 * b1
+ veor T1_L, T1_L, T1_H
+ veor XH2, XH2, XL3
+ vmull.p64 XL3, HH_L, T1_H // a0 * b0
+ vmull.p64 XM3, SHASH2_H, T1_L // (a1 + a0)(b1 + b0)
+
+ vld1.8 {T1}, [r2]!
+ veor XL2, XL2, XL3
+ vrev64.8 T1, T1
+ veor XM2, XM2, XM3
+
+ vmull.p64 XL3, SHASH_H, T1_L // a1 * b1
+ veor T1_L, T1_L, T1_H
+ veor XH2, XH2, XL3
+ vmull.p64 XL3, SHASH_L, T1_H // a0 * b0
+ vmull.p64 XM3, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
-0: vld1.64 {T1}, [r2]!
+ veor XL2, XL2, XL3
+ veor XM2, XM2, XM3
+
+ veor XL, XL, XL2
+ veor XH, XH, XH2
+ veor XM, XM, XM2
+
+ veor T1, XL, XH
+ veor XM, XM, T1
+
+ __pmull_reduce_p64
+
+ veor T1, T1, XH
+ veor XL, XL, T1
+
+ beq 4f
+ b 1b
+ .endif
+
+2: vld1.64 {T1}, [r2]!
subs r0, r0, #1
-1: /* multiply XL by SHASH in GF(2^128) */
+3: /* multiply XL by SHASH in GF(2^128) */
#ifndef CONFIG_CPU_BIG_ENDIAN
vrev64.8 T1, T1
#endif
@@ -203,7 +288,7 @@
bne 0b
- vst1.64 {XL}, [r1]
+4: vst1.64 {XL}, [r1]
bx lr
.endm
@@ -212,8 +297,14 @@
* struct ghash_key const *k, const char *head)
*/
ENTRY(pmull_ghash_update_p64)
- vld1.64 {SHASH}, [r3]
+ vld1.64 {SHASH}, [r3]!
+ vld1.64 {HH}, [r3]!
+ vld1.64 {HH3-HH4}, [r3]
+
veor SHASH2_p64, SHASH_L, SHASH_H
+ veor SHASH2_H, HH_L, HH_H
+ veor HH34_L, HH3_L, HH3_H
+ veor HH34_H, HH4_L, HH4_H
vmov.i8 MASK, #0xe1
vshl.u64 MASK, MASK, #57
@@ -1,7 +1,7 @@
/*
* Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
*
- * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
@@ -28,8 +28,10 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GHASH_DIGEST_SIZE 16
struct ghash_key {
- u64 a;
- u64 b;
+ u64 h[2];
+ u64 h2[2];
+ u64 h3[2];
+ u64 h4[2];
};
struct ghash_desc_ctx {
@@ -117,26 +119,40 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
return 0;
}
+static void ghash_reflect(u64 h[], const be128 *k)
+{
+ u64 carry = be64_to_cpu(k->a) >> 63;
+
+ h[0] = (be64_to_cpu(k->b) << 1) | carry;
+ h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
+
+ if (carry)
+ h[1] ^= 0xc200000000000000UL;
+}
+
static int ghash_setkey(struct crypto_shash *tfm,
const u8 *inkey, unsigned int keylen)
{
struct ghash_key *key = crypto_shash_ctx(tfm);
- u64 a, b;
+ be128 h, k;
if (keylen != GHASH_BLOCK_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
- /* perform multiplication by 'x' in GF(2^128) */
- b = get_unaligned_be64(inkey);
- a = get_unaligned_be64(inkey + 8);
+ memcpy(&k, inkey, GHASH_BLOCK_SIZE);
+ ghash_reflect(key->h, &k);
+
+ h = k;
+ gf128mul_lle(&h, &k);
+ ghash_reflect(key->h2, &h);
- key->a = (a << 1) | (b >> 63);
- key->b = (b << 1) | (a >> 63);
+ gf128mul_lle(&h, &k);
+ ghash_reflect(key->h3, &h);
- if (b >> 63)
- key->b ^= 0xc200000000000000UL;
+ gf128mul_lle(&h, &k);
+ ghash_reflect(key->h4, &h);
return 0;
}
Speed up the GHASH algorithm based on 64-bit polynomial multiplication by adding support for 4-way aggregation. This improves throughput by ~60% on Cortex-A53, from 1.70 cycles per byte to 1.05 cycles per byte. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> --- arch/arm/crypto/Kconfig | 1 + arch/arm/crypto/ghash-ce-core.S | 101 ++++++++++++++++++++++++++++++-- arch/arm/crypto/ghash-ce-glue.c | 38 ++++++++---- 3 files changed, 124 insertions(+), 16 deletions(-)