@@ -60,10 +60,16 @@ static bool aegis128_do_simd(void)
bool crypto_aegis128_have_simd(void);
void crypto_aegis128_update_simd(struct aegis_state *state, const void *msg);
+void crypto_aegis128_init_simd(struct aegis_state *state,
+ const union aegis_block *key,
+ const u8 *iv);
void crypto_aegis128_encrypt_chunk_simd(struct aegis_state *state, u8 *dst,
const u8 *src, unsigned int size);
void crypto_aegis128_decrypt_chunk_simd(struct aegis_state *state, u8 *dst,
const u8 *src, unsigned int size);
+void crypto_aegis128_final_simd(struct aegis_state *state,
+ union aegis_block *tag_xor,
+ u64 assoclen, u64 cryptlen);
static void crypto_aegis128_update(struct aegis_state *state)
{
@@ -395,17 +401,21 @@ static int crypto_aegis128_encrypt(struct aead_request *req)
struct skcipher_walk walk;
struct aegis_state state;
- crypto_aegis128_init(&state, &ctx->key, req->iv);
- crypto_aegis128_process_ad(&state, req->src, req->assoclen);
-
skcipher_walk_aead_encrypt(&walk, req, false);
- if (aegis128_do_simd())
+ if (aegis128_do_simd()) {
+ crypto_aegis128_init_simd(&state, &ctx->key, req->iv);
+ crypto_aegis128_process_ad(&state, req->src, req->assoclen);
crypto_aegis128_process_crypt(&state, req, &walk,
crypto_aegis128_encrypt_chunk_simd);
- else
+ crypto_aegis128_final_simd(&state, &tag, req->assoclen,
+ cryptlen);
+ } else {
+ crypto_aegis128_init(&state, &ctx->key, req->iv);
+ crypto_aegis128_process_ad(&state, req->src, req->assoclen);
crypto_aegis128_process_crypt(&state, req, &walk,
crypto_aegis128_encrypt_chunk);
- crypto_aegis128_final(&state, &tag, req->assoclen, cryptlen);
+ crypto_aegis128_final(&state, &tag, req->assoclen, cryptlen);
+ }
scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen,
authsize, 1);
@@ -426,17 +436,21 @@ static int crypto_aegis128_decrypt(struct aead_request *req)
scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen,
authsize, 0);
- crypto_aegis128_init(&state, &ctx->key, req->iv);
- crypto_aegis128_process_ad(&state, req->src, req->assoclen);
-
skcipher_walk_aead_decrypt(&walk, req, false);
- if (aegis128_do_simd())
+ if (aegis128_do_simd()) {
+ crypto_aegis128_init_simd(&state, &ctx->key, req->iv);
+ crypto_aegis128_process_ad(&state, req->src, req->assoclen);
crypto_aegis128_process_crypt(&state, req, &walk,
crypto_aegis128_decrypt_chunk_simd);
- else
+ crypto_aegis128_final_simd(&state, &tag, req->assoclen,
+ cryptlen);
+ } else {
+ crypto_aegis128_init(&state, &ctx->key, req->iv);
+ crypto_aegis128_process_ad(&state, req->src, req->assoclen);
crypto_aegis128_process_crypt(&state, req, &walk,
crypto_aegis128_decrypt_chunk);
- crypto_aegis128_final(&state, &tag, req->assoclen, cryptlen);
+ crypto_aegis128_final(&state, &tag, req->assoclen, cryptlen);
+ }
return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
}
@@ -132,6 +132,34 @@ void preload_sbox(void)
:: "r"(crypto_aes_sbox));
}
+void crypto_aegis128_init_neon(void *state, const void *key, const void *iv)
+{
+ static const uint8_t const0[] = {
+ 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d,
+ 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62,
+ };
+ static const uint8_t const1[] = {
+ 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1,
+ 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd,
+ };
+ uint8x16_t k = vld1q_u8(key);
+ uint8x16_t kiv = k ^ vld1q_u8(iv);
+ struct aegis128_state st = {{
+ kiv,
+ vld1q_u8(const1),
+ vld1q_u8(const0),
+ k ^ vld1q_u8(const0),
+ k ^ vld1q_u8(const1),
+ }};
+ int i;
+
+ for (i = 0; i < 5; i++) {
+ st = aegis128_update_neon(st, k);
+ st = aegis128_update_neon(st, kiv);
+ }
+ aegis128_save_state_neon(st, state);
+}
+
void crypto_aegis128_update_neon(void *state, const void *msg)
{
struct aegis128_state st = aegis128_load_state_neon(state);
@@ -210,3 +238,24 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
aegis128_save_state_neon(st, state);
}
+
+void crypto_aegis128_final_neon(void *state, void *tag_xor,
+ uint64_t assocbits, uint64_t cryptbits)
+{
+ struct aegis128_state st = aegis128_load_state_neon(state);
+ union {
+ uint64_t pair[2];
+ uint8x16_t v;
+ } t = {{ assocbits, cryptbits }};
+ uint8x16_t tag;
+ int i;
+
+ t.v ^= st.v[3];
+
+ for (i = 0; i < 7; i++)
+ st = aegis128_update_neon(st, t.v);
+
+ tag = vld1q_u8(tag_xor);
+ tag ^= st.v[0] ^ st.v[1] ^ st.v[2] ^ st.v[3] ^ st.v[4];
+ vst1q_u8(tag_xor, tag);
+}
@@ -8,11 +8,14 @@
#include "aegis.h"
+void crypto_aegis128_init_neon(void *state, const void *key, const u8 *iv);
void crypto_aegis128_update_neon(void *state, const void *msg);
void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
unsigned int size);
void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
unsigned int size);
+void crypto_aegis128_final_neon(void *state, void *tag_xor,
+ uint64_t assocbits, uint64_t cryptbits);
int aegis128_have_aes_insn __ro_after_init;
@@ -25,6 +28,15 @@ bool crypto_aegis128_have_simd(void)
return IS_ENABLED(CONFIG_ARM64);
}
+void crypto_aegis128_init_simd(union aegis_block *state,
+ const union aegis_block *key,
+ const u8 *iv)
+{
+ kernel_neon_begin();
+ crypto_aegis128_init_neon(state, key, iv);
+ kernel_neon_end();
+}
+
void crypto_aegis128_update_simd(union aegis_block *state, const void *msg)
{
kernel_neon_begin();
@@ -47,3 +59,13 @@ void crypto_aegis128_decrypt_chunk_simd(union aegis_block *state, u8 *dst,
crypto_aegis128_decrypt_chunk_neon(state, dst, src, size);
kernel_neon_end();
}
+
+void crypto_aegis128_final_simd(union aegis_block *state,
+ union aegis_block *tag_xor,
+ u64 assoclen, u64 cryptlen)
+{
+ kernel_neon_begin();
+ crypto_aegis128_final_neon(state, tag_xor, cpu_to_le64(8 * assoclen),
+ cpu_to_le64(8 * cryptlen));
+ kernel_neon_end();
+}
In order to speed up aegis128 processing even more, duplicate the init() and final() routines as SIMD versions in their entirety. This results in a 2x speedup on ARM Cortex-A57 for ~1500 byte inputs (using AES instructions). Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> --- crypto/aegis128-core.c | 38 ++++++++++----- crypto/aegis128-neon-inner.c | 49 ++++++++++++++++++++ crypto/aegis128-neon.c | 22 +++++++++ 3 files changed, 97 insertions(+), 12 deletions(-)