diff mbox series

[1/2] crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian

Message ID 20190223065408.6279-2-ebiggers@kernel.org (mailing list archive)
State Accepted
Delegated to: Herbert Xu
Headers show
Series crypto: arm64/chacha - fix for big endian | expand

Commit Message

Eric Biggers Feb. 23, 2019, 6:54 a.m. UTC
From: Eric Biggers <ebiggers@google.com>

The change to encrypt a fifth ChaCha block using scalar instructions
caused the chacha20-neon, xchacha20-neon, and xchacha12-neon self-tests
to start failing on big endian arm64 kernels.  The bug is that the
keystream block produced in 32-bit scalar registers is directly XOR'd
with the data words, which are loaded and stored in native endianness.
Thus in big endian mode the data bytes end up XOR'd with the wrong
bytes.  Fix it by byte-swapping the keystream words in big endian mode.

Fixes: 2fe55987b262 ("crypto: arm64/chacha - use combined SIMD/ALU routine for more speed")
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm64/crypto/chacha-neon-core.S | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

Comments

Ard Biesheuvel Feb. 23, 2019, 9:21 a.m. UTC | #1
On Sat, 23 Feb 2019 at 07:54, Eric Biggers <ebiggers@kernel.org> wrote:
>
> From: Eric Biggers <ebiggers@google.com>
>
> The change to encrypt a fifth ChaCha block using scalar instructions
> caused the chacha20-neon, xchacha20-neon, and xchacha12-neon self-tests
> to start failing on big endian arm64 kernels.  The bug is that the
> keystream block produced in 32-bit scalar registers is directly XOR'd
> with the data words, which are loaded and stored in native endianness.
> Thus in big endian mode the data bytes end up XOR'd with the wrong
> bytes.  Fix it by byte-swapping the keystream words in big endian mode.
>
> Fixes: 2fe55987b262 ("crypto: arm64/chacha - use combined SIMD/ALU routine for more speed")
> Signed-off-by: Eric Biggers <ebiggers@google.com>

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

> ---
>  arch/arm64/crypto/chacha-neon-core.S | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)
>
> diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
> index 021bb9e9784b2..bfb80e10ff7b0 100644
> --- a/arch/arm64/crypto/chacha-neon-core.S
> +++ b/arch/arm64/crypto/chacha-neon-core.S
> @@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon)
>         add             v3.4s, v3.4s, v19.4s
>           add           a2, a2, w8
>           add           a3, a3, w9
> +CPU_BE(          rev           a0, a0          )
> +CPU_BE(          rev           a1, a1          )
> +CPU_BE(          rev           a2, a2          )
> +CPU_BE(          rev           a3, a3          )
>
>         ld4r            {v24.4s-v27.4s}, [x0], #16
>         ld4r            {v28.4s-v31.4s}, [x0]
> @@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon)
>         add             v7.4s, v7.4s, v23.4s
>           add           a6, a6, w8
>           add           a7, a7, w9
> +CPU_BE(          rev           a4, a4          )
> +CPU_BE(          rev           a5, a5          )
> +CPU_BE(          rev           a6, a6          )
> +CPU_BE(          rev           a7, a7          )
>
>         // x8[0-3] += s2[0]
>         // x9[0-3] += s2[1]
> @@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon)
>         add             v11.4s, v11.4s, v27.4s
>           add           a10, a10, w8
>           add           a11, a11, w9
> +CPU_BE(          rev           a8, a8          )
> +CPU_BE(          rev           a9, a9          )
> +CPU_BE(          rev           a10, a10        )
> +CPU_BE(          rev           a11, a11        )
>
>         // x12[0-3] += s3[0]
>         // x13[0-3] += s3[1]
> @@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon)
>         add             v15.4s, v15.4s, v31.4s
>           add           a14, a14, w8
>           add           a15, a15, w9
> +CPU_BE(          rev           a12, a12        )
> +CPU_BE(          rev           a13, a13        )
> +CPU_BE(          rev           a14, a14        )
> +CPU_BE(          rev           a15, a15        )
>
>         // interleave 32-bit words in state n, n+1
>           ldp           w6, w7, [x2], #64
> --
> 2.20.1
>
diff mbox series

Patch

diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
index 021bb9e9784b2..bfb80e10ff7b0 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -532,6 +532,10 @@  ENTRY(chacha_4block_xor_neon)
 	add		v3.4s, v3.4s, v19.4s
 	  add		a2, a2, w8
 	  add		a3, a3, w9
+CPU_BE(	  rev		a0, a0		)
+CPU_BE(	  rev		a1, a1		)
+CPU_BE(	  rev		a2, a2		)
+CPU_BE(	  rev		a3, a3		)
 
 	ld4r		{v24.4s-v27.4s}, [x0], #16
 	ld4r		{v28.4s-v31.4s}, [x0]
@@ -552,6 +556,10 @@  ENTRY(chacha_4block_xor_neon)
 	add		v7.4s, v7.4s, v23.4s
 	  add		a6, a6, w8
 	  add		a7, a7, w9
+CPU_BE(	  rev		a4, a4		)
+CPU_BE(	  rev		a5, a5		)
+CPU_BE(	  rev		a6, a6		)
+CPU_BE(	  rev		a7, a7		)
 
 	// x8[0-3] += s2[0]
 	// x9[0-3] += s2[1]
@@ -569,6 +577,10 @@  ENTRY(chacha_4block_xor_neon)
 	add		v11.4s, v11.4s, v27.4s
 	  add		a10, a10, w8
 	  add		a11, a11, w9
+CPU_BE(	  rev		a8, a8		)
+CPU_BE(	  rev		a9, a9		)
+CPU_BE(	  rev		a10, a10	)
+CPU_BE(	  rev		a11, a11	)
 
 	// x12[0-3] += s3[0]
 	// x13[0-3] += s3[1]
@@ -586,6 +598,10 @@  ENTRY(chacha_4block_xor_neon)
 	add		v15.4s, v15.4s, v31.4s
 	  add		a14, a14, w8
 	  add		a15, a15, w9
+CPU_BE(	  rev		a12, a12	)
+CPU_BE(	  rev		a13, a13	)
+CPU_BE(	  rev		a14, a14	)
+CPU_BE(	  rev		a15, a15	)
 
 	// interleave 32-bit words in state n, n+1
 	  ldp		w6, w7, [x2], #64