diff mbox series

[riscv/for-next] crypto: riscv - parallelize AES-CBC decryption

Message ID 20240208060851.154129-1-ebiggers@kernel.org (mailing list archive)
State Not Applicable
Delegated to: Herbert Xu
Headers show
Series [riscv/for-next] crypto: riscv - parallelize AES-CBC decryption | expand

Commit Message

Eric Biggers Feb. 8, 2024, 6:08 a.m. UTC
From: Eric Biggers <ebiggers@google.com>

Since CBC decryption is parallelizable, make the RISC-V implementation
of AES-CBC decryption process multiple blocks at a time, instead of
processing the blocks one by one.  This should improve performance.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/riscv/crypto/aes-riscv64-zvkned.S | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)


base-commit: cb4ede926134a65bc3bf90ed58dace8451d7e759

Comments

Jerry Shih Feb. 10, 2024, 3:25 p.m. UTC | #1
On Feb 8, 2024, at 14:08, Eric Biggers <ebiggers@kernel.org> wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> Since CBC decryption is parallelizable, make the RISC-V implementation
> of AES-CBC decryption process multiple blocks at a time, instead of
> processing the blocks one by one.  This should improve performance.
> 
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
> arch/riscv/crypto/aes-riscv64-zvkned.S | 24 +++++++++++++++---------
> 1 file changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/riscv/crypto/aes-riscv64-zvkned.S b/arch/riscv/crypto/aes-riscv64-zvkned.S
> index 78d4e1186c074..43541aad6386c 100644
> --- a/arch/riscv/crypto/aes-riscv64-zvkned.S
> +++ b/arch/riscv/crypto/aes-riscv64-zvkned.S
> @@ -132,33 +132,39 @@ SYM_FUNC_END(aes_ecb_decrypt_zvkned)
> 	addi		INP, INP, 16
> 	addi		OUTP, OUTP, 16
> 	addi		LEN, LEN, -16
> 	bnez		LEN, 1b
> 
> 	vse32.v		v16, (IVP)	// Store next IV
> 	ret
> .endm
> 
> .macro	aes_cbc_decrypt	keylen
> +	srli		LEN, LEN, 2	// Convert LEN from bytes to words
> 	vle32.v		v16, (IVP)	// Load IV
> 1:
> -	vle32.v		v17, (INP)	// Load ciphertext block
> -	vmv.v.v		v18, v17	// Save ciphertext block
> -	aes_decrypt	v17, \keylen	// Decrypt
> -	vxor.vv		v17, v17, v16	// XOR with IV or prev ciphertext block
> -	vse32.v		v17, (OUTP)	// Store plaintext block
> -	vmv.v.v		v16, v18	// Next "IV" is prev ciphertext block
> -	addi		INP, INP, 16
> -	addi		OUTP, OUTP, 16
> -	addi		LEN, LEN, -16
> +	vsetvli		t0, LEN, e32, m4, ta, ma
> +	vle32.v		v20, (INP)	// Load ciphertext blocks
> +	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
> +	addi		t1, t0, -4
> +	vslidedown.vx	v24, v20, t1	// Save last ciphertext block

Do we need to setup the `e32, len=t0` for next IV?
I think we only need 128bit IV (with VL=4).

> +	aes_decrypt	v20, \keylen	// Decrypt the blocks
> +	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
> +	vse32.v		v20, (OUTP)	// Store plaintext blocks
> +	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block

Same VL issue here.

> +	slli		t1, t0, 2	// Words to bytes
> +	add		INP, INP, t1
> +	add		OUTP, OUTP, t1
> +	sub		LEN, LEN, t0
> 	bnez		LEN, 1b
> 
> +	vsetivli	zero, 4, e32, m1, ta, ma
> 	vse32.v		v16, (IVP)	// Store next IV
> 	ret
> .endm
> 
> // void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,
> //			       const u8 *in, u8 *out, size_t len, u8 iv[16]);
> //
> // |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
> SYM_FUNC_START(aes_cbc_encrypt_zvkned)
> 	aes_begin	KEYP, 128f, 192f
> 
> base-commit: cb4ede926134a65bc3bf90ed58dace8451d7e759
> -- 
> 2.43.0
>
Eric Biggers Feb. 10, 2024, 6:12 p.m. UTC | #2
On Sat, Feb 10, 2024 at 11:25:27PM +0800, Jerry Shih wrote:
> > .macro	aes_cbc_decrypt	keylen
> > +	srli		LEN, LEN, 2	// Convert LEN from bytes to words
> > 	vle32.v		v16, (IVP)	// Load IV
> > 1:
> > -	vle32.v		v17, (INP)	// Load ciphertext block
> > -	vmv.v.v		v18, v17	// Save ciphertext block
> > -	aes_decrypt	v17, \keylen	// Decrypt
> > -	vxor.vv		v17, v17, v16	// XOR with IV or prev ciphertext block
> > -	vse32.v		v17, (OUTP)	// Store plaintext block
> > -	vmv.v.v		v16, v18	// Next "IV" is prev ciphertext block
> > -	addi		INP, INP, 16
> > -	addi		OUTP, OUTP, 16
> > -	addi		LEN, LEN, -16
> > +	vsetvli		t0, LEN, e32, m4, ta, ma
> > +	vle32.v		v20, (INP)	// Load ciphertext blocks
> > +	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
> > +	addi		t1, t0, -4
> > +	vslidedown.vx	v24, v20, t1	// Save last ciphertext block
> 
> Do we need to setup the `e32, len=t0` for next IV?
> I think we only need 128bit IV (with VL=4).
> 
> > +	aes_decrypt	v20, \keylen	// Decrypt the blocks
> > +	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
> > +	vse32.v		v20, (OUTP)	// Store plaintext blocks
> > +	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block
> 
> Same VL issue here.

It's true that the vslidedown.vx and vmv.v.v only need vl=4.  But it also works
fine with vl unchanged.  It just results in some extra data being moved in the
registers.  My hypothesis is that this is going to be faster than having the
three extra instructions per loop iteration to change the vl to 4 twice.

I still have no real hardware to test on, so I have no quantitative data.  All I
can do is go with my instinct which is that the shorter version will be better.

If you have access to a real CPU that supports the RISC-V vector crypto
extensions, I'd be interested in the performance you get from each variant.
(Of course, different RISC-V CPU implementations may have quite different
performance characteristics, so that still won't be definitive.)

Here is the alternative variant given as a diff from this patch:

diff --git a/arch/riscv/crypto/aes-riscv64-zvkned.S b/arch/riscv/crypto/aes-riscv64-zvkned.S
index 43541aad6386c..ef380771f606a 100644
--- a/arch/riscv/crypto/aes-riscv64-zvkned.S
+++ b/arch/riscv/crypto/aes-riscv64-zvkned.S
@@ -146,10 +146,13 @@ SYM_FUNC_END(aes_ecb_decrypt_zvkned)
 	vle32.v		v20, (INP)	// Load ciphertext blocks
 	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
 	addi		t1, t0, -4
+	vsetivli	zero, 4, e32, m4, ta, ma
 	vslidedown.vx	v24, v20, t1	// Save last ciphertext block
+	vsetvli		t0, LEN, e32, m4, ta, ma
 	aes_decrypt	v20, \keylen	// Decrypt the blocks
 	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
 	vse32.v		v20, (OUTP)	// Store plaintext blocks
+	vsetivli	zero, 4, e32, m4, ta, ma
 	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block
 	slli		t1, t0, 2	// Words to bytes
 	add		INP, INP, t1
@@ -157,7 +160,6 @@ SYM_FUNC_END(aes_ecb_decrypt_zvkned)
 	sub		LEN, LEN, t0
 	bnez		LEN, 1b
 
-	vsetivli	zero, 4, e32, m1, ta, ma
 	vse32.v		v16, (IVP)	// Store next IV
 	ret
 .endm

A third variant would be to just replace vmv.v.v with vmv1r.v.

In general, this level of micro-optimization probably needs to be wait until
there are a variety of CPUs to test on.  We know that parallelizing the
algorithms is helpful, so we should do that, as this patch does.  But the
effects of small variations in the instruction sequences are currently unclear.

- Eric
Jerry Shih Feb. 26, 2024, 1:40 a.m. UTC | #3
On Feb 11, 2024, at 02:12, Eric Biggers <ebiggers@kernel.org> wrote:
> On Sat, Feb 10, 2024 at 11:25:27PM +0800, Jerry Shih wrote:
>>> .macro	aes_cbc_decrypt	keylen
>>> +	srli		LEN, LEN, 2	// Convert LEN from bytes to words
>>> 	vle32.v		v16, (IVP)	// Load IV
>>> 1:
>>> -	vle32.v		v17, (INP)	// Load ciphertext block
>>> -	vmv.v.v		v18, v17	// Save ciphertext block
>>> -	aes_decrypt	v17, \keylen	// Decrypt
>>> -	vxor.vv		v17, v17, v16	// XOR with IV or prev ciphertext block
>>> -	vse32.v		v17, (OUTP)	// Store plaintext block
>>> -	vmv.v.v		v16, v18	// Next "IV" is prev ciphertext block
>>> -	addi		INP, INP, 16
>>> -	addi		OUTP, OUTP, 16
>>> -	addi		LEN, LEN, -16
>>> +	vsetvli		t0, LEN, e32, m4, ta, ma
>>> +	vle32.v		v20, (INP)	// Load ciphertext blocks
>>> +	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
>>> +	addi		t1, t0, -4
>>> +	vslidedown.vx	v24, v20, t1	// Save last ciphertext block
>> 
>> Do we need to setup the `e32, len=t0` for next IV?
>> I think we only need 128bit IV (with VL=4).
>> 
>>> +	aes_decrypt	v20, \keylen	// Decrypt the blocks
>>> +	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
>>> +	vse32.v		v20, (OUTP)	// Store plaintext blocks
>>> +	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block
>> 
>> Same VL issue here.
> 
> It's true that the vslidedown.vx and vmv.v.v only need vl=4.  But it also works
> fine with vl unchanged.  It just results in some extra data being moved in the
> registers.  My hypothesis is that this is going to be faster than having the
> three extra instructions per loop iteration to change the vl to 4 twice.
> 
> I still have no real hardware to test on, so I have no quantitative data.  All I
> can do is go with my instinct which is that the shorter version will be better.
> 
> If you have access to a real CPU that supports the RISC-V vector crypto
> extensions, I'd be interested in the performance you get from each variant.
> (Of course, different RISC-V CPU implementations may have quite different
> performance characteristics, so that still won't be definitive.)

Hi Eric,
Thank you. I think the extra vl doesn't affect performance significantly. The main
tasks are still the aes body.
The original implementation is enough right now.

> In general, this level of micro-optimization probably needs to be wait until
> there are a variety of CPUs to test on.  We know that parallelizing the
> algorithms is helpful, so we should do that, as this patch does.  But the
> effects of small variations in the instruction sequences are currently unclear.
> 
> - Eric
Palmer Dabbelt March 20, 2024, 1:48 a.m. UTC | #4
On Sat, 10 Feb 2024 10:12:40 PST (-0800), ebiggers@kernel.org wrote:
> On Sat, Feb 10, 2024 at 11:25:27PM +0800, Jerry Shih wrote:
>> > .macro	aes_cbc_decrypt	keylen
>> > +	srli		LEN, LEN, 2	// Convert LEN from bytes to words
>> > 	vle32.v		v16, (IVP)	// Load IV
>> > 1:
>> > -	vle32.v		v17, (INP)	// Load ciphertext block
>> > -	vmv.v.v		v18, v17	// Save ciphertext block
>> > -	aes_decrypt	v17, \keylen	// Decrypt
>> > -	vxor.vv		v17, v17, v16	// XOR with IV or prev ciphertext block
>> > -	vse32.v		v17, (OUTP)	// Store plaintext block
>> > -	vmv.v.v		v16, v18	// Next "IV" is prev ciphertext block
>> > -	addi		INP, INP, 16
>> > -	addi		OUTP, OUTP, 16
>> > -	addi		LEN, LEN, -16
>> > +	vsetvli		t0, LEN, e32, m4, ta, ma
>> > +	vle32.v		v20, (INP)	// Load ciphertext blocks
>> > +	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
>> > +	addi		t1, t0, -4
>> > +	vslidedown.vx	v24, v20, t1	// Save last ciphertext block
>>
>> Do we need to setup the `e32, len=t0` for next IV?
>> I think we only need 128bit IV (with VL=4).
>>
>> > +	aes_decrypt	v20, \keylen	// Decrypt the blocks
>> > +	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
>> > +	vse32.v		v20, (OUTP)	// Store plaintext blocks
>> > +	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block
>>
>> Same VL issue here.
>
> It's true that the vslidedown.vx and vmv.v.v only need vl=4.  But it also works
> fine with vl unchanged.  It just results in some extra data being moved in the
> registers.  My hypothesis is that this is going to be faster than having the
> three extra instructions per loop iteration to change the vl to 4 twice.
>
> I still have no real hardware to test on, so I have no quantitative data.  All I
> can do is go with my instinct which is that the shorter version will be better.
>
> If you have access to a real CPU that supports the RISC-V vector crypto
> extensions, I'd be interested in the performance you get from each variant.
> (Of course, different RISC-V CPU implementations may have quite different
> performance characteristics, so that still won't be definitive.)

We're stacking up a lot of stuff with HW-dependent performance 
questions, I think it's fine to just take what's reasonably simple for 
now.  If we try to speculate about what future hardware might do we're 
just going to go crazy with possibilities, IMO we're way better off just 
optimizing as things show up.

> Here is the alternative variant given as a diff from this patch:
>
> diff --git a/arch/riscv/crypto/aes-riscv64-zvkned.S b/arch/riscv/crypto/aes-riscv64-zvkned.S
> index 43541aad6386c..ef380771f606a 100644
> --- a/arch/riscv/crypto/aes-riscv64-zvkned.S
> +++ b/arch/riscv/crypto/aes-riscv64-zvkned.S
> @@ -146,10 +146,13 @@ SYM_FUNC_END(aes_ecb_decrypt_zvkned)
>  	vle32.v		v20, (INP)	// Load ciphertext blocks
>  	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
>  	addi		t1, t0, -4
> +	vsetivli	zero, 4, e32, m4, ta, ma
>  	vslidedown.vx	v24, v20, t1	// Save last ciphertext block
> +	vsetvli		t0, LEN, e32, m4, ta, ma
>  	aes_decrypt	v20, \keylen	// Decrypt the blocks
>  	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
>  	vse32.v		v20, (OUTP)	// Store plaintext blocks
> +	vsetivli	zero, 4, e32, m4, ta, ma
>  	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block
>  	slli		t1, t0, 2	// Words to bytes
>  	add		INP, INP, t1
> @@ -157,7 +160,6 @@ SYM_FUNC_END(aes_ecb_decrypt_zvkned)
>  	sub		LEN, LEN, t0
>  	bnez		LEN, 1b
>
> -	vsetivli	zero, 4, e32, m1, ta, ma
>  	vse32.v		v16, (IVP)	// Store next IV
>  	ret
>  .endm
>
> A third variant would be to just replace vmv.v.v with vmv1r.v.
>
> In general, this level of micro-optimization probably needs to be wait until
> there are a variety of CPUs to test on.  We know that parallelizing the
> algorithms is helpful, so we should do that, as this patch does.  But the
> effects of small variations in the instruction sequences are currently unclear.

Ya, I agree.  So I'm fine with this, it's a base and we can always 
improve it when there's something concrete to run on.

> - Eric
patchwork-bot+linux-riscv@kernel.org March 20, 2024, 8:50 p.m. UTC | #5
Hello:

This patch was applied to riscv/linux.git (for-next)
by Palmer Dabbelt <palmer@rivosinc.com>:

On Wed,  7 Feb 2024 22:08:51 -0800 you wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> Since CBC decryption is parallelizable, make the RISC-V implementation
> of AES-CBC decryption process multiple blocks at a time, instead of
> processing the blocks one by one.  This should improve performance.
> 
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> 
> [...]

Here is the summary with links:
  - [riscv/for-next] crypto: riscv - parallelize AES-CBC decryption
    https://git.kernel.org/riscv/c/da215b089b5d

You are awesome, thank you!
diff mbox series

Patch

diff --git a/arch/riscv/crypto/aes-riscv64-zvkned.S b/arch/riscv/crypto/aes-riscv64-zvkned.S
index 78d4e1186c074..43541aad6386c 100644
--- a/arch/riscv/crypto/aes-riscv64-zvkned.S
+++ b/arch/riscv/crypto/aes-riscv64-zvkned.S
@@ -132,33 +132,39 @@  SYM_FUNC_END(aes_ecb_decrypt_zvkned)
 	addi		INP, INP, 16
 	addi		OUTP, OUTP, 16
 	addi		LEN, LEN, -16
 	bnez		LEN, 1b
 
 	vse32.v		v16, (IVP)	// Store next IV
 	ret
 .endm
 
 .macro	aes_cbc_decrypt	keylen
+	srli		LEN, LEN, 2	// Convert LEN from bytes to words
 	vle32.v		v16, (IVP)	// Load IV
 1:
-	vle32.v		v17, (INP)	// Load ciphertext block
-	vmv.v.v		v18, v17	// Save ciphertext block
-	aes_decrypt	v17, \keylen	// Decrypt
-	vxor.vv		v17, v17, v16	// XOR with IV or prev ciphertext block
-	vse32.v		v17, (OUTP)	// Store plaintext block
-	vmv.v.v		v16, v18	// Next "IV" is prev ciphertext block
-	addi		INP, INP, 16
-	addi		OUTP, OUTP, 16
-	addi		LEN, LEN, -16
+	vsetvli		t0, LEN, e32, m4, ta, ma
+	vle32.v		v20, (INP)	// Load ciphertext blocks
+	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
+	addi		t1, t0, -4
+	vslidedown.vx	v24, v20, t1	// Save last ciphertext block
+	aes_decrypt	v20, \keylen	// Decrypt the blocks
+	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
+	vse32.v		v20, (OUTP)	// Store plaintext blocks
+	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block
+	slli		t1, t0, 2	// Words to bytes
+	add		INP, INP, t1
+	add		OUTP, OUTP, t1
+	sub		LEN, LEN, t0
 	bnez		LEN, 1b
 
+	vsetivli	zero, 4, e32, m1, ta, ma
 	vse32.v		v16, (IVP)	// Store next IV
 	ret
 .endm
 
 // void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,
 //			       const u8 *in, u8 *out, size_t len, u8 iv[16]);
 //
 // |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
 SYM_FUNC_START(aes_cbc_encrypt_zvkned)
 	aes_begin	KEYP, 128f, 192f