diff mbox series

crypto: x86/aes-xts - access round keys using single-byte offsets

Message ID 20240409000154.29799-1-ebiggers@kernel.org (mailing list archive)
State Accepted
Delegated to: Herbert Xu
Headers show
Series crypto: x86/aes-xts - access round keys using single-byte offsets | expand

Commit Message

Eric Biggers April 9, 2024, 12:01 a.m. UTC
From: Eric Biggers <ebiggers@google.com>

Access the AES round keys using offsets -7*16 through 7*16, instead of
0*16 through 14*16.  This allows VEX-encoded instructions to address all
round keys using 1-byte offsets, whereas before some needed 4-byte
offsets.  This decreases the code size of aes-xts-avx-x86_64.o by 4.2%.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aes-xts-avx-x86_64.S | 81 +++++++++++++++-------------
 1 file changed, 44 insertions(+), 37 deletions(-)


base-commit: 4ad27a8be9dbefd4820da0f60da879d512b2f659
prerequisite-patch-id: 8d09ed747039f5e718ac7267e2a15e22504aa7f3

Comments

Ard Biesheuvel April 9, 2024, 9:12 a.m. UTC | #1
On Tue, 9 Apr 2024 at 02:02, Eric Biggers <ebiggers@kernel.org> wrote:
>
> From: Eric Biggers <ebiggers@google.com>
>
> Access the AES round keys using offsets -7*16 through 7*16, instead of
> 0*16 through 14*16.  This allows VEX-encoded instructions to address all
> round keys using 1-byte offsets, whereas before some needed 4-byte
> offsets.  This decreases the code size of aes-xts-avx-x86_64.o by 4.2%.
>
> Signed-off-by: Eric Biggers <ebiggers@google.com>

Nice optimization!

Do you think we might be able to macrofy this a bit so we can use zero
based indexing for the round keys, and hide the arithmetic?


> ---
>  arch/x86/crypto/aes-xts-avx-x86_64.S | 81 +++++++++++++++-------------
>  1 file changed, 44 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
> index fcaf64a2f8c6..95e412e7601d 100644
> --- a/arch/x86/crypto/aes-xts-avx-x86_64.S
> +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
> @@ -80,11 +80,11 @@
>         .byte   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
>  .text
>
>  // Function parameters
>  .set   KEY,            %rdi    // Initially points to crypto_aes_ctx, then is
> -                               // advanced to point directly to the round keys
> +                               // advanced to point directly to 7th round key
>  .set   SRC,            %rsi    // Pointer to next source data
>  .set   DST,            %rdx    // Pointer to next destination data
>  .set   LEN,            %rcx    // Remaining length in bytes
>  .set   TWEAK,          %r8     // Pointer to next tweak
>
> @@ -406,28 +406,28 @@
>  .endif
>  .endm
>
>  // Load the round keys: just the first one if !USE_AVX10, otherwise all of them.
>  .macro _load_round_keys
> -       _vbroadcast128  0*16(KEY), KEY0
> +       _vbroadcast128  -7*16(KEY), KEY0
>  .if USE_AVX10
> -       _vbroadcast128  1*16(KEY), KEY1
> -       _vbroadcast128  2*16(KEY), KEY2
> -       _vbroadcast128  3*16(KEY), KEY3
> -       _vbroadcast128  4*16(KEY), KEY4
> -       _vbroadcast128  5*16(KEY), KEY5
> -       _vbroadcast128  6*16(KEY), KEY6
> -       _vbroadcast128  7*16(KEY), KEY7
> -       _vbroadcast128  8*16(KEY), KEY8
> -       _vbroadcast128  9*16(KEY), KEY9
> -       _vbroadcast128  10*16(KEY), KEY10
> +       _vbroadcast128  -6*16(KEY), KEY1
> +       _vbroadcast128  -5*16(KEY), KEY2
> +       _vbroadcast128  -4*16(KEY), KEY3
> +       _vbroadcast128  -3*16(KEY), KEY4
> +       _vbroadcast128  -2*16(KEY), KEY5
> +       _vbroadcast128  -1*16(KEY), KEY6
> +       _vbroadcast128  0*16(KEY), KEY7
> +       _vbroadcast128  1*16(KEY), KEY8
> +       _vbroadcast128  2*16(KEY), KEY9
> +       _vbroadcast128  3*16(KEY), KEY10
>         // Note: if it's AES-128 or AES-192, the last several round keys won't
>         // be used.  We do the loads anyway to save a conditional jump.
> -       _vbroadcast128  11*16(KEY), KEY11
> -       _vbroadcast128  12*16(KEY), KEY12
> -       _vbroadcast128  13*16(KEY), KEY13
> -       _vbroadcast128  14*16(KEY), KEY14
> +       _vbroadcast128  4*16(KEY), KEY11
> +       _vbroadcast128  5*16(KEY), KEY12
> +       _vbroadcast128  6*16(KEY), KEY13
> +       _vbroadcast128  7*16(KEY), KEY14
>  .endif
>  .endm
>
>  // Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
>  // on the block(s) in \data using the round key(s) in \key.  The register length
> @@ -454,13 +454,13 @@
>  .macro _vaes_1x                enc, last, i, xmm_suffix, data
>  .if USE_AVX10
>         _vaes           \enc, \last, KEY\i\xmm_suffix, \data
>  .else
>  .ifnb \xmm_suffix
> -       _vaes           \enc, \last, \i*16(KEY), \data
> +       _vaes           \enc, \last, (\i-7)*16(KEY), \data
>  .else
> -       _vbroadcast128  \i*16(KEY), V4
> +       _vbroadcast128  (\i-7)*16(KEY), V4
>         _vaes           \enc, \last, V4, \data
>  .endif
>  .endif
>  .endm
>
> @@ -475,11 +475,11 @@
>         _vaes           \enc, \last, KEY\i, V1
>         _tweak_step     (2*(\i-1) + 1)
>         _vaes           \enc, \last, KEY\i, V2
>         _vaes           \enc, \last, KEY\i, V3
>  .else
> -       _vbroadcast128  \i*16(KEY), V4
> +       _vbroadcast128  (\i-7)*16(KEY), V4
>         _tweak_step     (2*(\i-1))
>         _vaes           \enc, \last, V4, V0
>         _vaes           \enc, \last, V4, V1
>         _tweak_step     (2*(\i-1) + 1)
>         _vaes           \enc, \last, V4, V2
> @@ -526,13 +526,19 @@
>         _define_aliases
>
>         // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
>         movl            480(KEY), KEYLEN
>
> -       // If decrypting, advance KEY to the decryption round keys.
> -.if !\enc
> -       add             $240, KEY
> +       // Advance KEY to point to the 7th encryption round key (if encrypting)
> +       // or the 7th decryption round key (if decrypting).  This makes the
> +       // offset to any round key be in the range [-112, 112], fitting in a
> +       // signed byte.  This shortens VEX-encoded instructions that access the
> +       // 8th and later round keys which otherwise would need 4-byte offsets.
> +.if \enc
> +       add             $7*16, KEY
> +.else
> +       add             $(15+7)*16, KEY
>  .endif
>
>         // Check whether the data length is a multiple of the AES block length.
>         test            $15, LEN
>         jnz             .Lneed_cts\@
> @@ -751,40 +757,41 @@
>
>  // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
>  //                        u8 iv[AES_BLOCK_SIZE]);
>  SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
>         vmovdqu         (%rsi), %xmm0
> -       vpxor           0*16(%rdi), %xmm0, %xmm0
> +       add             $7*16, %rdi
> +       vpxor           -7*16(%rdi), %xmm0, %xmm0
> +       vaesenc         -6*16(%rdi), %xmm0, %xmm0
> +       vaesenc         -5*16(%rdi), %xmm0, %xmm0
> +       vaesenc         -4*16(%rdi), %xmm0, %xmm0
> +       vaesenc         -3*16(%rdi), %xmm0, %xmm0
> +       vaesenc         -2*16(%rdi), %xmm0, %xmm0
> +       vaesenc         -1*16(%rdi), %xmm0, %xmm0
> +       vaesenc         0*16(%rdi), %xmm0, %xmm0
>         vaesenc         1*16(%rdi), %xmm0, %xmm0
>         vaesenc         2*16(%rdi), %xmm0, %xmm0
> +       cmpl            $24, 480-(7*16)(%rdi)
> +       jle             .Lencrypt_iv_aes_128_or_192
>         vaesenc         3*16(%rdi), %xmm0, %xmm0
>         vaesenc         4*16(%rdi), %xmm0, %xmm0
>         vaesenc         5*16(%rdi), %xmm0, %xmm0
>         vaesenc         6*16(%rdi), %xmm0, %xmm0
> -       vaesenc         7*16(%rdi), %xmm0, %xmm0
> -       vaesenc         8*16(%rdi), %xmm0, %xmm0
> -       vaesenc         9*16(%rdi), %xmm0, %xmm0
> -       cmpl            $24, 480(%rdi)
> -       jle             .Lencrypt_iv_aes_128_or_192
> -       vaesenc         10*16(%rdi), %xmm0, %xmm0
> -       vaesenc         11*16(%rdi), %xmm0, %xmm0
> -       vaesenc         12*16(%rdi), %xmm0, %xmm0
> -       vaesenc         13*16(%rdi), %xmm0, %xmm0
> -       vaesenclast     14*16(%rdi), %xmm0, %xmm0
> +       vaesenclast     7*16(%rdi), %xmm0, %xmm0
>  .Lencrypt_iv_done:
>         vmovdqu         %xmm0, (%rsi)
>         RET
>
>         // Out-of-line handling of AES-128 and AES-192
>  .Lencrypt_iv_aes_128_or_192:
>         jz              .Lencrypt_iv_aes_192
> -       vaesenclast     10*16(%rdi), %xmm0, %xmm0
> +       vaesenclast     3*16(%rdi), %xmm0, %xmm0
>         jmp             .Lencrypt_iv_done
>  .Lencrypt_iv_aes_192:
> -       vaesenc         10*16(%rdi), %xmm0, %xmm0
> -       vaesenc         11*16(%rdi), %xmm0, %xmm0
> -       vaesenclast     12*16(%rdi), %xmm0, %xmm0
> +       vaesenc         3*16(%rdi), %xmm0, %xmm0
> +       vaesenc         4*16(%rdi), %xmm0, %xmm0
> +       vaesenclast     5*16(%rdi), %xmm0, %xmm0
>         jmp             .Lencrypt_iv_done
>  SYM_FUNC_END(aes_xts_encrypt_iv)
>
>  // Below are the actual AES-XTS encryption and decryption functions,
>  // instantiated from the above macro.  They all have the following prototype:
>
> base-commit: 4ad27a8be9dbefd4820da0f60da879d512b2f659
> prerequisite-patch-id: 8d09ed747039f5e718ac7267e2a15e22504aa7f3
> --
> 2.44.0
>
>
Eric Biggers April 9, 2024, 12:11 p.m. UTC | #2
On Tue, Apr 09, 2024 at 11:12:11AM +0200, Ard Biesheuvel wrote:
> On Tue, 9 Apr 2024 at 02:02, Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > From: Eric Biggers <ebiggers@google.com>
> >
> > Access the AES round keys using offsets -7*16 through 7*16, instead of
> > 0*16 through 14*16.  This allows VEX-encoded instructions to address all
> > round keys using 1-byte offsets, whereas before some needed 4-byte
> > offsets.  This decreases the code size of aes-xts-avx-x86_64.o by 4.2%.
> >
> > Signed-off-by: Eric Biggers <ebiggers@google.com>
> 
> Nice optimization!
> 
> Do you think we might be able to macrofy this a bit so we can use zero
> based indexing for the round keys, and hide the arithmetic?
> 
> 

There are two alternatives I considered: defining variables KEYOFF0 through
KEYOFF14 and writing the offsets as KEYOFF\i(KEY), or defining one variable
KEYOFF and writing the offsets as \i*16-KEYOFF(KEY).  I think I slightly prefer
the current patch where it's less abstracted out, though.  It makes it clear the
offsets really are single-byte, and also index 7 is the exact mid-point so going
from -7 to 7 still feels fairly natural.  If we wanted to do something more
complex like use different offsets for AVX vs. AVX512, then we'd need the
abstraction to handle that, but it doesn't seem useful to do that.

- Eric
Ard Biesheuvel April 9, 2024, 12:44 p.m. UTC | #3
On Tue, 9 Apr 2024 at 14:11, Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Tue, Apr 09, 2024 at 11:12:11AM +0200, Ard Biesheuvel wrote:
> > On Tue, 9 Apr 2024 at 02:02, Eric Biggers <ebiggers@kernel.org> wrote:
> > >
> > > From: Eric Biggers <ebiggers@google.com>
> > >
> > > Access the AES round keys using offsets -7*16 through 7*16, instead of
> > > 0*16 through 14*16.  This allows VEX-encoded instructions to address all
> > > round keys using 1-byte offsets, whereas before some needed 4-byte
> > > offsets.  This decreases the code size of aes-xts-avx-x86_64.o by 4.2%.
> > >
> > > Signed-off-by: Eric Biggers <ebiggers@google.com>
> >
> > Nice optimization!
> >
> > Do you think we might be able to macrofy this a bit so we can use zero
> > based indexing for the round keys, and hide the arithmetic?
> >
> >
>
> There are two alternatives I considered: defining variables KEYOFF0 through
> KEYOFF14 and writing the offsets as KEYOFF\i(KEY), or defining one variable
> KEYOFF and writing the offsets as \i*16-KEYOFF(KEY).  I think I slightly prefer
> the current patch where it's less abstracted out, though.  It makes it clear the
> offsets really are single-byte, and also index 7 is the exact mid-point so going
> from -7 to 7 still feels fairly natural.  If we wanted to do something more
> complex like use different offsets for AVX vs. AVX512, then we'd need the
> abstraction to handle that, but it doesn't seem useful to do that.
>

Fair enough.
Herbert Xu April 19, 2024, 10:59 a.m. UTC | #4
Eric Biggers <ebiggers@kernel.org> wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> Access the AES round keys using offsets -7*16 through 7*16, instead of
> 0*16 through 14*16.  This allows VEX-encoded instructions to address all
> round keys using 1-byte offsets, whereas before some needed 4-byte
> offsets.  This decreases the code size of aes-xts-avx-x86_64.o by 4.2%.
> 
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
> arch/x86/crypto/aes-xts-avx-x86_64.S | 81 +++++++++++++++-------------
> 1 file changed, 44 insertions(+), 37 deletions(-)

Patch applied.  Thanks.
diff mbox series

Patch

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index fcaf64a2f8c6..95e412e7601d 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -80,11 +80,11 @@ 
 	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
 .text
 
 // Function parameters
 .set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
-				// advanced to point directly to the round keys
+				// advanced to point directly to 7th round key
 .set	SRC,		%rsi	// Pointer to next source data
 .set	DST,		%rdx	// Pointer to next destination data
 .set	LEN,		%rcx	// Remaining length in bytes
 .set	TWEAK,		%r8	// Pointer to next tweak
 
@@ -406,28 +406,28 @@ 
 .endif
 .endm
 
 // Load the round keys: just the first one if !USE_AVX10, otherwise all of them.
 .macro	_load_round_keys
-	_vbroadcast128	0*16(KEY), KEY0
+	_vbroadcast128	-7*16(KEY), KEY0
 .if USE_AVX10
-	_vbroadcast128	1*16(KEY), KEY1
-	_vbroadcast128	2*16(KEY), KEY2
-	_vbroadcast128	3*16(KEY), KEY3
-	_vbroadcast128	4*16(KEY), KEY4
-	_vbroadcast128	5*16(KEY), KEY5
-	_vbroadcast128	6*16(KEY), KEY6
-	_vbroadcast128	7*16(KEY), KEY7
-	_vbroadcast128	8*16(KEY), KEY8
-	_vbroadcast128	9*16(KEY), KEY9
-	_vbroadcast128	10*16(KEY), KEY10
+	_vbroadcast128	-6*16(KEY), KEY1
+	_vbroadcast128	-5*16(KEY), KEY2
+	_vbroadcast128	-4*16(KEY), KEY3
+	_vbroadcast128	-3*16(KEY), KEY4
+	_vbroadcast128	-2*16(KEY), KEY5
+	_vbroadcast128	-1*16(KEY), KEY6
+	_vbroadcast128	0*16(KEY), KEY7
+	_vbroadcast128	1*16(KEY), KEY8
+	_vbroadcast128	2*16(KEY), KEY9
+	_vbroadcast128	3*16(KEY), KEY10
 	// Note: if it's AES-128 or AES-192, the last several round keys won't
 	// be used.  We do the loads anyway to save a conditional jump.
-	_vbroadcast128	11*16(KEY), KEY11
-	_vbroadcast128	12*16(KEY), KEY12
-	_vbroadcast128	13*16(KEY), KEY13
-	_vbroadcast128	14*16(KEY), KEY14
+	_vbroadcast128	4*16(KEY), KEY11
+	_vbroadcast128	5*16(KEY), KEY12
+	_vbroadcast128	6*16(KEY), KEY13
+	_vbroadcast128	7*16(KEY), KEY14
 .endif
 .endm
 
 // Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
 // on the block(s) in \data using the round key(s) in \key.  The register length
@@ -454,13 +454,13 @@ 
 .macro _vaes_1x		enc, last, i, xmm_suffix, data
 .if USE_AVX10
 	_vaes		\enc, \last, KEY\i\xmm_suffix, \data
 .else
 .ifnb \xmm_suffix
-	_vaes		\enc, \last, \i*16(KEY), \data
+	_vaes		\enc, \last, (\i-7)*16(KEY), \data
 .else
-	_vbroadcast128	\i*16(KEY), V4
+	_vbroadcast128	(\i-7)*16(KEY), V4
 	_vaes		\enc, \last, V4, \data
 .endif
 .endif
 .endm
 
@@ -475,11 +475,11 @@ 
 	_vaes		\enc, \last, KEY\i, V1
 	_tweak_step	(2*(\i-1) + 1)
 	_vaes		\enc, \last, KEY\i, V2
 	_vaes		\enc, \last, KEY\i, V3
 .else
-	_vbroadcast128	\i*16(KEY), V4
+	_vbroadcast128	(\i-7)*16(KEY), V4
 	_tweak_step	(2*(\i-1))
 	_vaes		\enc, \last, V4, V0
 	_vaes		\enc, \last, V4, V1
 	_tweak_step	(2*(\i-1) + 1)
 	_vaes		\enc, \last, V4, V2
@@ -526,13 +526,19 @@ 
 	_define_aliases
 
 	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
 	movl		480(KEY), KEYLEN
 
-	// If decrypting, advance KEY to the decryption round keys.
-.if !\enc
-	add		$240, KEY
+	// Advance KEY to point to the 7th encryption round key (if encrypting)
+	// or the 7th decryption round key (if decrypting).  This makes the
+	// offset to any round key be in the range [-112, 112], fitting in a
+	// signed byte.  This shortens VEX-encoded instructions that access the
+	// 8th and later round keys which otherwise would need 4-byte offsets.
+.if \enc
+	add		$7*16, KEY
+.else
+	add		$(15+7)*16, KEY
 .endif
 
 	// Check whether the data length is a multiple of the AES block length.
 	test		$15, LEN
 	jnz		.Lneed_cts\@
@@ -751,40 +757,41 @@ 
 
 // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
 //			   u8 iv[AES_BLOCK_SIZE]);
 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
 	vmovdqu		(%rsi), %xmm0
-	vpxor		0*16(%rdi), %xmm0, %xmm0
+	add		$7*16, %rdi
+	vpxor		-7*16(%rdi), %xmm0, %xmm0
+	vaesenc		-6*16(%rdi), %xmm0, %xmm0
+	vaesenc		-5*16(%rdi), %xmm0, %xmm0
+	vaesenc		-4*16(%rdi), %xmm0, %xmm0
+	vaesenc		-3*16(%rdi), %xmm0, %xmm0
+	vaesenc		-2*16(%rdi), %xmm0, %xmm0
+	vaesenc		-1*16(%rdi), %xmm0, %xmm0
+	vaesenc		0*16(%rdi), %xmm0, %xmm0
 	vaesenc		1*16(%rdi), %xmm0, %xmm0
 	vaesenc		2*16(%rdi), %xmm0, %xmm0
+	cmpl		$24, 480-(7*16)(%rdi)
+	jle		.Lencrypt_iv_aes_128_or_192
 	vaesenc		3*16(%rdi), %xmm0, %xmm0
 	vaesenc		4*16(%rdi), %xmm0, %xmm0
 	vaesenc		5*16(%rdi), %xmm0, %xmm0
 	vaesenc		6*16(%rdi), %xmm0, %xmm0
-	vaesenc		7*16(%rdi), %xmm0, %xmm0
-	vaesenc		8*16(%rdi), %xmm0, %xmm0
-	vaesenc		9*16(%rdi), %xmm0, %xmm0
-	cmpl		$24, 480(%rdi)
-	jle		.Lencrypt_iv_aes_128_or_192
-	vaesenc		10*16(%rdi), %xmm0, %xmm0
-	vaesenc		11*16(%rdi), %xmm0, %xmm0
-	vaesenc		12*16(%rdi), %xmm0, %xmm0
-	vaesenc		13*16(%rdi), %xmm0, %xmm0
-	vaesenclast	14*16(%rdi), %xmm0, %xmm0
+	vaesenclast	7*16(%rdi), %xmm0, %xmm0
 .Lencrypt_iv_done:
 	vmovdqu		%xmm0, (%rsi)
 	RET
 
 	// Out-of-line handling of AES-128 and AES-192
 .Lencrypt_iv_aes_128_or_192:
 	jz		.Lencrypt_iv_aes_192
-	vaesenclast	10*16(%rdi), %xmm0, %xmm0
+	vaesenclast	3*16(%rdi), %xmm0, %xmm0
 	jmp		.Lencrypt_iv_done
 .Lencrypt_iv_aes_192:
-	vaesenc		10*16(%rdi), %xmm0, %xmm0
-	vaesenc		11*16(%rdi), %xmm0, %xmm0
-	vaesenclast	12*16(%rdi), %xmm0, %xmm0
+	vaesenc		3*16(%rdi), %xmm0, %xmm0
+	vaesenc		4*16(%rdi), %xmm0, %xmm0
+	vaesenclast	5*16(%rdi), %xmm0, %xmm0
 	jmp		.Lencrypt_iv_done
 SYM_FUNC_END(aes_xts_encrypt_iv)
 
 // Below are the actual AES-XTS encryption and decryption functions,
 // instantiated from the above macro.  They all have the following prototype: