Message ID | 20240409000154.29799-1-ebiggers@kernel.org (mailing list archive) |
---|---|
State | Accepted |
Delegated to: | Herbert Xu |
Headers | show |
Series | crypto: x86/aes-xts - access round keys using single-byte offsets | expand |
On Tue, 9 Apr 2024 at 02:02, Eric Biggers <ebiggers@kernel.org> wrote: > > From: Eric Biggers <ebiggers@google.com> > > Access the AES round keys using offsets -7*16 through 7*16, instead of > 0*16 through 14*16. This allows VEX-encoded instructions to address all > round keys using 1-byte offsets, whereas before some needed 4-byte > offsets. This decreases the code size of aes-xts-avx-x86_64.o by 4.2%. > > Signed-off-by: Eric Biggers <ebiggers@google.com> Nice optimization! Do you think we might be able to macrofy this a bit so we can use zero based indexing for the round keys, and hide the arithmetic? > --- > arch/x86/crypto/aes-xts-avx-x86_64.S | 81 +++++++++++++++------------- > 1 file changed, 44 insertions(+), 37 deletions(-) > > diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S > index fcaf64a2f8c6..95e412e7601d 100644 > --- a/arch/x86/crypto/aes-xts-avx-x86_64.S > +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S > @@ -80,11 +80,11 @@ > .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 > .text > > // Function parameters > .set KEY, %rdi // Initially points to crypto_aes_ctx, then is > - // advanced to point directly to the round keys > + // advanced to point directly to 7th round key > .set SRC, %rsi // Pointer to next source data > .set DST, %rdx // Pointer to next destination data > .set LEN, %rcx // Remaining length in bytes > .set TWEAK, %r8 // Pointer to next tweak > > @@ -406,28 +406,28 @@ > .endif > .endm > > // Load the round keys: just the first one if !USE_AVX10, otherwise all of them. > .macro _load_round_keys > - _vbroadcast128 0*16(KEY), KEY0 > + _vbroadcast128 -7*16(KEY), KEY0 > .if USE_AVX10 > - _vbroadcast128 1*16(KEY), KEY1 > - _vbroadcast128 2*16(KEY), KEY2 > - _vbroadcast128 3*16(KEY), KEY3 > - _vbroadcast128 4*16(KEY), KEY4 > - _vbroadcast128 5*16(KEY), KEY5 > - _vbroadcast128 6*16(KEY), KEY6 > - _vbroadcast128 7*16(KEY), KEY7 > - _vbroadcast128 8*16(KEY), KEY8 > - _vbroadcast128 9*16(KEY), KEY9 > - _vbroadcast128 10*16(KEY), KEY10 > + _vbroadcast128 -6*16(KEY), KEY1 > + _vbroadcast128 -5*16(KEY), KEY2 > + _vbroadcast128 -4*16(KEY), KEY3 > + _vbroadcast128 -3*16(KEY), KEY4 > + _vbroadcast128 -2*16(KEY), KEY5 > + _vbroadcast128 -1*16(KEY), KEY6 > + _vbroadcast128 0*16(KEY), KEY7 > + _vbroadcast128 1*16(KEY), KEY8 > + _vbroadcast128 2*16(KEY), KEY9 > + _vbroadcast128 3*16(KEY), KEY10 > // Note: if it's AES-128 or AES-192, the last several round keys won't > // be used. We do the loads anyway to save a conditional jump. > - _vbroadcast128 11*16(KEY), KEY11 > - _vbroadcast128 12*16(KEY), KEY12 > - _vbroadcast128 13*16(KEY), KEY13 > - _vbroadcast128 14*16(KEY), KEY14 > + _vbroadcast128 4*16(KEY), KEY11 > + _vbroadcast128 5*16(KEY), KEY12 > + _vbroadcast128 6*16(KEY), KEY13 > + _vbroadcast128 7*16(KEY), KEY14 > .endif > .endm > > // Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) > // on the block(s) in \data using the round key(s) in \key. The register length > @@ -454,13 +454,13 @@ > .macro _vaes_1x enc, last, i, xmm_suffix, data > .if USE_AVX10 > _vaes \enc, \last, KEY\i\xmm_suffix, \data > .else > .ifnb \xmm_suffix > - _vaes \enc, \last, \i*16(KEY), \data > + _vaes \enc, \last, (\i-7)*16(KEY), \data > .else > - _vbroadcast128 \i*16(KEY), V4 > + _vbroadcast128 (\i-7)*16(KEY), V4 > _vaes \enc, \last, V4, \data > .endif > .endif > .endm > > @@ -475,11 +475,11 @@ > _vaes \enc, \last, KEY\i, V1 > _tweak_step (2*(\i-1) + 1) > _vaes \enc, \last, KEY\i, V2 > _vaes \enc, \last, KEY\i, V3 > .else > - _vbroadcast128 \i*16(KEY), V4 > + _vbroadcast128 (\i-7)*16(KEY), V4 > _tweak_step (2*(\i-1)) > _vaes \enc, \last, V4, V0 > _vaes \enc, \last, V4, V1 > _tweak_step (2*(\i-1) + 1) > _vaes \enc, \last, V4, V2 > @@ -526,13 +526,19 @@ > _define_aliases > > // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). > movl 480(KEY), KEYLEN > > - // If decrypting, advance KEY to the decryption round keys. > -.if !\enc > - add $240, KEY > + // Advance KEY to point to the 7th encryption round key (if encrypting) > + // or the 7th decryption round key (if decrypting). This makes the > + // offset to any round key be in the range [-112, 112], fitting in a > + // signed byte. This shortens VEX-encoded instructions that access the > + // 8th and later round keys which otherwise would need 4-byte offsets. > +.if \enc > + add $7*16, KEY > +.else > + add $(15+7)*16, KEY > .endif > > // Check whether the data length is a multiple of the AES block length. > test $15, LEN > jnz .Lneed_cts\@ > @@ -751,40 +757,41 @@ > > // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, > // u8 iv[AES_BLOCK_SIZE]); > SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) > vmovdqu (%rsi), %xmm0 > - vpxor 0*16(%rdi), %xmm0, %xmm0 > + add $7*16, %rdi > + vpxor -7*16(%rdi), %xmm0, %xmm0 > + vaesenc -6*16(%rdi), %xmm0, %xmm0 > + vaesenc -5*16(%rdi), %xmm0, %xmm0 > + vaesenc -4*16(%rdi), %xmm0, %xmm0 > + vaesenc -3*16(%rdi), %xmm0, %xmm0 > + vaesenc -2*16(%rdi), %xmm0, %xmm0 > + vaesenc -1*16(%rdi), %xmm0, %xmm0 > + vaesenc 0*16(%rdi), %xmm0, %xmm0 > vaesenc 1*16(%rdi), %xmm0, %xmm0 > vaesenc 2*16(%rdi), %xmm0, %xmm0 > + cmpl $24, 480-(7*16)(%rdi) > + jle .Lencrypt_iv_aes_128_or_192 > vaesenc 3*16(%rdi), %xmm0, %xmm0 > vaesenc 4*16(%rdi), %xmm0, %xmm0 > vaesenc 5*16(%rdi), %xmm0, %xmm0 > vaesenc 6*16(%rdi), %xmm0, %xmm0 > - vaesenc 7*16(%rdi), %xmm0, %xmm0 > - vaesenc 8*16(%rdi), %xmm0, %xmm0 > - vaesenc 9*16(%rdi), %xmm0, %xmm0 > - cmpl $24, 480(%rdi) > - jle .Lencrypt_iv_aes_128_or_192 > - vaesenc 10*16(%rdi), %xmm0, %xmm0 > - vaesenc 11*16(%rdi), %xmm0, %xmm0 > - vaesenc 12*16(%rdi), %xmm0, %xmm0 > - vaesenc 13*16(%rdi), %xmm0, %xmm0 > - vaesenclast 14*16(%rdi), %xmm0, %xmm0 > + vaesenclast 7*16(%rdi), %xmm0, %xmm0 > .Lencrypt_iv_done: > vmovdqu %xmm0, (%rsi) > RET > > // Out-of-line handling of AES-128 and AES-192 > .Lencrypt_iv_aes_128_or_192: > jz .Lencrypt_iv_aes_192 > - vaesenclast 10*16(%rdi), %xmm0, %xmm0 > + vaesenclast 3*16(%rdi), %xmm0, %xmm0 > jmp .Lencrypt_iv_done > .Lencrypt_iv_aes_192: > - vaesenc 10*16(%rdi), %xmm0, %xmm0 > - vaesenc 11*16(%rdi), %xmm0, %xmm0 > - vaesenclast 12*16(%rdi), %xmm0, %xmm0 > + vaesenc 3*16(%rdi), %xmm0, %xmm0 > + vaesenc 4*16(%rdi), %xmm0, %xmm0 > + vaesenclast 5*16(%rdi), %xmm0, %xmm0 > jmp .Lencrypt_iv_done > SYM_FUNC_END(aes_xts_encrypt_iv) > > // Below are the actual AES-XTS encryption and decryption functions, > // instantiated from the above macro. They all have the following prototype: > > base-commit: 4ad27a8be9dbefd4820da0f60da879d512b2f659 > prerequisite-patch-id: 8d09ed747039f5e718ac7267e2a15e22504aa7f3 > -- > 2.44.0 > >
On Tue, Apr 09, 2024 at 11:12:11AM +0200, Ard Biesheuvel wrote: > On Tue, 9 Apr 2024 at 02:02, Eric Biggers <ebiggers@kernel.org> wrote: > > > > From: Eric Biggers <ebiggers@google.com> > > > > Access the AES round keys using offsets -7*16 through 7*16, instead of > > 0*16 through 14*16. This allows VEX-encoded instructions to address all > > round keys using 1-byte offsets, whereas before some needed 4-byte > > offsets. This decreases the code size of aes-xts-avx-x86_64.o by 4.2%. > > > > Signed-off-by: Eric Biggers <ebiggers@google.com> > > Nice optimization! > > Do you think we might be able to macrofy this a bit so we can use zero > based indexing for the round keys, and hide the arithmetic? > > There are two alternatives I considered: defining variables KEYOFF0 through KEYOFF14 and writing the offsets as KEYOFF\i(KEY), or defining one variable KEYOFF and writing the offsets as \i*16-KEYOFF(KEY). I think I slightly prefer the current patch where it's less abstracted out, though. It makes it clear the offsets really are single-byte, and also index 7 is the exact mid-point so going from -7 to 7 still feels fairly natural. If we wanted to do something more complex like use different offsets for AVX vs. AVX512, then we'd need the abstraction to handle that, but it doesn't seem useful to do that. - Eric
On Tue, 9 Apr 2024 at 14:11, Eric Biggers <ebiggers@kernel.org> wrote: > > On Tue, Apr 09, 2024 at 11:12:11AM +0200, Ard Biesheuvel wrote: > > On Tue, 9 Apr 2024 at 02:02, Eric Biggers <ebiggers@kernel.org> wrote: > > > > > > From: Eric Biggers <ebiggers@google.com> > > > > > > Access the AES round keys using offsets -7*16 through 7*16, instead of > > > 0*16 through 14*16. This allows VEX-encoded instructions to address all > > > round keys using 1-byte offsets, whereas before some needed 4-byte > > > offsets. This decreases the code size of aes-xts-avx-x86_64.o by 4.2%. > > > > > > Signed-off-by: Eric Biggers <ebiggers@google.com> > > > > Nice optimization! > > > > Do you think we might be able to macrofy this a bit so we can use zero > > based indexing for the round keys, and hide the arithmetic? > > > > > > There are two alternatives I considered: defining variables KEYOFF0 through > KEYOFF14 and writing the offsets as KEYOFF\i(KEY), or defining one variable > KEYOFF and writing the offsets as \i*16-KEYOFF(KEY). I think I slightly prefer > the current patch where it's less abstracted out, though. It makes it clear the > offsets really are single-byte, and also index 7 is the exact mid-point so going > from -7 to 7 still feels fairly natural. If we wanted to do something more > complex like use different offsets for AVX vs. AVX512, then we'd need the > abstraction to handle that, but it doesn't seem useful to do that. > Fair enough.
Eric Biggers <ebiggers@kernel.org> wrote: > From: Eric Biggers <ebiggers@google.com> > > Access the AES round keys using offsets -7*16 through 7*16, instead of > 0*16 through 14*16. This allows VEX-encoded instructions to address all > round keys using 1-byte offsets, whereas before some needed 4-byte > offsets. This decreases the code size of aes-xts-avx-x86_64.o by 4.2%. > > Signed-off-by: Eric Biggers <ebiggers@google.com> > --- > arch/x86/crypto/aes-xts-avx-x86_64.S | 81 +++++++++++++++------------- > 1 file changed, 44 insertions(+), 37 deletions(-) Patch applied. Thanks.
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index fcaf64a2f8c6..95e412e7601d 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -80,11 +80,11 @@ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 .text // Function parameters .set KEY, %rdi // Initially points to crypto_aes_ctx, then is - // advanced to point directly to the round keys + // advanced to point directly to 7th round key .set SRC, %rsi // Pointer to next source data .set DST, %rdx // Pointer to next destination data .set LEN, %rcx // Remaining length in bytes .set TWEAK, %r8 // Pointer to next tweak @@ -406,28 +406,28 @@ .endif .endm // Load the round keys: just the first one if !USE_AVX10, otherwise all of them. .macro _load_round_keys - _vbroadcast128 0*16(KEY), KEY0 + _vbroadcast128 -7*16(KEY), KEY0 .if USE_AVX10 - _vbroadcast128 1*16(KEY), KEY1 - _vbroadcast128 2*16(KEY), KEY2 - _vbroadcast128 3*16(KEY), KEY3 - _vbroadcast128 4*16(KEY), KEY4 - _vbroadcast128 5*16(KEY), KEY5 - _vbroadcast128 6*16(KEY), KEY6 - _vbroadcast128 7*16(KEY), KEY7 - _vbroadcast128 8*16(KEY), KEY8 - _vbroadcast128 9*16(KEY), KEY9 - _vbroadcast128 10*16(KEY), KEY10 + _vbroadcast128 -6*16(KEY), KEY1 + _vbroadcast128 -5*16(KEY), KEY2 + _vbroadcast128 -4*16(KEY), KEY3 + _vbroadcast128 -3*16(KEY), KEY4 + _vbroadcast128 -2*16(KEY), KEY5 + _vbroadcast128 -1*16(KEY), KEY6 + _vbroadcast128 0*16(KEY), KEY7 + _vbroadcast128 1*16(KEY), KEY8 + _vbroadcast128 2*16(KEY), KEY9 + _vbroadcast128 3*16(KEY), KEY10 // Note: if it's AES-128 or AES-192, the last several round keys won't // be used. We do the loads anyway to save a conditional jump. - _vbroadcast128 11*16(KEY), KEY11 - _vbroadcast128 12*16(KEY), KEY12 - _vbroadcast128 13*16(KEY), KEY13 - _vbroadcast128 14*16(KEY), KEY14 + _vbroadcast128 4*16(KEY), KEY11 + _vbroadcast128 5*16(KEY), KEY12 + _vbroadcast128 6*16(KEY), KEY13 + _vbroadcast128 7*16(KEY), KEY14 .endif .endm // Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) // on the block(s) in \data using the round key(s) in \key. The register length @@ -454,13 +454,13 @@ .macro _vaes_1x enc, last, i, xmm_suffix, data .if USE_AVX10 _vaes \enc, \last, KEY\i\xmm_suffix, \data .else .ifnb \xmm_suffix - _vaes \enc, \last, \i*16(KEY), \data + _vaes \enc, \last, (\i-7)*16(KEY), \data .else - _vbroadcast128 \i*16(KEY), V4 + _vbroadcast128 (\i-7)*16(KEY), V4 _vaes \enc, \last, V4, \data .endif .endif .endm @@ -475,11 +475,11 @@ _vaes \enc, \last, KEY\i, V1 _tweak_step (2*(\i-1) + 1) _vaes \enc, \last, KEY\i, V2 _vaes \enc, \last, KEY\i, V3 .else - _vbroadcast128 \i*16(KEY), V4 + _vbroadcast128 (\i-7)*16(KEY), V4 _tweak_step (2*(\i-1)) _vaes \enc, \last, V4, V0 _vaes \enc, \last, V4, V1 _tweak_step (2*(\i-1) + 1) _vaes \enc, \last, V4, V2 @@ -526,13 +526,19 @@ _define_aliases // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). movl 480(KEY), KEYLEN - // If decrypting, advance KEY to the decryption round keys. -.if !\enc - add $240, KEY + // Advance KEY to point to the 7th encryption round key (if encrypting) + // or the 7th decryption round key (if decrypting). This makes the + // offset to any round key be in the range [-112, 112], fitting in a + // signed byte. This shortens VEX-encoded instructions that access the + // 8th and later round keys which otherwise would need 4-byte offsets. +.if \enc + add $7*16, KEY +.else + add $(15+7)*16, KEY .endif // Check whether the data length is a multiple of the AES block length. test $15, LEN jnz .Lneed_cts\@ @@ -751,40 +757,41 @@ // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, // u8 iv[AES_BLOCK_SIZE]); SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) vmovdqu (%rsi), %xmm0 - vpxor 0*16(%rdi), %xmm0, %xmm0 + add $7*16, %rdi + vpxor -7*16(%rdi), %xmm0, %xmm0 + vaesenc -6*16(%rdi), %xmm0, %xmm0 + vaesenc -5*16(%rdi), %xmm0, %xmm0 + vaesenc -4*16(%rdi), %xmm0, %xmm0 + vaesenc -3*16(%rdi), %xmm0, %xmm0 + vaesenc -2*16(%rdi), %xmm0, %xmm0 + vaesenc -1*16(%rdi), %xmm0, %xmm0 + vaesenc 0*16(%rdi), %xmm0, %xmm0 vaesenc 1*16(%rdi), %xmm0, %xmm0 vaesenc 2*16(%rdi), %xmm0, %xmm0 + cmpl $24, 480-(7*16)(%rdi) + jle .Lencrypt_iv_aes_128_or_192 vaesenc 3*16(%rdi), %xmm0, %xmm0 vaesenc 4*16(%rdi), %xmm0, %xmm0 vaesenc 5*16(%rdi), %xmm0, %xmm0 vaesenc 6*16(%rdi), %xmm0, %xmm0 - vaesenc 7*16(%rdi), %xmm0, %xmm0 - vaesenc 8*16(%rdi), %xmm0, %xmm0 - vaesenc 9*16(%rdi), %xmm0, %xmm0 - cmpl $24, 480(%rdi) - jle .Lencrypt_iv_aes_128_or_192 - vaesenc 10*16(%rdi), %xmm0, %xmm0 - vaesenc 11*16(%rdi), %xmm0, %xmm0 - vaesenc 12*16(%rdi), %xmm0, %xmm0 - vaesenc 13*16(%rdi), %xmm0, %xmm0 - vaesenclast 14*16(%rdi), %xmm0, %xmm0 + vaesenclast 7*16(%rdi), %xmm0, %xmm0 .Lencrypt_iv_done: vmovdqu %xmm0, (%rsi) RET // Out-of-line handling of AES-128 and AES-192 .Lencrypt_iv_aes_128_or_192: jz .Lencrypt_iv_aes_192 - vaesenclast 10*16(%rdi), %xmm0, %xmm0 + vaesenclast 3*16(%rdi), %xmm0, %xmm0 jmp .Lencrypt_iv_done .Lencrypt_iv_aes_192: - vaesenc 10*16(%rdi), %xmm0, %xmm0 - vaesenc 11*16(%rdi), %xmm0, %xmm0 - vaesenclast 12*16(%rdi), %xmm0, %xmm0 + vaesenc 3*16(%rdi), %xmm0, %xmm0 + vaesenc 4*16(%rdi), %xmm0, %xmm0 + vaesenclast 5*16(%rdi), %xmm0, %xmm0 jmp .Lencrypt_iv_done SYM_FUNC_END(aes_xts_encrypt_iv) // Below are the actual AES-XTS encryption and decryption functions, // instantiated from the above macro. They all have the following prototype: