[2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit

Message ID	20241014042447.50197-3-ebiggers@kernel.org (mailing list archive)
State	New
Headers	show Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BE7BC1369AA; Mon, 14 Oct 2024 04:25:32 +0000 (UTC) From: Eric Biggers <ebiggers@kernel.org> To: linux-crypto@vger.kernel.org Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>, Josh Poimboeuf <jpoimboe@kernel.org>, Peter Zijlstra <peterz@infradead.org> Subject: [PATCH 2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit Date: Sun, 13 Oct 2024 21:24:46 -0700 Message-ID: <20241014042447.50197-3-ebiggers@kernel.org> In-Reply-To: <20241014042447.50197-1-ebiggers@kernel.org> References: <20241014042447.50197-1-ebiggers@kernel.org> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	crypto: x86/crc32c - jump table elimination and other cleanups \| expand [0/3] crypto: x86/crc32c - jump table elimination and other cleanups [1/3] crypto: x86/crc32c - simplify code for handling fewer than 200 bytes [2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit [3/3] crypto: x86/crc32c - eliminate jump table and excessive unrolling

diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index feccb5254c7e5..52c5d47ef5a14 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -39,11 +39,11 @@ * size is >= 512 to account * for fpu state save/restore overhead. */ #define CRC32C_PCL_BREAKEVEN 512 -asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, +asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init); #endif /* CONFIG_X86_64 */ static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) { diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 466cea4943963..bbf860e90951d 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -58,11 +58,11 @@ # Define threshold below which buffers are considered "small" and routed to # regular CRC code that does not interleave the CRC instructions. #define SMALL_SIZE 200 -# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); +# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init); .text SYM_FUNC_START(crc_pcl) #define bufp rdi #define bufp_dw %edi @@ -70,18 +70,15 @@ SYM_FUNC_START(crc_pcl) #define bufp_b %dil #define bufptmp %rcx #define block_0 %rcx #define block_1 %rdx #define block_2 %r11 -#define len %rsi -#define len_dw %esi -#define len_w %si -#define len_b %sil -#define crc_init_arg %rdx +#define len %esi +#define crc_init_arg %edx #define tmp %rbx -#define crc_init %r8 -#define crc_init_dw %r8d +#define crc_init %r8d +#define crc_init_q %r8 #define crc1 %r9 #define crc2 %r10 pushq %rbx pushq %rdi @@ -105,13 +102,13 @@ SYM_FUNC_START(crc_pcl) .Ldo_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer add %bufp, bufptmp # align buffer pointer for quadword # processing - sub %bufp, len # update buffer length + sub bufp_dw, len # update buffer length .Lalign_loop: - crc32b %bl, crc_init_dw # compute crc32 of 1-byte + crc32b %bl, crc_init # compute crc32 of 1-byte shr $8, tmp # get next byte dec %bufp jne .Lalign_loop .Lproc_block: @@ -119,19 +116,18 @@ SYM_FUNC_START(crc_pcl) ################################################################ ## 2) PROCESS BLOCKS: ################################################################ ## compute num of bytes to be processed - movq len, tmp # save num bytes in tmp - cmpq $128*24, len + cmp $128*24, len jae .Lfull_block .Lcontinue_block: ## len < 128*24 movq $2731, %rax # 2731 = ceil(2^16 / 24) - mul len_dw + mul len shrq $16, %rax ## eax contains floor(bytes / 24) = num 24-byte chunks to do ## process rax 24-byte chunks (128 >= rax >= 0) @@ -174,21 +170,21 @@ SYM_FUNC_START(crc_pcl) .rept 128-1 .altmacro LABEL crc_ %i .noaltmacro ENDBR - crc32q -i*8(block_0), crc_init + crc32q -i*8(block_0), crc_init_q crc32q -i*8(block_1), crc1 crc32q -i*8(block_2), crc2 i=(i-1) .endr .altmacro LABEL crc_ %i .noaltmacro ENDBR - crc32q -i*8(block_0), crc_init + crc32q -i*8(block_0), crc_init_q crc32q -i*8(block_1), crc1 # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet mov block_2, block_0 @@ -198,66 +194,65 @@ LABEL crc_ %i lea (K_table-8)(%rip), %bufp # first entry is for idx 1 shlq $3, %rax # rax *= 8 pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 leal (%eax,%eax,2), %eax # rax *= 3 (total *24) - subq %rax, tmp # tmp -= rax*24 + sub %eax, len # len -= rax*24 - movq crc_init, %xmm1 # CRC for block 1 + movq crc_init_q, %xmm1 # CRC for block 1 pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 movq crc1, %xmm2 # CRC for block 2 pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 pxor %xmm2,%xmm1 movq %xmm1, %rax xor -i*8(block_2), %rax - mov crc2, crc_init - crc32 %rax, crc_init + mov crc2, crc_init_q + crc32 %rax, crc_init_q ################################################################ ## 5) Check for end: ################################################################ LABEL crc_ 0 ENDBR - mov tmp, len - cmp $128*24, tmp + cmp $128*24, len jae .Lfull_block - cmp $SMALL_SIZE, tmp + cmp $SMALL_SIZE, len jae .Lcontinue_block ####################################################################### ## 6) Process any remainder without interleaving: ####################################################################### .Lsmall: test len, len jz .Ldone - mov len_dw, %eax + mov len, %eax shr $3, %eax jz .Ldo_dword .Ldo_qwords: - crc32q (bufptmp), crc_init + crc32q (bufptmp), crc_init_q add $8, bufptmp dec %eax jnz .Ldo_qwords .Ldo_dword: - test $4, len_dw + test $4, len jz .Ldo_word - crc32l (bufptmp), crc_init_dw + crc32l (bufptmp), crc_init add $4, bufptmp .Ldo_word: - test $2, len_dw + test $2, len jz .Ldo_byte - crc32w (bufptmp), crc_init_dw + crc32w (bufptmp), crc_init add $2, bufptmp .Ldo_byte: - test $1, len_dw + test $1, len jz .Ldone - crc32b (bufptmp), crc_init_dw + crc32b (bufptmp), crc_init .Ldone: - movq crc_init, %rax + mov crc_init, %eax popq %rsi popq %rdi popq %rbx RET SYM_FUNC_END(crc_pcl)

[2/3] crypto: x86/crc32c - access 32-bit arguments as 32-bit

Commit Message

Patch