From patchwork Tue Apr 9 12:42:15 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eric Biggers X-Patchwork-Id: 13622439 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E15CF12D748; Tue, 9 Apr 2024 12:44:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1712666649; cv=none; b=NCLGSvZxpa2Ibs9uDaKfIfoFcLZpLIst+Al8lw9CUqVVurPQ/18rjH1rPRNTUu7NkU6qgDNdrgY4FXkyu7EUpjK7qY+sl3i+JMurx/aqAUz2vz4SuktNBOP3aF4qOV3CiyzNZ6z4N8sIcu2iFAfpMoMMR9DJjmIWlYmCpCO2ndE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1712666649; c=relaxed/simple; bh=OEEVk8vQIWgxEd5k50yk4jBZXqyGIckJ1aq747YlYQg=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=kEB9G3moJn6OKabPZknXAOJOhNZkkxnujjpNvwXMfeO2IthlgQljyig0j2ZFEBEHDWXtAuURyjKZBuqRu6F1GT/KTrftrnCF5RZqkrCwfDf9Aeu818q8F2QJKrTwcK241MEnEHye2Da8vST0PCIukBZkqs1WsscvEnMJCDScIAk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=XHiYsDwG; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="XHiYsDwG" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 7FEEFC43390; Tue, 9 Apr 2024 12:44:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1712666648; bh=OEEVk8vQIWgxEd5k50yk4jBZXqyGIckJ1aq747YlYQg=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=XHiYsDwGat0m/RITmeyA9h36YhV1IQESQUD0T3OHgVUuxgW6xtmoKVR9P6tQKD43j sbvlMvYKT7MXSNDZkjVzudg/cJY2Gg+vmLy9un3GnJrNGi661QO6sM6wW8o4J54A3C IfcTA71toCF3dNKfSLAnLDbT04r8a3C5dURUvnFFHDTTOfaCWjncT0m+6B+js14zR6 T14MvV9uFJBdRy8SAWI0+4RfA4jwuTcOWnpod7CxU67IPyMrhcVCZBnPsfiPbyjpbC 3I7Hc0rRkLo2RdWcOCZDrZoOddZldSqa/0DufsIzTvUIzfFslO8uCNlnCGGdfv3Pfs eLt1muFcMOw3A== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: Stefan Kanthak , linux-kernel@vger.kernel.org Subject: [PATCH 1/2] crypto: x86/sha256-ni - convert to use rounds macros Date: Tue, 9 Apr 2024 08:42:15 -0400 Message-ID: <20240409124216.9261-2-ebiggers@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: <20240409124216.9261-1-ebiggers@kernel.org> References: <20240409124216.9261-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-crypto@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Eric Biggers To avoid source code duplication, do the SHA-256 rounds using macros. This reduces the length of sha256_ni_asm.S by 148 lines while still producing the exact same object file. Signed-off-by: Eric Biggers --- arch/x86/crypto/sha256_ni_asm.S | 216 +++++--------------------------- 1 file changed, 34 insertions(+), 182 deletions(-) diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index 537b6dcd7ed8..e485520e3b49 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -74,23 +74,50 @@ #define SHUF_MASK %xmm8 #define ABEF_SAVE %xmm9 #define CDGH_SAVE %xmm10 +.macro do_4rounds i, m0, m1, m2, m3 +.if \i < 16 + movdqu \i*4(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, \m0 +.else + movdqa \m0, MSG +.endif + paddd \i*4(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 +.if \i >= 12 && \i < 60 + movdqa \m0, MSGTMP4 + palignr $4, \m3, MSGTMP4 + paddd MSGTMP4, \m1 + sha256msg2 \m0, \m1 +.endif + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 +.if \i >= 4 && \i < 52 + sha256msg1 \m0, \m3 +.endif +.endm + +.macro do_16rounds i + do_4rounds (\i + 0), MSGTMP0, MSGTMP1, MSGTMP2, MSGTMP3 + do_4rounds (\i + 4), MSGTMP1, MSGTMP2, MSGTMP3, MSGTMP0 + do_4rounds (\i + 8), MSGTMP2, MSGTMP3, MSGTMP0, MSGTMP1 + do_4rounds (\i + 12), MSGTMP3, MSGTMP0, MSGTMP1, MSGTMP2 +.endm + /* * Intel SHA Extensions optimized implementation of a SHA-256 update function * * The function takes a pointer to the current hash values, a pointer to the * input data, and a number of 64 byte blocks to process. Once all blocks have * been processed, the digest pointer is updated with the resulting hash value. * The function only processes complete blocks, there is no functionality to * store partial blocks. All message padding and hash value initialization must * be done outside the update function. * - * The indented lines in the loop are instructions related to rounds processing. - * The non-indented lines are instructions related to the message schedule. - * * void sha256_ni_transform(uint32_t *digest, const void *data, uint32_t numBlocks); * digest : pointer to digest * data: pointer to input data * numBlocks: Number of blocks to process @@ -123,189 +150,14 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) .Lloop0: /* Save hash values for addition after rounds */ movdqa STATE0, ABEF_SAVE movdqa STATE1, CDGH_SAVE - /* Rounds 0-3 */ - movdqu 0*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP0 - paddd 0*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 4-7 */ - movdqu 1*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP1 - paddd 1*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 8-11 */ - movdqu 2*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP2 - paddd 2*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 12-15 */ - movdqu 3*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP3 - paddd 3*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 16-19 */ - movdqa MSGTMP0, MSG - paddd 4*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 20-23 */ - movdqa MSGTMP1, MSG - paddd 5*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 24-27 */ - movdqa MSGTMP2, MSG - paddd 6*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 28-31 */ - movdqa MSGTMP3, MSG - paddd 7*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 32-35 */ - movdqa MSGTMP0, MSG - paddd 8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 36-39 */ - movdqa MSGTMP1, MSG - paddd 9*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 40-43 */ - movdqa MSGTMP2, MSG - paddd 10*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 44-47 */ - movdqa MSGTMP3, MSG - paddd 11*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 48-51 */ - movdqa MSGTMP0, MSG - paddd 12*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 52-55 */ - movdqa MSGTMP1, MSG - paddd 13*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 56-59 */ - movdqa MSGTMP2, MSG - paddd 14*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 60-63 */ - movdqa MSGTMP3, MSG - paddd 15*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + do_16rounds 0 + do_16rounds 16 + do_16rounds 32 + do_16rounds 48 /* Add current hash values with previously saved */ paddd ABEF_SAVE, STATE0 paddd CDGH_SAVE, STATE1 From patchwork Tue Apr 9 12:42:16 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Eric Biggers X-Patchwork-Id: 13622440 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A96BC12E1D6; Tue, 9 Apr 2024 12:44:09 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1712666649; cv=none; b=HhFwI3JnVP2U/EKkAHCHOJmcYgwfokQhx9SDmcy4pCyF+gcq9T+r+EluJEcLjoJ76SLHkw2qdSQXRdJx2mwBbCnDlkMBOgQx8OFpd+rv9bQl2pNjAihJQWp7Cfqtg5squIOVCFkrt/Tbo/cOA9M3fSyfspHHlK3p6nQAv2XsZ0o= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1712666649; c=relaxed/simple; bh=NRWqtq//XuFFbL2CASYOLSgyi5eKB8/yl5Vbq3TCI5Q=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=LlEKk98wBJFdWd8f/Hu5RIRWKwgL8yeVwFL7KUgN5JESQnoezCnAOyLu+7NgtfbYaslwS+Wm/JsTb32szHVt8uPYmcUwIedb26RSKKNDVYdBSio4Yfh53X5MRX2WAcI5sFhMsxFSm7hAkKuoy3hDHyid9Bza2ffsPk64oGbZOBc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=V3nXCZZN; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="V3nXCZZN" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 02470C433F1; Tue, 9 Apr 2024 12:44:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1712666649; bh=NRWqtq//XuFFbL2CASYOLSgyi5eKB8/yl5Vbq3TCI5Q=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=V3nXCZZNxfBFFHMWQJaqFKEPnSW4UV5AE0bMyPLCds3R2cxiTrQz2UCyCNdWMZmD1 erzK9AC/FK3MMNcrH05df3GrO6fJ4xoiSEFqklmU/96VdXwdRXBfu/0qvZEmIl6z4F a1p6Qf3M7ppT1IvWDTzt+s2XQ4Z2skG6yTQkwO6Wc0p/5AOlby1XMMw8nYSTSPDNkF IiNlkXTU00w7trLFyN9tTzZsSYLhW+4ko+Gw/+hVd0p9HpIsBo0yKnq57ciXtdfrR/ rfqXCvBVwPm0DrVd82nHHmDc5j1Atc1HR5j5K1upjBVOnJRyEaPRVqs5DZguuYQuaJ R4JeK5j/OADKw== From: Eric Biggers To: linux-crypto@vger.kernel.org Cc: Stefan Kanthak , linux-kernel@vger.kernel.org Subject: [PATCH 2/2] crypto: x86/sha256-ni - optimize code size Date: Tue, 9 Apr 2024 08:42:16 -0400 Message-ID: <20240409124216.9261-3-ebiggers@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: <20240409124216.9261-1-ebiggers@kernel.org> References: <20240409124216.9261-1-ebiggers@kernel.org> Precedence: bulk X-Mailing-List: linux-crypto@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Eric Biggers - Load the SHA-256 round constants relative to a pointer that points into the middle of the constants rather than to the beginning. Since x86 instructions use signed offsets, this decreases the instruction length required to access some of the later round constants. - Use punpcklqdq or punpckhqdq instead of longer instructions such as pshufd, pblendw, and palignr. This doesn't harm performance. The end result is that sha256_ni_transform shrinks from 839 bytes to 791 bytes, with no loss in performance. Suggested-by: Stefan Kanthak Signed-off-by: Eric Biggers --- arch/x86/crypto/sha256_ni_asm.S | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index e485520e3b49..4d373069448d 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -82,19 +82,19 @@ pshufb SHUF_MASK, MSG movdqa MSG, \m0 .else movdqa \m0, MSG .endif - paddd \i*4(SHA256CONSTANTS), MSG + paddd (\i-32)*4(SHA256CONSTANTS), MSG sha256rnds2 STATE0, STATE1 .if \i >= 12 && \i < 60 movdqa \m0, MSGTMP4 palignr $4, \m3, MSGTMP4 paddd MSGTMP4, \m1 sha256msg2 \m0, \m1 .endif - pshufd $0x0E, MSG, MSG + punpckhqdq MSG, MSG sha256rnds2 STATE1, STATE0 .if \i >= 4 && \i < 52 sha256msg1 \m0, \m3 .endif .endm @@ -133,21 +133,21 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) /* * load initial hash values * Need to reorder these appropriately * DCBA, HGFE -> ABEF, CDGH */ - movdqu 0*16(DIGEST_PTR), STATE0 - movdqu 1*16(DIGEST_PTR), STATE1 + movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */ + movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */ - pshufd $0xB1, STATE0, STATE0 /* CDAB */ - pshufd $0x1B, STATE1, STATE1 /* EFGH */ movdqa STATE0, MSGTMP4 - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + punpcklqdq STATE1, STATE0 /* FEBA */ + punpckhqdq MSGTMP4, STATE1 /* DCHG */ + pshufd $0x1B, STATE0, STATE0 /* ABEF */ + pshufd $0xB1, STATE1, STATE1 /* CDGH */ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - lea K256(%rip), SHA256CONSTANTS + lea K256+32*4(%rip), SHA256CONSTANTS .Lloop0: /* Save hash values for addition after rounds */ movdqa STATE0, ABEF_SAVE movdqa STATE1, CDGH_SAVE @@ -165,18 +165,18 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) add $64, DATA_PTR cmp NUM_BLKS, DATA_PTR jne .Lloop0 /* Write hash values back in the correct order */ - pshufd $0x1B, STATE0, STATE0 /* FEBA */ - pshufd $0xB1, STATE1, STATE1 /* DCHG */ movdqa STATE0, MSGTMP4 - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, MSGTMP4, STATE1 /* HGFE */ + punpcklqdq STATE1, STATE0 /* GHEF */ + punpckhqdq MSGTMP4, STATE1 /* ABCD */ + pshufd $0xB1, STATE0, STATE0 /* HGFE */ + pshufd $0x1B, STATE1, STATE1 /* DCBA */ - movdqu STATE0, 0*16(DIGEST_PTR) - movdqu STATE1, 1*16(DIGEST_PTR) + movdqu STATE1, 0*16(DIGEST_PTR) + movdqu STATE0, 1*16(DIGEST_PTR) .Ldone_hash: RET SYM_FUNC_END(sha256_ni_transform)