[09/13] crypto: x86/poly - yield FPU context only when needed

Message ID	20221219220223.3982176-10-elliott@hpe.com (mailing list archive)
State	Changes Requested
Delegated to:	Herbert Xu
Headers	show Return-Path: <linux-crypto-owner@kernel.org> From: Robert Elliott <elliott@hpe.com> To: herbert@gondor.apana.org.au, davem@davemloft.net, Jason@zx2c4.com, ardb@kernel.org, ap420073@gmail.com, David.Laight@ACULAB.COM, ebiggers@kernel.org, tim.c.chen@linux.intel.com, peter@n8pjl.ca, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com Cc: linux-crypto@vger.kernel.org, x86@kernel.org, linux-kernel@vger.kernel.org, Robert Elliott <elliott@hpe.com> Subject: [PATCH 09/13] crypto: x86/poly - yield FPU context only when needed Date: Mon, 19 Dec 2022 16:02:19 -0600 Message-Id: <20221219220223.3982176-10-elliott@hpe.com> In-Reply-To: <20221219220223.3982176-1-elliott@hpe.com> References: <20221219220223.3982176-1-elliott@hpe.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	crypto: x86 - yield FPU context during long loops \| expand [00/13] crypto: x86 - yield FPU context during long loops [01/13] x86: protect simd.h header file [02/13] x86: add yield FPU context utility function [03/13] crypto: x86/sha - yield FPU context during long loops [04/13] crypto: x86/crc - yield FPU context during long loops [05/13] crypto: x86/sm3 - yield FPU context during long loops [06/13] crypto: x86/ghash - use u8 rather than char [07/13] crypto: x86/ghash - restructure FPU context saving [08/13] crypto: x86/ghash - yield FPU context during long loops [09/13] crypto: x86/poly - yield FPU context only when needed [10/13] crypto: x86/aegis - yield FPU context during long loops [11/13] crypto: x86/blake - yield FPU context only when needed [12/13] crypto: x86/chacha - yield FPU context only when needed [13/13] crypto: x86/aria - yield FPU context only when needed

diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index 46b036204ed9..4afbfd35afda 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -22,15 +22,21 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc, if (srclen < 64 || !crypto_simd_usable()) return crypto_nhpoly1305_update(desc, src, srclen); - do { - unsigned int n = min_t(unsigned int, srclen, SZ_4K); + kernel_fpu_begin(); + for (;;) { + const unsigned int chunk = min(srclen, 4096U); + + crypto_nhpoly1305_update_helper(desc, src, chunk, nh_avx2); + srclen -= chunk; + + if (!srclen) + break; + + src += chunk; + kernel_fpu_yield(); + } + kernel_fpu_end(); - kernel_fpu_begin(); - crypto_nhpoly1305_update_helper(desc, src, n, nh_avx2); - kernel_fpu_end(); - src += n; - srclen -= n; - } while (srclen); return 0; } diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index 4a4970d75107..f5c757f6f781 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -22,15 +22,21 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc, if (srclen < 64 || !crypto_simd_usable()) return crypto_nhpoly1305_update(desc, src, srclen); - do { - unsigned int n = min_t(unsigned int, srclen, SZ_4K); + kernel_fpu_begin(); + for (;;) { + const unsigned int chunk = min(srclen, 4096U); + + crypto_nhpoly1305_update_helper(desc, src, chunk, nh_sse2); + srclen -= chunk; + + if (!srclen) + break; + + src += chunk; + kernel_fpu_yield(); + } + kernel_fpu_end(); - kernel_fpu_begin(); - crypto_nhpoly1305_update_helper(desc, src, n, nh_sse2); - kernel_fpu_end(); - src += n; - srclen -= n; - } while (srclen); return 0; } diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 1dfb8af48a3c..13e2e134b458 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -15,20 +15,13 @@ #include <asm/intel-family.h> #include <asm/simd.h> -asmlinkage void poly1305_init_x86_64(void *ctx, - const u8 key[POLY1305_BLOCK_SIZE]); -asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, - const size_t len, const u32 padbit); -asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, - const size_t len, const u32 padbit); +asmlinkage void poly1305_init_x86_64(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, unsigned int len, u32 padbit); +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]); +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, unsigned int len, const u32 padbit); +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, unsigned int len, u32 padbit); +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, unsigned int len, u32 padbit); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); @@ -86,7 +79,7 @@ static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) poly1305_init_x86_64(ctx, key); } -static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, +static void poly1305_simd_blocks(void *ctx, const u8 *inp, unsigned int len, const u32 padbit) { struct poly1305_arch_internal *state = ctx; @@ -103,21 +96,25 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, return; } - do { - const size_t bytes = min_t(size_t, len, SZ_4K); + kernel_fpu_begin(); + for (;;) { + const unsigned int chunk = min(len, 4096U); - kernel_fpu_begin(); if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) - poly1305_blocks_avx512(ctx, inp, bytes, padbit); + poly1305_blocks_avx512(ctx, inp, chunk, padbit); else if (static_branch_likely(&poly1305_use_avx2)) - poly1305_blocks_avx2(ctx, inp, bytes, padbit); + poly1305_blocks_avx2(ctx, inp, chunk, padbit); else - poly1305_blocks_avx(ctx, inp, bytes, padbit); - kernel_fpu_end(); + poly1305_blocks_avx(ctx, inp, chunk, padbit); + len -= chunk; - len -= bytes; - inp += bytes; - } while (len); + if (!len) + break; + + inp += chunk; + kernel_fpu_yield(); + } + kernel_fpu_end(); } static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c index 8fa58b0f3cb3..a3d72e87d58d 100644 --- a/arch/x86/crypto/polyval-clmulni_glue.c +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -45,8 +45,8 @@ struct polyval_desc_ctx { u32 bytes; }; -asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, - const u8 *in, size_t nblocks, u8 *accumulator); +asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, const u8 *in, + unsigned int nblocks, u8 *accumulator); asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2); static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) @@ -55,27 +55,40 @@ static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) } static void internal_polyval_update(const struct polyval_tfm_ctx *keys, - const u8 *in, size_t nblocks, u8 *accumulator) + const u8 *in, unsigned int nblocks, u8 *accumulator) { - if (likely(crypto_simd_usable())) { - kernel_fpu_begin(); - clmul_polyval_update(keys, in, nblocks, accumulator); - kernel_fpu_end(); - } else { + if (!crypto_simd_usable()) { polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, nblocks, accumulator); + return; } + + kernel_fpu_begin(); + for (;;) { + const unsigned int chunks = min(nblocks, 4096U / POLYVAL_BLOCK_SIZE); + + clmul_polyval_update(keys, in, chunks, accumulator); + nblocks -= chunks; + + if (!nblocks) + break; + + in += chunks * POLYVAL_BLOCK_SIZE; + kernel_fpu_yield(); + } + kernel_fpu_end(); } static void internal_polyval_mul(u8 *op1, const u8 *op2) { - if (likely(crypto_simd_usable())) { - kernel_fpu_begin(); - clmul_polyval_mul(op1, op2); - kernel_fpu_end(); - } else { + if (!crypto_simd_usable()) { polyval_mul_non4k(op1, op2); + return; } + + kernel_fpu_begin(); + clmul_polyval_mul(op1, op2); + kernel_fpu_end(); } static int polyval_x86_setkey(struct crypto_shash *tfm, @@ -113,7 +126,6 @@ static int polyval_x86_update(struct shash_desc *desc, struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); u8 *pos; - unsigned int nblocks; unsigned int n; if (dctx->bytes) { @@ -131,9 +143,9 @@ static int polyval_x86_update(struct shash_desc *desc, tctx->key_powers[NUM_KEY_POWERS-1]); } - while (srclen >= POLYVAL_BLOCK_SIZE) { - /* Allow rescheduling every 4K bytes. */ - nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; + if (srclen >= POLYVAL_BLOCK_SIZE) { + const unsigned int nblocks = srclen / POLYVAL_BLOCK_SIZE; + internal_polyval_update(tctx, src, nblocks, dctx->buffer); srclen -= nblocks * POLYVAL_BLOCK_SIZE; src += nblocks * POLYVAL_BLOCK_SIZE;

[09/13] crypto: x86/poly - yield FPU context only when needed

Commit Message

Patch