@@ -12,46 +12,47 @@
#include <linux/sizes.h>
#include <asm/cpufeature.h>
-#include <asm/fpu/api.h>
#include <asm/processor.h>
#include <asm/simd.h>
-asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
-asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
+asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc);
+asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
- blake2s_compress_generic(state, block, nblocks, inc);
+ blake2s_compress_generic(state, data, nblocks, inc);
return;
}
- do {
- const size_t blocks = min_t(size_t, nblocks,
- SZ_4K / BLAKE2S_BLOCK_SIZE);
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunks = min(nblocks, 4096U / BLAKE2S_BLOCK_SIZE);
- kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) &&
static_branch_likely(&blake2s_use_avx512))
- blake2s_compress_avx512(state, block, blocks, inc);
+ blake2s_compress_avx512(state, data, chunks, inc);
else
- blake2s_compress_ssse3(state, block, blocks, inc);
- kernel_fpu_end();
+ blake2s_compress_ssse3(state, data, chunks, inc);
- nblocks -= blocks;
- block += blocks * BLAKE2S_BLOCK_SIZE;
- } while (nblocks);
+ nblocks -= chunks;
+
+ if (!nblocks)
+ break;
+
+ data += chunks * BLAKE2S_BLOCK_SIZE;
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();
}
EXPORT_SYMBOL(blake2s_compress);
@@ -10,11 +10,11 @@
#include <crypto/blake2s.h>
#include <linux/string.h>
-void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc);
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc);
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc);
+void blake2s_compress(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc);
bool blake2s_selftest(void);
@@ -37,12 +37,12 @@ static inline void blake2s_increment_counter(struct blake2s_state *state,
state->t[1] += (state->t[0] < inc);
}
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc)
__weak __alias(blake2s_compress_generic);
-void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc)
{
u32 m[16];
u32 v[16];
@@ -53,7 +53,7 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
while (nblocks > 0) {
blake2s_increment_counter(state, inc);
- memcpy(m, block, BLAKE2S_BLOCK_SIZE);
+ memcpy(m, data, BLAKE2S_BLOCK_SIZE);
le32_to_cpu_array(m, ARRAY_SIZE(m));
memcpy(v, state->h, 32);
v[ 8] = BLAKE2S_IV0;
@@ -103,7 +103,7 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
for (i = 0; i < 8; ++i)
state->h[i] ^= v[i] ^ v[i + 8];
- block += BLAKE2S_BLOCK_SIZE;
+ data += BLAKE2S_BLOCK_SIZE;
--nblocks;
}
}
The x86 assembly language implementations using SIMD process data between kernel_fpu_begin() and kernel_fpu_end() calls. That disables scheduler preemption, so prevents the CPU core from being used by other threads. The update() and finup() functions might be called to process large quantities of data, which can result in RCU stalls and soft lockups. Rather than break the processing into 4 KiB passes, each of which unilaterally calls kernel_fpu_begin() and kernel_fpu_end(), periodically check if the kernel scheduler wants to run something else on the CPU. If so, yield the kernel FPU context and let the scheduler intervene. Adjust the type of the length arguments everywhere to be unsigned long rather than size_t to avoid typecasts. Suggested-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Robert Elliott <elliott@hpe.com> --- arch/x86/crypto/blake2s-glue.c | 41 ++++++++++++++++--------------- include/crypto/internal/blake2s.h | 8 +++--- lib/crypto/blake2s-generic.c | 12 ++++----- 3 files changed, 31 insertions(+), 30 deletions(-)