diff mbox series

[RFC,3/7] crypto: x86/crc - limit FPU preemption

Message ID 20221006223151.22159-4-elliott@hpe.com (mailing list archive)
State Superseded
Delegated to: Herbert Xu
Headers show
Series crypto: x86 - fix RCU stalls | expand

Commit Message

Elliott, Robert (Servers) Oct. 6, 2022, 10:31 p.m. UTC
As done by the ECB and CBC helpers in arch/x86/crypt/ecb_cbc_helpers.h,
limit the number of bytes processed between kernel_fpu_begin() and
kernel_fpu_end() calls.

Those functions call preempt_disable() and preempt_enable(), so
the CPU core is unavailable for scheduling while running, leading to:
    rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {12-... } 22 jiffies s: 277 root: 0x1/.

Fixes: 78c37d191dd6 ("crypto: crc32 - add crc32 pclmulqdq implementation and wrappers for table implementation")
Fixes: 6a8ce1ef3940 ("crypto: crc32c - Optimize CRC32C calculation with PCLMULQDQ instruction")
Fixes: 0b95a7f85718 ("crypto: crct10dif - Glue code to cast accelerated CRCT10DIF assembly as a crypto transform")
Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Robert Elliott <elliott@hpe.com>
---
 arch/x86/crypto/crc32-pclmul_glue.c     | 18 ++++++++++----
 arch/x86/crypto/crc32c-intel_glue.c     | 32 ++++++++++++++++++++-----
 arch/x86/crypto/crct10dif-pclmul_glue.c | 32 ++++++++++++++++++++-----
 3 files changed, 66 insertions(+), 16 deletions(-)
diff mbox series

Patch

diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 288200fe7b4e..7cf65dc726c4 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -49,6 +49,8 @@ 
 #define SCALE_F			16L	/* size of xmm register */
 #define SCALE_F_MASK		(SCALE_F - 1)
 
+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
 u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
 
 static u32 __attribute__((pure))
@@ -57,6 +59,7 @@  static u32 __attribute__((pure))
 	unsigned int iquotient;
 	unsigned int iremainder;
 	unsigned int prealign;
+	unsigned int chunk;
 
 	if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable())
 		return crc32_le(crc, p, len);
@@ -73,12 +76,19 @@  static u32 __attribute__((pure))
 	iquotient = len & (~SCALE_F_MASK);
 	iremainder = len & SCALE_F_MASK;
 
-	kernel_fpu_begin();
-	crc = crc32_pclmul_le_16(p, iquotient, crc);
-	kernel_fpu_end();
+	do {
+		chunk = min(iquotient, FPU_BYTES);
+		iquotient -= chunk;
+
+		kernel_fpu_begin();
+		crc = crc32_pclmul_le_16(p, chunk, crc);
+		kernel_fpu_end();
+
+		p += chunk;
+	} while (iquotient);
 
 	if (iremainder)
-		crc = crc32_le(crc, p + iquotient, iremainder);
+		crc = crc32_le(crc, p, iremainder);
 
 	return crc;
 }
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index c5c965b694c6..b277c215f0fb 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -44,6 +44,8 @@ 
  */
 #define CRC32C_PCL_BREAKEVEN	512
 
+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
 asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
 				unsigned int crc_init);
 #endif /* CONFIG_X86_64 */
@@ -155,15 +157,23 @@  static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
 			       unsigned int len)
 {
 	u32 *crcp = shash_desc_ctx(desc);
+	unsigned int chunk;
 
 	/*
 	 * use faster PCL version if datasize is large enough to
 	 * overcome kernel fpu state save/restore overhead
 	 */
 	if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		*crcp = crc_pcl(data, len, *crcp);
-		kernel_fpu_end();
+		do {
+			chunk = min(len, FPU_BYTES);
+			len -= chunk;
+
+			kernel_fpu_begin();
+			*crcp = crc_pcl(data, chunk, *crcp);
+			kernel_fpu_end();
+
+			data += chunk;
+		} while (len);
 	} else
 		*crcp = crc32c_intel_le_hw(*crcp, data, len);
 	return 0;
@@ -172,10 +182,20 @@  static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
 static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
 				u8 *out)
 {
+	unsigned int chunk;
+
 	if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
-		kernel_fpu_end();
+		do {
+			chunk = min(len, FPU_BYTES);
+			len -= chunk;
+
+			kernel_fpu_begin();
+			*crcp = crc_pcl(data, chunk, *crcp);
+			kernel_fpu_end();
+
+			data += chunk;
+		} while (len);
+		*(__le32 *)out = ~cpu_to_le32(*crcp);
 	} else
 		*(__le32 *)out =
 			~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index 7c5a32282d51..bcd362df6b62 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -36,6 +36,8 @@ 
 #include <asm/cpu_device_id.h>
 #include <asm/simd.h>
 
+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
 asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
 
 struct chksum_desc_ctx {
@@ -55,11 +57,19 @@  static int chksum_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int length)
 {
 	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int chunk;
 
 	if (length >= 16 && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
-		kernel_fpu_end();
+		do {
+			chunk = min(length, FPU_BYTES);
+			length -= chunk;
+
+			kernel_fpu_begin();
+			ctx->crc = crc_t10dif_pcl(ctx->crc, data, chunk);
+			kernel_fpu_end();
+
+			data += chunk;
+		} while (length);
 	} else
 		ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
 	return 0;
@@ -75,10 +85,20 @@  static int chksum_final(struct shash_desc *desc, u8 *out)
 
 static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
 {
+	unsigned int chunk;
+
 	if (len >= 16 && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		*(__u16 *)out = crc_t10dif_pcl(crc, data, len);
-		kernel_fpu_end();
+		do {
+			chunk = min(len, FPU_BYTES);
+			len -= chunk;
+
+			kernel_fpu_begin();
+			crc = crc_t10dif_pcl(crc, data, chunk);
+			kernel_fpu_end();
+
+			data += chunk;
+		} while (len);
+		*(__u16 *)out = crc;
 	} else
 		*(__u16 *)out = crc_t10dif_generic(crc, data, len);
 	return 0;