diff mbox series

[v2,04/19] crypto: x86/sha - limit FPU preemption

Message ID 20221012215931.3896-5-elliott@hpe.com (mailing list archive)
State Changes Requested
Delegated to: Herbert Xu
Headers show
Series crypto: x86 - fix RCU stalls | expand

Commit Message

Elliott, Robert (Servers) Oct. 12, 2022, 9:59 p.m. UTC
As done by the ECB and CBC helpers in arch/x86/crypt/ecb_cbc_helpers.h,
limit the number of bytes processed between kernel_fpu_begin() and
kernel_fpu_end() calls.

Those functions call preempt_disable() and preempt_enable(), so
the CPU core is unavailable for scheduling while running.

This leads to "rcu_preempt detected expedited stalls" with stack dumps
pointing to the optimized hash function if the module is loaded and
used a lot:
    rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: ...

For example, that can occur during boot with the stack track pointing
to the sha512-x86 function if the system set to use SHA-512 for
module signing. The call trace includes:
    module_sig_check
    mod_verify_sig
    pkcs7_verify
    pkcs7_digest
    sha512_finup
    sha512_base_do_update

Fixes: 66be89515888 ("crypto: sha1 - SSSE3 based SHA1 implementation for x86-64")
Fixes: 8275d1aa6422 ("crypto: sha256 - Create module providing optimized SHA256 routines using SSSE3, AVX or AVX2 instructions.")
Fixes: 87de4579f92d ("crypto: sha512 - Create module providing optimized SHA512 routines using SSSE3, AVX or AVX2 instructions.")
Fixes: aa031b8f702e ("crypto: x86/sha512 - load based on CPU features")
Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Robert Elliott <elliott@hpe.com>
---
 arch/x86/crypto/sha1_ssse3_glue.c   | 32 ++++++++++++++++++++++++-----
 arch/x86/crypto/sha256_ssse3_glue.c | 32 ++++++++++++++++++++++++-----
 arch/x86/crypto/sha512_ssse3_glue.c | 32 ++++++++++++++++++++++++-----
 3 files changed, 81 insertions(+), 15 deletions(-)

Comments

Jason A. Donenfeld Oct. 13, 2022, 12:41 a.m. UTC | #1
On Wed, Oct 12, 2022 at 04:59:16PM -0500, Robert Elliott wrote:
> As done by the ECB and CBC helpers in arch/x86/crypt/ecb_cbc_helpers.h,
> limit the number of bytes processed between kernel_fpu_begin() and
> kernel_fpu_end() calls.
> 
> Those functions call preempt_disable() and preempt_enable(), so
> the CPU core is unavailable for scheduling while running.
> 
> This leads to "rcu_preempt detected expedited stalls" with stack dumps
> pointing to the optimized hash function if the module is loaded and
> used a lot:
>     rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: ...
> 
> For example, that can occur during boot with the stack track pointing
> to the sha512-x86 function if the system set to use SHA-512 for
> module signing. The call trace includes:
>     module_sig_check
>     mod_verify_sig
>     pkcs7_verify
>     pkcs7_digest
>     sha512_finup
>     sha512_base_do_update
> 
> Fixes: 66be89515888 ("crypto: sha1 - SSSE3 based SHA1 implementation for x86-64")
> Fixes: 8275d1aa6422 ("crypto: sha256 - Create module providing optimized SHA256 routines using SSSE3, AVX or AVX2 instructions.")
> Fixes: 87de4579f92d ("crypto: sha512 - Create module providing optimized SHA512 routines using SSSE3, AVX or AVX2 instructions.")
> Fixes: aa031b8f702e ("crypto: x86/sha512 - load based on CPU features")
> Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Robert Elliott <elliott@hpe.com>
> ---
>  arch/x86/crypto/sha1_ssse3_glue.c   | 32 ++++++++++++++++++++++++-----
>  arch/x86/crypto/sha256_ssse3_glue.c | 32 ++++++++++++++++++++++++-----
>  arch/x86/crypto/sha512_ssse3_glue.c | 32 ++++++++++++++++++++++++-----
>  3 files changed, 81 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
> index 44340a1139e0..a9f5779b41ca 100644
> --- a/arch/x86/crypto/sha1_ssse3_glue.c
> +++ b/arch/x86/crypto/sha1_ssse3_glue.c
> @@ -26,6 +26,8 @@
>  #include <crypto/sha1_base.h>
>  #include <asm/simd.h>
>  
> +#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */

Declare this inside the function it's used as an untyped enum, and give
it a better name, like BYTES_PER_FPU.
Eric Biggers Oct. 13, 2022, 5:57 a.m. UTC | #2
On Wed, Oct 12, 2022 at 04:59:16PM -0500, Robert Elliott wrote:
> diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
> index 44340a1139e0..a9f5779b41ca 100644
> --- a/arch/x86/crypto/sha1_ssse3_glue.c
> +++ b/arch/x86/crypto/sha1_ssse3_glue.c
> @@ -26,6 +26,8 @@
>  #include <crypto/sha1_base.h>
>  #include <asm/simd.h>
>  
> +#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
> +
>  static int sha1_update(struct shash_desc *desc, const u8 *data,
>  			     unsigned int len, sha1_block_fn *sha1_xform)
>  {
> @@ -41,9 +43,18 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
>  	 */
>  	BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
>  
> -	kernel_fpu_begin();
> -	sha1_base_do_update(desc, data, len, sha1_xform);
> -	kernel_fpu_end();
> +	do {
> +		unsigned int chunk = min(len, FPU_BYTES);
> +
> +		if (chunk) {
> +			kernel_fpu_begin();
> +			sha1_base_do_update(desc, data, chunk, sha1_xform);
> +			kernel_fpu_end();
> +		}
> +
> +		len -= chunk;
> +		data += chunk;
> +	} while (len);

'len' can't be 0 at the beginning of this loop, so the 'if (chunk)' check isn't
needed.  And it wouldn't make sense even if 'len' could be 0, since a while loop
could just be used in that case.

- Eric
Herbert Xu Oct. 13, 2022, 6:04 a.m. UTC | #3
On Wed, Oct 12, 2022 at 10:57:04PM -0700, Eric Biggers wrote:
>
> 'len' can't be 0 at the beginning of this loop, so the 'if (chunk)' check isn't
> needed.  And it wouldn't make sense even if 'len' could be 0, since a while loop
> could just be used in that case.

I don't see anything preventing len from being zero if this gets
called directly by a user of the Crypto API through crypto_shash_update.
But yes a while loop would be a lot cleaner.

Thanks,
Eric Biggers Oct. 13, 2022, 6:08 a.m. UTC | #4
On Thu, Oct 13, 2022 at 02:04:43PM +0800, Herbert Xu wrote:
> On Wed, Oct 12, 2022 at 10:57:04PM -0700, Eric Biggers wrote:
> >
> > 'len' can't be 0 at the beginning of this loop, so the 'if (chunk)' check isn't
> > needed.  And it wouldn't make sense even if 'len' could be 0, since a while loop
> > could just be used in that case.
> 
> I don't see anything preventing len from being zero if this gets
> called directly by a user of the Crypto API through crypto_shash_update.
> But yes a while loop would be a lot cleaner.
> 

When len == 0, the following path is taken instead:

        if (!crypto_simd_usable() ||
            (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
                return crypto_sha1_update(desc, data, len);

- Eric
Herbert Xu Oct. 13, 2022, 7:50 a.m. UTC | #5
On Wed, Oct 12, 2022 at 11:08:53PM -0700, Eric Biggers wrote:
.
> When len == 0, the following path is taken instead:
> 
>         if (!crypto_simd_usable() ||
>             (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
>                 return crypto_sha1_update(desc, data, len);

Good point, I missed that.

Thanks!
Elliott, Robert (Servers) Oct. 13, 2022, 9:50 p.m. UTC | #6
> > +#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls
> */
> 
> Declare this inside the function it's used as an untyped enum, and give
> it a better name, like BYTES_PER_FPU.

I agree that's a better name, and will use something like that depending
on how other discussions go. I was trying to keep it as a one-line change
within 80 characters, expecting some churn before it's finalized.
Elliott, Robert (Servers) Oct. 13, 2022, 10:41 p.m. UTC | #7
> > diff --git a/arch/x86/crypto/sha1_ssse3_glue.c
...
> > +	do {
> > +		unsigned int chunk = min(len, FPU_BYTES);
> > +
> > +		if (chunk) {
> > +			kernel_fpu_begin();
> > +			sha1_base_do_update(desc, data, chunk, sha1_xform);
> > +			kernel_fpu_end();
> > +		}
> > +
> > +		len -= chunk;
> > +		data += chunk;
> > +	} while (len);
> 
> 'len' can't be 0 at the beginning of this loop, so the 'if (chunk)' check
> isn't needed.  And it wouldn't make sense even if 'len' could be 0, since
> a while loop could just be used in that case.

Thanks, I'll remove that if from all the sha functions, since they do
have that protective check upfront. I'll review the 0 byte handling in
all of them.
David Laight Oct. 14, 2022, 11:01 a.m. UTC | #8
From: Jason A. Donenfeld
> Sent: 13 October 2022 01:42
...
> > diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
> > index 44340a1139e0..a9f5779b41ca 100644
> > --- a/arch/x86/crypto/sha1_ssse3_glue.c
> > +++ b/arch/x86/crypto/sha1_ssse3_glue.c
> > @@ -26,6 +26,8 @@
> >  #include <crypto/sha1_base.h>
> >  #include <asm/simd.h>
> >
> > +#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
> 
> Declare this inside the function it's used as an untyped enum, and give
> it a better name, like BYTES_PER_FPU.

Isn't 'bytes' the wrong unit anyway?
At least it ought to be 'clocks' so it can be divided by the
(approximate) 'clocks per byte' of the algorithm.

Something like a crc is likely to be far faster than AES.

Clearly the actual required units are microseconds.
But depending on the actual cpu frequency is a bit hard.
And people running faster cpu may want lower latency anyway.
So a typical slow cpu frequency is probably ok.

The actual architecture dependant constant really ought
to be defined with kernel_fpu_begin().

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
diff mbox series

Patch

diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 44340a1139e0..a9f5779b41ca 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -26,6 +26,8 @@ 
 #include <crypto/sha1_base.h>
 #include <asm/simd.h>
 
+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
 static int sha1_update(struct shash_desc *desc, const u8 *data,
 			     unsigned int len, sha1_block_fn *sha1_xform)
 {
@@ -41,9 +43,18 @@  static int sha1_update(struct shash_desc *desc, const u8 *data,
 	 */
 	BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
 
-	kernel_fpu_begin();
-	sha1_base_do_update(desc, data, len, sha1_xform);
-	kernel_fpu_end();
+	do {
+		unsigned int chunk = min(len, FPU_BYTES);
+
+		if (chunk) {
+			kernel_fpu_begin();
+			sha1_base_do_update(desc, data, chunk, sha1_xform);
+			kernel_fpu_end();
+		}
+
+		len -= chunk;
+		data += chunk;
+	} while (len);
 
 	return 0;
 }
@@ -54,9 +65,20 @@  static int sha1_finup(struct shash_desc *desc, const u8 *data,
 	if (!crypto_simd_usable())
 		return crypto_sha1_finup(desc, data, len, out);
 
+	do {
+		unsigned int chunk = min(len, FPU_BYTES);
+
+		if (chunk) {
+			kernel_fpu_begin();
+			sha1_base_do_update(desc, data, chunk, sha1_xform);
+			kernel_fpu_end();
+		}
+
+		len -= chunk;
+		data += chunk;
+	} while (len);
+
 	kernel_fpu_begin();
-	if (len)
-		sha1_base_do_update(desc, data, len, sha1_xform);
 	sha1_base_do_finalize(desc, sha1_xform);
 	kernel_fpu_end();
 
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 3a5f6be7dbba..322c8aa907af 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -40,6 +40,8 @@ 
 #include <linux/string.h>
 #include <asm/simd.h>
 
+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
 asmlinkage void sha256_transform_ssse3(struct sha256_state *state,
 				       const u8 *data, int blocks);
 
@@ -58,9 +60,18 @@  static int _sha256_update(struct shash_desc *desc, const u8 *data,
 	 */
 	BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
 
-	kernel_fpu_begin();
-	sha256_base_do_update(desc, data, len, sha256_xform);
-	kernel_fpu_end();
+	do {
+		unsigned int chunk = min(len, FPU_BYTES);
+
+		if (chunk) {
+			kernel_fpu_begin();
+			sha256_base_do_update(desc, data, chunk, sha256_xform);
+			kernel_fpu_end();
+		}
+
+		len -= chunk;
+		data += chunk;
+	} while (len);
 
 	return 0;
 }
@@ -71,9 +82,20 @@  static int sha256_finup(struct shash_desc *desc, const u8 *data,
 	if (!crypto_simd_usable())
 		return crypto_sha256_finup(desc, data, len, out);
 
+	do {
+		unsigned int chunk = min(len, FPU_BYTES);
+
+		if (chunk) {
+			kernel_fpu_begin();
+			sha256_base_do_update(desc, data, chunk, sha256_xform);
+			kernel_fpu_end();
+		}
+
+		len -= chunk;
+		data += chunk;
+	} while (len);
+
 	kernel_fpu_begin();
-	if (len)
-		sha256_base_do_update(desc, data, len, sha256_xform);
 	sha256_base_do_finalize(desc, sha256_xform);
 	kernel_fpu_end();
 
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 6d3b85e53d0e..fd5075a32613 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -39,6 +39,8 @@ 
 #include <asm/cpu_device_id.h>
 #include <asm/simd.h>
 
+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
 asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
 				       const u8 *data, int blocks);
 
@@ -57,9 +59,18 @@  static int sha512_update(struct shash_desc *desc, const u8 *data,
 	 */
 	BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
 
-	kernel_fpu_begin();
-	sha512_base_do_update(desc, data, len, sha512_xform);
-	kernel_fpu_end();
+	do {
+		unsigned int chunk = min(len, FPU_BYTES);
+
+		if (chunk) {
+			kernel_fpu_begin();
+			sha512_base_do_update(desc, data, chunk, sha512_xform);
+			kernel_fpu_end();
+		}
+
+		len -= chunk;
+		data += chunk;
+	} while (len);
 
 	return 0;
 }
@@ -70,9 +81,20 @@  static int sha512_finup(struct shash_desc *desc, const u8 *data,
 	if (!crypto_simd_usable())
 		return crypto_sha512_finup(desc, data, len, out);
 
+	do {
+		unsigned int chunk = min(len, FPU_BYTES);
+
+		if (chunk) {
+			kernel_fpu_begin();
+			sha512_base_do_update(desc, data, chunk, sha512_xform);
+			kernel_fpu_end();
+		}
+
+		len -= chunk;
+		data += chunk;
+	} while (len);
+
 	kernel_fpu_begin();
-	if (len)
-		sha512_base_do_update(desc, data, len, sha512_xform);
 	sha512_base_do_finalize(desc, sha512_xform);
 	kernel_fpu_end();