diff mbox series

[2/2] crypto: scompress: Use per-CPU struct instead multiple variables

Message ID 20190329130956.15935-2-bigeasy@linutronix.de (mailing list archive)
State Accepted
Delegated to: Herbert Xu
Headers show
Series [1/2] crypto: scompress: return proper error code for allocation failure | expand

Commit Message

Sebastian Andrzej Siewior March 29, 2019, 1:09 p.m. UTC
Two per-CPU variables are allocated as pointer to per-CPU memory which
then are used as scratch buffers.
We could be smart about this and use instead a per-CPU struct which
contains the pointers already and then we need to allocate just the
scratch buffers.
Add a lock to the struct. By doing so we can avoid the get_cpu()
statement and gain lockdep coverage (if enabled) to ensure that the lock
is always acquired in the right context. On non-preemptible kernels the
lock vanishes.
It is okay to use raw_cpu_ptr() in order to get a pointer to the struct
since it is protected by the spinlock.

The diffstat of this is negative and according to size scompress.o:
   text    data     bss     dec     hex filename
   1847     160      24    2031     7ef dbg_before.o
   1754     232       4    1990     7c6 dbg_after.o
   1799      64      24    1887     75f no_dbg-before.o
   1703      88       4    1795     703 no_dbg-after.o

The overall size increase difference is also negative. The increase in
the data section is only four bytes without lockdep.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 crypto/scompress.c | 137 ++++++++++++++++++++-------------------------
 1 file changed, 60 insertions(+), 77 deletions(-)

Comments

Guenter Roeck April 11, 2019, 4:07 a.m. UTC | #1
Hi Sebastian,

On Fri, Mar 29, 2019 at 02:09:56PM +0100, Sebastian Andrzej Siewior wrote:
> Two per-CPU variables are allocated as pointer to per-CPU memory which
> then are used as scratch buffers.
> We could be smart about this and use instead a per-CPU struct which
> contains the pointers already and then we need to allocate just the
> scratch buffers.
> Add a lock to the struct. By doing so we can avoid the get_cpu()
> statement and gain lockdep coverage (if enabled) to ensure that the lock
> is always acquired in the right context. On non-preemptible kernels the
> lock vanishes.
> It is okay to use raw_cpu_ptr() in order to get a pointer to the struct
> since it is protected by the spinlock.
> 
> The diffstat of this is negative and according to size scompress.o:
>    text    data     bss     dec     hex filename
>    1847     160      24    2031     7ef dbg_before.o
>    1754     232       4    1990     7c6 dbg_after.o
>    1799      64      24    1887     75f no_dbg-before.o
>    1703      88       4    1795     703 no_dbg-after.o
> 
> The overall size increase difference is also negative. The increase in
> the data section is only four bytes without lockdep.
> 

Unfortunately, this patch causes random crashes.

[   13.084225] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
[   13.084687] Mem abort info:
[   13.084821]   ESR = 0x96000044
[   13.085008]   Exception class = DABT (current EL), IL = 32 bits
[   13.085189]   SET = 0, FnV = 0
[   13.085331]   EA = 0, S1PTW = 0
[   13.085468] Data abort info:
[   13.086018]   ISV = 0, ISS = 0x00000044
[   13.086185]   CM = 0, WnR = 1
[   13.086379] [0000000000000000] user address but active_mm is swapper
[   13.087032] Internal error: Oops: 96000044 [#1] PREEMPT SMP
[   13.087322] Modules linked in:
[   13.087645] Process cryptomgr_test (pid: 165, stack limit = 0x(____ptrval____))
[   13.088038] CPU: 1 PID: 165 Comm: cryptomgr_test Not tainted 5.1.0-rc4-next-20190410 #1
[   13.088250] Hardware name: ZynqMP ZCU102 Rev1.0 (DT)
[   13.088530] pstate: 80000005 (Nzcv daif -PAN -UAO)
[   13.088740] pc : __memcpy+0xc0/0x180
[   13.088887] lr : scatterwalk_copychunks+0xd4/0x1e8
[   13.089033] sp : ffff000012f2ba50
[   13.089139] x29: ffff000012f2ba50 x28: ffff80007a7c0500 
[   13.089324] x27: 0000000000000000 x26: ffff000012f2bae8 
[   13.089460] x25: 0000000000000046 x24: 0000000000000046 
[   13.089643] x23: ffff80007a004440 x22: 0000000000000046 
[   13.089815] x21: 0000000000000000 x20: 000081ffffffffff 
[   13.089974] x19: 0000000000000000 x18: 00000000000007ff 
[   13.090111] x17: 0000000000000000 x16: 0000000000000000 
[   13.090264] x15: ffff0000130ff748 x14: 0000000000000000 
[   13.090414] x13: ffff000011c509e8 x12: ffff000011a46980 
[   13.090569] x11: ffff000011a46000 x10: 0000000000000001 
[   13.090736] x9 : 0000000000000000 x8 : 20646e6120776f6e 
[   13.090885] x7 : 207375206e696f4a x6 : 0000000000000000 
[   13.091022] x5 : ffff0000117bf808 x4 : 0000000000000000 
[   13.091168] x3 : 0000000000000000 x2 : ffffffffffffffc6 
[   13.091313] x1 : ffff80007a62e610 x0 : 0000000000000000 
[   13.091554] Call trace:
[   13.091682]  __memcpy+0xc0/0x180
[   13.091810]  scatterwalk_map_and_copy+0x88/0xf0
[   13.091956]  scomp_acomp_comp_decomp+0x94/0x140
[   13.092083]  scomp_acomp_compress+0x10/0x18
[   13.092201]  test_acomp+0x158/0x5c0
[   13.092307]  alg_test_comp+0x298/0x440
[   13.092420]  alg_test.part.8+0xbc/0x398
[   13.092542]  alg_test+0x3c/0x68
[   13.092642]  cryptomgr_test+0x44/0x50
[   13.092754]  kthread+0x128/0x130
[   13.092867]  ret_from_fork+0x10/0x18
[   13.093158] Code: 14000028 f1020042 5400024a a8c12027 (a88120c7) 

This is seen with an arm64 image running on qemu with machine xlnx-zcu102
and two CPUs, and crypto test options enabled. It happens roughly every
other boot. Reverting the patch fixes the problem. Bisect log (from
crypto/master) is attached.

Guenter

---
# bad: [eda69b0c06bc615f4b055d560ed19001619e611a] crypto: testmgr - add panic_on_fail module parameter
# good: [9e98c678c2d6ae3a17cb2de55d17f69dddaa231b] Linux 5.1-rc1
git bisect start 'HEAD' 'v5.1-rc1'
# good: [52c899ec472e88e33c31c33bea844217c0963a05] crypto: ccp - Make ccp_register_rsa_alg static
git bisect good 52c899ec472e88e33c31c33bea844217c0963a05
# good: [6a4d1b18ef00a7b182740b7b4d8a0fcd317368f8] crypto: scompress - return proper error code for allocation failure
git bisect good 6a4d1b18ef00a7b182740b7b4d8a0fcd317368f8
# bad: [307508d1072979f4435416f87936f87eaeb82054] crypto: crct10dif-generic - fix use via crypto_shash_digest()
git bisect bad 307508d1072979f4435416f87936f87eaeb82054
# bad: [8316da02e3e07b0da9b2d812a619b5513c7f59d2] crypto: ccp - Use kmemdup in ccp_copy_and_save_keypart()
git bisect bad 8316da02e3e07b0da9b2d812a619b5513c7f59d2
# bad: [61abc356bf310d346d2d469cb009f6d4334f34de] crypto: aes - Use ___cacheline_aligned for aes data
git bisect bad 61abc356bf310d346d2d469cb009f6d4334f34de
# bad: [71052dcf4be70be4077817297dcde7b155e745f2] crypto: scompress - Use per-CPU struct instead multiple variables
git bisect bad 71052dcf4be70be4077817297dcde7b155e745f2
# first bad commit: [71052dcf4be70be4077817297dcde7b155e745f2] crypto: scompress - Use per-CPU struct instead multiple variables
Eric Biggers April 11, 2019, 4:39 a.m. UTC | #2
On Wed, Apr 10, 2019 at 09:07:35PM -0700, Guenter Roeck wrote:
> Hi Sebastian,
> 
> On Fri, Mar 29, 2019 at 02:09:56PM +0100, Sebastian Andrzej Siewior wrote:
> > Two per-CPU variables are allocated as pointer to per-CPU memory which
> > then are used as scratch buffers.
> > We could be smart about this and use instead a per-CPU struct which
> > contains the pointers already and then we need to allocate just the
> > scratch buffers.
> > Add a lock to the struct. By doing so we can avoid the get_cpu()
> > statement and gain lockdep coverage (if enabled) to ensure that the lock
> > is always acquired in the right context. On non-preemptible kernels the
> > lock vanishes.
> > It is okay to use raw_cpu_ptr() in order to get a pointer to the struct
> > since it is protected by the spinlock.
> > 
> > The diffstat of this is negative and according to size scompress.o:
> >    text    data     bss     dec     hex filename
> >    1847     160      24    2031     7ef dbg_before.o
> >    1754     232       4    1990     7c6 dbg_after.o
> >    1799      64      24    1887     75f no_dbg-before.o
> >    1703      88       4    1795     703 no_dbg-after.o
> > 
> > The overall size increase difference is also negative. The increase in
> > the data section is only four bytes without lockdep.
> > 
> 
> Unfortunately, this patch causes random crashes.
> 
> [   13.084225] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
> [   13.084687] Mem abort info:
> [   13.084821]   ESR = 0x96000044
> [   13.085008]   Exception class = DABT (current EL), IL = 32 bits
> [   13.085189]   SET = 0, FnV = 0
> [   13.085331]   EA = 0, S1PTW = 0
> [   13.085468] Data abort info:
> [   13.086018]   ISV = 0, ISS = 0x00000044
> [   13.086185]   CM = 0, WnR = 1
> [   13.086379] [0000000000000000] user address but active_mm is swapper
> [   13.087032] Internal error: Oops: 96000044 [#1] PREEMPT SMP
> [   13.087322] Modules linked in:
> [   13.087645] Process cryptomgr_test (pid: 165, stack limit = 0x(____ptrval____))
> [   13.088038] CPU: 1 PID: 165 Comm: cryptomgr_test Not tainted 5.1.0-rc4-next-20190410 #1
> [   13.088250] Hardware name: ZynqMP ZCU102 Rev1.0 (DT)
> [   13.088530] pstate: 80000005 (Nzcv daif -PAN -UAO)
> [   13.088740] pc : __memcpy+0xc0/0x180
> [   13.088887] lr : scatterwalk_copychunks+0xd4/0x1e8
> [   13.089033] sp : ffff000012f2ba50
> [   13.089139] x29: ffff000012f2ba50 x28: ffff80007a7c0500 
> [   13.089324] x27: 0000000000000000 x26: ffff000012f2bae8 
> [   13.089460] x25: 0000000000000046 x24: 0000000000000046 
> [   13.089643] x23: ffff80007a004440 x22: 0000000000000046 
> [   13.089815] x21: 0000000000000000 x20: 000081ffffffffff 
> [   13.089974] x19: 0000000000000000 x18: 00000000000007ff 
> [   13.090111] x17: 0000000000000000 x16: 0000000000000000 
> [   13.090264] x15: ffff0000130ff748 x14: 0000000000000000 
> [   13.090414] x13: ffff000011c509e8 x12: ffff000011a46980 
> [   13.090569] x11: ffff000011a46000 x10: 0000000000000001 
> [   13.090736] x9 : 0000000000000000 x8 : 20646e6120776f6e 
> [   13.090885] x7 : 207375206e696f4a x6 : 0000000000000000 
> [   13.091022] x5 : ffff0000117bf808 x4 : 0000000000000000 
> [   13.091168] x3 : 0000000000000000 x2 : ffffffffffffffc6 
> [   13.091313] x1 : ffff80007a62e610 x0 : 0000000000000000 
> [   13.091554] Call trace:
> [   13.091682]  __memcpy+0xc0/0x180
> [   13.091810]  scatterwalk_map_and_copy+0x88/0xf0
> [   13.091956]  scomp_acomp_comp_decomp+0x94/0x140
> [   13.092083]  scomp_acomp_compress+0x10/0x18
> [   13.092201]  test_acomp+0x158/0x5c0
> [   13.092307]  alg_test_comp+0x298/0x440
> [   13.092420]  alg_test.part.8+0xbc/0x398
> [   13.092542]  alg_test+0x3c/0x68
> [   13.092642]  cryptomgr_test+0x44/0x50
> [   13.092754]  kthread+0x128/0x130
> [   13.092867]  ret_from_fork+0x10/0x18
> [   13.093158] Code: 14000028 f1020042 5400024a a8c12027 (a88120c7) 
> 
> This is seen with an arm64 image running on qemu with machine xlnx-zcu102
> and two CPUs, and crypto test options enabled. It happens roughly every
> other boot. Reverting the patch fixes the problem. Bisect log (from
> crypto/master) is attached.
> 
> Guenter

Well, from a quick read of the patch, it's probably because it uses
raw_cpu_ptr() instead of per_cpu_ptr() when allocating/freeing the buffers, so
they are unlikely to actually be allocated for all CPUs.

Also Sebastian, I don't understand the point of the spinlock.  What was wrong
with get_cpu()?  Note that taking the spinlock disables preemption too...  Also
now it's possible to spin for a long time on the spinlock, for no reason really.

- Eric
Sebastian Andrzej Siewior April 12, 2019, 8:42 a.m. UTC | #3
On 2019-04-10 21:07:35 [-0700], Guenter Roeck wrote:
> Hi Sebastian,
Hi Guenter,

> Unfortunately, this patch causes random crashes.> This is seen with an arm64 image running on qemu with machine xlnx-zcu102
> and two CPUs, and crypto test options enabled. It happens roughly every
> other boot. Reverting the patch fixes the problem. Bisect log (from
> crypto/master) is attached.

What is it that you are doing? I had it tcrypt tested on x86-64 before
posting. Let me check it again with the next tree on qemu…

> Guenter

Sebastian
Sebastian Andrzej Siewior April 12, 2019, 8:54 a.m. UTC | #4
On 2019-04-10 21:39:34 [-0700], Eric Biggers wrote:
> Well, from a quick read of the patch, it's probably because it uses
> raw_cpu_ptr() instead of per_cpu_ptr() when allocating/freeing the buffers, so
> they are unlikely to actually be allocated for all CPUs.

memory is allocated in crypto_scomp_alloc_scratches() before the
algorithm is first used for for_each_possible_cpu. So all CPUs should
have it. 

> Also Sebastian, I don't understand the point of the spinlock.  What was wrong
> with get_cpu()?  Note that taking the spinlock disables preemption too...  Also
> now it's possible to spin for a long time on the spinlock, for no reason really.

The long spin is only possible PREEMPT kernel if the context switches
the CPU between getting the pointer and acquiring the CPU. Should this
really become a problem I could try to think of something.
That spin_lock gives us lock annotation and a lock context. With
preempt_disable() it is just a BKL like thing with no context. And this
basically kills PREEMPT_RT.

> - Eric

Sebastian
Guenter Roeck April 12, 2019, 1:43 p.m. UTC | #5
On 4/12/19 1:42 AM, Sebastian Andrzej Siewior wrote:
> On 2019-04-10 21:07:35 [-0700], Guenter Roeck wrote:
>> Hi Sebastian,
> Hi Guenter,
> 
>> Unfortunately, this patch causes random crashes.
> …
>> This is seen with an arm64 image running on qemu with machine xlnx-zcu102
>> and two CPUs, and crypto test options enabled. It happens roughly every
>> other boot. Reverting the patch fixes the problem. Bisect log (from
>> crypto/master) is attached.
> 
> What is it that you are doing? I had it tcrypt tested on x86-64 before
> posting. Let me check it again with the next tree on qemu…
> 

Nothing special. Booting arm64:defconfig+various debug and test options in qemu.
It does not seem to depend on specific boot options. Crash failure rate, based
on the last three of days testing, seems to be roughly 20%. The problem is only
seen if there is more than one CPU. I have seen it with 2, 4, and 6 CPUs active,
but it seems to be most likely to happen with 2 CPUs. I don't have enough data
yet to be sure, though.

The problem is only seen with arm64, at least in my tests.

Guenter
Guenter Roeck April 12, 2019, 1:45 p.m. UTC | #6
On 4/12/19 1:54 AM, Sebastian Andrzej Siewior wrote:
> On 2019-04-10 21:39:34 [-0700], Eric Biggers wrote:
>> Well, from a quick read of the patch, it's probably because it uses
>> raw_cpu_ptr() instead of per_cpu_ptr() when allocating/freeing the buffers, so
>> they are unlikely to actually be allocated for all CPUs.
> 
> memory is allocated in crypto_scomp_alloc_scratches() before the
> algorithm is first used for for_each_possible_cpu. So all CPUs should
> have it.
> 
>> Also Sebastian, I don't understand the point of the spinlock.  What was wrong
>> with get_cpu()?  Note that taking the spinlock disables preemption too...  Also
>> now it's possible to spin for a long time on the spinlock, for no reason really.
> 
> The long spin is only possible PREEMPT kernel if the context switches
> the CPU between getting the pointer and acquiring the CPU. Should this
> really become a problem I could try to think of something.
> That spin_lock gives us lock annotation and a lock context. With
> preempt_disable() it is just a BKL like thing with no context. And this
> basically kills PREEMPT_RT.
> 

arm64:defconfig:

CONFIG_PREEMPT=y
CONFIG_PREEMPT_COUNT=y
CONFIG_PREEMPT_RCU=y
CONFIG_PREEMPT_NOTIFIERS=y
# CONFIG_DEBUG_PREEMPT is not set

Hmm ... maybe I should add specific PREEMPT tests to my suite.

Guenter
Sebastian Andrzej Siewior April 12, 2019, 1:50 p.m. UTC | #7
On 2019-04-12 06:43:31 [-0700], Guenter Roeck wrote:
> On 4/12/19 1:42 AM, Sebastian Andrzej Siewior wrote:
> > On 2019-04-10 21:07:35 [-0700], Guenter Roeck wrote:
> > > Hi Sebastian,
> > Hi Guenter,
> > 
> > > Unfortunately, this patch causes random crashes.
> > …
> > > This is seen with an arm64 image running on qemu with machine xlnx-zcu102
> > > and two CPUs, and crypto test options enabled. It happens roughly every
> > > other boot. Reverting the patch fixes the problem. Bisect log (from
> > > crypto/master) is attached.
> > 
> > What is it that you are doing? I had it tcrypt tested on x86-64 before
> > posting. Let me check it again with the next tree on qemu…
> > 
> 
> Nothing special. Booting arm64:defconfig+various debug and test options in qemu.
> It does not seem to depend on specific boot options. Crash failure rate, based
> on the last three of days testing, seems to be roughly 20%. The problem is only
> seen if there is more than one CPU. I have seen it with 2, 4, and 6 CPUs active,
> but it seems to be most likely to happen with 2 CPUs. I don't have enough data
> yet to be sure, though.
> 
> The problem is only seen with arm64, at least in my tests.

I found your test thingy and I managed to run it here. I started 
	rootfs/arm64/run-qemu-arm64.sh

a few times (only with the xlnx-zcu102 targets) and after the third
iteration it exploded once.
So I have this, let me try to dig further.

> Guenter

Sebastian
Sebastian Andrzej Siewior April 12, 2019, 2:29 p.m. UTC | #8
On 2019-04-12 15:50:56 [+0200], To Guenter Roeck wrote:
> So I have this, let me try to dig further.

I'm such a moron. Patch is coming soon…

> > Guenter
> 
Sebastian
diff mbox series

Patch

diff --git a/crypto/scompress.c b/crypto/scompress.c
index aea1a8e5d1954..da31f6fe1f833 100644
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -29,9 +29,17 @@ 
 #include <crypto/internal/scompress.h>
 #include "internal.h"
 
+struct scomp_scratch {
+	spinlock_t	lock;
+	void		*src;
+	void		*dst;
+};
+
+static DEFINE_PER_CPU(struct scomp_scratch, scomp_scratch) = {
+	.lock = __SPIN_LOCK_UNLOCKED(scomp_scratch.lock),
+};
+
 static const struct crypto_type crypto_scomp_type;
-static void * __percpu *scomp_src_scratches;
-static void * __percpu *scomp_dst_scratches;
 static int scomp_scratch_users;
 static DEFINE_MUTEX(scomp_lock);
 
@@ -62,76 +70,53 @@  static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_puts(m, "type         : scomp\n");
 }
 
-static void crypto_scomp_free_scratches(void * __percpu *scratches)
+static void crypto_scomp_free_scratches(void)
 {
+	struct scomp_scratch *scratch;
 	int i;
 
-	if (!scratches)
-		return;
-
-	for_each_possible_cpu(i)
-		vfree(*per_cpu_ptr(scratches, i));
-
-	free_percpu(scratches);
-}
-
-static void * __percpu *crypto_scomp_alloc_scratches(void)
-{
-	void * __percpu *scratches;
-	int i;
-
-	scratches = alloc_percpu(void *);
-	if (!scratches)
-		return NULL;
-
 	for_each_possible_cpu(i) {
-		void *scratch;
+		scratch = raw_cpu_ptr(&scomp_scratch);
 
-		scratch = vmalloc_node(SCOMP_SCRATCH_SIZE, cpu_to_node(i));
-		if (!scratch)
+		vfree(scratch->src);
+		vfree(scratch->dst);
+		scratch->src = NULL;
+		scratch->dst = NULL;
+	}
+}
+
+static int crypto_scomp_alloc_scratches(void)
+{
+	struct scomp_scratch *scratch;
+	int i;
+
+	for_each_possible_cpu(i) {
+		void *mem;
+
+		scratch = raw_cpu_ptr(&scomp_scratch);
+
+		mem = vmalloc_node(SCOMP_SCRATCH_SIZE, cpu_to_node(i));
+		if (!mem)
 			goto error;
-		*per_cpu_ptr(scratches, i) = scratch;
-	}
-
-	return scratches;
-
-error:
-	crypto_scomp_free_scratches(scratches);
-	return NULL;
-}
-
-static void crypto_scomp_free_all_scratches(void)
-{
-	if (!--scomp_scratch_users) {
-		crypto_scomp_free_scratches(scomp_src_scratches);
-		crypto_scomp_free_scratches(scomp_dst_scratches);
-		scomp_src_scratches = NULL;
-		scomp_dst_scratches = NULL;
-	}
-}
-
-static int crypto_scomp_alloc_all_scratches(void)
-{
-	if (!scomp_scratch_users++) {
-		scomp_src_scratches = crypto_scomp_alloc_scratches();
-		if (!scomp_src_scratches)
-			return -ENOMEM;
-		scomp_dst_scratches = crypto_scomp_alloc_scratches();
-		if (!scomp_dst_scratches) {
-			crypto_scomp_free_scratches(scomp_src_scratches);
-			scomp_src_scratches = NULL;
-			return -ENOMEM;
-		}
+		scratch->src = mem;
+		mem = vmalloc_node(SCOMP_SCRATCH_SIZE, cpu_to_node(i));
+		if (!mem)
+			goto error;
+		scratch->dst = mem;
 	}
 	return 0;
+error:
+	crypto_scomp_free_scratches();
+	return -ENOMEM;
 }
 
 static int crypto_scomp_init_tfm(struct crypto_tfm *tfm)
 {
-	int ret;
+	int ret = 0;
 
 	mutex_lock(&scomp_lock);
-	ret = crypto_scomp_alloc_all_scratches();
+	if (!scomp_scratch_users++)
+		ret = crypto_scomp_alloc_scratches();
 	mutex_unlock(&scomp_lock);
 
 	return ret;
@@ -143,31 +128,28 @@  static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
 	void **tfm_ctx = acomp_tfm_ctx(tfm);
 	struct crypto_scomp *scomp = *tfm_ctx;
 	void **ctx = acomp_request_ctx(req);
-	const int cpu = get_cpu();
-	u8 *scratch_src = *per_cpu_ptr(scomp_src_scratches, cpu);
-	u8 *scratch_dst = *per_cpu_ptr(scomp_dst_scratches, cpu);
+	struct scomp_scratch *scratch;
 	int ret;
 
-	if (!req->src || !req->slen || req->slen > SCOMP_SCRATCH_SIZE) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (!req->src || !req->slen || req->slen > SCOMP_SCRATCH_SIZE)
+		return -EINVAL;
 
-	if (req->dst && !req->dlen) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (req->dst && !req->dlen)
+		return -EINVAL;
 
 	if (!req->dlen || req->dlen > SCOMP_SCRATCH_SIZE)
 		req->dlen = SCOMP_SCRATCH_SIZE;
 
-	scatterwalk_map_and_copy(scratch_src, req->src, 0, req->slen, 0);
+	scratch = raw_cpu_ptr(&scomp_scratch);
+	spin_lock(&scratch->lock);
+
+	scatterwalk_map_and_copy(scratch->src, req->src, 0, req->slen, 0);
 	if (dir)
-		ret = crypto_scomp_compress(scomp, scratch_src, req->slen,
-					    scratch_dst, &req->dlen, *ctx);
+		ret = crypto_scomp_compress(scomp, scratch->src, req->slen,
+					    scratch->dst, &req->dlen, *ctx);
 	else
-		ret = crypto_scomp_decompress(scomp, scratch_src, req->slen,
-					      scratch_dst, &req->dlen, *ctx);
+		ret = crypto_scomp_decompress(scomp, scratch->src, req->slen,
+					      scratch->dst, &req->dlen, *ctx);
 	if (!ret) {
 		if (!req->dst) {
 			req->dst = sgl_alloc(req->dlen, GFP_ATOMIC, NULL);
@@ -176,11 +158,11 @@  static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
 				goto out;
 			}
 		}
-		scatterwalk_map_and_copy(scratch_dst, req->dst, 0, req->dlen,
+		scatterwalk_map_and_copy(scratch->dst, req->dst, 0, req->dlen,
 					 1);
 	}
 out:
-	put_cpu();
+	spin_unlock(&scratch->lock);
 	return ret;
 }
 
@@ -201,7 +183,8 @@  static void crypto_exit_scomp_ops_async(struct crypto_tfm *tfm)
 	crypto_free_scomp(*ctx);
 
 	mutex_lock(&scomp_lock);
-	crypto_scomp_free_all_scratches();
+	if (!--scomp_scratch_users)
+		crypto_scomp_free_scratches();
 	mutex_unlock(&scomp_lock);
 }