diff mbox series

[v2,6/6] mm/zswap: directly use percpu mutex and buffer in load/store

Message ID 20231213-zswap-dstmem-v2-6-daa5d9ae41a7@bytedance.com (mailing list archive)
State New
Headers show
Series mm/zswap: dstmem reuse optimizations and cleanups | expand

Commit Message

Chengming Zhou Dec. 18, 2023, 8:22 a.m. UTC
Since the introduce of reusing the dstmem in the load path, it seems
confusing that we are now using acomp_ctx->dstmem and acomp_ctx->mutex
now for purposes other than what the naming suggests.

Yosry suggested removing these two fields from acomp_ctx, and directly
using zswap_dstmem and zswap_mutex in both the load and store paths,
rename them, and add proper comments above their definitions that they
are for generic percpu buffering on the load and store paths.

So this patch remove dstmem and mutex from acomp_ctx, and rename the
zswap_dstmem to zswap_buffer, using the percpu mutex and buffer on
the load and store paths. And refactor out __zswap_store() to only
include the compress & store, since I found zswap_store() is too long.

Suggested-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 mm/zswap.c | 193 +++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 104 insertions(+), 89 deletions(-)

Comments

Yosry Ahmed Dec. 18, 2023, 9:37 a.m. UTC | #1
On Mon, Dec 18, 2023 at 12:22 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> Since the introduce of reusing the dstmem in the load path, it seems
> confusing that we are now using acomp_ctx->dstmem and acomp_ctx->mutex
> now for purposes other than what the naming suggests.
>
> Yosry suggested removing these two fields from acomp_ctx, and directly
> using zswap_dstmem and zswap_mutex in both the load and store paths,
> rename them, and add proper comments above their definitions that they
> are for generic percpu buffering on the load and store paths.
>
> So this patch remove dstmem and mutex from acomp_ctx, and rename the
> zswap_dstmem to zswap_buffer, using the percpu mutex and buffer on
> the load and store paths. And refactor out __zswap_store() to only
> include the compress & store, since I found zswap_store() is too long.

I am not sure refactoring out __zswap_store() is useful to be honest,
but I am not objecting to it, it mirrors __zswap_load() in a sense.
However, if you want to do so, please do it in a separate patch from
renaming the percpu buffers and mutex. This will make reviewing easier
(and make my Suggested-by correctly scoped).

Also, any reason why raw_smp_processor_id() is used here instead of
smp_processor_id()?

>
> Suggested-by: Yosry Ahmed <yosryahmed@google.com>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> ---
>  mm/zswap.c | 193 +++++++++++++++++++++++++++++++++----------------------------
>  1 file changed, 104 insertions(+), 89 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 2c349fd88904..b7449294ec3a 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -166,8 +166,6 @@ struct crypto_acomp_ctx {
>         struct crypto_acomp *acomp;
>         struct acomp_req *req;
>         struct crypto_wait wait;
> -       u8 *dstmem;
> -       struct mutex *mutex;
>  };
>
>  /*
> @@ -694,7 +692,7 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool)
>  /*********************************
>  * per-cpu code
>  **********************************/
> -static DEFINE_PER_CPU(u8 *, zswap_dstmem);
> +static DEFINE_PER_CPU(u8 *, zswap_buffer);
>  /*
>   * If users dynamically change the zpool type and compressor at runtime, i.e.
>   * zswap is running, zswap can have more than one zpool on one cpu, but they
> @@ -702,39 +700,39 @@ static DEFINE_PER_CPU(u8 *, zswap_dstmem);
>   */
>  static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
>
> -static int zswap_dstmem_prepare(unsigned int cpu)
> +static int zswap_buffer_prepare(unsigned int cpu)
>  {
>         struct mutex *mutex;
> -       u8 *dst;
> +       u8 *buf;
>
> -       dst = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
> -       if (!dst)
> +       buf = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
> +       if (!buf)
>                 return -ENOMEM;
>
>         mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
>         if (!mutex) {
> -               kfree(dst);
> +               kfree(buf);
>                 return -ENOMEM;
>         }
>
>         mutex_init(mutex);
> -       per_cpu(zswap_dstmem, cpu) = dst;
> +       per_cpu(zswap_buffer, cpu) = buf;
>         per_cpu(zswap_mutex, cpu) = mutex;
>         return 0;
>  }
>
> -static int zswap_dstmem_dead(unsigned int cpu)
> +static int zswap_buffer_dead(unsigned int cpu)
>  {
>         struct mutex *mutex;
> -       u8 *dst;
> +       u8 *buf;
>
>         mutex = per_cpu(zswap_mutex, cpu);
>         kfree(mutex);
>         per_cpu(zswap_mutex, cpu) = NULL;
>
> -       dst = per_cpu(zswap_dstmem, cpu);
> -       kfree(dst);
> -       per_cpu(zswap_dstmem, cpu) = NULL;
> +       buf = per_cpu(zswap_buffer, cpu);
> +       kfree(buf);
> +       per_cpu(zswap_buffer, cpu) = NULL;
>
>         return 0;
>  }
> @@ -772,9 +770,6 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>         acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
>                                    crypto_req_done, &acomp_ctx->wait);
>
> -       acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
> -       acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
> -
>         return 0;
>  }
>
> @@ -1392,20 +1387,98 @@ static int zswap_enabled_param_set(const char *val,
>         return ret;
>  }
>
> +static int __zswap_store(struct zswap_entry *entry, struct page *page)
> +{
> +       struct scatterlist input, output;
> +       struct crypto_acomp_ctx *acomp_ctx;
> +       struct zpool *zpool;
> +       unsigned long handle;
> +       unsigned int dlen;
> +       u8 *buf, *dst;
> +       gfp_t gfp;
> +       int ret;
> +       int cpu;
> +       struct mutex *mutex;
> +
> +       cpu = raw_smp_processor_id();
> +       mutex = per_cpu(zswap_mutex, cpu);
> +       mutex_lock(mutex);
> +
> +       acomp_ctx = per_cpu_ptr(entry->pool->acomp_ctx, cpu);
> +       buf = per_cpu(zswap_buffer, cpu);
> +
> +       sg_init_table(&input, 1);
> +       sg_set_page(&input, page, PAGE_SIZE, 0);
> +       sg_init_one(&output, buf, PAGE_SIZE);
> +       acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, PAGE_SIZE);
> +       /*
> +        * it maybe looks a little bit silly that we send an asynchronous request,
> +        * then wait for its completion synchronously. This makes the process look
> +        * synchronous in fact.
> +        * Theoretically, acomp supports users send multiple acomp requests in one
> +        * acomp instance, then get those requests done simultaneously. but in this
> +        * case, zswap actually does store and load page by page, there is no
> +        * existing method to send the second page before the first page is done
> +        * in one thread doing zwap.
> +        * but in different threads running on different cpu, we have different
> +        * acomp instance, so multiple threads can do (de)compression in parallel.
> +        */
> +       ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
> +       dlen = acomp_ctx->req->dlen;
> +
> +       if (ret) {
> +               zswap_reject_compress_fail++;
> +               goto unlock;
> +       }
> +
> +       /* store */
> +       zpool = zswap_find_zpool(entry);
> +       gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> +       if (zpool_malloc_support_movable(zpool))
> +               gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> +       ret = zpool_malloc(zpool, dlen, gfp, &handle);
> +       if (ret == -ENOSPC) {
> +               zswap_reject_compress_poor++;
> +               goto unlock;
> +       }
> +       if (ret) {
> +               zswap_reject_alloc_fail++;
> +               goto unlock;
> +       }
> +       dst = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
> +       memcpy(dst, buf, dlen);
> +       zpool_unmap_handle(zpool, handle);
> +       mutex_unlock(mutex);
> +
> +       entry->handle = handle;
> +       entry->length = dlen;
> +       return 0;
> +
> +unlock:
> +       mutex_unlock(mutex);
> +       return ret;
> +}
> +
>  static void __zswap_load(struct zswap_entry *entry, struct page *page)
>  {
>         struct zpool *zpool = zswap_find_zpool(entry);
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
> -       u8 *src;
> +       u8 *src, *buf;
> +       int cpu;
> +       struct mutex *mutex;
> +
> +       cpu = raw_smp_processor_id();
> +       mutex = per_cpu(zswap_mutex, cpu);
> +       mutex_lock(mutex);
>
> -       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> -       mutex_lock(acomp_ctx->mutex);
> +       acomp_ctx = per_cpu_ptr(entry->pool->acomp_ctx, cpu);
>
>         src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>         if (!zpool_can_sleep_mapped(zpool)) {
> -               memcpy(acomp_ctx->dstmem, src, entry->length);
> -               src = acomp_ctx->dstmem;
> +               buf = per_cpu(zswap_buffer, cpu);
> +               memcpy(buf, src, entry->length);
> +               src = buf;
>                 zpool_unmap_handle(zpool, entry->handle);
>         }
>
> @@ -1415,7 +1488,7 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
>         acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
>         BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
>         BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
> -       mutex_unlock(acomp_ctx->mutex);
> +       mutex_unlock(mutex);
>
>         if (zpool_can_sleep_mapped(zpool))
>                 zpool_unmap_handle(zpool, entry->handle);
> @@ -1539,18 +1612,11 @@ bool zswap_store(struct folio *folio)
>         struct page *page = &folio->page;
>         struct zswap_tree *tree = zswap_trees[type];
>         struct zswap_entry *entry, *dupentry;
> -       struct scatterlist input, output;
> -       struct crypto_acomp_ctx *acomp_ctx;
>         struct obj_cgroup *objcg = NULL;
>         struct mem_cgroup *memcg = NULL;
>         struct zswap_pool *pool;
> -       struct zpool *zpool;
> -       unsigned int dlen = PAGE_SIZE;
> -       unsigned long handle, value;
> -       char *buf;
> -       u8 *src, *dst;
> -       gfp_t gfp;
> -       int ret;
> +       u8 *src;
> +       unsigned long value;
>
>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>         VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1635,60 +1701,11 @@ bool zswap_store(struct folio *folio)
>                 mem_cgroup_put(memcg);
>         }
>
> -       /* compress */
> -       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> -
> -       mutex_lock(acomp_ctx->mutex);
> -
> -       dst = acomp_ctx->dstmem;
> -       sg_init_table(&input, 1);
> -       sg_set_page(&input, page, PAGE_SIZE, 0);
> -
> -       sg_init_one(&output, dst, PAGE_SIZE);
> -       acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
> -       /*
> -        * it maybe looks a little bit silly that we send an asynchronous request,
> -        * then wait for its completion synchronously. This makes the process look
> -        * synchronous in fact.
> -        * Theoretically, acomp supports users send multiple acomp requests in one
> -        * acomp instance, then get those requests done simultaneously. but in this
> -        * case, zswap actually does store and load page by page, there is no
> -        * existing method to send the second page before the first page is done
> -        * in one thread doing zwap.
> -        * but in different threads running on different cpu, we have different
> -        * acomp instance, so multiple threads can do (de)compression in parallel.
> -        */
> -       ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
> -       dlen = acomp_ctx->req->dlen;
> -
> -       if (ret) {
> -               zswap_reject_compress_fail++;
> -               goto put_dstmem;
> -       }
> -
> -       /* store */
> -       zpool = zswap_find_zpool(entry);
> -       gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> -       if (zpool_malloc_support_movable(zpool))
> -               gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> -       ret = zpool_malloc(zpool, dlen, gfp, &handle);
> -       if (ret == -ENOSPC) {
> -               zswap_reject_compress_poor++;
> -               goto put_dstmem;
> -       }
> -       if (ret) {
> -               zswap_reject_alloc_fail++;
> -               goto put_dstmem;
> -       }
> -       buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
> -       memcpy(buf, dst, dlen);
> -       zpool_unmap_handle(zpool, handle);
> -       mutex_unlock(acomp_ctx->mutex);
> +       if (__zswap_store(entry, page))
> +               goto put_pool;
>
>         /* populate entry */
>         entry->swpentry = swp_entry(type, offset);
> -       entry->handle = handle;
> -       entry->length = dlen;
>
>  insert_entry:
>         entry->objcg = objcg;
> @@ -1725,8 +1742,6 @@ bool zswap_store(struct folio *folio)
>
>         return true;
>
> -put_dstmem:
> -       mutex_unlock(acomp_ctx->mutex);
>  put_pool:
>         zswap_pool_put(entry->pool);
>  freepage:
> @@ -1902,10 +1917,10 @@ static int zswap_setup(void)
>         }
>
>         ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
> -                               zswap_dstmem_prepare, zswap_dstmem_dead);
> +                               zswap_buffer_prepare, zswap_buffer_dead);
>         if (ret) {
> -               pr_err("dstmem alloc failed\n");
> -               goto dstmem_fail;
> +               pr_err("buffer alloc failed\n");
> +               goto buffer_fail;
>         }
>
>         ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
> @@ -1940,7 +1955,7 @@ static int zswap_setup(void)
>                 zswap_pool_destroy(pool);
>  hp_fail:
>         cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
> -dstmem_fail:
> +buffer_fail:
>         kmem_cache_destroy(zswap_entry_cache);
>  cache_fail:
>         /* if built-in, we aren't unloaded on failure; don't allow use */
>
> --
> b4 0.10.1
Chengming Zhou Dec. 18, 2023, 11:26 a.m. UTC | #2
On 2023/12/18 17:37, Yosry Ahmed wrote:
> On Mon, Dec 18, 2023 at 12:22 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> Since the introduce of reusing the dstmem in the load path, it seems
>> confusing that we are now using acomp_ctx->dstmem and acomp_ctx->mutex
>> now for purposes other than what the naming suggests.
>>
>> Yosry suggested removing these two fields from acomp_ctx, and directly
>> using zswap_dstmem and zswap_mutex in both the load and store paths,
>> rename them, and add proper comments above their definitions that they
>> are for generic percpu buffering on the load and store paths.
>>
>> So this patch remove dstmem and mutex from acomp_ctx, and rename the
>> zswap_dstmem to zswap_buffer, using the percpu mutex and buffer on
>> the load and store paths. And refactor out __zswap_store() to only
>> include the compress & store, since I found zswap_store() is too long.
> 
> I am not sure refactoring out __zswap_store() is useful to be honest,
> but I am not objecting to it, it mirrors __zswap_load() in a sense.

Yes, it mirrors __zswap_load() and only includes compress and store.
And it makes easy for me to only concentrate on __zswap_store/load()
when renaming the percpu buffers and mutex. But if anyone has objection,
I can drop it.

> However, if you want to do so, please do it in a separate patch from
> renaming the percpu buffers and mutex. This will make reviewing easier
> (and make my Suggested-by correctly scoped).

Right, will do.

> 
> Also, any reason why raw_smp_processor_id() is used here instead of
> smp_processor_id()?
> 

Here we don't need the CPU id stable, since we only need to pick one
CPU and use the mutex to serialize.

And from the comments below in <include/linux/smp.h>, WARN would happen
if we use smp_processor_id() here without other helpers.

 * The CPU id is stable when:
 *
 *  - IRQs are disabled;
 *  - preemption is disabled;
 *  - the task is CPU affine.

 * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN
 * when smp_processor_id() is used when the CPU id is not stable.

Thanks!
Chengming Zhou Dec. 18, 2023, 11:54 a.m. UTC | #3
On 2023/12/18 17:37, Yosry Ahmed wrote:
> On Mon, Dec 18, 2023 at 12:22 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> Since the introduce of reusing the dstmem in the load path, it seems
>> confusing that we are now using acomp_ctx->dstmem and acomp_ctx->mutex
>> now for purposes other than what the naming suggests.
>>
>> Yosry suggested removing these two fields from acomp_ctx, and directly
>> using zswap_dstmem and zswap_mutex in both the load and store paths,
>> rename them, and add proper comments above their definitions that they
>> are for generic percpu buffering on the load and store paths.
>>
>> So this patch remove dstmem and mutex from acomp_ctx, and rename the
>> zswap_dstmem to zswap_buffer, using the percpu mutex and buffer on
>> the load and store paths. And refactor out __zswap_store() to only
>> include the compress & store, since I found zswap_store() is too long.
> 
> I am not sure refactoring out __zswap_store() is useful to be honest,
> but I am not objecting to it, it mirrors __zswap_load() in a sense.
> However, if you want to do so, please do it in a separate patch from
> renaming the percpu buffers and mutex. This will make reviewing easier
> (and make my Suggested-by correctly scoped).
> 

After thinking twice, I dropped the __zswap_store() part. Right, it's
not much useful.

Thanks.
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index 2c349fd88904..b7449294ec3a 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -166,8 +166,6 @@  struct crypto_acomp_ctx {
 	struct crypto_acomp *acomp;
 	struct acomp_req *req;
 	struct crypto_wait wait;
-	u8 *dstmem;
-	struct mutex *mutex;
 };
 
 /*
@@ -694,7 +692,7 @@  static void zswap_alloc_shrinker(struct zswap_pool *pool)
 /*********************************
 * per-cpu code
 **********************************/
-static DEFINE_PER_CPU(u8 *, zswap_dstmem);
+static DEFINE_PER_CPU(u8 *, zswap_buffer);
 /*
  * If users dynamically change the zpool type and compressor at runtime, i.e.
  * zswap is running, zswap can have more than one zpool on one cpu, but they
@@ -702,39 +700,39 @@  static DEFINE_PER_CPU(u8 *, zswap_dstmem);
  */
 static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
 
-static int zswap_dstmem_prepare(unsigned int cpu)
+static int zswap_buffer_prepare(unsigned int cpu)
 {
 	struct mutex *mutex;
-	u8 *dst;
+	u8 *buf;
 
-	dst = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
-	if (!dst)
+	buf = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
+	if (!buf)
 		return -ENOMEM;
 
 	mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
 	if (!mutex) {
-		kfree(dst);
+		kfree(buf);
 		return -ENOMEM;
 	}
 
 	mutex_init(mutex);
-	per_cpu(zswap_dstmem, cpu) = dst;
+	per_cpu(zswap_buffer, cpu) = buf;
 	per_cpu(zswap_mutex, cpu) = mutex;
 	return 0;
 }
 
-static int zswap_dstmem_dead(unsigned int cpu)
+static int zswap_buffer_dead(unsigned int cpu)
 {
 	struct mutex *mutex;
-	u8 *dst;
+	u8 *buf;
 
 	mutex = per_cpu(zswap_mutex, cpu);
 	kfree(mutex);
 	per_cpu(zswap_mutex, cpu) = NULL;
 
-	dst = per_cpu(zswap_dstmem, cpu);
-	kfree(dst);
-	per_cpu(zswap_dstmem, cpu) = NULL;
+	buf = per_cpu(zswap_buffer, cpu);
+	kfree(buf);
+	per_cpu(zswap_buffer, cpu) = NULL;
 
 	return 0;
 }
@@ -772,9 +770,6 @@  static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   crypto_req_done, &acomp_ctx->wait);
 
-	acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
-	acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
-
 	return 0;
 }
 
@@ -1392,20 +1387,98 @@  static int zswap_enabled_param_set(const char *val,
 	return ret;
 }
 
+static int __zswap_store(struct zswap_entry *entry, struct page *page)
+{
+	struct scatterlist input, output;
+	struct crypto_acomp_ctx *acomp_ctx;
+	struct zpool *zpool;
+	unsigned long handle;
+	unsigned int dlen;
+	u8 *buf, *dst;
+	gfp_t gfp;
+	int ret;
+	int cpu;
+	struct mutex *mutex;
+
+	cpu = raw_smp_processor_id();
+	mutex = per_cpu(zswap_mutex, cpu);
+	mutex_lock(mutex);
+
+	acomp_ctx = per_cpu_ptr(entry->pool->acomp_ctx, cpu);
+	buf = per_cpu(zswap_buffer, cpu);
+
+	sg_init_table(&input, 1);
+	sg_set_page(&input, page, PAGE_SIZE, 0);
+	sg_init_one(&output, buf, PAGE_SIZE);
+	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, PAGE_SIZE);
+	/*
+	 * it maybe looks a little bit silly that we send an asynchronous request,
+	 * then wait for its completion synchronously. This makes the process look
+	 * synchronous in fact.
+	 * Theoretically, acomp supports users send multiple acomp requests in one
+	 * acomp instance, then get those requests done simultaneously. but in this
+	 * case, zswap actually does store and load page by page, there is no
+	 * existing method to send the second page before the first page is done
+	 * in one thread doing zwap.
+	 * but in different threads running on different cpu, we have different
+	 * acomp instance, so multiple threads can do (de)compression in parallel.
+	 */
+	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+	dlen = acomp_ctx->req->dlen;
+
+	if (ret) {
+		zswap_reject_compress_fail++;
+		goto unlock;
+	}
+
+	/* store */
+	zpool = zswap_find_zpool(entry);
+	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+	if (zpool_malloc_support_movable(zpool))
+		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+	ret = zpool_malloc(zpool, dlen, gfp, &handle);
+	if (ret == -ENOSPC) {
+		zswap_reject_compress_poor++;
+		goto unlock;
+	}
+	if (ret) {
+		zswap_reject_alloc_fail++;
+		goto unlock;
+	}
+	dst = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
+	memcpy(dst, buf, dlen);
+	zpool_unmap_handle(zpool, handle);
+	mutex_unlock(mutex);
+
+	entry->handle = handle;
+	entry->length = dlen;
+	return 0;
+
+unlock:
+	mutex_unlock(mutex);
+	return ret;
+}
+
 static void __zswap_load(struct zswap_entry *entry, struct page *page)
 {
 	struct zpool *zpool = zswap_find_zpool(entry);
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
-	u8 *src;
+	u8 *src, *buf;
+	int cpu;
+	struct mutex *mutex;
+
+	cpu = raw_smp_processor_id();
+	mutex = per_cpu(zswap_mutex, cpu);
+	mutex_lock(mutex);
 
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-	mutex_lock(acomp_ctx->mutex);
+	acomp_ctx = per_cpu_ptr(entry->pool->acomp_ctx, cpu);
 
 	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
 	if (!zpool_can_sleep_mapped(zpool)) {
-		memcpy(acomp_ctx->dstmem, src, entry->length);
-		src = acomp_ctx->dstmem;
+		buf = per_cpu(zswap_buffer, cpu);
+		memcpy(buf, src, entry->length);
+		src = buf;
 		zpool_unmap_handle(zpool, entry->handle);
 	}
 
@@ -1415,7 +1488,7 @@  static void __zswap_load(struct zswap_entry *entry, struct page *page)
 	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
 	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
 	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
-	mutex_unlock(acomp_ctx->mutex);
+	mutex_unlock(mutex);
 
 	if (zpool_can_sleep_mapped(zpool))
 		zpool_unmap_handle(zpool, entry->handle);
@@ -1539,18 +1612,11 @@  bool zswap_store(struct folio *folio)
 	struct page *page = &folio->page;
 	struct zswap_tree *tree = zswap_trees[type];
 	struct zswap_entry *entry, *dupentry;
-	struct scatterlist input, output;
-	struct crypto_acomp_ctx *acomp_ctx;
 	struct obj_cgroup *objcg = NULL;
 	struct mem_cgroup *memcg = NULL;
 	struct zswap_pool *pool;
-	struct zpool *zpool;
-	unsigned int dlen = PAGE_SIZE;
-	unsigned long handle, value;
-	char *buf;
-	u8 *src, *dst;
-	gfp_t gfp;
-	int ret;
+	u8 *src;
+	unsigned long value;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1635,60 +1701,11 @@  bool zswap_store(struct folio *folio)
 		mem_cgroup_put(memcg);
 	}
 
-	/* compress */
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
-	mutex_lock(acomp_ctx->mutex);
-
-	dst = acomp_ctx->dstmem;
-	sg_init_table(&input, 1);
-	sg_set_page(&input, page, PAGE_SIZE, 0);
-
-	sg_init_one(&output, dst, PAGE_SIZE);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
-	/*
-	 * it maybe looks a little bit silly that we send an asynchronous request,
-	 * then wait for its completion synchronously. This makes the process look
-	 * synchronous in fact.
-	 * Theoretically, acomp supports users send multiple acomp requests in one
-	 * acomp instance, then get those requests done simultaneously. but in this
-	 * case, zswap actually does store and load page by page, there is no
-	 * existing method to send the second page before the first page is done
-	 * in one thread doing zwap.
-	 * but in different threads running on different cpu, we have different
-	 * acomp instance, so multiple threads can do (de)compression in parallel.
-	 */
-	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
-	dlen = acomp_ctx->req->dlen;
-
-	if (ret) {
-		zswap_reject_compress_fail++;
-		goto put_dstmem;
-	}
-
-	/* store */
-	zpool = zswap_find_zpool(entry);
-	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	if (zpool_malloc_support_movable(zpool))
-		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	ret = zpool_malloc(zpool, dlen, gfp, &handle);
-	if (ret == -ENOSPC) {
-		zswap_reject_compress_poor++;
-		goto put_dstmem;
-	}
-	if (ret) {
-		zswap_reject_alloc_fail++;
-		goto put_dstmem;
-	}
-	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
-	memcpy(buf, dst, dlen);
-	zpool_unmap_handle(zpool, handle);
-	mutex_unlock(acomp_ctx->mutex);
+	if (__zswap_store(entry, page))
+		goto put_pool;
 
 	/* populate entry */
 	entry->swpentry = swp_entry(type, offset);
-	entry->handle = handle;
-	entry->length = dlen;
 
 insert_entry:
 	entry->objcg = objcg;
@@ -1725,8 +1742,6 @@  bool zswap_store(struct folio *folio)
 
 	return true;
 
-put_dstmem:
-	mutex_unlock(acomp_ctx->mutex);
 put_pool:
 	zswap_pool_put(entry->pool);
 freepage:
@@ -1902,10 +1917,10 @@  static int zswap_setup(void)
 	}
 
 	ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
-				zswap_dstmem_prepare, zswap_dstmem_dead);
+				zswap_buffer_prepare, zswap_buffer_dead);
 	if (ret) {
-		pr_err("dstmem alloc failed\n");
-		goto dstmem_fail;
+		pr_err("buffer alloc failed\n");
+		goto buffer_fail;
 	}
 
 	ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
@@ -1940,7 +1955,7 @@  static int zswap_setup(void)
 		zswap_pool_destroy(pool);
 hp_fail:
 	cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
-dstmem_fail:
+buffer_fail:
 	kmem_cache_destroy(zswap_entry_cache);
 cache_fail:
 	/* if built-in, we aren't unloaded on failure; don't allow use */