diff mbox series

[1/5] mm/zswap: reuse dstmem when decompress

Message ID 20231213-zswap-dstmem-v1-1-896763369d04@bytedance.com (mailing list archive)
State New
Headers show
Series mm/zswap: dstmem reuse optimizations and cleanups | expand

Commit Message

Chengming Zhou Dec. 13, 2023, 4:17 a.m. UTC
In the !zpool_can_sleep_mapped() case such as zsmalloc, we need to first
copy the entry->handle memory to a temporary memory, which is allocated
using kmalloc.

Obviously we can reuse the per-compressor dstmem to avoid allocating
every time, since it's percpu-compressor and protected in mutex.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
---
 mm/zswap.c | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

Comments

Yosry Ahmed Dec. 13, 2023, 11:24 p.m. UTC | #1
On Tue, Dec 12, 2023 at 8:18 PM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> In the !zpool_can_sleep_mapped() case such as zsmalloc, we need to first
> copy the entry->handle memory to a temporary memory, which is allocated
> using kmalloc.
>
> Obviously we can reuse the per-compressor dstmem to avoid allocating
> every time, since it's percpu-compressor and protected in mutex.
>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> Reviewed-by: Nhat Pham <nphamcs@gmail.com>
> ---
>  mm/zswap.c | 29 +++++++++--------------------
>  1 file changed, 9 insertions(+), 20 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 7ee54a3d8281..edb8b45ed5a1 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1772,9 +1772,9 @@ bool zswap_load(struct folio *folio)
>         struct zswap_entry *entry;
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
> -       u8 *src, *dst, *tmp;
> +       unsigned int dlen = PAGE_SIZE;
> +       u8 *src, *dst;
>         struct zpool *zpool;
> -       unsigned int dlen;
>         bool ret;
>
>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
> @@ -1796,27 +1796,18 @@ bool zswap_load(struct folio *folio)
>                 goto stats;
>         }
>
> -       zpool = zswap_find_zpool(entry);
> -       if (!zpool_can_sleep_mapped(zpool)) {
> -               tmp = kmalloc(entry->length, GFP_KERNEL);
> -               if (!tmp) {
> -                       ret = false;
> -                       goto freeentry;
> -               }
> -       }
> -
>         /* decompress */
> -       dlen = PAGE_SIZE;
> -       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> +       mutex_lock(acomp_ctx->mutex);
>
> +       zpool = zswap_find_zpool(entry);
> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>         if (!zpool_can_sleep_mapped(zpool)) {
> -               memcpy(tmp, src, entry->length);
> -               src = tmp;
> +               memcpy(acomp_ctx->dstmem, src, entry->length);
> +               src = acomp_ctx->dstmem;

I don't like that we are now using acomp_ctx->dstmem and
acomp_ctx->mutex now for purposes other than what the naming suggests.

How about removing these two fields from acomp_ctx, and directly using
zswap_dstmem and zswap_mutex in both the load and store paths, rename
them, and add proper comments above their definitions that they are
for generic percpu buffering on the load and store paths?
Chengming Zhou Dec. 14, 2023, 1:29 p.m. UTC | #2
On 2023/12/14 07:24, Yosry Ahmed wrote:
> On Tue, Dec 12, 2023 at 8:18 PM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> In the !zpool_can_sleep_mapped() case such as zsmalloc, we need to first
>> copy the entry->handle memory to a temporary memory, which is allocated
>> using kmalloc.
>>
>> Obviously we can reuse the per-compressor dstmem to avoid allocating
>> every time, since it's percpu-compressor and protected in mutex.
>>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>> Reviewed-by: Nhat Pham <nphamcs@gmail.com>
>> ---
>>  mm/zswap.c | 29 +++++++++--------------------
>>  1 file changed, 9 insertions(+), 20 deletions(-)
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 7ee54a3d8281..edb8b45ed5a1 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -1772,9 +1772,9 @@ bool zswap_load(struct folio *folio)
>>         struct zswap_entry *entry;
>>         struct scatterlist input, output;
>>         struct crypto_acomp_ctx *acomp_ctx;
>> -       u8 *src, *dst, *tmp;
>> +       unsigned int dlen = PAGE_SIZE;
>> +       u8 *src, *dst;
>>         struct zpool *zpool;
>> -       unsigned int dlen;
>>         bool ret;
>>
>>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>> @@ -1796,27 +1796,18 @@ bool zswap_load(struct folio *folio)
>>                 goto stats;
>>         }
>>
>> -       zpool = zswap_find_zpool(entry);
>> -       if (!zpool_can_sleep_mapped(zpool)) {
>> -               tmp = kmalloc(entry->length, GFP_KERNEL);
>> -               if (!tmp) {
>> -                       ret = false;
>> -                       goto freeentry;
>> -               }
>> -       }
>> -
>>         /* decompress */
>> -       dlen = PAGE_SIZE;
>> -       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>> +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
>> +       mutex_lock(acomp_ctx->mutex);
>>
>> +       zpool = zswap_find_zpool(entry);
>> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>>         if (!zpool_can_sleep_mapped(zpool)) {
>> -               memcpy(tmp, src, entry->length);
>> -               src = tmp;
>> +               memcpy(acomp_ctx->dstmem, src, entry->length);
>> +               src = acomp_ctx->dstmem;
> 
> I don't like that we are now using acomp_ctx->dstmem and
> acomp_ctx->mutex now for purposes other than what the naming suggests.

The "mutex" name is coherent, "dstmem" depends on how we use IMHO.
Change to just "mem"? Or do you have a better name to replace?

> 
> How about removing these two fields from acomp_ctx, and directly using
> zswap_dstmem and zswap_mutex in both the load and store paths, rename
> them, and add proper comments above their definitions that they are
> for generic percpu buffering on the load and store paths?

Yes, they are percpu memory and lock, but they are used by per acomp_ctx,
and the cpu maybe changing in the middle, so maybe better to keep them.

Thanks!
Yosry Ahmed Dec. 14, 2023, 1:32 p.m. UTC | #3
On Thu, Dec 14, 2023 at 5:29 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> On 2023/12/14 07:24, Yosry Ahmed wrote:
> > On Tue, Dec 12, 2023 at 8:18 PM Chengming Zhou
> > <zhouchengming@bytedance.com> wrote:
> >>
> >> In the !zpool_can_sleep_mapped() case such as zsmalloc, we need to first
> >> copy the entry->handle memory to a temporary memory, which is allocated
> >> using kmalloc.
> >>
> >> Obviously we can reuse the per-compressor dstmem to avoid allocating
> >> every time, since it's percpu-compressor and protected in mutex.
> >>
> >> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> >> Reviewed-by: Nhat Pham <nphamcs@gmail.com>
> >> ---
> >>  mm/zswap.c | 29 +++++++++--------------------
> >>  1 file changed, 9 insertions(+), 20 deletions(-)
> >>
> >> diff --git a/mm/zswap.c b/mm/zswap.c
> >> index 7ee54a3d8281..edb8b45ed5a1 100644
> >> --- a/mm/zswap.c
> >> +++ b/mm/zswap.c
> >> @@ -1772,9 +1772,9 @@ bool zswap_load(struct folio *folio)
> >>         struct zswap_entry *entry;
> >>         struct scatterlist input, output;
> >>         struct crypto_acomp_ctx *acomp_ctx;
> >> -       u8 *src, *dst, *tmp;
> >> +       unsigned int dlen = PAGE_SIZE;
> >> +       u8 *src, *dst;
> >>         struct zpool *zpool;
> >> -       unsigned int dlen;
> >>         bool ret;
> >>
> >>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
> >> @@ -1796,27 +1796,18 @@ bool zswap_load(struct folio *folio)
> >>                 goto stats;
> >>         }
> >>
> >> -       zpool = zswap_find_zpool(entry);
> >> -       if (!zpool_can_sleep_mapped(zpool)) {
> >> -               tmp = kmalloc(entry->length, GFP_KERNEL);
> >> -               if (!tmp) {
> >> -                       ret = false;
> >> -                       goto freeentry;
> >> -               }
> >> -       }
> >> -
> >>         /* decompress */
> >> -       dlen = PAGE_SIZE;
> >> -       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> >> +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> >> +       mutex_lock(acomp_ctx->mutex);
> >>
> >> +       zpool = zswap_find_zpool(entry);
> >> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> >>         if (!zpool_can_sleep_mapped(zpool)) {
> >> -               memcpy(tmp, src, entry->length);
> >> -               src = tmp;
> >> +               memcpy(acomp_ctx->dstmem, src, entry->length);
> >> +               src = acomp_ctx->dstmem;
> >
> > I don't like that we are now using acomp_ctx->dstmem and
> > acomp_ctx->mutex now for purposes other than what the naming suggests.
>
> The "mutex" name is coherent, "dstmem" depends on how we use IMHO.
> Change to just "mem"? Or do you have a better name to replace?
>
> >
> > How about removing these two fields from acomp_ctx, and directly using
> > zswap_dstmem and zswap_mutex in both the load and store paths, rename
> > them, and add proper comments above their definitions that they are
> > for generic percpu buffering on the load and store paths?
>
> Yes, they are percpu memory and lock, but they are used by per acomp_ctx,
> and the cpu maybe changing in the middle, so maybe better to keep them.

I don't mean to remove completely. Keep them as (for example)
zswap_mem and zswap_mutex global percpu variables, and not have
pointers in acomp_ctx to them. Instead of using acomp_ctx->dstmem
today, we directly use the global zswap_mem (same for the mutex).

This makes it clear that the buffers are not owned or exclusively used
by the acomp_ctx. WDYT?
Chengming Zhou Dec. 14, 2023, 2:42 p.m. UTC | #4
On 2023/12/14 21:32, Yosry Ahmed wrote:
> On Thu, Dec 14, 2023 at 5:29 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> On 2023/12/14 07:24, Yosry Ahmed wrote:
>>> On Tue, Dec 12, 2023 at 8:18 PM Chengming Zhou
>>> <zhouchengming@bytedance.com> wrote:
>>>>
>>>> In the !zpool_can_sleep_mapped() case such as zsmalloc, we need to first
>>>> copy the entry->handle memory to a temporary memory, which is allocated
>>>> using kmalloc.
>>>>
>>>> Obviously we can reuse the per-compressor dstmem to avoid allocating
>>>> every time, since it's percpu-compressor and protected in mutex.
>>>>
>>>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>>>> Reviewed-by: Nhat Pham <nphamcs@gmail.com>
>>>> ---
>>>>  mm/zswap.c | 29 +++++++++--------------------
>>>>  1 file changed, 9 insertions(+), 20 deletions(-)
>>>>
>>>> diff --git a/mm/zswap.c b/mm/zswap.c
>>>> index 7ee54a3d8281..edb8b45ed5a1 100644
>>>> --- a/mm/zswap.c
>>>> +++ b/mm/zswap.c
>>>> @@ -1772,9 +1772,9 @@ bool zswap_load(struct folio *folio)
>>>>         struct zswap_entry *entry;
>>>>         struct scatterlist input, output;
>>>>         struct crypto_acomp_ctx *acomp_ctx;
>>>> -       u8 *src, *dst, *tmp;
>>>> +       unsigned int dlen = PAGE_SIZE;
>>>> +       u8 *src, *dst;
>>>>         struct zpool *zpool;
>>>> -       unsigned int dlen;
>>>>         bool ret;
>>>>
>>>>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>>>> @@ -1796,27 +1796,18 @@ bool zswap_load(struct folio *folio)
>>>>                 goto stats;
>>>>         }
>>>>
>>>> -       zpool = zswap_find_zpool(entry);
>>>> -       if (!zpool_can_sleep_mapped(zpool)) {
>>>> -               tmp = kmalloc(entry->length, GFP_KERNEL);
>>>> -               if (!tmp) {
>>>> -                       ret = false;
>>>> -                       goto freeentry;
>>>> -               }
>>>> -       }
>>>> -
>>>>         /* decompress */
>>>> -       dlen = PAGE_SIZE;
>>>> -       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>>>> +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
>>>> +       mutex_lock(acomp_ctx->mutex);
>>>>
>>>> +       zpool = zswap_find_zpool(entry);
>>>> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>>>>         if (!zpool_can_sleep_mapped(zpool)) {
>>>> -               memcpy(tmp, src, entry->length);
>>>> -               src = tmp;
>>>> +               memcpy(acomp_ctx->dstmem, src, entry->length);
>>>> +               src = acomp_ctx->dstmem;
>>>
>>> I don't like that we are now using acomp_ctx->dstmem and
>>> acomp_ctx->mutex now for purposes other than what the naming suggests.
>>
>> The "mutex" name is coherent, "dstmem" depends on how we use IMHO.
>> Change to just "mem"? Or do you have a better name to replace?
>>
>>>
>>> How about removing these two fields from acomp_ctx, and directly using
>>> zswap_dstmem and zswap_mutex in both the load and store paths, rename
>>> them, and add proper comments above their definitions that they are
>>> for generic percpu buffering on the load and store paths?
>>
>> Yes, they are percpu memory and lock, but they are used by per acomp_ctx,
>> and the cpu maybe changing in the middle, so maybe better to keep them.
> 
> I don't mean to remove completely. Keep them as (for example)
> zswap_mem and zswap_mutex global percpu variables, and not have
> pointers in acomp_ctx to them. Instead of using acomp_ctx->dstmem
> today, we directly use the global zswap_mem (same for the mutex).
> 
> This makes it clear that the buffers are not owned or exclusively used
> by the acomp_ctx. WDYT?

Does this look good to you?

```
int cpu = raw_smp_processor_id();

mutex = per_cpu(zswap_mutex, cpu);
mutex_lock(mutex);

dstmem = per_cpu(zswap_dstmem, cpu);
acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);

/* compress or decompress */
```

Another way I just think of is to make acomp_ctx own its lock and buffer,
and we could delete these percpu zswap_mutex and zswap_dstmem instead.

Thanks.
Chris Li Dec. 14, 2023, 5:59 p.m. UTC | #5
On Tue, Dec 12, 2023 at 8:18 PM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> In the !zpool_can_sleep_mapped() case such as zsmalloc, we need to first
> copy the entry->handle memory to a temporary memory, which is allocated
> using kmalloc.
>
> Obviously we can reuse the per-compressor dstmem to avoid allocating
> every time, since it's percpu-compressor and protected in mutex.

You are trading more memory for faster speed.
Per-cpu data structure does not come free. It is expensive in terms of
memory on a big server with a lot of CPU. Think more than a few
hundred CPU. On the big servers, we might want to disable this
optimization to save a few MB RAM, depending on the gain of this
optimization.
Do we have any benchmark suggesting how much CPU overhead or latency
this per-CPU page buys us, compared to using kmalloc?

Chris

>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> Reviewed-by: Nhat Pham <nphamcs@gmail.com>
> ---
>  mm/zswap.c | 29 +++++++++--------------------
>  1 file changed, 9 insertions(+), 20 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 7ee54a3d8281..edb8b45ed5a1 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1772,9 +1772,9 @@ bool zswap_load(struct folio *folio)
>         struct zswap_entry *entry;
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
> -       u8 *src, *dst, *tmp;
> +       unsigned int dlen = PAGE_SIZE;
> +       u8 *src, *dst;
>         struct zpool *zpool;
> -       unsigned int dlen;
>         bool ret;
>
>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
> @@ -1796,27 +1796,18 @@ bool zswap_load(struct folio *folio)
>                 goto stats;
>         }
>
> -       zpool = zswap_find_zpool(entry);
> -       if (!zpool_can_sleep_mapped(zpool)) {
> -               tmp = kmalloc(entry->length, GFP_KERNEL);
> -               if (!tmp) {
> -                       ret = false;
> -                       goto freeentry;
> -               }
> -       }
> -
>         /* decompress */
> -       dlen = PAGE_SIZE;
> -       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> +       mutex_lock(acomp_ctx->mutex);
>
> +       zpool = zswap_find_zpool(entry);
> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>         if (!zpool_can_sleep_mapped(zpool)) {
> -               memcpy(tmp, src, entry->length);
> -               src = tmp;
> +               memcpy(acomp_ctx->dstmem, src, entry->length);
> +               src = acomp_ctx->dstmem;
>                 zpool_unmap_handle(zpool, entry->handle);
>         }
>
> -       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> -       mutex_lock(acomp_ctx->mutex);
>         sg_init_one(&input, src, entry->length);
>         sg_init_table(&output, 1);
>         sg_set_page(&output, page, PAGE_SIZE, 0);
> @@ -1827,15 +1818,13 @@ bool zswap_load(struct folio *folio)
>
>         if (zpool_can_sleep_mapped(zpool))
>                 zpool_unmap_handle(zpool, entry->handle);
> -       else
> -               kfree(tmp);
>
>         ret = true;
>  stats:
>         count_vm_event(ZSWPIN);
>         if (entry->objcg)
>                 count_objcg_event(entry->objcg, ZSWPIN);
> -freeentry:
> +
>         spin_lock(&tree->lock);
>         if (ret && zswap_exclusive_loads_enabled) {
>                 zswap_invalidate_entry(tree, entry);
>
> --
> b4 0.10.1
Yosry Ahmed Dec. 14, 2023, 6:24 p.m. UTC | #6
On Thu, Dec 14, 2023 at 6:42 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
[..]
> >>>> -
> >>>>         /* decompress */
> >>>> -       dlen = PAGE_SIZE;
> >>>> -       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> >>>> +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> >>>> +       mutex_lock(acomp_ctx->mutex);
> >>>>
> >>>> +       zpool = zswap_find_zpool(entry);
> >>>> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> >>>>         if (!zpool_can_sleep_mapped(zpool)) {
> >>>> -               memcpy(tmp, src, entry->length);
> >>>> -               src = tmp;
> >>>> +               memcpy(acomp_ctx->dstmem, src, entry->length);
> >>>> +               src = acomp_ctx->dstmem;
> >>>
> >>> I don't like that we are now using acomp_ctx->dstmem and
> >>> acomp_ctx->mutex now for purposes other than what the naming suggests.
> >>
> >> The "mutex" name is coherent, "dstmem" depends on how we use IMHO.
> >> Change to just "mem"? Or do you have a better name to replace?
> >>
> >>>
> >>> How about removing these two fields from acomp_ctx, and directly using
> >>> zswap_dstmem and zswap_mutex in both the load and store paths, rename
> >>> them, and add proper comments above their definitions that they are
> >>> for generic percpu buffering on the load and store paths?
> >>
> >> Yes, they are percpu memory and lock, but they are used by per acomp_ctx,
> >> and the cpu maybe changing in the middle, so maybe better to keep them.
> >
> > I don't mean to remove completely. Keep them as (for example)
> > zswap_mem and zswap_mutex global percpu variables, and not have
> > pointers in acomp_ctx to them. Instead of using acomp_ctx->dstmem
> > today, we directly use the global zswap_mem (same for the mutex).
> >
> > This makes it clear that the buffers are not owned or exclusively used
> > by the acomp_ctx. WDYT?
>
> Does this look good to you?
>
> ```
> int cpu = raw_smp_processor_id();
>
> mutex = per_cpu(zswap_mutex, cpu);
> mutex_lock(mutex);
>
> dstmem = per_cpu(zswap_dstmem, cpu);

Renaming to zswap_buffer or zswap_mem would be better I think, but
yeah what I had in mind is having zswap_mutex and
zswap_[dstmem/mem/buffer] be generic percpu buffers that are used by
store and load paths for different purposes, not directly linked to
acomp_ctx.

> acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
>
> /* compress or decompress */
> ```
>
> Another way I just think of is to make acomp_ctx own its lock and buffer,
> and we could delete these percpu zswap_mutex and zswap_dstmem instead.

You mean have two separate set of percpu buffers for zswap load &
stores paths? This is probably unnecessary.
Yosry Ahmed Dec. 14, 2023, 6:26 p.m. UTC | #7
On Thu, Dec 14, 2023 at 9:59 AM Chris Li <chrisl@kernel.org> wrote:
>
> On Tue, Dec 12, 2023 at 8:18 PM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
> >
> > In the !zpool_can_sleep_mapped() case such as zsmalloc, we need to first
> > copy the entry->handle memory to a temporary memory, which is allocated
> > using kmalloc.
> >
> > Obviously we can reuse the per-compressor dstmem to avoid allocating
> > every time, since it's percpu-compressor and protected in mutex.
>
> You are trading more memory for faster speed.
> Per-cpu data structure does not come free. It is expensive in terms of
> memory on a big server with a lot of CPU. Think more than a few
> hundred CPU. On the big servers, we might want to disable this
> optimization to save a few MB RAM, depending on the gain of this
> optimization.
> Do we have any benchmark suggesting how much CPU overhead or latency
> this per-CPU page buys us, compared to using kmalloc?

IIUC we are not creating any new percpu data structures here. We are
reusing existing percpu buffers used in the store path to compress
into. Now we also use them in the load path if we need a temporary
buffer to decompress into if the zpool backend does not support
sleeping while the memory is mapped.
Nhat Pham Dec. 14, 2023, 8:33 p.m. UTC | #8
On Thu, Dec 14, 2023 at 9:59 AM Chris Li <chrisl@kernel.org> wrote:
>
> On Tue, Dec 12, 2023 at 8:18 PM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
> >
> > In the !zpool_can_sleep_mapped() case such as zsmalloc, we need to first
> > copy the entry->handle memory to a temporary memory, which is allocated
> > using kmalloc.
> >
> > Obviously we can reuse the per-compressor dstmem to avoid allocating
> > every time, since it's percpu-compressor and protected in mutex.
>
> You are trading more memory for faster speed.
> Per-cpu data structure does not come free. It is expensive in terms of
> memory on a big server with a lot of CPU. Think more than a few
> hundred CPU. On the big servers, we might want to disable this
> optimization to save a few MB RAM, depending on the gain of this
> optimization.
> Do we have any benchmark suggesting how much CPU overhead or latency
> this per-CPU page buys us, compared to using kmalloc?

I think Chengming is re-using an existing per-CPU buffer for this
purpose. IIUC, it was previously only used for compression
(zswap_store) - Chengming is leveraging it for decompression (load and
writeback) too with this patch. This sounds fine to me tbh, because
both directions have to hold the mutex anyway, so that buffer is
locked out - might as well use it.

We're doing a bit more work in the mutex section (memcpy and handle
(un)mapping) - but seems fine to me tbh.

>
> Chris
>
> >
> > Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> > Reviewed-by: Nhat Pham <nphamcs@gmail.com>
> > ---
> >  mm/zswap.c | 29 +++++++++--------------------
> >  1 file changed, 9 insertions(+), 20 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 7ee54a3d8281..edb8b45ed5a1 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -1772,9 +1772,9 @@ bool zswap_load(struct folio *folio)
> >         struct zswap_entry *entry;
> >         struct scatterlist input, output;
> >         struct crypto_acomp_ctx *acomp_ctx;
> > -       u8 *src, *dst, *tmp;
> > +       unsigned int dlen = PAGE_SIZE;
> > +       u8 *src, *dst;
> >         struct zpool *zpool;
> > -       unsigned int dlen;
> >         bool ret;
> >
> >         VM_WARN_ON_ONCE(!folio_test_locked(folio));
> > @@ -1796,27 +1796,18 @@ bool zswap_load(struct folio *folio)
> >                 goto stats;
> >         }
> >
> > -       zpool = zswap_find_zpool(entry);
> > -       if (!zpool_can_sleep_mapped(zpool)) {
> > -               tmp = kmalloc(entry->length, GFP_KERNEL);
> > -               if (!tmp) {
> > -                       ret = false;
> > -                       goto freeentry;
> > -               }
> > -       }
> > -
> >         /* decompress */
> > -       dlen = PAGE_SIZE;
> > -       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> > +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > +       mutex_lock(acomp_ctx->mutex);
> >
> > +       zpool = zswap_find_zpool(entry);
> > +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> >         if (!zpool_can_sleep_mapped(zpool)) {
> > -               memcpy(tmp, src, entry->length);
> > -               src = tmp;
> > +               memcpy(acomp_ctx->dstmem, src, entry->length);
> > +               src = acomp_ctx->dstmem;
> >                 zpool_unmap_handle(zpool, entry->handle);
> >         }
> >
> > -       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > -       mutex_lock(acomp_ctx->mutex);
> >         sg_init_one(&input, src, entry->length);
> >         sg_init_table(&output, 1);
> >         sg_set_page(&output, page, PAGE_SIZE, 0);
> > @@ -1827,15 +1818,13 @@ bool zswap_load(struct folio *folio)
> >
> >         if (zpool_can_sleep_mapped(zpool))
> >                 zpool_unmap_handle(zpool, entry->handle);
> > -       else
> > -               kfree(tmp);
> >
> >         ret = true;
> >  stats:
> >         count_vm_event(ZSWPIN);
> >         if (entry->objcg)
> >                 count_objcg_event(entry->objcg, ZSWPIN);
> > -freeentry:
> > +
> >         spin_lock(&tree->lock);
> >         if (ret && zswap_exclusive_loads_enabled) {
> >                 zswap_invalidate_entry(tree, entry);
> >
> > --
> > b4 0.10.1
Chris Li Dec. 14, 2023, 10:02 p.m. UTC | #9
Hi Yosry,

On Thu, Dec 14, 2023 at 10:27 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Thu, Dec 14, 2023 at 9:59 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Tue, Dec 12, 2023 at 8:18 PM Chengming Zhou
> > <zhouchengming@bytedance.com> wrote:
> > >
> > > In the !zpool_can_sleep_mapped() case such as zsmalloc, we need to first
> > > copy the entry->handle memory to a temporary memory, which is allocated
> > > using kmalloc.
> > >
> > > Obviously we can reuse the per-compressor dstmem to avoid allocating
> > > every time, since it's percpu-compressor and protected in mutex.
> >
> > You are trading more memory for faster speed.
> > Per-cpu data structure does not come free. It is expensive in terms of
> > memory on a big server with a lot of CPU. Think more than a few
> > hundred CPU. On the big servers, we might want to disable this
> > optimization to save a few MB RAM, depending on the gain of this
> > optimization.
> > Do we have any benchmark suggesting how much CPU overhead or latency
> > this per-CPU page buys us, compared to using kmalloc?
>
> IIUC we are not creating any new percpu data structures here. We are
> reusing existing percpu buffers used in the store path to compress
> into. Now we also use them in the load path if we need a temporary
> buffer to decompress into if the zpool backend does not support
> sleeping while the memory is mapped.

That sounds like pure win then. Thanks for explaining it.

Hi Nahn,

> I think Chengming is re-using an existing per-CPU buffer for this
> purpose. IIUC, it was previously only used for compression
> (zswap_store) - Chengming is leveraging it for decompression (load and
> writeback) too with this patch. This sounds fine to me tbh, because
> both directions have to hold the mutex anyway, so that buffer is
> locked out - might as well use it.

Agree.

Acked-by: Chris Li <chrisl@kernel.org>

>
> We're doing a bit more work in the mutex section (memcpy and handle
> (un)mapping) - but seems fine to me tbh.

Thanks for the heads up.

Chris
Chengming Zhou Dec. 18, 2023, 8:06 a.m. UTC | #10
On 2023/12/15 02:24, Yosry Ahmed wrote:
> On Thu, Dec 14, 2023 at 6:42 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
> [..]
>>>>>> -
>>>>>>         /* decompress */
>>>>>> -       dlen = PAGE_SIZE;
>>>>>> -       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>>>>>> +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
>>>>>> +       mutex_lock(acomp_ctx->mutex);
>>>>>>
>>>>>> +       zpool = zswap_find_zpool(entry);
>>>>>> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>>>>>>         if (!zpool_can_sleep_mapped(zpool)) {
>>>>>> -               memcpy(tmp, src, entry->length);
>>>>>> -               src = tmp;
>>>>>> +               memcpy(acomp_ctx->dstmem, src, entry->length);
>>>>>> +               src = acomp_ctx->dstmem;
>>>>>
>>>>> I don't like that we are now using acomp_ctx->dstmem and
>>>>> acomp_ctx->mutex now for purposes other than what the naming suggests.
>>>>
>>>> The "mutex" name is coherent, "dstmem" depends on how we use IMHO.
>>>> Change to just "mem"? Or do you have a better name to replace?
>>>>
>>>>>
>>>>> How about removing these two fields from acomp_ctx, and directly using
>>>>> zswap_dstmem and zswap_mutex in both the load and store paths, rename
>>>>> them, and add proper comments above their definitions that they are
>>>>> for generic percpu buffering on the load and store paths?
>>>>
>>>> Yes, they are percpu memory and lock, but they are used by per acomp_ctx,
>>>> and the cpu maybe changing in the middle, so maybe better to keep them.
>>>
>>> I don't mean to remove completely. Keep them as (for example)
>>> zswap_mem and zswap_mutex global percpu variables, and not have
>>> pointers in acomp_ctx to them. Instead of using acomp_ctx->dstmem
>>> today, we directly use the global zswap_mem (same for the mutex).
>>>
>>> This makes it clear that the buffers are not owned or exclusively used
>>> by the acomp_ctx. WDYT?
>>
>> Does this look good to you?
>>
>> ```
>> int cpu = raw_smp_processor_id();
>>
>> mutex = per_cpu(zswap_mutex, cpu);
>> mutex_lock(mutex);
>>
>> dstmem = per_cpu(zswap_dstmem, cpu);
> 
> Renaming to zswap_buffer or zswap_mem would be better I think, but
> yeah what I had in mind is having zswap_mutex and
> zswap_[dstmem/mem/buffer] be generic percpu buffers that are used by
> store and load paths for different purposes, not directly linked to
> acomp_ctx.
> 

Ok, I'll append a patch to do the refactor & cleanup: remove mutex
and dstmem from acomp_ctx, and rename to zswap_buffer, then directly
use them on the load/store paths.

>> acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
>>
>> /* compress or decompress */
>> ```
>>
>> Another way I just think of is to make acomp_ctx own its lock and buffer,
>> and we could delete these percpu zswap_mutex and zswap_dstmem instead.
> 
> You mean have two separate set of percpu buffers for zswap load &
> stores paths? This is probably unnecessary.

Alright. Thanks.
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index 7ee54a3d8281..edb8b45ed5a1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1772,9 +1772,9 @@  bool zswap_load(struct folio *folio)
 	struct zswap_entry *entry;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
-	u8 *src, *dst, *tmp;
+	unsigned int dlen = PAGE_SIZE;
+	u8 *src, *dst;
 	struct zpool *zpool;
-	unsigned int dlen;
 	bool ret;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
@@ -1796,27 +1796,18 @@  bool zswap_load(struct folio *folio)
 		goto stats;
 	}
 
-	zpool = zswap_find_zpool(entry);
-	if (!zpool_can_sleep_mapped(zpool)) {
-		tmp = kmalloc(entry->length, GFP_KERNEL);
-		if (!tmp) {
-			ret = false;
-			goto freeentry;
-		}
-	}
-
 	/* decompress */
-	dlen = PAGE_SIZE;
-	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
+	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+	mutex_lock(acomp_ctx->mutex);
 
+	zpool = zswap_find_zpool(entry);
+	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
 	if (!zpool_can_sleep_mapped(zpool)) {
-		memcpy(tmp, src, entry->length);
-		src = tmp;
+		memcpy(acomp_ctx->dstmem, src, entry->length);
+		src = acomp_ctx->dstmem;
 		zpool_unmap_handle(zpool, entry->handle);
 	}
 
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-	mutex_lock(acomp_ctx->mutex);
 	sg_init_one(&input, src, entry->length);
 	sg_init_table(&output, 1);
 	sg_set_page(&output, page, PAGE_SIZE, 0);
@@ -1827,15 +1818,13 @@  bool zswap_load(struct folio *folio)
 
 	if (zpool_can_sleep_mapped(zpool))
 		zpool_unmap_handle(zpool, entry->handle);
-	else
-		kfree(tmp);
 
 	ret = true;
 stats:
 	count_vm_event(ZSWPIN);
 	if (entry->objcg)
 		count_objcg_event(entry->objcg, ZSWPIN);
-freeentry:
+
 	spin_lock(&tree->lock);
 	if (ret && zswap_exclusive_loads_enabled) {
 		zswap_invalidate_entry(tree, entry);