diff mbox series

[v2] mm: vmscan: reclaim anon pages if there are swapcache pages

Message ID 20230822024901.2412520-1-liushixin2@huawei.com (mailing list archive)
State New
Headers show
Series [v2] mm: vmscan: reclaim anon pages if there are swapcache pages | expand

Commit Message

Liu Shixin Aug. 22, 2023, 2:49 a.m. UTC
When spaces of swap devices are exhausted, only file pages can be reclaimed.
But there are still some swapcache pages in anon lru list. This can lead
to a premature out-of-memory.

This problem can be fixed by checking number of swapcache pages in
can_reclaim_anon_pages(). For memcg v2, there are swapcache stat that can
be used directly. For memcg v1, use total_swapcache_pages() instead, which
may not accurate but can solve the problem.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/swap.h |  6 ++++++
 mm/memcontrol.c      |  8 ++++++++
 mm/vmscan.c          | 12 ++++++++----
 3 files changed, 22 insertions(+), 4 deletions(-)

Comments

Yosry Ahmed Aug. 22, 2023, 4:35 p.m. UTC | #1
On Mon, Aug 21, 2023 at 6:54 PM Liu Shixin <liushixin2@huawei.com> wrote:
>
> When spaces of swap devices are exhausted, only file pages can be reclaimed.
> But there are still some swapcache pages in anon lru list. This can lead
> to a premature out-of-memory.
>
> This problem can be fixed by checking number of swapcache pages in
> can_reclaim_anon_pages(). For memcg v2, there are swapcache stat that can
> be used directly. For memcg v1, use total_swapcache_pages() instead, which
> may not accurate but can solve the problem.

Interesting find. I wonder if we really don't have any handling of
this situation.

>
> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
> ---
>  include/linux/swap.h |  6 ++++++
>  mm/memcontrol.c      |  8 ++++++++
>  mm/vmscan.c          | 12 ++++++++----
>  3 files changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 456546443f1f..0318e918bfa4 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -669,6 +669,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
>  }
>
>  extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
> +extern long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg);
>  extern bool mem_cgroup_swap_full(struct folio *folio);
>  #else
>  static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
> @@ -691,6 +692,11 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
>         return get_nr_swap_pages();
>  }
>
> +static inline long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
> +{
> +       return total_swapcache_pages();
> +}
> +
>  static inline bool mem_cgroup_swap_full(struct folio *folio)
>  {
>         return vm_swap_full();
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e8ca4bdcb03c..3e578f41023e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -7567,6 +7567,14 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
>         return nr_swap_pages;
>  }
>
> +long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
> +{
> +       if (mem_cgroup_disabled() || do_memsw_account())
> +               return total_swapcache_pages();
> +
> +       return memcg_page_state(memcg, NR_SWAPCACHE);
> +}

Is there a reason why we cannot use NR_SWAPCACHE for cgroup v1? Isn't
that being maintained regardless of cgroup version? It is not exposed
in cgroup v1's memory.stat, but I don't think there is a reason we
can't do that -- if only to document that it is being used with cgroup
v1.


> +
>  bool mem_cgroup_swap_full(struct folio *folio)
>  {
>         struct mem_cgroup *memcg;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 7c33c5b653ef..bcb6279cbae7 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -609,13 +609,17 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
>         if (memcg == NULL) {
>                 /*
>                  * For non-memcg reclaim, is there
> -                * space in any swap device?
> +                * space in any swap device or swapcache pages?
>                  */
> -               if (get_nr_swap_pages() > 0)
> +               if (get_nr_swap_pages() + total_swapcache_pages() > 0)
>                         return true;
>         } else {
> -               /* Is the memcg below its swap limit? */
> -               if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
> +               /*
> +                * Is the memcg below its swap limit or is there swapcache
> +                * pages can be freed?
> +                */
> +               if (mem_cgroup_get_nr_swap_pages(memcg) +
> +                   mem_cgroup_get_nr_swapcache_pages(memcg) > 0)
>                         return true;
>         }

I wonder if it would be more efficient to set a bit in struct
scan_control if we only are out of swap spaces but have swap cache
pages, and only isolate anon pages that are in the swap cache, instead
of isolating random anon pages. We may end up isolating pages that are
not in the swap cache for a few iterations and wasting cycles.


>
> --
> 2.25.1
>
Liu Shixin Aug. 23, 2023, 2 a.m. UTC | #2
On 2023/8/23 0:35, Yosry Ahmed wrote:
> On Mon, Aug 21, 2023 at 6:54 PM Liu Shixin <liushixin2@huawei.com> wrote:
>> When spaces of swap devices are exhausted, only file pages can be reclaimed.
>> But there are still some swapcache pages in anon lru list. This can lead
>> to a premature out-of-memory.
>>
>> This problem can be fixed by checking number of swapcache pages in
>> can_reclaim_anon_pages(). For memcg v2, there are swapcache stat that can
>> be used directly. For memcg v1, use total_swapcache_pages() instead, which
>> may not accurate but can solve the problem.
> Interesting find. I wonder if we really don't have any handling of
> this situation.
I have alreadly test this problem and can confirm that it is a real problem.
With 9MB swap space and 10MB mem_cgroup limit,when allocate 15MB memory,
there is a probability that OOM occurs.
>
>> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
>> ---
>>  include/linux/swap.h |  6 ++++++
>>  mm/memcontrol.c      |  8 ++++++++
>>  mm/vmscan.c          | 12 ++++++++----
>>  3 files changed, 22 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 456546443f1f..0318e918bfa4 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -669,6 +669,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
>>  }
>>
>>  extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
>> +extern long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg);
>>  extern bool mem_cgroup_swap_full(struct folio *folio);
>>  #else
>>  static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
>> @@ -691,6 +692,11 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
>>         return get_nr_swap_pages();
>>  }
>>
>> +static inline long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
>> +{
>> +       return total_swapcache_pages();
>> +}
>> +
>>  static inline bool mem_cgroup_swap_full(struct folio *folio)
>>  {
>>         return vm_swap_full();
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index e8ca4bdcb03c..3e578f41023e 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -7567,6 +7567,14 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
>>         return nr_swap_pages;
>>  }
>>
>> +long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
>> +{
>> +       if (mem_cgroup_disabled() || do_memsw_account())
>> +               return total_swapcache_pages();
>> +
>> +       return memcg_page_state(memcg, NR_SWAPCACHE);
>> +}
> Is there a reason why we cannot use NR_SWAPCACHE for cgroup v1? Isn't
> that being maintained regardless of cgroup version? It is not exposed
> in cgroup v1's memory.stat, but I don't think there is a reason we
> can't do that -- if only to document that it is being used with cgroup
> v1.
Thanks for your advice, it is more appropriate to use NR_SWAPCACH.
>
>
>> +
>>  bool mem_cgroup_swap_full(struct folio *folio)
>>  {
>>         struct mem_cgroup *memcg;
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 7c33c5b653ef..bcb6279cbae7 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -609,13 +609,17 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
>>         if (memcg == NULL) {
>>                 /*
>>                  * For non-memcg reclaim, is there
>> -                * space in any swap device?
>> +                * space in any swap device or swapcache pages?
>>                  */
>> -               if (get_nr_swap_pages() > 0)
>> +               if (get_nr_swap_pages() + total_swapcache_pages() > 0)
>>                         return true;
>>         } else {
>> -               /* Is the memcg below its swap limit? */
>> -               if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
>> +               /*
>> +                * Is the memcg below its swap limit or is there swapcache
>> +                * pages can be freed?
>> +                */
>> +               if (mem_cgroup_get_nr_swap_pages(memcg) +
>> +                   mem_cgroup_get_nr_swapcache_pages(memcg) > 0)
>>                         return true;
>>         }
> I wonder if it would be more efficient to set a bit in struct
> scan_control if we only are out of swap spaces but have swap cache
> pages, and only isolate anon pages that are in the swap cache, instead
> of isolating random anon pages. We may end up isolating pages that are
> not in the swap cache for a few iterations and wasting cycles.
Good idea. Thanks.
>
>> --
>> 2.25.1
>>
> .
>
Michal Hocko Aug. 23, 2023, 1:12 p.m. UTC | #3
On Wed 23-08-23 10:00:58, Liu Shixin wrote:
> 
> 
> On 2023/8/23 0:35, Yosry Ahmed wrote:
> > On Mon, Aug 21, 2023 at 6:54 PM Liu Shixin <liushixin2@huawei.com> wrote:
> >> When spaces of swap devices are exhausted, only file pages can be reclaimed.
> >> But there are still some swapcache pages in anon lru list. This can lead
> >> to a premature out-of-memory.
> >>
> >> This problem can be fixed by checking number of swapcache pages in
> >> can_reclaim_anon_pages(). For memcg v2, there are swapcache stat that can
> >> be used directly. For memcg v1, use total_swapcache_pages() instead, which
> >> may not accurate but can solve the problem.
> > Interesting find. I wonder if we really don't have any handling of
> > this situation.
> I have alreadly test this problem and can confirm that it is a real problem.
> With 9MB swap space and 10MB mem_cgroup limit,when allocate 15MB memory,
> there is a probability that OOM occurs.

Could you be more specific about the test and the oom report?
Yosry Ahmed Aug. 23, 2023, 3:29 p.m. UTC | #4
On Wed, Aug 23, 2023 at 6:12 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Wed 23-08-23 10:00:58, Liu Shixin wrote:
> >
> >
> > On 2023/8/23 0:35, Yosry Ahmed wrote:
> > > On Mon, Aug 21, 2023 at 6:54 PM Liu Shixin <liushixin2@huawei.com> wrote:
> > >> When spaces of swap devices are exhausted, only file pages can be reclaimed.
> > >> But there are still some swapcache pages in anon lru list. This can lead
> > >> to a premature out-of-memory.
> > >>
> > >> This problem can be fixed by checking number of swapcache pages in
> > >> can_reclaim_anon_pages(). For memcg v2, there are swapcache stat that can
> > >> be used directly. For memcg v1, use total_swapcache_pages() instead, which
> > >> may not accurate but can solve the problem.
> > > Interesting find. I wonder if we really don't have any handling of
> > > this situation.
> > I have alreadly test this problem and can confirm that it is a real problem.
> > With 9MB swap space and 10MB mem_cgroup limit,when allocate 15MB memory,
> > there is a probability that OOM occurs.
>
> Could you be more specific about the test and the oom report?

I actually couldn't reproduce it using 9MB of zram and a cgroup with a
10MB limit trying to allocate 15MB of tmpfs, no matter how many
repetitions I do.

>
> --
> Michal Hocko
> SUSE Labs
Liu Shixin Aug. 24, 2023, 3:39 a.m. UTC | #5
On 2023/8/23 23:29, Yosry Ahmed wrote:
> On Wed, Aug 23, 2023 at 6:12 AM Michal Hocko <mhocko@suse.com> wrote:
>> On Wed 23-08-23 10:00:58, Liu Shixin wrote:
>>>
>>> On 2023/8/23 0:35, Yosry Ahmed wrote:
>>>> On Mon, Aug 21, 2023 at 6:54 PM Liu Shixin <liushixin2@huawei.com> wrote:
>>>>> When spaces of swap devices are exhausted, only file pages can be reclaimed.
>>>>> But there are still some swapcache pages in anon lru list. This can lead
>>>>> to a premature out-of-memory.
>>>>>
>>>>> This problem can be fixed by checking number of swapcache pages in
>>>>> can_reclaim_anon_pages(). For memcg v2, there are swapcache stat that can
>>>>> be used directly. For memcg v1, use total_swapcache_pages() instead, which
>>>>> may not accurate but can solve the problem.
>>>> Interesting find. I wonder if we really don't have any handling of
>>>> this situation.
>>> I have alreadly test this problem and can confirm that it is a real problem.
>>> With 9MB swap space and 10MB mem_cgroup limit,when allocate 15MB memory,
>>> there is a probability that OOM occurs.
>> Could you be more specific about the test and the oom report?
> I actually couldn't reproduce it using 9MB of zram and a cgroup with a
> 10MB limit trying to allocate 15MB of tmpfs, no matter how many
> repetitions I do.
The following is the printout of the testcase I used. In fact, the probability
of triggering this problem is very low. You can adjust the size of the swap
space to increase the probability of recurrence.

10148+0 records in
10148+0 records out
10391552 bytes (10 MB, 9.9 MiB) copied, 0.0390954 s, 266 MB/s
mkswap: /home/swapfile: insecure permissions 0644, 0600 suggested.
Setting up swapspace version 1, size = 9.9 MiB (10387456 bytes)
no label, UUID=9219cb2a-55d7-46b6-9dcd-bb491095225d
mkswap return is 0
swapon: /home/swapfile: insecure permissions 0644, 0600 suggested.
swapon return is 0
swapoff success
10148+0 records in
10148+0 records out
10391552 bytes (10 MB, 9.9 MiB) copied, 0.0389205 s, 267 MB/s
mkswap: /home/swapfile: insecure permissions 0644, 0600 suggested.
Setting up swapspace version 1, size = 9.9 MiB (10387456 bytes)
no label, UUID=614b967a-bd87-430d-b867-6e09a8b77b27
mkswap return is 0
swapon: /home/swapfile: insecure permissions 0644, 0600 suggested.
swapon return is 0
---- in do_test---
=========orig_mem is 10428416, orig_sw_mem is 17059840
SwapCached:         3428 kB
SwapTotal:         10144 kB
SwapFree:            240 kB
rss is 7572, swap is 0
check pass
memcg_swap.sh: line 79:  6596 Killed                  cgexec -g "memory:test" ./malloc 16777216
swapoff success
10148+0 records in
10148+0 records out
10391552 bytes (10 MB, 9.9 MiB) copied, 0.0404156 s, 257 MB/s
mkswap: /home/swapfile: insecure permissions 0644, 0600 suggested.
Setting up swapspace version 1, size = 9.9 MiB (10387456 bytes)
no label, UUID=a228e988-47c1-44d5-9ce1-cd7b66e97721
mkswap return is 0
swapon: /home/swapfile: insecure permissions 0644, 0600 suggested.
swapon return is 0
---- in do_test---
=========orig_mem is 10485760, orig_sw_mem is 16834560
SwapCached:         3944 kB
SwapTotal:         10144 kB
SwapFree:              0 kB
rss is 7112, swap is 0
check fail
memcg_swap.sh: line 79:  6633 Killed                  cgexec -g "memory:test" ./malloc 16777216

This is my testcase:

memcg_swap.sh:
#!/bin/bash

_mkswap()
{
    size=${1:-10}
    swapfile="/home/swapfile"

    # clear swap
    swapoff -a
    expect_size=$(free -b | grep 'Swap' | awk '{print $2}')
    if [ $expect_size -ne 0 ]; then
        echo "$expect_size"
        echo "swapoff fail"
        return 1
    fi
    echo "swapoff success"
    rm -rf $swapfile
    dd if=/dev/zero of=$swapfile bs=1k count=10148
    mkswap $swapfile
    echo "mkswap return is $?"
    swapon $swapfile
    echo "swapon return is $?"
}

echo "----in do_pre----"
cgdelete -r "memory:test"
cgcreate -g "memory:test"
_mkswap 10

while true
do
    _mkswap 10
    echo "---- in do_test---"
    cgcreate -g "memory:test"
    echo 1 > /sys/fs/cgroup/memory/test/memory.oom_control
    echo 10M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    cgexec -g "memory:test" ./malloc 16777216 &
    pid=$!
    sleep 3
    orig_mem=$(cat /sys/fs/cgroup/memory/test/memory.usage_in_bytes)
    orig_sw_mem=$(cat /sys/fs/cgroup/memory/test/memory.memsw.usage_in_bytes)
    echo "=========orig_mem is $orig_mem, orig_sw_mem is $orig_sw_mem"
    cat /proc/meminfo | grep Swap
    swapfree=$(cat /proc/meminfo | grep SwapFree |  awk '{print $2}')
    swapcache=$(cat /proc/meminfo | grep SwapCached |  awk '{print $2}')
    if [ $swapfree -eq 0 ]; then
        echo "==========" >> /root/free.txt
        echo "swapfree is $swapfree" >> /root/free.txt
        echo "swapcache is $swacache" >> /root/free.txt
        echo "==========" >> /root/free.txt
    fi
    rss=$(cat /proc/$pid/smaps_rollup | sed -n 2p | awk '{print $2}')
    swap=$(cat /proc/$pid/smaps_rollup | sed -n 19p | awk '{print $2}')
    echo "rss is $rss, swap is $swap"
    echo "test data==================" >> /root/data.txt
    echo "rss is $rss" >> /root/data.txt
    echo "swap is $swap" >> /root/data.txt
    echo "test end===================" >> /root/data.txt
    if [ $orig_mem -le 10485760 -a \
             "$(cat /proc/$pid/status | grep State | awk '{print $2}')" == "R" ]; then
        echo "check pass"
    else
        echo "check fail"
        kill -9 $pid
        cgdelete -r "memory:test"
        break
    fi
    kill -9 $pid
    cgdelete -r "memory:test"
done

malloc.c:
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <stdlib.h>
#include <sys/mman.h>

int main(int argc, char **argv)
{
        char *p;

        p = malloc(atoi(argv[1]));
        memset(p, 1, atoi(argv[1]));
        return 0;
}
>
>> --
>> Michal Hocko
>> SUSE Labs
> .
>
Huang, Ying Aug. 24, 2023, 8:48 a.m. UTC | #6
Yosry Ahmed <yosryahmed@google.com> writes:

> On Mon, Aug 21, 2023 at 6:54 PM Liu Shixin <liushixin2@huawei.com> wrote:
>>
>> When spaces of swap devices are exhausted, only file pages can be reclaimed.
>> But there are still some swapcache pages in anon lru list. This can lead
>> to a premature out-of-memory.
>>
>> This problem can be fixed by checking number of swapcache pages in
>> can_reclaim_anon_pages(). For memcg v2, there are swapcache stat that can
>> be used directly. For memcg v1, use total_swapcache_pages() instead, which
>> may not accurate but can solve the problem.
>
> Interesting find. I wonder if we really don't have any handling of
> this situation.
>
>>
>> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
>> ---
>>  include/linux/swap.h |  6 ++++++
>>  mm/memcontrol.c      |  8 ++++++++
>>  mm/vmscan.c          | 12 ++++++++----
>>  3 files changed, 22 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 456546443f1f..0318e918bfa4 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -669,6 +669,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
>>  }
>>
>>  extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
>> +extern long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg);
>>  extern bool mem_cgroup_swap_full(struct folio *folio);
>>  #else
>>  static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
>> @@ -691,6 +692,11 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
>>         return get_nr_swap_pages();
>>  }
>>
>> +static inline long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
>> +{
>> +       return total_swapcache_pages();
>> +}
>> +
>>  static inline bool mem_cgroup_swap_full(struct folio *folio)
>>  {
>>         return vm_swap_full();
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index e8ca4bdcb03c..3e578f41023e 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -7567,6 +7567,14 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
>>         return nr_swap_pages;
>>  }
>>
>> +long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
>> +{
>> +       if (mem_cgroup_disabled() || do_memsw_account())
>> +               return total_swapcache_pages();
>> +
>> +       return memcg_page_state(memcg, NR_SWAPCACHE);
>> +}
>
> Is there a reason why we cannot use NR_SWAPCACHE for cgroup v1? Isn't
> that being maintained regardless of cgroup version? It is not exposed
> in cgroup v1's memory.stat, but I don't think there is a reason we
> can't do that -- if only to document that it is being used with cgroup
> v1.
>
>
>> +
>>  bool mem_cgroup_swap_full(struct folio *folio)
>>  {
>>         struct mem_cgroup *memcg;
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 7c33c5b653ef..bcb6279cbae7 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -609,13 +609,17 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
>>         if (memcg == NULL) {
>>                 /*
>>                  * For non-memcg reclaim, is there
>> -                * space in any swap device?
>> +                * space in any swap device or swapcache pages?
>>                  */
>> -               if (get_nr_swap_pages() > 0)
>> +               if (get_nr_swap_pages() + total_swapcache_pages() > 0)
>>                         return true;
>>         } else {
>> -               /* Is the memcg below its swap limit? */
>> -               if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
>> +               /*
>> +                * Is the memcg below its swap limit or is there swapcache
>> +                * pages can be freed?
>> +                */
>> +               if (mem_cgroup_get_nr_swap_pages(memcg) +
>> +                   mem_cgroup_get_nr_swapcache_pages(memcg) > 0)
>>                         return true;
>>         }
>
> I wonder if it would be more efficient to set a bit in struct
> scan_control if we only are out of swap spaces but have swap cache
> pages, and only isolate anon pages that are in the swap cache, instead
> of isolating random anon pages. We may end up isolating pages that are
> not in the swap cache for a few iterations and wasting cycles.

Scanning swap cache directly will make the code more complex.  IIUC, the
possibility for the swap device to be used up isn't high.  If so, I
prefer the simpler implementation as that in this series.

--
Best Regards,
Huang, Ying
Yosry Ahmed Aug. 24, 2023, 6:27 p.m. UTC | #7
On Wed, Aug 23, 2023 at 8:39 PM Liu Shixin <liushixin2@huawei.com> wrote:
>
>
>
> On 2023/8/23 23:29, Yosry Ahmed wrote:
> > On Wed, Aug 23, 2023 at 6:12 AM Michal Hocko <mhocko@suse.com> wrote:
> >> On Wed 23-08-23 10:00:58, Liu Shixin wrote:
> >>>
> >>> On 2023/8/23 0:35, Yosry Ahmed wrote:
> >>>> On Mon, Aug 21, 2023 at 6:54 PM Liu Shixin <liushixin2@huawei.com> wrote:
> >>>>> When spaces of swap devices are exhausted, only file pages can be reclaimed.
> >>>>> But there are still some swapcache pages in anon lru list. This can lead
> >>>>> to a premature out-of-memory.
> >>>>>
> >>>>> This problem can be fixed by checking number of swapcache pages in
> >>>>> can_reclaim_anon_pages(). For memcg v2, there are swapcache stat that can
> >>>>> be used directly. For memcg v1, use total_swapcache_pages() instead, which
> >>>>> may not accurate but can solve the problem.
> >>>> Interesting find. I wonder if we really don't have any handling of
> >>>> this situation.
> >>> I have alreadly test this problem and can confirm that it is a real problem.
> >>> With 9MB swap space and 10MB mem_cgroup limit,when allocate 15MB memory,
> >>> there is a probability that OOM occurs.
> >> Could you be more specific about the test and the oom report?
> > I actually couldn't reproduce it using 9MB of zram and a cgroup with a
> > 10MB limit trying to allocate 15MB of tmpfs, no matter how many
> > repetitions I do.
> The following is the printout of the testcase I used. In fact, the probability
> of triggering this problem is very low. You can adjust the size of the swap
> space to increase the probability of recurrence.
>

I was actually trying to reproduce this in the worst way possible:
- Tmpfs is really eager to remove pages from the swapcache once they
are swapped in and added to the page cache, unlike anon.
- With zram we skip the swapcache in the swap fault path for non-shared pages.

I tried again with anonymous pages on a disk swapfile and I can
reliably reproduce the problem with the attached script. It basically
sets up 6MB of disk swap, creates a cgroup with 10MB limit, and runs
an allocator program. The program allocates 13MB and writes to them
twice (to induce refaults), and repeats this 100 times.

Here is an OOM log:

alloc_anon invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0,
oom_score_adj=0
[ 1974.205755] CPU: 252 PID: 116849 Comm: alloc_anon Tainted: G
   O       6.5.0-smp-DEV #21
[ 1974.205758] Hardware name: Google, Inc.
Arcadia_IT_80/Arcadia_IT_80, BIOS 12.22.0 06/14/2023
[ 1974.205759] Call Trace:
[ 1974.205761]  <TASK>
[ 1974.205764]  dump_stack_lvl+0x5d/0x90
[ 1974.205770]  dump_stack+0x14/0x20
[ 1974.205772]  dump_header+0x52/0x230
[ 1974.205775]  oom_kill_process+0x103/0x200
[ 1974.205777]  out_of_memory+0x3ae/0x590
[ 1974.205779]  mem_cgroup_out_of_memory+0x110/0x150
[ 1974.205782]  try_charge_memcg+0x3f9/0x760
[ 1974.205784]  ? __folio_alloc+0x1e/0x40
[ 1974.205787]  charge_memcg+0x3d/0x180
[ 1974.205788]  __mem_cgroup_charge+0x2f/0x70
[ 1974.205790]  do_pte_missing+0x7e8/0xd10
[ 1974.205792]  handle_mm_fault+0x6a5/0xaa0
[ 1974.205795]  do_user_addr_fault+0x387/0x560
[ 1974.205798]  exc_page_fault+0x67/0x120
[ 1974.205799]  asm_exc_page_fault+0x2b/0x30
[ 1974.205802] RIP: 0033:0x40ee89
[ 1974.205804] Code: fe 7f 40 40 c5 fe 7f 40 60 48 83 c7 80 48 81 fa
00 01 00 00 76 2b 48 8d 90 80 00 00 00 48 83 e2 c0 c5 fd 7f 02 c5 fd
7f 40
[ 1974.205806] RSP: 002b:00007ffe9bf6a2d8 EFLAGS: 00010283
[ 1974.205808] RAX: 00000000007286c0 RBX: 00007ffe9bf6a4e8 RCX: 00000000007286c0
[ 1974.205809] RDX: 0000000001215fc0 RSI: 0000000000000001 RDI: 0000000001228640
[ 1974.205810] RBP: 00007ffe9bf6a310 R08: 0000000000b20951 R09: 0000000000000000
[ 1974.205811] R10: 00000000004a21e0 R11: 0000000000749000 R12: 00007ffe9bf6a4c8
[ 1974.205812] R13: 000000000049e6f0 R14: 0000000000000001 R15: 0000000000000001
[ 1974.205815]  </TASK>
[ 1974.205815] memory: usage 10240kB, limit 10240kB, failcnt 4480498
[ 1974.205817] swap: usage 6140kB, limit 9007199254740988kB, failcnt 0
[ 1974.205818] Memory cgroup stats for /a:
[ 1974.205895] anon 5222400
[ 1974.205896] file 0
[ 1974.205896] kernel 106496
[ 1974.205897] kernel_stack 0
[ 1974.205897] pagetables 45056
[ 1974.205898] sec_pagetables 0
[ 1974.205898] percpu 12384
[ 1974.205898] sock 0
[ 1974.205899] vmalloc 0
[ 1974.205899] shmem 0
[ 1974.205900] zswap 0
[ 1974.205900] zswapped 0
[ 1974.205900] file_mapped 0
[ 1974.205901] file_dirty 0
[ 1974.205901] file_writeback 122880
[ 1974.205902] swapcached 5156864
[ 1974.205902] anon_thp 2097152
[ 1974.205902] file_thp 0
[ 1974.205903] shmem_thp 0
[ 1974.205903] inactive_anon 9396224
[ 1974.205904] active_anon 925696
[ 1974.205904] inactive_file 0
[ 1974.205904] active_file 0
[ 1974.205905] unevictable 0
[ 1974.205905] slab_reclaimable 1824
[ 1974.205906] slab_unreclaimable 7728
[ 1974.205906] slab 9552
[ 1974.205907] workingset_refault_anon 5220628
[ 1974.205907] workingset_refault_file 0
[ 1974.205907] workingset_activate_anon 212458
[ 1974.205908] workingset_activate_file 0
[ 1974.205908] workingset_restore_anon 110673
[ 1974.205909] workingset_restore_file 0
[ 1974.205909] workingset_nodereclaim 0
[ 1974.205910] pgscan 15788241
[ 1974.205910] pgsteal 5222782
[ 1974.205910] pgscan_kswapd 0
[ 1974.205911] pgscan_direct 15786393
[ 1974.205911] pgscan_khugepaged 1848
[ 1974.205912] pgsteal_kswapd 0
[ 1974.205912] pgsteal_direct 5222044
[ 1974.205912] pgsteal_khugepaged 738
[ 1974.205913] pgfault 5298719
[ 1974.205913] pgmajfault 692459
[ 1974.205914] pgrefill 350803
[ 1974.205914] pgactivate 140230
[ 1974.205914] pgdeactivate 350803
[ 1974.205915] pglazyfree 0
[ 1974.205915] pglazyfreed 0
[ 1974.205916] zswpin 0
[ 1974.205916] zswpout 0
[ 1974.205916] thp_fault_alloc 125
[ 1974.205917] thp_collapse_alloc 2
[ 1974.205917] Tasks state (memory values in pages):
[ 1974.205918] [  pid  ]   uid  tgid total_vm      rss pgtables_bytes
swapents oom_score_adj name
[ 1974.205919] [ 116849]     0 116849     3059     1722    53248
1024             0 alloc_anon
[ 1974.205922] oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=/,mems_allowed=0-1,oom_memcg=/a,task_memcg=/a,task=alloc_anon,pid=10
[ 1974.205929] Memory cgroup out of memory: Killed process 116849
(alloc_anon) total-vm:12236kB, anon-rss:6888kB, file-rss:0kB,
shmem-rss:0kB,0

It shows that we are using 10MB of memory and 6MB of swap, but that's
because the ~5MB of memory in the swapcache are doubling as both
memory and swap if I understand correctly.

I could not reproduce the problem with this patch (but perhaps I
didn't try hard enough).
Yosry Ahmed Aug. 24, 2023, 6:31 p.m. UTC | #8
On Thu, Aug 24, 2023 at 1:51 AM Huang, Ying <ying.huang@intel.com> wrote:
>
> Yosry Ahmed <yosryahmed@google.com> writes:
>
> > On Mon, Aug 21, 2023 at 6:54 PM Liu Shixin <liushixin2@huawei.com> wrote:
> >>
> >> When spaces of swap devices are exhausted, only file pages can be reclaimed.
> >> But there are still some swapcache pages in anon lru list. This can lead
> >> to a premature out-of-memory.
> >>
> >> This problem can be fixed by checking number of swapcache pages in
> >> can_reclaim_anon_pages(). For memcg v2, there are swapcache stat that can
> >> be used directly. For memcg v1, use total_swapcache_pages() instead, which
> >> may not accurate but can solve the problem.
> >
> > Interesting find. I wonder if we really don't have any handling of
> > this situation.
> >
> >>
> >> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
> >> ---
> >>  include/linux/swap.h |  6 ++++++
> >>  mm/memcontrol.c      |  8 ++++++++
> >>  mm/vmscan.c          | 12 ++++++++----
> >>  3 files changed, 22 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/include/linux/swap.h b/include/linux/swap.h
> >> index 456546443f1f..0318e918bfa4 100644
> >> --- a/include/linux/swap.h
> >> +++ b/include/linux/swap.h
> >> @@ -669,6 +669,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
> >>  }
> >>
> >>  extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
> >> +extern long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg);
> >>  extern bool mem_cgroup_swap_full(struct folio *folio);
> >>  #else
> >>  static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
> >> @@ -691,6 +692,11 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
> >>         return get_nr_swap_pages();
> >>  }
> >>
> >> +static inline long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
> >> +{
> >> +       return total_swapcache_pages();
> >> +}
> >> +
> >>  static inline bool mem_cgroup_swap_full(struct folio *folio)
> >>  {
> >>         return vm_swap_full();
> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >> index e8ca4bdcb03c..3e578f41023e 100644
> >> --- a/mm/memcontrol.c
> >> +++ b/mm/memcontrol.c
> >> @@ -7567,6 +7567,14 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
> >>         return nr_swap_pages;
> >>  }
> >>
> >> +long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
> >> +{
> >> +       if (mem_cgroup_disabled() || do_memsw_account())
> >> +               return total_swapcache_pages();
> >> +
> >> +       return memcg_page_state(memcg, NR_SWAPCACHE);
> >> +}
> >
> > Is there a reason why we cannot use NR_SWAPCACHE for cgroup v1? Isn't
> > that being maintained regardless of cgroup version? It is not exposed
> > in cgroup v1's memory.stat, but I don't think there is a reason we
> > can't do that -- if only to document that it is being used with cgroup
> > v1.
> >
> >
> >> +
> >>  bool mem_cgroup_swap_full(struct folio *folio)
> >>  {
> >>         struct mem_cgroup *memcg;
> >> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >> index 7c33c5b653ef..bcb6279cbae7 100644
> >> --- a/mm/vmscan.c
> >> +++ b/mm/vmscan.c
> >> @@ -609,13 +609,17 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
> >>         if (memcg == NULL) {
> >>                 /*
> >>                  * For non-memcg reclaim, is there
> >> -                * space in any swap device?
> >> +                * space in any swap device or swapcache pages?
> >>                  */
> >> -               if (get_nr_swap_pages() > 0)
> >> +               if (get_nr_swap_pages() + total_swapcache_pages() > 0)
> >>                         return true;
> >>         } else {
> >> -               /* Is the memcg below its swap limit? */
> >> -               if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
> >> +               /*
> >> +                * Is the memcg below its swap limit or is there swapcache
> >> +                * pages can be freed?
> >> +                */
> >> +               if (mem_cgroup_get_nr_swap_pages(memcg) +
> >> +                   mem_cgroup_get_nr_swapcache_pages(memcg) > 0)
> >>                         return true;
> >>         }
> >
> > I wonder if it would be more efficient to set a bit in struct
> > scan_control if we only are out of swap spaces but have swap cache
> > pages, and only isolate anon pages that are in the swap cache, instead
> > of isolating random anon pages. We may end up isolating pages that are
> > not in the swap cache for a few iterations and wasting cycles.
>
> Scanning swap cache directly will make the code more complex.  IIUC, the
> possibility for the swap device to be used up isn't high.  If so, I
> prefer the simpler implementation as that in this series.

I did not mean that, sorry if I wasn't clear. I meant to set a bit in
struct scan_control, and then in isolate_lru_folios() for anon lrus,
we can skip isolating folios that are not in the swapcache if that bit
is set.

My main concern was that if we have a few pages in the swapcache we
may end up wasting cycles scanning through a lot of other anonymous
pages until we reach them. If that's too much complexity that's
understandable.

>
> --
> Best Regards,
> Huang, Ying
Huang, Ying Aug. 25, 2023, 12:47 a.m. UTC | #9
Yosry Ahmed <yosryahmed@google.com> writes:

> On Thu, Aug 24, 2023 at 1:51 AM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Yosry Ahmed <yosryahmed@google.com> writes:
>>
>> > On Mon, Aug 21, 2023 at 6:54 PM Liu Shixin <liushixin2@huawei.com> wrote:
>> >>
>> >> When spaces of swap devices are exhausted, only file pages can be reclaimed.
>> >> But there are still some swapcache pages in anon lru list. This can lead
>> >> to a premature out-of-memory.
>> >>
>> >> This problem can be fixed by checking number of swapcache pages in
>> >> can_reclaim_anon_pages(). For memcg v2, there are swapcache stat that can
>> >> be used directly. For memcg v1, use total_swapcache_pages() instead, which
>> >> may not accurate but can solve the problem.
>> >
>> > Interesting find. I wonder if we really don't have any handling of
>> > this situation.
>> >
>> >>
>> >> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
>> >> ---
>> >>  include/linux/swap.h |  6 ++++++
>> >>  mm/memcontrol.c      |  8 ++++++++
>> >>  mm/vmscan.c          | 12 ++++++++----
>> >>  3 files changed, 22 insertions(+), 4 deletions(-)
>> >>
>> >> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> >> index 456546443f1f..0318e918bfa4 100644
>> >> --- a/include/linux/swap.h
>> >> +++ b/include/linux/swap.h
>> >> @@ -669,6 +669,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
>> >>  }
>> >>
>> >>  extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
>> >> +extern long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg);
>> >>  extern bool mem_cgroup_swap_full(struct folio *folio);
>> >>  #else
>> >>  static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
>> >> @@ -691,6 +692,11 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
>> >>         return get_nr_swap_pages();
>> >>  }
>> >>
>> >> +static inline long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
>> >> +{
>> >> +       return total_swapcache_pages();
>> >> +}
>> >> +
>> >>  static inline bool mem_cgroup_swap_full(struct folio *folio)
>> >>  {
>> >>         return vm_swap_full();
>> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> >> index e8ca4bdcb03c..3e578f41023e 100644
>> >> --- a/mm/memcontrol.c
>> >> +++ b/mm/memcontrol.c
>> >> @@ -7567,6 +7567,14 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
>> >>         return nr_swap_pages;
>> >>  }
>> >>
>> >> +long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
>> >> +{
>> >> +       if (mem_cgroup_disabled() || do_memsw_account())
>> >> +               return total_swapcache_pages();
>> >> +
>> >> +       return memcg_page_state(memcg, NR_SWAPCACHE);
>> >> +}
>> >
>> > Is there a reason why we cannot use NR_SWAPCACHE for cgroup v1? Isn't
>> > that being maintained regardless of cgroup version? It is not exposed
>> > in cgroup v1's memory.stat, but I don't think there is a reason we
>> > can't do that -- if only to document that it is being used with cgroup
>> > v1.
>> >
>> >
>> >> +
>> >>  bool mem_cgroup_swap_full(struct folio *folio)
>> >>  {
>> >>         struct mem_cgroup *memcg;
>> >> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> >> index 7c33c5b653ef..bcb6279cbae7 100644
>> >> --- a/mm/vmscan.c
>> >> +++ b/mm/vmscan.c
>> >> @@ -609,13 +609,17 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
>> >>         if (memcg == NULL) {
>> >>                 /*
>> >>                  * For non-memcg reclaim, is there
>> >> -                * space in any swap device?
>> >> +                * space in any swap device or swapcache pages?
>> >>                  */
>> >> -               if (get_nr_swap_pages() > 0)
>> >> +               if (get_nr_swap_pages() + total_swapcache_pages() > 0)
>> >>                         return true;
>> >>         } else {
>> >> -               /* Is the memcg below its swap limit? */
>> >> -               if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
>> >> +               /*
>> >> +                * Is the memcg below its swap limit or is there swapcache
>> >> +                * pages can be freed?
>> >> +                */
>> >> +               if (mem_cgroup_get_nr_swap_pages(memcg) +
>> >> +                   mem_cgroup_get_nr_swapcache_pages(memcg) > 0)
>> >>                         return true;
>> >>         }
>> >
>> > I wonder if it would be more efficient to set a bit in struct
>> > scan_control if we only are out of swap spaces but have swap cache
>> > pages, and only isolate anon pages that are in the swap cache, instead
>> > of isolating random anon pages. We may end up isolating pages that are
>> > not in the swap cache for a few iterations and wasting cycles.
>>
>> Scanning swap cache directly will make the code more complex.  IIUC, the
>> possibility for the swap device to be used up isn't high.  If so, I
>> prefer the simpler implementation as that in this series.
>
> I did not mean that, sorry if I wasn't clear. I meant to set a bit in
> struct scan_control, and then in isolate_lru_folios() for anon lrus,
> we can skip isolating folios that are not in the swapcache if that bit
> is set.
>
> My main concern was that if we have a few pages in the swapcache we
> may end up wasting cycles scanning through a lot of other anonymous
> pages until we reach them. If that's too much complexity that's
> understandable.

Sorry, I misunderstood your idea.  This sounds reasonable to me.  We can
check swap space and swap cache in isolate_lru_folios(), either in
isolate_lru_folios() directly, or via a bit in scan_control.

--
Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 456546443f1f..0318e918bfa4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -669,6 +669,7 @@  static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
 }
 
 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
+extern long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg);
 extern bool mem_cgroup_swap_full(struct folio *folio);
 #else
 static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
@@ -691,6 +692,11 @@  static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 	return get_nr_swap_pages();
 }
 
+static inline long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
+{
+	return total_swapcache_pages();
+}
+
 static inline bool mem_cgroup_swap_full(struct folio *folio)
 {
 	return vm_swap_full();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e8ca4bdcb03c..3e578f41023e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7567,6 +7567,14 @@  long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 	return nr_swap_pages;
 }
 
+long mem_cgroup_get_nr_swapcache_pages(struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled() || do_memsw_account())
+		return total_swapcache_pages();
+
+	return memcg_page_state(memcg, NR_SWAPCACHE);
+}
+
 bool mem_cgroup_swap_full(struct folio *folio)
 {
 	struct mem_cgroup *memcg;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7c33c5b653ef..bcb6279cbae7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -609,13 +609,17 @@  static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
 	if (memcg == NULL) {
 		/*
 		 * For non-memcg reclaim, is there
-		 * space in any swap device?
+		 * space in any swap device or swapcache pages?
 		 */
-		if (get_nr_swap_pages() > 0)
+		if (get_nr_swap_pages() + total_swapcache_pages() > 0)
 			return true;
 	} else {
-		/* Is the memcg below its swap limit? */
-		if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+		/*
+		 * Is the memcg below its swap limit or is there swapcache
+		 * pages can be freed?
+		 */
+		if (mem_cgroup_get_nr_swap_pages(memcg) +
+		    mem_cgroup_get_nr_swapcache_pages(memcg) > 0)
 			return true;
 	}