diff mbox series

mm: throttle and inc min_seq when both page types reach MIN_NR_GENS

Message ID 20241009074953.608591-1-zhaoyang.huang@unisoc.com (mailing list archive)
State New
Headers show
Series mm: throttle and inc min_seq when both page types reach MIN_NR_GENS | expand

Commit Message

zhaoyang.huang Oct. 9, 2024, 7:49 a.m. UTC
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

The test case of [1] leads to system hang which caused by a local
watchdog thread starved over 20s on a 5.5GB RAM ANDROID15(v6.6)
system. This commit solve the issue by have the reclaimer be throttled
and increase min_seq if both page types reach MIN_NR_GENS, which may
introduce a livelock of switching type with holding lruvec->lru_lock.

[1]
launch below script 8 times simutanously which allocates 1GB virtual
memory and access it from user space by each thread.
$ costmem -c1024000 -b12800 -o0 &

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 mm/vmscan.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

Comments

Andrew Morton Oct. 9, 2024, 8:52 p.m. UTC | #1
On Wed, 9 Oct 2024 15:49:53 +0800 "zhaoyang.huang" <zhaoyang.huang@unisoc.com> wrote:

> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> 
> The test case of [1] leads to system hang which caused by a local
> watchdog thread starved over 20s on a 5.5GB RAM ANDROID15(v6.6)
> system. This commit solve the issue by have the reclaimer be throttled
> and increase min_seq if both page types reach MIN_NR_GENS, which may
> introduce a livelock of switching type with holding lruvec->lru_lock.
> 
> [1]
> launch below script 8 times simutanously which allocates 1GB virtual
> memory and access it from user space by each thread.
> $ costmem -c1024000 -b12800 -o0 &
> 

That looks like a pretty simple testcase.  Do people know where to get
`costmem' from?

> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c

This is a somewhat serious issue, so I'll add the patch for some
testing, but I'll await feedback from MGLRU developers before
proceeeding further, thanks.
Zhaoyang Huang Oct. 10, 2024, 1:28 a.m. UTC | #2
On Thu, Oct 10, 2024 at 4:52 AM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Wed, 9 Oct 2024 15:49:53 +0800 "zhaoyang.huang" <zhaoyang.huang@unisoc.com> wrote:
>
> > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> >
> > The test case of [1] leads to system hang which caused by a local
> > watchdog thread starved over 20s on a 5.5GB RAM ANDROID15(v6.6)
> > system. This commit solve the issue by have the reclaimer be throttled
> > and increase min_seq if both page types reach MIN_NR_GENS, which may
> > introduce a livelock of switching type with holding lruvec->lru_lock.
> >
> > [1]
> > launch below script 8 times simutanously which allocates 1GB virtual
> > memory and access it from user space by each thread.
> > $ costmem -c1024000 -b12800 -o0 &
> >
>
> That looks like a pretty simple testcase.  Do people know where to get
> `costmem' from?
Sorry, I am just aware that this is an internal test tool integrated
into the SDK by our folks. Here is an old version of costmem which I
can share

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

unsigned int block_size = 64;
unsigned int cost_size = 1024 * 1024;
int oom_adj = 15;

static void usage(void)
{
printf("Usage:\n");
printf("  costmem [-ccost_size(KB) -bblock_size(KB) -oOom_adj(-16 to 15)]\n");
printf("  such as: costmem -c2048 -b128 -o15\n");
}

void process_options(int argc, char **argv)
{
int opt = 0;
while ((opt = getopt (argc, argv, "c:b:o:")) != -1) {
switch (opt) {
case 'c':
cost_size = (unsigned int)atoi(optarg);
break;
case 'b':
block_size = (unsigned int)atoi(optarg);
break;
case 'o':
oom_adj = atoi(optarg);
break;
default:
break;
}
}
}

int main(int argc, char *argv[])
{
int i, max;
char *memory;
size_t j;
size_t page_size;
int rval = -EINVAL;
char text[128] = {0};
int fd;
pid_t pid = getpid();

if (argc < 2) {
usage();
return rval;
} else if (argc == 2) {
if (strstr(argv[1], "help"))
usage();
return rval;
}

process_options(argc, argv);
if (oom_adj < -16 || oom_adj > 15) {
printf("Oom_adj must between -16 to 15\n");
return rval;
}

sprintf(text, "/proc/%d/oom_adj", pid);

fd = open(text, O_WRONLY);

if (-1 == fd) {
perror("open");
return rval;
} else {
sprintf(text, "%d", oom_adj);
if (write(fd, text, strlen(text)) == -1)
perror("write");

close(fd);
}

printf("Cost mem %d KB, %d KB per Block, oom_adj %d\n", cost_size,
block_size, oom_adj);

max = cost_size / block_size;

for(i = 1; i < max + 1; i++) {
memory = malloc(block_size * 1024);
if(NULL == memory){
perror("malloc");
return rval;
}

if(mlock(memory, block_size * 1024) == -1) {
perror("mlock");
return rval;
}

memset(memory, 0, block_size * 1024);

printf("%dKB,", (int)(block_size * i));
if(9 == i % 10)
printf("\n");
}

printf("Have malloc and mlock %d KB mem\n", block_size * i);
printf("Have malloc and mlock %d KB mem\n", block_size * i);
printf("Have malloc and mlock %d KB mem\n", block_size * i);

i = 0;
while(1){
sleep(20);
i++;
printf(".");
if(9 == i % 10)
printf("Please Ctrl+c to kill this APP\n");
}
return 0;
}

>
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
>
> This is a somewhat serious issue, so I'll add the patch for some
> testing, but I'll await feedback from MGLRU developers before
> proceeeding further, thanks.
IMHO, MGLRU is now lack of the mechanism of 'too_many_isolated' thing,
should we do it in this way or others?
>
Yu Zhao Oct. 10, 2024, 4:37 p.m. UTC | #3
On Wed, Oct 9, 2024 at 1:50 AM zhaoyang.huang <zhaoyang.huang@unisoc.com> wrote:
>
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
>
> The test case of [1] leads to system hang which caused by a local
> watchdog thread starved over 20s on a 5.5GB RAM ANDROID15(v6.6)
> system. This commit solve the issue by have the reclaimer be throttled
> and increase min_seq if both page types reach MIN_NR_GENS, which may
> introduce a livelock of switching type with holding lruvec->lru_lock.
>
> [1]
> launch below script 8 times simutanously which allocates 1GB virtual
> memory and access it from user space by each thread.
> $ costmem -c1024000 -b12800 -o0 &
>
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> ---
>  mm/vmscan.c | 16 ++++++++++++++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index cfa839284b92..83e450d0ce3c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4384,11 +4384,23 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>         int remaining = MAX_LRU_BATCH;
>         struct lru_gen_folio *lrugen = &lruvec->lrugen;
>         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> +       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>
>         VM_WARN_ON_ONCE(!list_empty(list));
>
> -       if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
> -               return 0;
> +       if (get_nr_gens(lruvec, type) == MIN_NR_GENS) {
> +               /*
> +                * throttle for a while and then increase the min_seq since
> +                * both page types reach the limit.
> +                */

Sorry but this isn't going to work because in try_to_inc_min_seq(), there is
   `while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {`
to prevent reclaimers from evicting hot memory -- they need to do aging first.

>
> +               if (get_nr_gens(lruvec, !type) == MIN_NR_GENS) {
> +                       spin_unlock_irq(&lruvec->lru_lock);
> +                       reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
> +                       spin_lock_irq(&lruvec->lru_lock);
> +                       try_to_inc_min_seq(lruvec, get_swappiness(lruvec, sc));
> +               } else
> +                       return 0;
> +       }
>
>         gen = lru_gen_from_seq(lrugen->min_seq[type]);
>
> --
> 2.25.1
>
Zhaoyang Huang Oct. 11, 2024, 8:02 a.m. UTC | #4
On Fri, Oct 11, 2024 at 12:37 AM Yu Zhao <yuzhao@google.com> wrote:
>
> On Wed, Oct 9, 2024 at 1:50 AM zhaoyang.huang <zhaoyang.huang@unisoc.com> wrote:
> >
> > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> >
> > The test case of [1] leads to system hang which caused by a local
> > watchdog thread starved over 20s on a 5.5GB RAM ANDROID15(v6.6)
> > system. This commit solve the issue by have the reclaimer be throttled
> > and increase min_seq if both page types reach MIN_NR_GENS, which may
> > introduce a livelock of switching type with holding lruvec->lru_lock.
> >
> > [1]
> > launch below script 8 times simutanously which allocates 1GB virtual
> > memory and access it from user space by each thread.
> > $ costmem -c1024000 -b12800 -o0 &
> >
> > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > ---
> >  mm/vmscan.c | 16 ++++++++++++++--
> >  1 file changed, 14 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index cfa839284b92..83e450d0ce3c 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -4384,11 +4384,23 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
> >         int remaining = MAX_LRU_BATCH;
> >         struct lru_gen_folio *lrugen = &lruvec->lrugen;
> >         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> > +       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> >
> >         VM_WARN_ON_ONCE(!list_empty(list));
> >
> > -       if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
> > -               return 0;
> > +       if (get_nr_gens(lruvec, type) == MIN_NR_GENS) {
> > +               /*
> > +                * throttle for a while and then increase the min_seq since
> > +                * both page types reach the limit.
> > +                */
>
> Sorry but this isn't going to work because in try_to_inc_min_seq(), there is
>    `while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {`
> to prevent reclaimers from evicting hot memory -- they need to do aging first.
Thanks for heads up. What I thought was assuming there is a running
reclaimer will do the aging and the throttled reclaimers increase the
min_seq when scheduled back and move on. Or could we just drop the
lock and throttle for a while to avoid a livelock on 'type = !type'
with holding the lock?

>
> >
> > +               if (get_nr_gens(lruvec, !type) == MIN_NR_GENS) {
> > +                       spin_unlock_irq(&lruvec->lru_lock);
> > +                       reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
> > +                       spin_lock_irq(&lruvec->lru_lock);
> > +                       try_to_inc_min_seq(lruvec, get_swappiness(lruvec, sc));
> > +               } else
> > +                       return 0;
> > +       }
> >
> >         gen = lru_gen_from_seq(lrugen->min_seq[type]);
> >
> > --
> > 2.25.1
> >
Zhaoyang Huang Oct. 12, 2024, 1:49 a.m. UTC | #5
On Fri, Oct 11, 2024 at 4:02 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
>
> On Fri, Oct 11, 2024 at 12:37 AM Yu Zhao <yuzhao@google.com> wrote:
> >
> > On Wed, Oct 9, 2024 at 1:50 AM zhaoyang.huang <zhaoyang.huang@unisoc.com> wrote:
> > >
> > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > >
> > > The test case of [1] leads to system hang which caused by a local
> > > watchdog thread starved over 20s on a 5.5GB RAM ANDROID15(v6.6)
> > > system. This commit solve the issue by have the reclaimer be throttled
> > > and increase min_seq if both page types reach MIN_NR_GENS, which may
> > > introduce a livelock of switching type with holding lruvec->lru_lock.
> > >
> > > [1]
> > > launch below script 8 times simutanously which allocates 1GB virtual
> > > memory and access it from user space by each thread.
> > > $ costmem -c1024000 -b12800 -o0 &
> > >
> > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > ---
> > >  mm/vmscan.c | 16 ++++++++++++++--
> > >  1 file changed, 14 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index cfa839284b92..83e450d0ce3c 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -4384,11 +4384,23 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
> > >         int remaining = MAX_LRU_BATCH;
> > >         struct lru_gen_folio *lrugen = &lruvec->lrugen;
> > >         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> > > +       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> > >
> > >         VM_WARN_ON_ONCE(!list_empty(list));
> > >
> > > -       if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
> > > -               return 0;
> > > +       if (get_nr_gens(lruvec, type) == MIN_NR_GENS) {
> > > +               /*
> > > +                * throttle for a while and then increase the min_seq since
> > > +                * both page types reach the limit.
> > > +                */
> >
> > Sorry but this isn't going to work because in try_to_inc_min_seq(), there is
> >    `while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {`
> > to prevent reclaimers from evicting hot memory -- they need to do aging first.
> Thanks for heads up. What I thought was assuming there is a running
> reclaimer will do the aging and the throttled reclaimers increase the
> min_seq when scheduled back and move on. Or could we just drop the
> lock and throttle for a while to avoid a livelock on 'type = !type'
> with holding the lock?
please find below for the lru_lock contention information[2] which we
get from syzkaller test. if the patch[1] is worth discussing which
introduces throttling direct reclaimer by judging the number of
isolated folios.

[1]
https://lore.kernel.org/all/20240716094348.2451312-1-zhaoyang.huang@unisoc.com/

[2]
[  295.163779][T8447@C5] preemptoff_warn: C5 T:<8447>syz.2.17
D:40.429ms F:295.123341s E:6.660 ms
[  295.165000][T8447@C5] preemptoff_warn: C5 enabled preempt at:
[  295.165000][T8447@C5] _raw_spin_unlock_irq+0x2c/0x5c
[  295.165000][T8447@C5] evict_folios+0x2504/0x3050
[  295.165000][T8447@C5] try_to_shrink_lruvec+0x40c/0x594
[  295.165000][T8447@C5] shrink_one+0x174/0x4cc
[  295.165000][T8447@C5] shrink_node+0x1c50/0x2088
[  295.165000][T8447@C5] do_try_to_free_pages+0x560/0xef8
[  295.165000][T8447@C5] try_to_free_pages+0x4e8/0xaf0
[  295.165000][T8447@C5] __alloc_pages_slowpath+0x92c/0x1c78
[  295.165000][T8447@C5] __alloc_pages+0x404/0x48c
[  295.166277][T298@C0] C0 T:<298>logd.writer D:42.389ms F:295.123885s
[  295.166337][T298@C0] C0 enabled IRQ at:
[  295.166337][T298@C0] _raw_spin_unlock_irq+0x20/0x5c
[  295.166337][T298@C0] evict_folios+0x2504/0x3050
[  295.166337][T298@C0] shrink_one+0x174/0x4cc
[  295.166337][T298@C0] shrink_node+0x1c50/0x2088
[  295.166337][T298@C0] do_try_to_free_pages+0x560/0xef8
[  295.166337][T298@C0] try_to_free_pages+0x4e8/0xaf0
[  295.166337][T298@C0] __alloc_pages_slowpath+0x92c/0x1c78
[  295.166337][T298@C0] __alloc_pages+0x404/0x48c
[  295.166337][T298@C0] erofs_allocpage+0x90/0xb0
[  295.167317][T298@C0] preemptoff_warn: C0 T:<298>logd.writer
D:43.424ms F:295.123888s
[  295.168484][T8210@C7] C7 T:<8210>syz-executor D:32.816ms F:295.135666s
[  295.168507][T8210@C7] C7 enabled IRQ at:
[  295.168507][T8210@C7] _raw_spin_unlock_irq+0x20/0x5c
[  295.168507][T8210@C7] evict_folios+0x2504/0x3050
[  295.168507][T8210@C7] shrink_one+0x174/0x4cc
[  295.168507][T8210@C7] shrink_node+0x1c50/0x2088
[  295.168507][T8210@C7] do_try_to_free_pages+0x560/0xef8
[  295.168507][T8210@C7] try_to_free_pages+0x4e8/0xaf0
[  295.168507][T8210@C7] __alloc_pages_slowpath+0x92c/0x1c78
[  295.168507][T8210@C7] __alloc_pages+0x404/0x48c
[  295.168507][T8210@C7] __get_free_pages+0x24/0x3c
[  295.168625][T8210@C7] preemptoff_warn: C7 T:<8210>syz-executor
D:32.956ms F:295.135666s
[  295.168645][T8210@C7] preemptoff_warn: C7 enabled preempt at:
[  295.168645][T8210@C7] _raw_spin_unlock_irq+0x2c/0x5c
[  295.168645][T8210@C7] evict_folios+0x2504/0x3050
[  295.168645][T8210@C7] try_to_shrink_lruvec+0x40c/0x594
[  295.168645][T8210@C7] shrink_one+0x174/0x4cc
[  295.168645][T8210@C7] shrink_node+0x1c50/0x2088
[  295.168645][T8210@C7] do_try_to_free_pages+0x560/0xef8
[  295.168645][T8210@C7] try_to_free_pages+0x4e8/0xaf0
[  295.168645][T8210@C7] __alloc_pages_slowpath+0x92c/0x1c78
[  295.168645][T8210@C7] __alloc_pages+0x404/0x48c
[  295.178291][T8441@C2] C2 T:<8441>syz.3.18 D:42.290ms F:295.135998s
[  295.178356][T8441@C2] C2 enabled IRQ at:
[  295.178356][T8441@C2] _raw_spin_unlock_irq+0x20/0x5c
[  295.178356][T8441@C2] evict_folios+0x2504/0x3050
[  295.178356][T8441@C2] shrink_one+0x174/0x4cc
[  295.178356][T8441@C2] shrink_node+0x1c50/0x2088
[  295.178356][T8441@C2] do_try_to_free_pages+0x560/0xef8
[  295.178356][T8441@C2] try_to_free_pages+0x4e8/0xaf0
[  295.178356][T8441@C2] __alloc_pages_slowpath+0x92c/0x1c78
[  295.178356][T8441@C2] __alloc_pages+0x404/0x48c
[  295.178356][T8441@C2] bpf_ringbuf_alloc+0x22c/0x434
[  295.179135][T8441@C2] preemptoff_warn: C2 T:<8441>syz.3.18
D:43.128ms F:295.136000s

>
> >
> > >
> > > +               if (get_nr_gens(lruvec, !type) == MIN_NR_GENS) {
> > > +                       spin_unlock_irq(&lruvec->lru_lock);
> > > +                       reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
> > > +                       spin_lock_irq(&lruvec->lru_lock);
> > > +                       try_to_inc_min_seq(lruvec, get_swappiness(lruvec, sc));
> > > +               } else
> > > +                       return 0;
> > > +       }
> > >
> > >         gen = lru_gen_from_seq(lrugen->min_seq[type]);
> > >
> > > --
> > > 2.25.1
> > >
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index cfa839284b92..83e450d0ce3c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4384,11 +4384,23 @@  static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 	int remaining = MAX_LRU_BATCH;
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
 	VM_WARN_ON_ONCE(!list_empty(list));
 
-	if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
-		return 0;
+	if (get_nr_gens(lruvec, type) == MIN_NR_GENS) {
+		/*
+		 * throttle for a while and then increase the min_seq since
+		 * both page types reach the limit.
+		 */
+		if (get_nr_gens(lruvec, !type) == MIN_NR_GENS) {
+			spin_unlock_irq(&lruvec->lru_lock);
+			reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
+			spin_lock_irq(&lruvec->lru_lock);
+			try_to_inc_min_seq(lruvec, get_swappiness(lruvec, sc));
+		} else
+			return 0;
+	}
 
 	gen = lru_gen_from_seq(lrugen->min_seq[type]);