diff mbox series

[3/8] mm/vmscan: Throttle reclaim when no progress is being made

Message ID 20211022144651.19914-4-mgorman@techsingularity.net (mailing list archive)
State New, archived
Headers show
Series Remove dependency on congestion_wait in mm/ | expand

Commit Message

Mel Gorman Oct. 22, 2021, 2:46 p.m. UTC
Memcg reclaim throttles on congestion if no reclaim progress is made.
This makes little sense, it might be due to writeback or a host of
other factors.

For !memcg reclaim, it's messy. Direct reclaim primarily is throttled
in the page allocator if it is failing to make progress. Kswapd
throttles if too many pages are under writeback and marked for
immediate reclaim.

This patch explicitly throttles if reclaim is failing to make progress.

[vbabka@suse.cz: Remove redundant code]
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mmzone.h        |  1 +
 include/trace/events/vmscan.h |  4 +++-
 mm/memcontrol.c               | 10 +---------
 mm/vmscan.c                   | 28 ++++++++++++++++++++++++++++
 4 files changed, 33 insertions(+), 10 deletions(-)

Comments

Darrick J. Wong Nov. 24, 2021, 1:19 a.m. UTC | #1
On Fri, Oct 22, 2021 at 03:46:46PM +0100, Mel Gorman wrote:
> Memcg reclaim throttles on congestion if no reclaim progress is made.
> This makes little sense, it might be due to writeback or a host of
> other factors.
> 
> For !memcg reclaim, it's messy. Direct reclaim primarily is throttled
> in the page allocator if it is failing to make progress. Kswapd
> throttles if too many pages are under writeback and marked for
> immediate reclaim.
> 
> This patch explicitly throttles if reclaim is failing to make progress.

Hi Mel,

Ever since Christoph broke swapfiles, I've been carrying around a little
fstest in my dev tree[1] that tries to exercise paging things in and out
of a swapfile.  Sadly I've been trapped in about three dozen customer
escalations for over a month, which means I haven't been able to do much
upstream in weeks.  Like submit this test upstream. :(

Now that I've finally gotten around to trying out a 5.16-rc2 build, I
notice that the runtime of this test has gone from ~5s to 2 hours.
Among other things that it does, the test sets up a cgroup with a memory
controller limiting the memory usage to 25MB, then runs a program that
tries to dirty 50MB of memory.  There's 2GB of memory in the VM, so
we're not running reclaim globally, but the cgroup gets throttled very
severely.

AFAICT the system is mostly idle, but it's difficult to tell because ps
and top also get stuck waiting for this cgroup for whatever reason.  My
uninformed spculation is that usemem_and_swapoff takes a page fault
while dirtying the 50MB memory buffer, prepares to pull a page in from
swap, tries to evict another page to stay under the memcg limit, but
that decides that it's making no progress and calls
reclaim_throttle(..., VMSCAN_THROTTLE_NOPROGRESS).

The sleep is uninterruptible, so I can't even kill -9 fstests to shut it
down.  Eventually we either finish the test or (for the mlock part) the
OOM killer actually kills the process, but this takes a very long time.

Any thoughts?  For now I can just hack around this by skipping
reclaim_throttle if cgroup_reclaim() == true, but that's probably not
the correct fix. :)

--D

[1] https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/commit/?h=test-swapfile-io&id=0d0ad843cea366d0ab0a7d8d984e5cd1deba5b43

> 
> [vbabka@suse.cz: Remove redundant code]
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  include/linux/mmzone.h        |  1 +
>  include/trace/events/vmscan.h |  4 +++-
>  mm/memcontrol.c               | 10 +---------
>  mm/vmscan.c                   | 28 ++++++++++++++++++++++++++++
>  4 files changed, 33 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 9ccd8d95291b..00e305cfb3ec 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -276,6 +276,7 @@ enum lru_list {
>  enum vmscan_throttle_state {
>  	VMSCAN_THROTTLE_WRITEBACK,
>  	VMSCAN_THROTTLE_ISOLATED,
> +	VMSCAN_THROTTLE_NOPROGRESS,
>  	NR_VMSCAN_THROTTLE,
>  };
>  
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> index d4905bd9e9c4..f25a6149d3ba 100644
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -29,11 +29,13 @@
>  
>  #define _VMSCAN_THROTTLE_WRITEBACK	(1 << VMSCAN_THROTTLE_WRITEBACK)
>  #define _VMSCAN_THROTTLE_ISOLATED	(1 << VMSCAN_THROTTLE_ISOLATED)
> +#define _VMSCAN_THROTTLE_NOPROGRESS	(1 << VMSCAN_THROTTLE_NOPROGRESS)
>  
>  #define show_throttle_flags(flags)						\
>  	(flags) ? __print_flags(flags, "|",					\
>  		{_VMSCAN_THROTTLE_WRITEBACK,	"VMSCAN_THROTTLE_WRITEBACK"},	\
> -		{_VMSCAN_THROTTLE_ISOLATED,	"VMSCAN_THROTTLE_ISOLATED"}	\
> +		{_VMSCAN_THROTTLE_ISOLATED,	"VMSCAN_THROTTLE_ISOLATED"},	\
> +		{_VMSCAN_THROTTLE_NOPROGRESS,	"VMSCAN_THROTTLE_NOPROGRESS"}	\
>  		) : "VMSCAN_THROTTLE_NONE"
>  
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 6da5020a8656..8b33152c9b85 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3465,19 +3465,11 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>  
>  	/* try to free all pages in this cgroup */
>  	while (nr_retries && page_counter_read(&memcg->memory)) {
> -		int progress;
> -
>  		if (signal_pending(current))
>  			return -EINTR;
>  
> -		progress = try_to_free_mem_cgroup_pages(memcg, 1,
> -							GFP_KERNEL, true);
> -		if (!progress) {
> +		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
>  			nr_retries--;
> -			/* maybe some writeback is necessary */
> -			congestion_wait(BLK_RW_ASYNC, HZ/10);
> -		}
> -
>  	}
>  
>  	return 0;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 1e54e636b927..0450f6867d61 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3323,6 +3323,33 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
>  	return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
>  }
>  
> +static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
> +{
> +	/* If reclaim is making progress, wake any throttled tasks. */
> +	if (sc->nr_reclaimed) {
> +		wait_queue_head_t *wqh;
> +
> +		wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
> +		if (waitqueue_active(wqh))
> +			wake_up(wqh);
> +
> +		return;
> +	}
> +
> +	/*
> +	 * Do not throttle kswapd on NOPROGRESS as it will throttle on
> +	 * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under
> +	 * writeback and marked for immediate reclaim at the tail of
> +	 * the LRU.
> +	 */
> +	if (current_is_kswapd())
> +		return;
> +
> +	/* Throttle if making no progress at high prioities. */
> +	if (sc->priority < DEF_PRIORITY - 2)
> +		reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
> +}
> +
>  /*
>   * This is the direct reclaim path, for page-allocating processes.  We only
>   * try to reclaim pages from zones which will satisfy the caller's allocation
> @@ -3407,6 +3434,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>  			continue;
>  		last_pgdat = zone->zone_pgdat;
>  		shrink_node(zone->zone_pgdat, sc);
> +		consider_reclaim_throttle(zone->zone_pgdat, sc);
>  	}
>  
>  	/*
> -- 
> 2.31.1
>
Darrick J. Wong Nov. 24, 2021, 1:49 a.m. UTC | #2
On Tue, Nov 23, 2021 at 05:19:12PM -0800, Darrick J. Wong wrote:
> On Fri, Oct 22, 2021 at 03:46:46PM +0100, Mel Gorman wrote:
> > Memcg reclaim throttles on congestion if no reclaim progress is made.
> > This makes little sense, it might be due to writeback or a host of
> > other factors.
> > 
> > For !memcg reclaim, it's messy. Direct reclaim primarily is throttled
> > in the page allocator if it is failing to make progress. Kswapd
> > throttles if too many pages are under writeback and marked for
> > immediate reclaim.
> > 
> > This patch explicitly throttles if reclaim is failing to make progress.
> 
> Hi Mel,
> 
> Ever since Christoph broke swapfiles, I've been carrying around a little
> fstest in my dev tree[1] that tries to exercise paging things in and out
> of a swapfile.  Sadly I've been trapped in about three dozen customer
> escalations for over a month, which means I haven't been able to do much
> upstream in weeks.  Like submit this test upstream. :(
> 
> Now that I've finally gotten around to trying out a 5.16-rc2 build, I
> notice that the runtime of this test has gone from ~5s to 2 hours.
> Among other things that it does, the test sets up a cgroup with a memory
> controller limiting the memory usage to 25MB, then runs a program that
> tries to dirty 50MB of memory.  There's 2GB of memory in the VM, so
> we're not running reclaim globally, but the cgroup gets throttled very
> severely.
> 
> AFAICT the system is mostly idle, but it's difficult to tell because ps
> and top also get stuck waiting for this cgroup for whatever reason.  My
> uninformed spculation is that usemem_and_swapoff takes a page fault
> while dirtying the 50MB memory buffer, prepares to pull a page in from
> swap, tries to evict another page to stay under the memcg limit, but
> that decides that it's making no progress and calls
> reclaim_throttle(..., VMSCAN_THROTTLE_NOPROGRESS).
> 
> The sleep is uninterruptible, so I can't even kill -9 fstests to shut it
> down.  Eventually we either finish the test or (for the mlock part) the
> OOM killer actually kills the process, but this takes a very long time.
> 
> Any thoughts?  For now I can just hack around this by skipping
> reclaim_throttle if cgroup_reclaim() == true, but that's probably not
> the correct fix. :)

Update: after adding timing information to usemem_and_swapoff, it looks
like dirtying the 50MB buffer takes ~22s (up from 0.06s on 5.15).  The
mlock call stalls for ~280s until the OOM killer kills it (up from
nearly instantaneous on 5.15), and the swapon/swapoff variant takes
20 minutes to hours depending on the run.

--D

> --D
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/commit/?h=test-swapfile-io&id=0d0ad843cea366d0ab0a7d8d984e5cd1deba5b43
> 
> > 
> > [vbabka@suse.cz: Remove redundant code]
> > Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> > Acked-by: Vlastimil Babka <vbabka@suse.cz>
> > ---
> >  include/linux/mmzone.h        |  1 +
> >  include/trace/events/vmscan.h |  4 +++-
> >  mm/memcontrol.c               | 10 +---------
> >  mm/vmscan.c                   | 28 ++++++++++++++++++++++++++++
> >  4 files changed, 33 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 9ccd8d95291b..00e305cfb3ec 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -276,6 +276,7 @@ enum lru_list {
> >  enum vmscan_throttle_state {
> >  	VMSCAN_THROTTLE_WRITEBACK,
> >  	VMSCAN_THROTTLE_ISOLATED,
> > +	VMSCAN_THROTTLE_NOPROGRESS,
> >  	NR_VMSCAN_THROTTLE,
> >  };
> >  
> > diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> > index d4905bd9e9c4..f25a6149d3ba 100644
> > --- a/include/trace/events/vmscan.h
> > +++ b/include/trace/events/vmscan.h
> > @@ -29,11 +29,13 @@
> >  
> >  #define _VMSCAN_THROTTLE_WRITEBACK	(1 << VMSCAN_THROTTLE_WRITEBACK)
> >  #define _VMSCAN_THROTTLE_ISOLATED	(1 << VMSCAN_THROTTLE_ISOLATED)
> > +#define _VMSCAN_THROTTLE_NOPROGRESS	(1 << VMSCAN_THROTTLE_NOPROGRESS)
> >  
> >  #define show_throttle_flags(flags)						\
> >  	(flags) ? __print_flags(flags, "|",					\
> >  		{_VMSCAN_THROTTLE_WRITEBACK,	"VMSCAN_THROTTLE_WRITEBACK"},	\
> > -		{_VMSCAN_THROTTLE_ISOLATED,	"VMSCAN_THROTTLE_ISOLATED"}	\
> > +		{_VMSCAN_THROTTLE_ISOLATED,	"VMSCAN_THROTTLE_ISOLATED"},	\
> > +		{_VMSCAN_THROTTLE_NOPROGRESS,	"VMSCAN_THROTTLE_NOPROGRESS"}	\
> >  		) : "VMSCAN_THROTTLE_NONE"
> >  
> >  
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 6da5020a8656..8b33152c9b85 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -3465,19 +3465,11 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> >  
> >  	/* try to free all pages in this cgroup */
> >  	while (nr_retries && page_counter_read(&memcg->memory)) {
> > -		int progress;
> > -
> >  		if (signal_pending(current))
> >  			return -EINTR;
> >  
> > -		progress = try_to_free_mem_cgroup_pages(memcg, 1,
> > -							GFP_KERNEL, true);
> > -		if (!progress) {
> > +		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
> >  			nr_retries--;
> > -			/* maybe some writeback is necessary */
> > -			congestion_wait(BLK_RW_ASYNC, HZ/10);
> > -		}
> > -
> >  	}
> >  
> >  	return 0;
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 1e54e636b927..0450f6867d61 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -3323,6 +3323,33 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> >  	return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
> >  }
> >  
> > +static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
> > +{
> > +	/* If reclaim is making progress, wake any throttled tasks. */
> > +	if (sc->nr_reclaimed) {
> > +		wait_queue_head_t *wqh;
> > +
> > +		wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
> > +		if (waitqueue_active(wqh))
> > +			wake_up(wqh);
> > +
> > +		return;
> > +	}
> > +
> > +	/*
> > +	 * Do not throttle kswapd on NOPROGRESS as it will throttle on
> > +	 * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under
> > +	 * writeback and marked for immediate reclaim at the tail of
> > +	 * the LRU.
> > +	 */
> > +	if (current_is_kswapd())
> > +		return;
> > +
> > +	/* Throttle if making no progress at high prioities. */
> > +	if (sc->priority < DEF_PRIORITY - 2)
> > +		reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
> > +}
> > +
> >  /*
> >   * This is the direct reclaim path, for page-allocating processes.  We only
> >   * try to reclaim pages from zones which will satisfy the caller's allocation
> > @@ -3407,6 +3434,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> >  			continue;
> >  		last_pgdat = zone->zone_pgdat;
> >  		shrink_node(zone->zone_pgdat, sc);
> > +		consider_reclaim_throttle(zone->zone_pgdat, sc);
> >  	}
> >  
> >  	/*
> > -- 
> > 2.31.1
> >
Mel Gorman Nov. 24, 2021, 10:32 a.m. UTC | #3
On Tue, Nov 23, 2021 at 05:19:12PM -0800, Darrick J. Wong wrote:
> On Fri, Oct 22, 2021 at 03:46:46PM +0100, Mel Gorman wrote:
> > Memcg reclaim throttles on congestion if no reclaim progress is made.
> > This makes little sense, it might be due to writeback or a host of
> > other factors.
> > 
> > For !memcg reclaim, it's messy. Direct reclaim primarily is throttled
> > in the page allocator if it is failing to make progress. Kswapd
> > throttles if too many pages are under writeback and marked for
> > immediate reclaim.
> > 
> > This patch explicitly throttles if reclaim is failing to make progress.
> 
> Hi Mel,
> 
> Ever since Christoph broke swapfiles, I've been carrying around a little
> fstest in my dev tree[1] that tries to exercise paging things in and out
> of a swapfile.  Sadly I've been trapped in about three dozen customer
> escalations for over a month, which means I haven't been able to do much
> upstream in weeks.  Like submit this test upstream. :(
> 
> Now that I've finally gotten around to trying out a 5.16-rc2 build, I
> notice that the runtime of this test has gone from ~5s to 2 hours.
> Among other things that it does, the test sets up a cgroup with a memory
> controller limiting the memory usage to 25MB, then runs a program that
> tries to dirty 50MB of memory.  There's 2GB of memory in the VM, so
> we're not running reclaim globally, but the cgroup gets throttled very
> severely.
> 

Ok, so this test cannot make progress until some of the cgroup pages get
cleaned. What is the expectation for the test? Should it OOM or do you
expect it to have spin-like behaviour until some writeback completes?
I'm guessing you'd prefer it to spin and right now it's sleeping far
too much.

> AFAICT the system is mostly idle, but it's difficult to tell because ps
> and top also get stuck waiting for this cgroup for whatever reason. 

But this is surprising because I expect that ps and top are not running
within the cgroup. Was /proc/PID/stack readable? 

> My
> uninformed spculation is that usemem_and_swapoff takes a page fault
> while dirtying the 50MB memory buffer, prepares to pull a page in from
> swap, tries to evict another page to stay under the memcg limit, but
> that decides that it's making no progress and calls
> reclaim_throttle(..., VMSCAN_THROTTLE_NOPROGRESS).
> 
> The sleep is uninterruptible, so I can't even kill -9 fstests to shut it
> down.  Eventually we either finish the test or (for the mlock part) the
> OOM killer actually kills the process, but this takes a very long time.
> 

The sleep can be interruptible.

> Any thoughts?  For now I can just hack around this by skipping
> reclaim_throttle if cgroup_reclaim() == true, but that's probably not
> the correct fix. :)
> 

No, it wouldn't be but a possibility is throttling for only 1 jiffy if
reclaiming within a memcg and the zone is balanced overall.

The interruptible part should just be the patch below. I need to poke at
the cgroup limit part a bit

diff --git a/mm/vmscan.c b/mm/vmscan.c
index fb9584641ac7..07db03883062 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1068,7 +1068,7 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
 		break;
 	}
 
-	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+	prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
 	ret = schedule_timeout(timeout);
 	finish_wait(wqh, &wait);
Vlastimil Babka Nov. 24, 2021, 10:43 a.m. UTC | #4
On 11/24/21 11:32, Mel Gorman wrote:
> On Tue, Nov 23, 2021 at 05:19:12PM -0800, Darrick J. Wong wrote:
>> On Fri, Oct 22, 2021 at 03:46:46PM +0100, Mel Gorman wrote:
>> > Memcg reclaim throttles on congestion if no reclaim progress is made.
>> > This makes little sense, it might be due to writeback or a host of
>> > other factors.
>> > 
>> > For !memcg reclaim, it's messy. Direct reclaim primarily is throttled
>> > in the page allocator if it is failing to make progress. Kswapd
>> > throttles if too many pages are under writeback and marked for
>> > immediate reclaim.
>> > 
>> > This patch explicitly throttles if reclaim is failing to make progress.
>> 
>> Hi Mel,
>> 
>> Ever since Christoph broke swapfiles, I've been carrying around a little
>> fstest in my dev tree[1] that tries to exercise paging things in and out
>> of a swapfile.  Sadly I've been trapped in about three dozen customer
>> escalations for over a month, which means I haven't been able to do much
>> upstream in weeks.  Like submit this test upstream. :(
>> 
>> Now that I've finally gotten around to trying out a 5.16-rc2 build, I
>> notice that the runtime of this test has gone from ~5s to 2 hours.
>> Among other things that it does, the test sets up a cgroup with a memory
>> controller limiting the memory usage to 25MB, then runs a program that
>> tries to dirty 50MB of memory.  There's 2GB of memory in the VM, so
>> we're not running reclaim globally, but the cgroup gets throttled very
>> severely.
>> 
> 
> Ok, so this test cannot make progress until some of the cgroup pages get
> cleaned. What is the expectation for the test? Should it OOM or do you
> expect it to have spin-like behaviour until some writeback completes?
> I'm guessing you'd prefer it to spin and right now it's sleeping far
> too much.
> 
>> AFAICT the system is mostly idle, but it's difficult to tell because ps
>> and top also get stuck waiting for this cgroup for whatever reason. 
> 
> But this is surprising because I expect that ps and top are not running
> within the cgroup. Was /proc/PID/stack readable? 
> 
>> My
>> uninformed spculation is that usemem_and_swapoff takes a page fault
>> while dirtying the 50MB memory buffer, prepares to pull a page in from
>> swap, tries to evict another page to stay under the memcg limit, but
>> that decides that it's making no progress and calls
>> reclaim_throttle(..., VMSCAN_THROTTLE_NOPROGRESS).
>> 
>> The sleep is uninterruptible, so I can't even kill -9 fstests to shut it
>> down.  Eventually we either finish the test or (for the mlock part) the
>> OOM killer actually kills the process, but this takes a very long time.
>> 
> 
> The sleep can be interruptible.
> 
>> Any thoughts?  For now I can just hack around this by skipping
>> reclaim_throttle if cgroup_reclaim() == true, but that's probably not
>> the correct fix. :)
>> 
> 
> No, it wouldn't be but a possibility is throttling for only 1 jiffy if
> reclaiming within a memcg and the zone is balanced overall.
> 
> The interruptible part should just be the patch below. I need to poke at
> the cgroup limit part a bit

As the throttle timeout is short anyway, will the TASK_UNINTERRUPTIBLE vs
TASK_INTERRUPTIBLE make a difference for the (ability to kill? AFAIU
typically this inability to kill is because of a loop that doesn't check for
fatal_signal_pending().

> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index fb9584641ac7..07db03883062 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1068,7 +1068,7 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
>  		break;
>  	}
>  
> -	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
> +	prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
>  	ret = schedule_timeout(timeout);
>  	finish_wait(wqh, &wait);
>  
>
Mel Gorman Nov. 24, 2021, 10:53 a.m. UTC | #5
On Wed, Nov 24, 2021 at 11:43:05AM +0100, Vlastimil Babka wrote:
> >> Any thoughts?  For now I can just hack around this by skipping
> >> reclaim_throttle if cgroup_reclaim() == true, but that's probably not
> >> the correct fix. :)
> >> 
> > 
> > No, it wouldn't be but a possibility is throttling for only 1 jiffy if
> > reclaiming within a memcg and the zone is balanced overall.
> > 
> > The interruptible part should just be the patch below. I need to poke at
> > the cgroup limit part a bit
> 
> As the throttle timeout is short anyway, will the TASK_UNINTERRUPTIBLE vs
> TASK_INTERRUPTIBLE make a difference for the (ability to kill? AFAIU
> typically this inability to kill is because of a loop that doesn't check for
> fatal_signal_pending().
> 

Yep, and the fatal_signal_pending() is lacking within reclaim in general
but I'm undecided on how much that should change in the context of reclaim
throttling but at minimum, I don't want the signal delivery to be masked
or delayed.
Mel Gorman Nov. 24, 2021, 2:35 p.m. UTC | #6
On Tue, Nov 23, 2021 at 05:49:14PM -0800, Darrick J. Wong wrote:
> > Ever since Christoph broke swapfiles, I've been carrying around a little
> > fstest in my dev tree[1] that tries to exercise paging things in and out
> > of a swapfile.  Sadly I've been trapped in about three dozen customer
> > escalations for over a month, which means I haven't been able to do much
> > upstream in weeks.  Like submit this test upstream. :(
> > 
> > Now that I've finally gotten around to trying out a 5.16-rc2 build, I
> > notice that the runtime of this test has gone from ~5s to 2 hours.
> > Among other things that it does, the test sets up a cgroup with a memory
> > controller limiting the memory usage to 25MB, then runs a program that
> > tries to dirty 50MB of memory.  There's 2GB of memory in the VM, so
> > we're not running reclaim globally, but the cgroup gets throttled very
> > severely.
> > 
> > AFAICT the system is mostly idle, but it's difficult to tell because ps
> > and top also get stuck waiting for this cgroup for whatever reason.  My
> > uninformed spculation is that usemem_and_swapoff takes a page fault
> > while dirtying the 50MB memory buffer, prepares to pull a page in from
> > swap, tries to evict another page to stay under the memcg limit, but
> > that decides that it's making no progress and calls
> > reclaim_throttle(..., VMSCAN_THROTTLE_NOPROGRESS).
> > 
> > The sleep is uninterruptible, so I can't even kill -9 fstests to shut it
> > down.  Eventually we either finish the test or (for the mlock part) the
> > OOM killer actually kills the process, but this takes a very long time.
> > 
> > Any thoughts?  For now I can just hack around this by skipping
> > reclaim_throttle if cgroup_reclaim() == true, but that's probably not
> > the correct fix. :)
> 
> Update: after adding timing information to usemem_and_swapoff, it looks
> like dirtying the 50MB buffer takes ~22s (up from 0.06s on 5.15).  The
> mlock call stalls for ~280s until the OOM killer kills it (up from
> nearly instantaneous on 5.15), and the swapon/swapoff variant takes
> 20 minutes to hours depending on the run.
> 

Can you try the patch below please? I think I'm running the test
correctly and it finishes for me in 16 seconds with this applied

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 07db03883062..d9166e94eb95 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1057,7 +1057,17 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
 
 		break;
 	case VMSCAN_THROTTLE_NOPROGRESS:
-		timeout = HZ/2;
+		timeout = 1;
+
+		/*
+		 * If kswapd is disabled, reschedule if necessary but do not
+		 * throttle as the system is likely near OOM.
+		 */
+		if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) {
+			cond_resched();
+			return;
+		}
+
 		break;
 	case VMSCAN_THROTTLE_ISOLATED:
 		timeout = HZ/50;
@@ -3395,7 +3405,7 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
 		return;
 
 	/* Throttle if making no progress at high prioities. */
-	if (sc->priority < DEF_PRIORITY - 2)
+	if (sc->priority < DEF_PRIORITY - 2 && !sc->nr_reclaimed)
 		reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
 }
 
@@ -3415,6 +3425,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	unsigned long nr_soft_scanned;
 	gfp_t orig_mask;
 	pg_data_t *last_pgdat = NULL;
+	pg_data_t *first_pgdat = NULL;
 
 	/*
 	 * If the number of buffer_heads in the machine exceeds the maximum
@@ -3478,14 +3489,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 			/* need some check for avoid more shrink_zone() */
 		}
 
+		if (!first_pgdat)
+			first_pgdat = zone->zone_pgdat;
+
 		/* See comment about same check for global reclaim above */
 		if (zone->zone_pgdat == last_pgdat)
 			continue;
 		last_pgdat = zone->zone_pgdat;
 		shrink_node(zone->zone_pgdat, sc);
-		consider_reclaim_throttle(zone->zone_pgdat, sc);
 	}
 
+	consider_reclaim_throttle(first_pgdat, sc);
+
 	/*
 	 * Restore to original mask to avoid the impact on the caller if we
 	 * promoted it to __GFP_HIGHMEM.
Mike Galbraith Nov. 24, 2021, 5:24 p.m. UTC | #7
On Wed, 2021-11-24 at 10:32 +0000, Mel Gorman wrote:
> On Tue, Nov 23, 2021 at 05:19:12PM -0800, Darrick J. Wong wrote:
>
> > AFAICT the system is mostly idle, but it's difficult to tell because ps
> > and top also get stuck waiting for this cgroup for whatever reason.
>
> But this is surprising because I expect that ps and top are not running
> within the cgroup. Was /proc/PID/stack readable?

Probably this.

crash> ps | grep UN
   4418   4417   4  ffff8881cae66e40  UN   0.0    7620    980  memcg_test_1 <== the bad guy
   4419   4417   6  ffff8881cae62f40  UN   0.0    7620    980  memcg_test_1
   4420   4417   5  ffff8881cae65e80  UN   0.0    7620    980  memcg_test_1
   4421   4417   7  ffff8881cae63f00  UN   0.0    7620    980  memcg_test_1
   4422   4417   4  ffff8881cae60000  UN   0.0    7620    980  memcg_test_1
   4423   4417   3  ffff888128985e80  UN   0.0    7620    980  memcg_test_1
   4424   4417   7  ffff888117f79f80  UN   0.0    7620    980  memcg_test_1
   4425   4417   2  ffff888117f7af40  UN   0.0    7620    980  memcg_test_1
   4428   2791   6  ffff8881a8253f00  UN   0.0   38868   3568  ps
   4429   2808   4  ffff888100c90000  UN   0.0   38868   3600  ps
crash> bt -sx 4429
PID: 4429   TASK: ffff888100c90000  CPU: 4   COMMAND: "ps"
 #0 [ffff8881af1c3ce0] __schedule+0x285 at ffffffff817ae6c5
 #1 [ffff8881af1c3d68] schedule+0x3a at ffffffff817aed4a
 #2 [ffff8881af1c3d78] rwsem_down_read_slowpath+0x197 at ffffffff817b11a7
 #3 [ffff8881af1c3e08] down_read_killable+0x5c at ffffffff817b142c
 #4 [ffff8881af1c3e18] down_read_killable+0x5c at ffffffff817b142c
 #5 [ffff8881af1c3e28] __access_remote_vm+0x3f at ffffffff8120131f
 #6 [ffff8881af1c3e90] proc_pid_cmdline_read+0x148 at ffffffff812fc9a8
 #7 [ffff8881af1c3ee8] vfs_read+0x92 at ffffffff8126a302
 #8 [ffff8881af1c3f00] ksys_read+0x7d at ffffffff8126a72d
 #9 [ffff8881af1c3f38] do_syscall_64+0x37 at ffffffff817a3f57
#10 [ffff8881af1c3f50] entry_SYSCALL_64_after_hwframe+0x44 at ffffffff8180007c
    RIP: 00007f4b50fe8b5e  RSP: 00007ffdd7f6fe38  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 00007f4b5186a010  RCX: 00007f4b50fe8b5e
    RDX: 0000000000020000  RSI: 00007f4b5186a010  RDI: 0000000000000006
    RBP: 0000000000020000   R8: 0000000000000007   R9: 00000000ffffffff
    R10: 0000000000000000  R11: 0000000000000246  R12: 00007f4b5186a010
    R13: 0000000000000000  R14: 0000000000000006  R15: 0000000000000000
    ORIG_RAX: 0000000000000000  CS: 0033  SS: 002b
crash> mm_struct -x ffff8881021b4800
struct mm_struct {
  {
    mmap = 0xffff8881ccfe6a80,
    mm_rb = {
      rb_node = 0xffff8881ccfe61a0
    },
...
    mmap_lock = {
      count = {
        counter = 0x3
      },
      owner = {
        counter = 0xffff8881cae66e40
...
crash> bt 0xffff8881cae66e40
PID: 4418   TASK: ffff8881cae66e40  CPU: 4   COMMAND: "memcg_test_1"
 #0 [ffff888154097a88] __schedule at ffffffff817ae6c5
 #1 [ffff888154097b10] schedule at ffffffff817aed4a
 #2 [ffff888154097b20] schedule_timeout at ffffffff817b311f
 #3 [ffff888154097b90] reclaim_throttle at ffffffff811d802b
 #4 [ffff888154097bf0] do_try_to_free_pages at ffffffff811da206
 #5 [ffff888154097c40] try_to_free_mem_cgroup_pages at ffffffff811db522
 #6 [ffff888154097cd0] try_charge_memcg at ffffffff81256440
 #7 [ffff888154097d60] obj_cgroup_charge_pages at ffffffff81256c97
 #8 [ffff888154097d88] obj_cgroup_charge at ffffffff8125898c
 #9 [ffff888154097da8] kmem_cache_alloc at ffffffff81242099
#10 [ffff888154097de0] vm_area_alloc at ffffffff8106c87a
#11 [ffff888154097df0] mmap_region at ffffffff812082b2
#12 [ffff888154097e58] do_mmap at ffffffff81208922
#13 [ffff888154097eb0] vm_mmap_pgoff at ffffffff811e259f
#14 [ffff888154097f38] do_syscall_64 at ffffffff817a3f57
#15 [ffff888154097f50] entry_SYSCALL_64_after_hwframe at
ffffffff8180007c
    RIP: 00007f211c36b743  RSP: 00007ffeaac1bd58  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000000000000  RCX: 00007f211c36b743
    RDX: 0000000000000003  RSI: 0000000000001000  RDI: 0000000000000000
    RBP: 0000000000000000   R8: 0000000000000000   R9: 0000000000000000
    R10: 0000000000002022  R11: 0000000000000246  R12: 0000000000000003
    R13: 0000000000001000  R14: 0000000000002022  R15: 0000000000000000
    ORIG_RAX: 0000000000000009  CS: 0033  SS: 002b
crash>
Darrick J. Wong Nov. 24, 2021, 6:02 p.m. UTC | #8
On Wed, Nov 24, 2021 at 02:35:59PM +0000, Mel Gorman wrote:
> On Tue, Nov 23, 2021 at 05:49:14PM -0800, Darrick J. Wong wrote:
> > > Ever since Christoph broke swapfiles, I've been carrying around a little
> > > fstest in my dev tree[1] that tries to exercise paging things in and out
> > > of a swapfile.  Sadly I've been trapped in about three dozen customer
> > > escalations for over a month, which means I haven't been able to do much
> > > upstream in weeks.  Like submit this test upstream. :(
> > > 
> > > Now that I've finally gotten around to trying out a 5.16-rc2 build, I
> > > notice that the runtime of this test has gone from ~5s to 2 hours.
> > > Among other things that it does, the test sets up a cgroup with a memory
> > > controller limiting the memory usage to 25MB, then runs a program that
> > > tries to dirty 50MB of memory.  There's 2GB of memory in the VM, so
> > > we're not running reclaim globally, but the cgroup gets throttled very
> > > severely.
> > > 
> > > AFAICT the system is mostly idle, but it's difficult to tell because ps
> > > and top also get stuck waiting for this cgroup for whatever reason.  My
> > > uninformed spculation is that usemem_and_swapoff takes a page fault
> > > while dirtying the 50MB memory buffer, prepares to pull a page in from
> > > swap, tries to evict another page to stay under the memcg limit, but
> > > that decides that it's making no progress and calls
> > > reclaim_throttle(..., VMSCAN_THROTTLE_NOPROGRESS).
> > > 
> > > The sleep is uninterruptible, so I can't even kill -9 fstests to shut it
> > > down.  Eventually we either finish the test or (for the mlock part) the
> > > OOM killer actually kills the process, but this takes a very long time.
> > > 
> > > Any thoughts?  For now I can just hack around this by skipping
> > > reclaim_throttle if cgroup_reclaim() == true, but that's probably not
> > > the correct fix. :)
> > 
> > Update: after adding timing information to usemem_and_swapoff, it looks
> > like dirtying the 50MB buffer takes ~22s (up from 0.06s on 5.15).  The
> > mlock call stalls for ~280s until the OOM killer kills it (up from
> > nearly instantaneous on 5.15), and the swapon/swapoff variant takes
> > 20 minutes to hours depending on the run.
> > 
> 
> Can you try the patch below please? I think I'm running the test
> correctly and it finishes for me in 16 seconds with this applied

20 seconds here, but this /does/ fix the problem.  Thank you!

Tested-by: Darrick J. Wong <djwong@kernel.org>

--D

> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 07db03883062..d9166e94eb95 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1057,7 +1057,17 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
>  
>  		break;
>  	case VMSCAN_THROTTLE_NOPROGRESS:
> -		timeout = HZ/2;
> +		timeout = 1;
> +
> +		/*
> +		 * If kswapd is disabled, reschedule if necessary but do not
> +		 * throttle as the system is likely near OOM.
> +		 */
> +		if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) {
> +			cond_resched();
> +			return;
> +		}
> +
>  		break;
>  	case VMSCAN_THROTTLE_ISOLATED:
>  		timeout = HZ/50;
> @@ -3395,7 +3405,7 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
>  		return;
>  
>  	/* Throttle if making no progress at high prioities. */
> -	if (sc->priority < DEF_PRIORITY - 2)
> +	if (sc->priority < DEF_PRIORITY - 2 && !sc->nr_reclaimed)
>  		reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
>  }
>  
> @@ -3415,6 +3425,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>  	unsigned long nr_soft_scanned;
>  	gfp_t orig_mask;
>  	pg_data_t *last_pgdat = NULL;
> +	pg_data_t *first_pgdat = NULL;
>  
>  	/*
>  	 * If the number of buffer_heads in the machine exceeds the maximum
> @@ -3478,14 +3489,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>  			/* need some check for avoid more shrink_zone() */
>  		}
>  
> +		if (!first_pgdat)
> +			first_pgdat = zone->zone_pgdat;
> +
>  		/* See comment about same check for global reclaim above */
>  		if (zone->zone_pgdat == last_pgdat)
>  			continue;
>  		last_pgdat = zone->zone_pgdat;
>  		shrink_node(zone->zone_pgdat, sc);
> -		consider_reclaim_throttle(zone->zone_pgdat, sc);
>  	}
>  
> +	consider_reclaim_throttle(first_pgdat, sc);
> +
>  	/*
>  	 * Restore to original mask to avoid the impact on the caller if we
>  	 * promoted it to __GFP_HIGHMEM.
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9ccd8d95291b..00e305cfb3ec 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -276,6 +276,7 @@  enum lru_list {
 enum vmscan_throttle_state {
 	VMSCAN_THROTTLE_WRITEBACK,
 	VMSCAN_THROTTLE_ISOLATED,
+	VMSCAN_THROTTLE_NOPROGRESS,
 	NR_VMSCAN_THROTTLE,
 };
 
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index d4905bd9e9c4..f25a6149d3ba 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -29,11 +29,13 @@ 
 
 #define _VMSCAN_THROTTLE_WRITEBACK	(1 << VMSCAN_THROTTLE_WRITEBACK)
 #define _VMSCAN_THROTTLE_ISOLATED	(1 << VMSCAN_THROTTLE_ISOLATED)
+#define _VMSCAN_THROTTLE_NOPROGRESS	(1 << VMSCAN_THROTTLE_NOPROGRESS)
 
 #define show_throttle_flags(flags)						\
 	(flags) ? __print_flags(flags, "|",					\
 		{_VMSCAN_THROTTLE_WRITEBACK,	"VMSCAN_THROTTLE_WRITEBACK"},	\
-		{_VMSCAN_THROTTLE_ISOLATED,	"VMSCAN_THROTTLE_ISOLATED"}	\
+		{_VMSCAN_THROTTLE_ISOLATED,	"VMSCAN_THROTTLE_ISOLATED"},	\
+		{_VMSCAN_THROTTLE_NOPROGRESS,	"VMSCAN_THROTTLE_NOPROGRESS"}	\
 		) : "VMSCAN_THROTTLE_NONE"
 
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6da5020a8656..8b33152c9b85 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3465,19 +3465,11 @@  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 
 	/* try to free all pages in this cgroup */
 	while (nr_retries && page_counter_read(&memcg->memory)) {
-		int progress;
-
 		if (signal_pending(current))
 			return -EINTR;
 
-		progress = try_to_free_mem_cgroup_pages(memcg, 1,
-							GFP_KERNEL, true);
-		if (!progress) {
+		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
 			nr_retries--;
-			/* maybe some writeback is necessary */
-			congestion_wait(BLK_RW_ASYNC, HZ/10);
-		}
-
 	}
 
 	return 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1e54e636b927..0450f6867d61 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3323,6 +3323,33 @@  static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 	return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
 }
 
+static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
+{
+	/* If reclaim is making progress, wake any throttled tasks. */
+	if (sc->nr_reclaimed) {
+		wait_queue_head_t *wqh;
+
+		wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
+		if (waitqueue_active(wqh))
+			wake_up(wqh);
+
+		return;
+	}
+
+	/*
+	 * Do not throttle kswapd on NOPROGRESS as it will throttle on
+	 * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under
+	 * writeback and marked for immediate reclaim at the tail of
+	 * the LRU.
+	 */
+	if (current_is_kswapd())
+		return;
+
+	/* Throttle if making no progress at high prioities. */
+	if (sc->priority < DEF_PRIORITY - 2)
+		reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
+}
+
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -3407,6 +3434,7 @@  static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 			continue;
 		last_pgdat = zone->zone_pgdat;
 		shrink_node(zone->zone_pgdat, sc);
+		consider_reclaim_throttle(zone->zone_pgdat, sc);
 	}
 
 	/*