diff mbox series

[5/8] mm: vmscan: replace shrink_node() loop with a retry jump

Message ID 20191022144803.302233-6-hannes@cmpxchg.org (mailing list archive)
State New, archived
Headers show
Series : mm: vmscan: cgroup-related cleanups | expand

Commit Message

Johannes Weiner Oct. 22, 2019, 2:48 p.m. UTC
Most of the function body is inside a loop, which imposes an
additional indentation and scoping level that makes the code a bit
hard to follow and modify.

The looping only happens in case of reclaim-compaction, which isn't
the common case. So rather than adding yet another function level to
the reclaim path and have every reclaim invocation go through a level
that only exists for one specific cornercase, use a retry goto.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/vmscan.c | 231 ++++++++++++++++++++++++++--------------------------
 1 file changed, 115 insertions(+), 116 deletions(-)

Comments

Roman Gushchin Oct. 22, 2019, 7:56 p.m. UTC | #1
On Tue, Oct 22, 2019 at 10:48:00AM -0400, Johannes Weiner wrote:
> Most of the function body is inside a loop, which imposes an
> additional indentation and scoping level that makes the code a bit
> hard to follow and modify.
> 
> The looping only happens in case of reclaim-compaction, which isn't
> the common case. So rather than adding yet another function level to
> the reclaim path and have every reclaim invocation go through a level
> that only exists for one specific cornercase, use a retry goto.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  mm/vmscan.c | 231 ++++++++++++++++++++++++++--------------------------
>  1 file changed, 115 insertions(+), 116 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 302dad112f75..235d1fc72311 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2729,144 +2729,143 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
>  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  {
>  	struct reclaim_state *reclaim_state = current->reclaim_state;
> +	struct mem_cgroup *root = sc->target_mem_cgroup;
>  	unsigned long nr_reclaimed, nr_scanned;
>  	bool reclaimable = false;
> +	struct mem_cgroup *memcg;
> +again:
> +	memset(&sc->nr, 0, sizeof(sc->nr));
>  
> -	do {
> -		struct mem_cgroup *root = sc->target_mem_cgroup;
> -		struct mem_cgroup *memcg;
> -
> -		memset(&sc->nr, 0, sizeof(sc->nr));
> -
> -		nr_reclaimed = sc->nr_reclaimed;
> -		nr_scanned = sc->nr_scanned;
> +	nr_reclaimed = sc->nr_reclaimed;
> +	nr_scanned = sc->nr_scanned;
>  
> -		memcg = mem_cgroup_iter(root, NULL, NULL);
> -		do {
> -			unsigned long reclaimed;
> -			unsigned long scanned;
> +	memcg = mem_cgroup_iter(root, NULL, NULL);
> +	do {
> +		unsigned long reclaimed;
> +		unsigned long scanned;
>  
> -			switch (mem_cgroup_protected(root, memcg)) {
> -			case MEMCG_PROT_MIN:
> -				/*
> -				 * Hard protection.
> -				 * If there is no reclaimable memory, OOM.
> -				 */
> +		switch (mem_cgroup_protected(root, memcg)) {
> +		case MEMCG_PROT_MIN:
> +			/*
> +			 * Hard protection.
> +			 * If there is no reclaimable memory, OOM.
> +			 */
> +			continue;
> +		case MEMCG_PROT_LOW:
> +			/*
> +			 * Soft protection.
> +			 * Respect the protection only as long as
> +			 * there is an unprotected supply
> +			 * of reclaimable memory from other cgroups.
> +			 */
> +			if (!sc->memcg_low_reclaim) {
> +				sc->memcg_low_skipped = 1;
>  				continue;
> -			case MEMCG_PROT_LOW:
> -				/*
> -				 * Soft protection.
> -				 * Respect the protection only as long as
> -				 * there is an unprotected supply
> -				 * of reclaimable memory from other cgroups.
> -				 */
> -				if (!sc->memcg_low_reclaim) {
> -					sc->memcg_low_skipped = 1;
> -					continue;
> -				}
> -				memcg_memory_event(memcg, MEMCG_LOW);
> -				break;
> -			case MEMCG_PROT_NONE:
> -				/*
> -				 * All protection thresholds breached. We may
> -				 * still choose to vary the scan pressure
> -				 * applied based on by how much the cgroup in
> -				 * question has exceeded its protection
> -				 * thresholds (see get_scan_count).
> -				 */
> -				break;
>  			}
> +			memcg_memory_event(memcg, MEMCG_LOW);
> +			break;
> +		case MEMCG_PROT_NONE:
> +			/*
> +			 * All protection thresholds breached. We may
> +			 * still choose to vary the scan pressure
> +			 * applied based on by how much the cgroup in
> +			 * question has exceeded its protection
> +			 * thresholds (see get_scan_count).
> +			 */
> +			break;
> +		}
>  
> -			reclaimed = sc->nr_reclaimed;
> -			scanned = sc->nr_scanned;
> -			shrink_node_memcg(pgdat, memcg, sc);
> -
> -			shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> -					sc->priority);
> -
> -			/* Record the group's reclaim efficiency */
> -			vmpressure(sc->gfp_mask, memcg, false,
> -				   sc->nr_scanned - scanned,
> -				   sc->nr_reclaimed - reclaimed);
> -
> -		} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> +		reclaimed = sc->nr_reclaimed;
> +		scanned = sc->nr_scanned;
> +		shrink_node_memcg(pgdat, memcg, sc);
>  
> -		if (reclaim_state) {
> -			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> -			reclaim_state->reclaimed_slab = 0;
> -		}
> +		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> +			    sc->priority);
>  
> -		/* Record the subtree's reclaim efficiency */
> -		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> -			   sc->nr_scanned - nr_scanned,
> -			   sc->nr_reclaimed - nr_reclaimed);
> +		/* Record the group's reclaim efficiency */
> +		vmpressure(sc->gfp_mask, memcg, false,
> +			   sc->nr_scanned - scanned,
> +			   sc->nr_reclaimed - reclaimed);

It doesn't look as a trivial change. I'd add some comments to the commit message
why it's safe to do.

Thanks!

>  
> -		if (sc->nr_reclaimed - nr_reclaimed)
> -			reclaimable = true;
> +	} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
>  
> -		if (current_is_kswapd()) {
> -			/*
> -			 * If reclaim is isolating dirty pages under writeback,
> -			 * it implies that the long-lived page allocation rate
> -			 * is exceeding the page laundering rate. Either the
> -			 * global limits are not being effective at throttling
> -			 * processes due to the page distribution throughout
> -			 * zones or there is heavy usage of a slow backing
> -			 * device. The only option is to throttle from reclaim
> -			 * context which is not ideal as there is no guarantee
> -			 * the dirtying process is throttled in the same way
> -			 * balance_dirty_pages() manages.
> -			 *
> -			 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
> -			 * count the number of pages under pages flagged for
> -			 * immediate reclaim and stall if any are encountered
> -			 * in the nr_immediate check below.
> -			 */
> -			if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
> -				set_bit(PGDAT_WRITEBACK, &pgdat->flags);
> +	if (reclaim_state) {
> +		sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> +		reclaim_state->reclaimed_slab = 0;
> +	}
>  
> -			/*
> -			 * Tag a node as congested if all the dirty pages
> -			 * scanned were backed by a congested BDI and
> -			 * wait_iff_congested will stall.
> -			 */
> -			if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> -				set_bit(PGDAT_CONGESTED, &pgdat->flags);
> +	/* Record the subtree's reclaim efficiency */
> +	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> +		   sc->nr_scanned - nr_scanned,
> +		   sc->nr_reclaimed - nr_reclaimed);
>  
> -			/* Allow kswapd to start writing pages during reclaim.*/
> -			if (sc->nr.unqueued_dirty == sc->nr.file_taken)
> -				set_bit(PGDAT_DIRTY, &pgdat->flags);
> +	if (sc->nr_reclaimed - nr_reclaimed)
> +		reclaimable = true;
>  
> -			/*
> -			 * If kswapd scans pages marked marked for immediate
> -			 * reclaim and under writeback (nr_immediate), it
> -			 * implies that pages are cycling through the LRU
> -			 * faster than they are written so also forcibly stall.
> -			 */
> -			if (sc->nr.immediate)
> -				congestion_wait(BLK_RW_ASYNC, HZ/10);
> -		}

Don't you want to separate the block below into a separate function?
It can probably make the big loop shorter and easier to follow.

Thanks!

> +	if (current_is_kswapd()) {
> +		/*
> +		 * If reclaim is isolating dirty pages under writeback,
> +		 * it implies that the long-lived page allocation rate
> +		 * is exceeding the page laundering rate. Either the
> +		 * global limits are not being effective at throttling
> +		 * processes due to the page distribution throughout
> +		 * zones or there is heavy usage of a slow backing
> +		 * device. The only option is to throttle from reclaim
> +		 * context which is not ideal as there is no guarantee
> +		 * the dirtying process is throttled in the same way
> +		 * balance_dirty_pages() manages.
> +		 *
> +		 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
> +		 * count the number of pages under pages flagged for
> +		 * immediate reclaim and stall if any are encountered
> +		 * in the nr_immediate check below.
> +		 */
> +		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
> +			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
>  
>  		/*
> -		 * Legacy memcg will stall in page writeback so avoid forcibly
> -		 * stalling in wait_iff_congested().
> +		 * Tag a node as congested if all the dirty pages
> +		 * scanned were backed by a congested BDI and
> +		 * wait_iff_congested will stall.
>  		 */
> -		if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> -		    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> -			set_memcg_congestion(pgdat, root, true);
> +		if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> +			set_bit(PGDAT_CONGESTED, &pgdat->flags);
> +
> +		/* Allow kswapd to start writing pages during reclaim.*/
> +		if (sc->nr.unqueued_dirty == sc->nr.file_taken)
> +			set_bit(PGDAT_DIRTY, &pgdat->flags);
>  
>  		/*
> -		 * Stall direct reclaim for IO completions if underlying BDIs
> -		 * and node is congested. Allow kswapd to continue until it
> -		 * starts encountering unqueued dirty pages or cycling through
> -		 * the LRU too quickly.
> +		 * If kswapd scans pages marked marked for immediate
> +		 * reclaim and under writeback (nr_immediate), it
> +		 * implies that pages are cycling through the LRU
> +		 * faster than they are written so also forcibly stall.
>  		 */
> -		if (!sc->hibernation_mode && !current_is_kswapd() &&
> -		   current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> -			wait_iff_congested(BLK_RW_ASYNC, HZ/10);
> +		if (sc->nr.immediate)
> +			congestion_wait(BLK_RW_ASYNC, HZ/10);
> +	}
> +
> +	/*
> +	 * Legacy memcg will stall in page writeback so avoid forcibly
> +	 * stalling in wait_iff_congested().
> +	 */
> +	if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> +	    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> +		set_memcg_congestion(pgdat, root, true);
> +
> +	/*
> +	 * Stall direct reclaim for IO completions if underlying BDIs
> +	 * and node is congested. Allow kswapd to continue until it
> +	 * starts encountering unqueued dirty pages or cycling through
> +	 * the LRU too quickly.
> +	 */
> +	if (!sc->hibernation_mode && !current_is_kswapd() &&
> +	    current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> +		wait_iff_congested(BLK_RW_ASYNC, HZ/10);
>  
> -	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> -					 sc));
> +	if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> +				    sc))
> +		goto again;
>  
>  	/*
>  	 * Kswapd gives up on balancing particular nodes after too
> -- 
> 2.23.0
> 
>
Johannes Weiner Oct. 22, 2019, 9:42 p.m. UTC | #2
On Tue, Oct 22, 2019 at 07:56:33PM +0000, Roman Gushchin wrote:
> On Tue, Oct 22, 2019 at 10:48:00AM -0400, Johannes Weiner wrote:
> > -			/* Record the group's reclaim efficiency */
> > -			vmpressure(sc->gfp_mask, memcg, false,
> > -				   sc->nr_scanned - scanned,
> > -				   sc->nr_reclaimed - reclaimed);
> > -
> > -		} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> > +		reclaimed = sc->nr_reclaimed;
> > +		scanned = sc->nr_scanned;
> > +		shrink_node_memcg(pgdat, memcg, sc);
> >  
> > -		if (reclaim_state) {
> > -			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> > -			reclaim_state->reclaimed_slab = 0;
> > -		}
> > +		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> > +			    sc->priority);
> >  
> > -		/* Record the subtree's reclaim efficiency */
> > -		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> > -			   sc->nr_scanned - nr_scanned,
> > -			   sc->nr_reclaimed - nr_reclaimed);
> > +		/* Record the group's reclaim efficiency */
> > +		vmpressure(sc->gfp_mask, memcg, false,
> > +			   sc->nr_scanned - scanned,
> > +			   sc->nr_reclaimed - reclaimed);
> 
> It doesn't look as a trivial change. I'd add some comments to the commit message
> why it's safe to do.

It's an equivalent change - it's just really misleading because the
+++ lines are not the counter-part of the --- lines here!

There are two vmpressure calls in this function: one against the
individual cgroups, and one against the tree. The diff puts them
adjacent here, but the counter-part for the --- lines is here:

> > +	/* Record the subtree's reclaim efficiency */
> > +	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> > +		   sc->nr_scanned - nr_scanned,
> > +		   sc->nr_reclaimed - nr_reclaimed);

And the counter-part to the +++ lines is further up (beginning of the
quoted diff).
Roman Gushchin Oct. 22, 2019, 10:46 p.m. UTC | #3
On Tue, Oct 22, 2019 at 05:42:49PM -0400, Johannes Weiner wrote:
> On Tue, Oct 22, 2019 at 07:56:33PM +0000, Roman Gushchin wrote:
> > On Tue, Oct 22, 2019 at 10:48:00AM -0400, Johannes Weiner wrote:
> > > -			/* Record the group's reclaim efficiency */
> > > -			vmpressure(sc->gfp_mask, memcg, false,
> > > -				   sc->nr_scanned - scanned,
> > > -				   sc->nr_reclaimed - reclaimed);
> > > -
> > > -		} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> > > +		reclaimed = sc->nr_reclaimed;
> > > +		scanned = sc->nr_scanned;
> > > +		shrink_node_memcg(pgdat, memcg, sc);
> > >  
> > > -		if (reclaim_state) {
> > > -			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> > > -			reclaim_state->reclaimed_slab = 0;
> > > -		}
> > > +		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> > > +			    sc->priority);
> > >  
> > > -		/* Record the subtree's reclaim efficiency */
> > > -		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> > > -			   sc->nr_scanned - nr_scanned,
> > > -			   sc->nr_reclaimed - nr_reclaimed);
> > > +		/* Record the group's reclaim efficiency */
> > > +		vmpressure(sc->gfp_mask, memcg, false,
> > > +			   sc->nr_scanned - scanned,
> > > +			   sc->nr_reclaimed - reclaimed);
> > 
> > It doesn't look as a trivial change. I'd add some comments to the commit message
> > why it's safe to do.
> 
> It's an equivalent change - it's just really misleading because the
> +++ lines are not the counter-part of the --- lines here!
> 
> There are two vmpressure calls in this function: one against the
> individual cgroups, and one against the tree. The diff puts them
> adjacent here, but the counter-part for the --- lines is here:
> 
> > > +	/* Record the subtree's reclaim efficiency */
> > > +	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> > > +		   sc->nr_scanned - nr_scanned,
> > > +		   sc->nr_reclaimed - nr_reclaimed);
> 
> And the counter-part to the +++ lines is further up (beginning of the
> quoted diff).
> 

Ah, ok, got it. You were right in the foreword, indentation change
diffs are hard to read.

Thanks for the explanation!

Reviewed-by: Roman Gushchin <guro@fb.com>
Michal Hocko Oct. 23, 2019, 2:18 p.m. UTC | #4
On Tue 22-10-19 10:48:00, Johannes Weiner wrote:
> Most of the function body is inside a loop, which imposes an
> additional indentation and scoping level that makes the code a bit
> hard to follow and modify.

I do agree!

> The looping only happens in case of reclaim-compaction, which isn't
> the common case. So rather than adding yet another function level to
> the reclaim path and have every reclaim invocation go through a level
> that only exists for one specific cornercase, use a retry goto.

I would just keep the core logic in its own function and do the loop
around it rather than a goto retry. This is certainly a matter of taste
but I like a loop with an explicit condition much more than a if with
goto.

> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  mm/vmscan.c | 231 ++++++++++++++++++++++++++--------------------------
>  1 file changed, 115 insertions(+), 116 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 302dad112f75..235d1fc72311 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2729,144 +2729,143 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
>  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  {
>  	struct reclaim_state *reclaim_state = current->reclaim_state;
> +	struct mem_cgroup *root = sc->target_mem_cgroup;
>  	unsigned long nr_reclaimed, nr_scanned;
>  	bool reclaimable = false;
> +	struct mem_cgroup *memcg;
> +again:
> +	memset(&sc->nr, 0, sizeof(sc->nr));
>  
> -	do {
> -		struct mem_cgroup *root = sc->target_mem_cgroup;
> -		struct mem_cgroup *memcg;
> -
> -		memset(&sc->nr, 0, sizeof(sc->nr));
> -
> -		nr_reclaimed = sc->nr_reclaimed;
> -		nr_scanned = sc->nr_scanned;
> +	nr_reclaimed = sc->nr_reclaimed;
> +	nr_scanned = sc->nr_scanned;
>  
> -		memcg = mem_cgroup_iter(root, NULL, NULL);
> -		do {
> -			unsigned long reclaimed;
> -			unsigned long scanned;
> +	memcg = mem_cgroup_iter(root, NULL, NULL);
> +	do {
> +		unsigned long reclaimed;
> +		unsigned long scanned;
>  
> -			switch (mem_cgroup_protected(root, memcg)) {
> -			case MEMCG_PROT_MIN:
> -				/*
> -				 * Hard protection.
> -				 * If there is no reclaimable memory, OOM.
> -				 */
> +		switch (mem_cgroup_protected(root, memcg)) {
> +		case MEMCG_PROT_MIN:
> +			/*
> +			 * Hard protection.
> +			 * If there is no reclaimable memory, OOM.
> +			 */
> +			continue;
> +		case MEMCG_PROT_LOW:
> +			/*
> +			 * Soft protection.
> +			 * Respect the protection only as long as
> +			 * there is an unprotected supply
> +			 * of reclaimable memory from other cgroups.
> +			 */
> +			if (!sc->memcg_low_reclaim) {
> +				sc->memcg_low_skipped = 1;
>  				continue;
> -			case MEMCG_PROT_LOW:
> -				/*
> -				 * Soft protection.
> -				 * Respect the protection only as long as
> -				 * there is an unprotected supply
> -				 * of reclaimable memory from other cgroups.
> -				 */
> -				if (!sc->memcg_low_reclaim) {
> -					sc->memcg_low_skipped = 1;
> -					continue;
> -				}
> -				memcg_memory_event(memcg, MEMCG_LOW);
> -				break;
> -			case MEMCG_PROT_NONE:
> -				/*
> -				 * All protection thresholds breached. We may
> -				 * still choose to vary the scan pressure
> -				 * applied based on by how much the cgroup in
> -				 * question has exceeded its protection
> -				 * thresholds (see get_scan_count).
> -				 */
> -				break;
>  			}
> +			memcg_memory_event(memcg, MEMCG_LOW);
> +			break;
> +		case MEMCG_PROT_NONE:
> +			/*
> +			 * All protection thresholds breached. We may
> +			 * still choose to vary the scan pressure
> +			 * applied based on by how much the cgroup in
> +			 * question has exceeded its protection
> +			 * thresholds (see get_scan_count).
> +			 */
> +			break;
> +		}
>  
> -			reclaimed = sc->nr_reclaimed;
> -			scanned = sc->nr_scanned;
> -			shrink_node_memcg(pgdat, memcg, sc);
> -
> -			shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> -					sc->priority);
> -
> -			/* Record the group's reclaim efficiency */
> -			vmpressure(sc->gfp_mask, memcg, false,
> -				   sc->nr_scanned - scanned,
> -				   sc->nr_reclaimed - reclaimed);
> -
> -		} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> +		reclaimed = sc->nr_reclaimed;
> +		scanned = sc->nr_scanned;
> +		shrink_node_memcg(pgdat, memcg, sc);
>  
> -		if (reclaim_state) {
> -			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> -			reclaim_state->reclaimed_slab = 0;
> -		}
> +		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> +			    sc->priority);
>  
> -		/* Record the subtree's reclaim efficiency */
> -		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> -			   sc->nr_scanned - nr_scanned,
> -			   sc->nr_reclaimed - nr_reclaimed);
> +		/* Record the group's reclaim efficiency */
> +		vmpressure(sc->gfp_mask, memcg, false,
> +			   sc->nr_scanned - scanned,
> +			   sc->nr_reclaimed - reclaimed);
>  
> -		if (sc->nr_reclaimed - nr_reclaimed)
> -			reclaimable = true;
> +	} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
>  
> -		if (current_is_kswapd()) {
> -			/*
> -			 * If reclaim is isolating dirty pages under writeback,
> -			 * it implies that the long-lived page allocation rate
> -			 * is exceeding the page laundering rate. Either the
> -			 * global limits are not being effective at throttling
> -			 * processes due to the page distribution throughout
> -			 * zones or there is heavy usage of a slow backing
> -			 * device. The only option is to throttle from reclaim
> -			 * context which is not ideal as there is no guarantee
> -			 * the dirtying process is throttled in the same way
> -			 * balance_dirty_pages() manages.
> -			 *
> -			 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
> -			 * count the number of pages under pages flagged for
> -			 * immediate reclaim and stall if any are encountered
> -			 * in the nr_immediate check below.
> -			 */
> -			if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
> -				set_bit(PGDAT_WRITEBACK, &pgdat->flags);
> +	if (reclaim_state) {
> +		sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> +		reclaim_state->reclaimed_slab = 0;
> +	}
>  
> -			/*
> -			 * Tag a node as congested if all the dirty pages
> -			 * scanned were backed by a congested BDI and
> -			 * wait_iff_congested will stall.
> -			 */
> -			if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> -				set_bit(PGDAT_CONGESTED, &pgdat->flags);
> +	/* Record the subtree's reclaim efficiency */
> +	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> +		   sc->nr_scanned - nr_scanned,
> +		   sc->nr_reclaimed - nr_reclaimed);
>  
> -			/* Allow kswapd to start writing pages during reclaim.*/
> -			if (sc->nr.unqueued_dirty == sc->nr.file_taken)
> -				set_bit(PGDAT_DIRTY, &pgdat->flags);
> +	if (sc->nr_reclaimed - nr_reclaimed)
> +		reclaimable = true;
>  
> -			/*
> -			 * If kswapd scans pages marked marked for immediate
> -			 * reclaim and under writeback (nr_immediate), it
> -			 * implies that pages are cycling through the LRU
> -			 * faster than they are written so also forcibly stall.
> -			 */
> -			if (sc->nr.immediate)
> -				congestion_wait(BLK_RW_ASYNC, HZ/10);
> -		}
> +	if (current_is_kswapd()) {
> +		/*
> +		 * If reclaim is isolating dirty pages under writeback,
> +		 * it implies that the long-lived page allocation rate
> +		 * is exceeding the page laundering rate. Either the
> +		 * global limits are not being effective at throttling
> +		 * processes due to the page distribution throughout
> +		 * zones or there is heavy usage of a slow backing
> +		 * device. The only option is to throttle from reclaim
> +		 * context which is not ideal as there is no guarantee
> +		 * the dirtying process is throttled in the same way
> +		 * balance_dirty_pages() manages.
> +		 *
> +		 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
> +		 * count the number of pages under pages flagged for
> +		 * immediate reclaim and stall if any are encountered
> +		 * in the nr_immediate check below.
> +		 */
> +		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
> +			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
>  
>  		/*
> -		 * Legacy memcg will stall in page writeback so avoid forcibly
> -		 * stalling in wait_iff_congested().
> +		 * Tag a node as congested if all the dirty pages
> +		 * scanned were backed by a congested BDI and
> +		 * wait_iff_congested will stall.
>  		 */
> -		if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> -		    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> -			set_memcg_congestion(pgdat, root, true);
> +		if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> +			set_bit(PGDAT_CONGESTED, &pgdat->flags);
> +
> +		/* Allow kswapd to start writing pages during reclaim.*/
> +		if (sc->nr.unqueued_dirty == sc->nr.file_taken)
> +			set_bit(PGDAT_DIRTY, &pgdat->flags);
>  
>  		/*
> -		 * Stall direct reclaim for IO completions if underlying BDIs
> -		 * and node is congested. Allow kswapd to continue until it
> -		 * starts encountering unqueued dirty pages or cycling through
> -		 * the LRU too quickly.
> +		 * If kswapd scans pages marked marked for immediate
> +		 * reclaim and under writeback (nr_immediate), it
> +		 * implies that pages are cycling through the LRU
> +		 * faster than they are written so also forcibly stall.
>  		 */
> -		if (!sc->hibernation_mode && !current_is_kswapd() &&
> -		   current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> -			wait_iff_congested(BLK_RW_ASYNC, HZ/10);
> +		if (sc->nr.immediate)
> +			congestion_wait(BLK_RW_ASYNC, HZ/10);
> +	}
> +
> +	/*
> +	 * Legacy memcg will stall in page writeback so avoid forcibly
> +	 * stalling in wait_iff_congested().
> +	 */
> +	if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> +	    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> +		set_memcg_congestion(pgdat, root, true);
> +
> +	/*
> +	 * Stall direct reclaim for IO completions if underlying BDIs
> +	 * and node is congested. Allow kswapd to continue until it
> +	 * starts encountering unqueued dirty pages or cycling through
> +	 * the LRU too quickly.
> +	 */
> +	if (!sc->hibernation_mode && !current_is_kswapd() &&
> +	    current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> +		wait_iff_congested(BLK_RW_ASYNC, HZ/10);
>  
> -	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> -					 sc));
> +	if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> +				    sc))
> +		goto again;
>  
>  	/*
>  	 * Kswapd gives up on balancing particular nodes after too
> -- 
> 2.23.0
Johannes Weiner Oct. 25, 2019, 1:44 p.m. UTC | #5
On Wed, Oct 23, 2019 at 04:18:57PM +0200, Michal Hocko wrote:
> On Tue 22-10-19 10:48:00, Johannes Weiner wrote:
> > Most of the function body is inside a loop, which imposes an
> > additional indentation and scoping level that makes the code a bit
> > hard to follow and modify.
> 
> I do agree!
> 
> > The looping only happens in case of reclaim-compaction, which isn't
> > the common case. So rather than adding yet another function level to
> > the reclaim path and have every reclaim invocation go through a level
> > that only exists for one specific cornercase, use a retry goto.
> 
> I would just keep the core logic in its own function and do the loop
> around it rather than a goto retry. This is certainly a matter of taste
> but I like a loop with an explicit condition much more than a if with
> goto.

Yeah, as the changelog says, I'm intentionally putting the looping
construct into the "cold path" of the code flow: we only loops in a
very specific cornercase, and having the whole body in a loop, or
creating another function nesting level for it suggests otherwise.

A goto seems like the perfect tool to have a retry for one particular
caller without muddying the code flow for the common call stack.

Matter of taste, I guess.
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 302dad112f75..235d1fc72311 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2729,144 +2729,143 @@  static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
 static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 {
 	struct reclaim_state *reclaim_state = current->reclaim_state;
+	struct mem_cgroup *root = sc->target_mem_cgroup;
 	unsigned long nr_reclaimed, nr_scanned;
 	bool reclaimable = false;
+	struct mem_cgroup *memcg;
+again:
+	memset(&sc->nr, 0, sizeof(sc->nr));
 
-	do {
-		struct mem_cgroup *root = sc->target_mem_cgroup;
-		struct mem_cgroup *memcg;
-
-		memset(&sc->nr, 0, sizeof(sc->nr));
-
-		nr_reclaimed = sc->nr_reclaimed;
-		nr_scanned = sc->nr_scanned;
+	nr_reclaimed = sc->nr_reclaimed;
+	nr_scanned = sc->nr_scanned;
 
-		memcg = mem_cgroup_iter(root, NULL, NULL);
-		do {
-			unsigned long reclaimed;
-			unsigned long scanned;
+	memcg = mem_cgroup_iter(root, NULL, NULL);
+	do {
+		unsigned long reclaimed;
+		unsigned long scanned;
 
-			switch (mem_cgroup_protected(root, memcg)) {
-			case MEMCG_PROT_MIN:
-				/*
-				 * Hard protection.
-				 * If there is no reclaimable memory, OOM.
-				 */
+		switch (mem_cgroup_protected(root, memcg)) {
+		case MEMCG_PROT_MIN:
+			/*
+			 * Hard protection.
+			 * If there is no reclaimable memory, OOM.
+			 */
+			continue;
+		case MEMCG_PROT_LOW:
+			/*
+			 * Soft protection.
+			 * Respect the protection only as long as
+			 * there is an unprotected supply
+			 * of reclaimable memory from other cgroups.
+			 */
+			if (!sc->memcg_low_reclaim) {
+				sc->memcg_low_skipped = 1;
 				continue;
-			case MEMCG_PROT_LOW:
-				/*
-				 * Soft protection.
-				 * Respect the protection only as long as
-				 * there is an unprotected supply
-				 * of reclaimable memory from other cgroups.
-				 */
-				if (!sc->memcg_low_reclaim) {
-					sc->memcg_low_skipped = 1;
-					continue;
-				}
-				memcg_memory_event(memcg, MEMCG_LOW);
-				break;
-			case MEMCG_PROT_NONE:
-				/*
-				 * All protection thresholds breached. We may
-				 * still choose to vary the scan pressure
-				 * applied based on by how much the cgroup in
-				 * question has exceeded its protection
-				 * thresholds (see get_scan_count).
-				 */
-				break;
 			}
+			memcg_memory_event(memcg, MEMCG_LOW);
+			break;
+		case MEMCG_PROT_NONE:
+			/*
+			 * All protection thresholds breached. We may
+			 * still choose to vary the scan pressure
+			 * applied based on by how much the cgroup in
+			 * question has exceeded its protection
+			 * thresholds (see get_scan_count).
+			 */
+			break;
+		}
 
-			reclaimed = sc->nr_reclaimed;
-			scanned = sc->nr_scanned;
-			shrink_node_memcg(pgdat, memcg, sc);
-
-			shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
-					sc->priority);
-
-			/* Record the group's reclaim efficiency */
-			vmpressure(sc->gfp_mask, memcg, false,
-				   sc->nr_scanned - scanned,
-				   sc->nr_reclaimed - reclaimed);
-
-		} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
+		reclaimed = sc->nr_reclaimed;
+		scanned = sc->nr_scanned;
+		shrink_node_memcg(pgdat, memcg, sc);
 
-		if (reclaim_state) {
-			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
-			reclaim_state->reclaimed_slab = 0;
-		}
+		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
+			    sc->priority);
 
-		/* Record the subtree's reclaim efficiency */
-		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
-			   sc->nr_scanned - nr_scanned,
-			   sc->nr_reclaimed - nr_reclaimed);
+		/* Record the group's reclaim efficiency */
+		vmpressure(sc->gfp_mask, memcg, false,
+			   sc->nr_scanned - scanned,
+			   sc->nr_reclaimed - reclaimed);
 
-		if (sc->nr_reclaimed - nr_reclaimed)
-			reclaimable = true;
+	} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
 
-		if (current_is_kswapd()) {
-			/*
-			 * If reclaim is isolating dirty pages under writeback,
-			 * it implies that the long-lived page allocation rate
-			 * is exceeding the page laundering rate. Either the
-			 * global limits are not being effective at throttling
-			 * processes due to the page distribution throughout
-			 * zones or there is heavy usage of a slow backing
-			 * device. The only option is to throttle from reclaim
-			 * context which is not ideal as there is no guarantee
-			 * the dirtying process is throttled in the same way
-			 * balance_dirty_pages() manages.
-			 *
-			 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
-			 * count the number of pages under pages flagged for
-			 * immediate reclaim and stall if any are encountered
-			 * in the nr_immediate check below.
-			 */
-			if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
-				set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+	if (reclaim_state) {
+		sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+		reclaim_state->reclaimed_slab = 0;
+	}
 
-			/*
-			 * Tag a node as congested if all the dirty pages
-			 * scanned were backed by a congested BDI and
-			 * wait_iff_congested will stall.
-			 */
-			if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
-				set_bit(PGDAT_CONGESTED, &pgdat->flags);
+	/* Record the subtree's reclaim efficiency */
+	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+		   sc->nr_scanned - nr_scanned,
+		   sc->nr_reclaimed - nr_reclaimed);
 
-			/* Allow kswapd to start writing pages during reclaim.*/
-			if (sc->nr.unqueued_dirty == sc->nr.file_taken)
-				set_bit(PGDAT_DIRTY, &pgdat->flags);
+	if (sc->nr_reclaimed - nr_reclaimed)
+		reclaimable = true;
 
-			/*
-			 * If kswapd scans pages marked marked for immediate
-			 * reclaim and under writeback (nr_immediate), it
-			 * implies that pages are cycling through the LRU
-			 * faster than they are written so also forcibly stall.
-			 */
-			if (sc->nr.immediate)
-				congestion_wait(BLK_RW_ASYNC, HZ/10);
-		}
+	if (current_is_kswapd()) {
+		/*
+		 * If reclaim is isolating dirty pages under writeback,
+		 * it implies that the long-lived page allocation rate
+		 * is exceeding the page laundering rate. Either the
+		 * global limits are not being effective at throttling
+		 * processes due to the page distribution throughout
+		 * zones or there is heavy usage of a slow backing
+		 * device. The only option is to throttle from reclaim
+		 * context which is not ideal as there is no guarantee
+		 * the dirtying process is throttled in the same way
+		 * balance_dirty_pages() manages.
+		 *
+		 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
+		 * count the number of pages under pages flagged for
+		 * immediate reclaim and stall if any are encountered
+		 * in the nr_immediate check below.
+		 */
+		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
+			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
 
 		/*
-		 * Legacy memcg will stall in page writeback so avoid forcibly
-		 * stalling in wait_iff_congested().
+		 * Tag a node as congested if all the dirty pages
+		 * scanned were backed by a congested BDI and
+		 * wait_iff_congested will stall.
 		 */
-		if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
-		    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
-			set_memcg_congestion(pgdat, root, true);
+		if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+			set_bit(PGDAT_CONGESTED, &pgdat->flags);
+
+		/* Allow kswapd to start writing pages during reclaim.*/
+		if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+			set_bit(PGDAT_DIRTY, &pgdat->flags);
 
 		/*
-		 * Stall direct reclaim for IO completions if underlying BDIs
-		 * and node is congested. Allow kswapd to continue until it
-		 * starts encountering unqueued dirty pages or cycling through
-		 * the LRU too quickly.
+		 * If kswapd scans pages marked marked for immediate
+		 * reclaim and under writeback (nr_immediate), it
+		 * implies that pages are cycling through the LRU
+		 * faster than they are written so also forcibly stall.
 		 */
-		if (!sc->hibernation_mode && !current_is_kswapd() &&
-		   current_may_throttle() && pgdat_memcg_congested(pgdat, root))
-			wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+		if (sc->nr.immediate)
+			congestion_wait(BLK_RW_ASYNC, HZ/10);
+	}
+
+	/*
+	 * Legacy memcg will stall in page writeback so avoid forcibly
+	 * stalling in wait_iff_congested().
+	 */
+	if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
+	    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+		set_memcg_congestion(pgdat, root, true);
+
+	/*
+	 * Stall direct reclaim for IO completions if underlying BDIs
+	 * and node is congested. Allow kswapd to continue until it
+	 * starts encountering unqueued dirty pages or cycling through
+	 * the LRU too quickly.
+	 */
+	if (!sc->hibernation_mode && !current_is_kswapd() &&
+	    current_may_throttle() && pgdat_memcg_congested(pgdat, root))
+		wait_iff_congested(BLK_RW_ASYNC, HZ/10);
 
-	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
-					 sc));
+	if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
+				    sc))
+		goto again;
 
 	/*
 	 * Kswapd gives up on balancing particular nodes after too