diff mbox series

[RFC,20/26] mm: vmscan: use compaction_suitable() check in kswapd

Message ID 20230418191313.268131-21-hannes@cmpxchg.org (mailing list archive)
State New
Headers show
Series mm: reliable huge page allocator | expand

Commit Message

Johannes Weiner April 18, 2023, 7:13 p.m. UTC
Kswapd currently bails on higher-order allocations with an open-coded
check for whether it's reclaimed the compaction gap.

compaction_suitable() is the customary interface to coordinate reclaim
with compaction.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/vmscan.c | 67 ++++++++++++++++++-----------------------------------
 1 file changed, 23 insertions(+), 44 deletions(-)

Comments

Huang, Ying April 25, 2023, 3:12 a.m. UTC | #1
Johannes Weiner <hannes@cmpxchg.org> writes:

> Kswapd currently bails on higher-order allocations with an open-coded
> check for whether it's reclaimed the compaction gap.
>
> compaction_suitable() is the customary interface to coordinate reclaim
> with compaction.
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  mm/vmscan.c | 67 ++++++++++++++++++-----------------------------------
>  1 file changed, 23 insertions(+), 44 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ee8c8ca2e7b5..723705b9e4d9 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -6872,12 +6872,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
>  		if (!managed_zone(zone))
>  			continue;
>  
> +		/* Allocation can succeed in any zone, done */
>  		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
>  			mark = wmark_pages(zone, WMARK_PROMO);
>  		else
>  			mark = high_wmark_pages(zone);
>  		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
>  			return true;
> +
> +		/* Allocation can't succeed, but enough order-0 to compact */
> +		if (compaction_suitable(zone, order,
> +					highest_zoneidx) == COMPACT_CONTINUE)
> +			return true;

Should we check the following first?

        order > 0 && zone_watermark_ok_safe(zone, 0, mark, highest_zoneidx)

Best Regards,
Huang, Ying

>  	}
>  
>  	/*
> @@ -6968,16 +6974,6 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
>  	 */
>  	shrink_node(pgdat, sc);
>  
> -	/*
> -	 * Fragmentation may mean that the system cannot be rebalanced for
> -	 * high-order allocations. If twice the allocation size has been
> -	 * reclaimed then recheck watermarks only at order-0 to prevent
> -	 * excessive reclaim. Assume that a process requested a high-order
> -	 * can direct reclaim/compact.
> -	 */
> -	if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
> -		sc->order = 0;
> -
>  	return sc->nr_scanned >= sc->nr_to_reclaim;
>  }
>  
> @@ -7018,15 +7014,13 @@ clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
>   * that are eligible for use by the caller until at least one zone is
>   * balanced.
>   *
> - * Returns the order kswapd finished reclaiming at.
> - *
>   * kswapd scans the zones in the highmem->normal->dma direction.  It skips
>   * zones which have free_pages > high_wmark_pages(zone), but once a zone is
>   * found to have free_pages <= high_wmark_pages(zone), any page in that zone
>   * or lower is eligible for reclaim until at least one usable zone is
>   * balanced.
>   */
> -static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
> +static void balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
>  {
>  	int i;
>  	unsigned long nr_soft_reclaimed;
> @@ -7226,14 +7220,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
>  	__fs_reclaim_release(_THIS_IP_);
>  	psi_memstall_leave(&pflags);
>  	set_task_reclaim_state(current, NULL);
> -
> -	/*
> -	 * Return the order kswapd stopped reclaiming at as
> -	 * prepare_kswapd_sleep() takes it into account. If another caller
> -	 * entered the allocator slow path while kswapd was awake, order will
> -	 * remain at the higher level.
> -	 */
> -	return sc.order;
>  }
>  
>  /*
> @@ -7251,7 +7237,7 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
>  	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
>  }
>  
> -static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
> +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
>  				unsigned int highest_zoneidx)
>  {
>  	long remaining = 0;
> @@ -7269,7 +7255,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>  	 * eligible zone balanced that it's also unlikely that compaction will
>  	 * succeed.
>  	 */
> -	if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
> +	if (prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) {
>  		/*
>  		 * Compaction records what page blocks it recently failed to
>  		 * isolate pages from and skips them in the future scanning.
> @@ -7282,7 +7268,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>  		 * We have freed the memory, now we should compact it to make
>  		 * allocation of the requested order possible.
>  		 */
> -		wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
> +		wakeup_kcompactd(pgdat, order, highest_zoneidx);
>  
>  		remaining = schedule_timeout(HZ/10);
>  
> @@ -7296,8 +7282,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>  					kswapd_highest_zoneidx(pgdat,
>  							highest_zoneidx));
>  
> -			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
> -				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
> +			if (READ_ONCE(pgdat->kswapd_order) < order)
> +				WRITE_ONCE(pgdat->kswapd_order, order);
>  		}
>  
>  		finish_wait(&pgdat->kswapd_wait, &wait);
> @@ -7308,8 +7294,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>  	 * After a short sleep, check if it was a premature sleep. If not, then
>  	 * go fully to sleep until explicitly woken up.
>  	 */
> -	if (!remaining &&
> -	    prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
> +	if (!remaining && prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) {
>  		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
>  
>  		/*
> @@ -7350,8 +7335,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>   */
>  static int kswapd(void *p)
>  {
> -	unsigned int alloc_order, reclaim_order;
> -	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
> +	unsigned int order, highest_zoneidx;
>  	pg_data_t *pgdat = (pg_data_t *)p;
>  	struct task_struct *tsk = current;
>  	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> @@ -7374,22 +7358,20 @@ static int kswapd(void *p)
>  	tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
>  	set_freezable();
>  
> -	WRITE_ONCE(pgdat->kswapd_order, 0);
> +	order = 0;
> +	highest_zoneidx = MAX_NR_ZONES - 1;
> +	WRITE_ONCE(pgdat->kswapd_order, order);
>  	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
> +
>  	atomic_set(&pgdat->nr_writeback_throttled, 0);
> +
>  	for ( ; ; ) {
>  		bool ret;
>  
> -		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
> -		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
> -							highest_zoneidx);
> -
> -kswapd_try_sleep:
> -		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
> -					highest_zoneidx);
> +		kswapd_try_to_sleep(pgdat, order, highest_zoneidx);
>  
>  		/* Read the new order and highest_zoneidx */
> -		alloc_order = READ_ONCE(pgdat->kswapd_order);
> +		order = READ_ONCE(pgdat->kswapd_order);
>  		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
>  							highest_zoneidx);
>  		WRITE_ONCE(pgdat->kswapd_order, 0);
> @@ -7415,11 +7397,8 @@ static int kswapd(void *p)
>  		 * request (alloc_order).
>  		 */
>  		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
> -						alloc_order);
> -		reclaim_order = balance_pgdat(pgdat, alloc_order,
> -						highest_zoneidx);
> -		if (reclaim_order < alloc_order)
> -			goto kswapd_try_sleep;
> +					    order);
> +		balance_pgdat(pgdat, order, highest_zoneidx);
>  	}
>  
>  	tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
Johannes Weiner April 25, 2023, 2:26 p.m. UTC | #2
On Tue, Apr 25, 2023 at 11:12:28AM +0800, Huang, Ying wrote:
> Johannes Weiner <hannes@cmpxchg.org> writes:
> 
> > Kswapd currently bails on higher-order allocations with an open-coded
> > check for whether it's reclaimed the compaction gap.
> >
> > compaction_suitable() is the customary interface to coordinate reclaim
> > with compaction.
> >
> > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> > ---
> >  mm/vmscan.c | 67 ++++++++++++++++++-----------------------------------
> >  1 file changed, 23 insertions(+), 44 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index ee8c8ca2e7b5..723705b9e4d9 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -6872,12 +6872,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
> >  		if (!managed_zone(zone))
> >  			continue;
> >  
> > +		/* Allocation can succeed in any zone, done */
> >  		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
> >  			mark = wmark_pages(zone, WMARK_PROMO);
> >  		else
> >  			mark = high_wmark_pages(zone);
> >  		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
> >  			return true;
> > +
> > +		/* Allocation can't succeed, but enough order-0 to compact */
> > +		if (compaction_suitable(zone, order,
> > +					highest_zoneidx) == COMPACT_CONTINUE)
> > +			return true;
> 
> Should we check the following first?
> 
>         order > 0 && zone_watermark_ok_safe(zone, 0, mark, highest_zoneidx)

That's what compaction_suitable() does. It checks whether there are
enough migration targets for compaction (COMPACT_CONTINUE) or whether
reclaim needs to do some more work (COMPACT_SKIPPED).
Huang, Ying April 26, 2023, 1:30 a.m. UTC | #3
Johannes Weiner <hannes@cmpxchg.org> writes:

> On Tue, Apr 25, 2023 at 11:12:28AM +0800, Huang, Ying wrote:
>> Johannes Weiner <hannes@cmpxchg.org> writes:
>> 
>> > Kswapd currently bails on higher-order allocations with an open-coded
>> > check for whether it's reclaimed the compaction gap.
>> >
>> > compaction_suitable() is the customary interface to coordinate reclaim
>> > with compaction.
>> >
>> > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
>> > ---
>> >  mm/vmscan.c | 67 ++++++++++++++++++-----------------------------------
>> >  1 file changed, 23 insertions(+), 44 deletions(-)
>> >
>> > diff --git a/mm/vmscan.c b/mm/vmscan.c
>> > index ee8c8ca2e7b5..723705b9e4d9 100644
>> > --- a/mm/vmscan.c
>> > +++ b/mm/vmscan.c
>> > @@ -6872,12 +6872,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
>> >  		if (!managed_zone(zone))
>> >  			continue;
>> >  
>> > +		/* Allocation can succeed in any zone, done */
>> >  		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
>> >  			mark = wmark_pages(zone, WMARK_PROMO);
>> >  		else
>> >  			mark = high_wmark_pages(zone);
>> >  		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
>> >  			return true;
>> > +
>> > +		/* Allocation can't succeed, but enough order-0 to compact */
>> > +		if (compaction_suitable(zone, order,
>> > +					highest_zoneidx) == COMPACT_CONTINUE)
>> > +			return true;
>> 
>> Should we check the following first?
>> 
>>         order > 0 && zone_watermark_ok_safe(zone, 0, mark, highest_zoneidx)
>
> That's what compaction_suitable() does. It checks whether there are
> enough migration targets for compaction (COMPACT_CONTINUE) or whether
> reclaim needs to do some more work (COMPACT_SKIPPED).

Yes.  And I found that the watermark used in compaction_suitable() is
low_wmark_pages() or min_wmark_pages(), which doesn't match the
watermark here.  Or did I miss something?

Best Regards,
Huang, Ying
Johannes Weiner April 26, 2023, 3:22 p.m. UTC | #4
On Wed, Apr 26, 2023 at 09:30:23AM +0800, Huang, Ying wrote:
> Johannes Weiner <hannes@cmpxchg.org> writes:
> 
> > On Tue, Apr 25, 2023 at 11:12:28AM +0800, Huang, Ying wrote:
> >> Johannes Weiner <hannes@cmpxchg.org> writes:
> >> 
> >> > Kswapd currently bails on higher-order allocations with an open-coded
> >> > check for whether it's reclaimed the compaction gap.
> >> >
> >> > compaction_suitable() is the customary interface to coordinate reclaim
> >> > with compaction.
> >> >
> >> > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> >> > ---
> >> >  mm/vmscan.c | 67 ++++++++++++++++++-----------------------------------
> >> >  1 file changed, 23 insertions(+), 44 deletions(-)
> >> >
> >> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> >> > index ee8c8ca2e7b5..723705b9e4d9 100644
> >> > --- a/mm/vmscan.c
> >> > +++ b/mm/vmscan.c
> >> > @@ -6872,12 +6872,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
> >> >  		if (!managed_zone(zone))
> >> >  			continue;
> >> >  
> >> > +		/* Allocation can succeed in any zone, done */
> >> >  		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
> >> >  			mark = wmark_pages(zone, WMARK_PROMO);
> >> >  		else
> >> >  			mark = high_wmark_pages(zone);
> >> >  		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
> >> >  			return true;
> >> > +
> >> > +		/* Allocation can't succeed, but enough order-0 to compact */
> >> > +		if (compaction_suitable(zone, order,
> >> > +					highest_zoneidx) == COMPACT_CONTINUE)
> >> > +			return true;
> >> 
> >> Should we check the following first?
> >> 
> >>         order > 0 && zone_watermark_ok_safe(zone, 0, mark, highest_zoneidx)
> >
> > That's what compaction_suitable() does. It checks whether there are
> > enough migration targets for compaction (COMPACT_CONTINUE) or whether
> > reclaim needs to do some more work (COMPACT_SKIPPED).
> 
> Yes.  And I found that the watermark used in compaction_suitable() is
> low_wmark_pages() or min_wmark_pages(), which doesn't match the
> watermark here.  Or did I miss something?

Ahh, you're right, kswapd will bail prematurely. Compaction cannot
reliably meet the high watermark with a low watermark scratch space.

I'll add the order check before the suitable test, for clarity, and so
that order-0 requests don't check the same thing twice.

For the watermark, I'd make it an arg to compaction_suitable() and use
whatever the reclaimer targets (high for kswapd, min for direct).

However, there is a minor snag. compaction_suitable() currently has
its own smarts regarding the watermark:

	/*
	 * Watermarks for order-0 must be met for compaction to be able to
	 * isolate free pages for migration targets. This means that the
	 * watermark and alloc_flags have to match, or be more pessimistic than
	 * the check in __isolate_free_page(). We don't use the direct
	 * compactor's alloc_flags, as they are not relevant for freepage
	 * isolation. We however do use the direct compactor's highest_zoneidx
	 * to skip over zones where lowmem reserves would prevent allocation
	 * even if compaction succeeds.
	 * For costly orders, we require low watermark instead of min for
	 * compaction to proceed to increase its chances.
	 * ALLOC_CMA is used, as pages in CMA pageblocks are considered
	 * suitable migration targets
	 */
	watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
				low_wmark_pages(zone) : min_wmark_pages(zone);

Historically it has always checked low instead of min. Then Vlastimil
changed it to min for non-costly orders here:

commit 8348faf91f56371d4bada6fc5915e19580a15ffe
Author: Vlastimil Babka <vbabka@suse.cz>
Date:   Fri Oct 7 16:58:00 2016 -0700

    mm, compaction: require only min watermarks for non-costly orders
    
    The __compaction_suitable() function checks the low watermark plus a
    compact_gap() gap to decide if there's enough free memory to perform
    compaction.  Then __isolate_free_page uses low watermark check to decide
    if particular free page can be isolated.  In the latter case, using low
    watermark is needlessly pessimistic, as the free page isolations are
    only temporary.  For __compaction_suitable() the higher watermark makes
    sense for high-order allocations where more freepages increase the
    chance of success, and we can typically fail with some order-0 fallback
    when the system is struggling to reach that watermark.  But for
    low-order allocation, forming the page should not be that hard.  So
    using low watermark here might just prevent compaction from even trying,
    and eventually lead to OOM killer even if we are above min watermarks.
    
    So after this patch, we use min watermark for non-costly orders in
    __compaction_suitable(), and for all orders in __isolate_free_page().

Lowering to min wasn't an issue for non-costly, but AFAICS there was
no explicit testing for whether min would work for costly orders too.

I'd propose trying it with min even for costly and see what happens.

If it does regress, a better place to boost scratch space for costly
orders might be compact_gap(), so I'd move it there.

Does that sound reasonable?
Huang, Ying April 27, 2023, 5:41 a.m. UTC | #5
Johannes Weiner <hannes@cmpxchg.org> writes:

> On Wed, Apr 26, 2023 at 09:30:23AM +0800, Huang, Ying wrote:
>> Johannes Weiner <hannes@cmpxchg.org> writes:
>> 
>> > On Tue, Apr 25, 2023 at 11:12:28AM +0800, Huang, Ying wrote:
>> >> Johannes Weiner <hannes@cmpxchg.org> writes:
>> >> 
>> >> > Kswapd currently bails on higher-order allocations with an open-coded
>> >> > check for whether it's reclaimed the compaction gap.
>> >> >
>> >> > compaction_suitable() is the customary interface to coordinate reclaim
>> >> > with compaction.
>> >> >
>> >> > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
>> >> > ---
>> >> >  mm/vmscan.c | 67 ++++++++++++++++++-----------------------------------
>> >> >  1 file changed, 23 insertions(+), 44 deletions(-)
>> >> >
>> >> > diff --git a/mm/vmscan.c b/mm/vmscan.c
>> >> > index ee8c8ca2e7b5..723705b9e4d9 100644
>> >> > --- a/mm/vmscan.c
>> >> > +++ b/mm/vmscan.c
>> >> > @@ -6872,12 +6872,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
>> >> >  		if (!managed_zone(zone))
>> >> >  			continue;
>> >> >  
>> >> > +		/* Allocation can succeed in any zone, done */
>> >> >  		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
>> >> >  			mark = wmark_pages(zone, WMARK_PROMO);
>> >> >  		else
>> >> >  			mark = high_wmark_pages(zone);
>> >> >  		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
>> >> >  			return true;
>> >> > +
>> >> > +		/* Allocation can't succeed, but enough order-0 to compact */
>> >> > +		if (compaction_suitable(zone, order,
>> >> > +					highest_zoneidx) == COMPACT_CONTINUE)
>> >> > +			return true;
>> >> 
>> >> Should we check the following first?
>> >> 
>> >>         order > 0 && zone_watermark_ok_safe(zone, 0, mark, highest_zoneidx)
>> >
>> > That's what compaction_suitable() does. It checks whether there are
>> > enough migration targets for compaction (COMPACT_CONTINUE) or whether
>> > reclaim needs to do some more work (COMPACT_SKIPPED).
>> 
>> Yes.  And I found that the watermark used in compaction_suitable() is
>> low_wmark_pages() or min_wmark_pages(), which doesn't match the
>> watermark here.  Or did I miss something?
>
> Ahh, you're right, kswapd will bail prematurely. Compaction cannot
> reliably meet the high watermark with a low watermark scratch space.
>
> I'll add the order check before the suitable test, for clarity, and so
> that order-0 requests don't check the same thing twice.
>
> For the watermark, I'd make it an arg to compaction_suitable() and use
> whatever the reclaimer targets (high for kswapd, min for direct).
>
> However, there is a minor snag. compaction_suitable() currently has
> its own smarts regarding the watermark:
>
> 	/*
> 	 * Watermarks for order-0 must be met for compaction to be able to
> 	 * isolate free pages for migration targets. This means that the
> 	 * watermark and alloc_flags have to match, or be more pessimistic than
> 	 * the check in __isolate_free_page(). We don't use the direct
> 	 * compactor's alloc_flags, as they are not relevant for freepage
> 	 * isolation. We however do use the direct compactor's highest_zoneidx
> 	 * to skip over zones where lowmem reserves would prevent allocation
> 	 * even if compaction succeeds.
> 	 * For costly orders, we require low watermark instead of min for
> 	 * compaction to proceed to increase its chances.
> 	 * ALLOC_CMA is used, as pages in CMA pageblocks are considered
> 	 * suitable migration targets
> 	 */
> 	watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
> 				low_wmark_pages(zone) : min_wmark_pages(zone);
>
> Historically it has always checked low instead of min. Then Vlastimil
> changed it to min for non-costly orders here:
>
> commit 8348faf91f56371d4bada6fc5915e19580a15ffe
> Author: Vlastimil Babka <vbabka@suse.cz>
> Date:   Fri Oct 7 16:58:00 2016 -0700
>
>     mm, compaction: require only min watermarks for non-costly orders
>     
>     The __compaction_suitable() function checks the low watermark plus a
>     compact_gap() gap to decide if there's enough free memory to perform
>     compaction.  Then __isolate_free_page uses low watermark check to decide
>     if particular free page can be isolated.  In the latter case, using low
>     watermark is needlessly pessimistic, as the free page isolations are
>     only temporary.  For __compaction_suitable() the higher watermark makes
>     sense for high-order allocations where more freepages increase the
>     chance of success, and we can typically fail with some order-0 fallback
>     when the system is struggling to reach that watermark.  But for
>     low-order allocation, forming the page should not be that hard.  So
>     using low watermark here might just prevent compaction from even trying,
>     and eventually lead to OOM killer even if we are above min watermarks.
>     
>     So after this patch, we use min watermark for non-costly orders in
>     __compaction_suitable(), and for all orders in __isolate_free_page().
>
> Lowering to min wasn't an issue for non-costly, but AFAICS there was
> no explicit testing for whether min would work for costly orders too.
>
> I'd propose trying it with min even for costly and see what happens.
>
> If it does regress, a better place to boost scratch space for costly
> orders might be compact_gap(), so I'd move it there.
>
> Does that sound reasonable?

Sounds good to me, Thanks!

Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ee8c8ca2e7b5..723705b9e4d9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6872,12 +6872,18 @@  static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
 		if (!managed_zone(zone))
 			continue;
 
+		/* Allocation can succeed in any zone, done */
 		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
 			mark = wmark_pages(zone, WMARK_PROMO);
 		else
 			mark = high_wmark_pages(zone);
 		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
 			return true;
+
+		/* Allocation can't succeed, but enough order-0 to compact */
+		if (compaction_suitable(zone, order,
+					highest_zoneidx) == COMPACT_CONTINUE)
+			return true;
 	}
 
 	/*
@@ -6968,16 +6974,6 @@  static bool kswapd_shrink_node(pg_data_t *pgdat,
 	 */
 	shrink_node(pgdat, sc);
 
-	/*
-	 * Fragmentation may mean that the system cannot be rebalanced for
-	 * high-order allocations. If twice the allocation size has been
-	 * reclaimed then recheck watermarks only at order-0 to prevent
-	 * excessive reclaim. Assume that a process requested a high-order
-	 * can direct reclaim/compact.
-	 */
-	if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
-		sc->order = 0;
-
 	return sc->nr_scanned >= sc->nr_to_reclaim;
 }
 
@@ -7018,15 +7014,13 @@  clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
  * that are eligible for use by the caller until at least one zone is
  * balanced.
  *
- * Returns the order kswapd finished reclaiming at.
- *
  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
  * zones which have free_pages > high_wmark_pages(zone), but once a zone is
  * found to have free_pages <= high_wmark_pages(zone), any page in that zone
  * or lower is eligible for reclaim until at least one usable zone is
  * balanced.
  */
-static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
+static void balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 {
 	int i;
 	unsigned long nr_soft_reclaimed;
@@ -7226,14 +7220,6 @@  static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 	__fs_reclaim_release(_THIS_IP_);
 	psi_memstall_leave(&pflags);
 	set_task_reclaim_state(current, NULL);
-
-	/*
-	 * Return the order kswapd stopped reclaiming at as
-	 * prepare_kswapd_sleep() takes it into account. If another caller
-	 * entered the allocator slow path while kswapd was awake, order will
-	 * remain at the higher level.
-	 */
-	return sc.order;
 }
 
 /*
@@ -7251,7 +7237,7 @@  static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
 	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
 }
 
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
 				unsigned int highest_zoneidx)
 {
 	long remaining = 0;
@@ -7269,7 +7255,7 @@  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 	 * eligible zone balanced that it's also unlikely that compaction will
 	 * succeed.
 	 */
-	if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
+	if (prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) {
 		/*
 		 * Compaction records what page blocks it recently failed to
 		 * isolate pages from and skips them in the future scanning.
@@ -7282,7 +7268,7 @@  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 		 * We have freed the memory, now we should compact it to make
 		 * allocation of the requested order possible.
 		 */
-		wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
+		wakeup_kcompactd(pgdat, order, highest_zoneidx);
 
 		remaining = schedule_timeout(HZ/10);
 
@@ -7296,8 +7282,8 @@  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 					kswapd_highest_zoneidx(pgdat,
 							highest_zoneidx));
 
-			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
-				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
+			if (READ_ONCE(pgdat->kswapd_order) < order)
+				WRITE_ONCE(pgdat->kswapd_order, order);
 		}
 
 		finish_wait(&pgdat->kswapd_wait, &wait);
@@ -7308,8 +7294,7 @@  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 	 * After a short sleep, check if it was a premature sleep. If not, then
 	 * go fully to sleep until explicitly woken up.
 	 */
-	if (!remaining &&
-	    prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
+	if (!remaining && prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) {
 		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 
 		/*
@@ -7350,8 +7335,7 @@  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
  */
 static int kswapd(void *p)
 {
-	unsigned int alloc_order, reclaim_order;
-	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
+	unsigned int order, highest_zoneidx;
 	pg_data_t *pgdat = (pg_data_t *)p;
 	struct task_struct *tsk = current;
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -7374,22 +7358,20 @@  static int kswapd(void *p)
 	tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
 	set_freezable();
 
-	WRITE_ONCE(pgdat->kswapd_order, 0);
+	order = 0;
+	highest_zoneidx = MAX_NR_ZONES - 1;
+	WRITE_ONCE(pgdat->kswapd_order, order);
 	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+
 	atomic_set(&pgdat->nr_writeback_throttled, 0);
+
 	for ( ; ; ) {
 		bool ret;
 
-		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
-		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
-							highest_zoneidx);
-
-kswapd_try_sleep:
-		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
-					highest_zoneidx);
+		kswapd_try_to_sleep(pgdat, order, highest_zoneidx);
 
 		/* Read the new order and highest_zoneidx */
-		alloc_order = READ_ONCE(pgdat->kswapd_order);
+		order = READ_ONCE(pgdat->kswapd_order);
 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
 							highest_zoneidx);
 		WRITE_ONCE(pgdat->kswapd_order, 0);
@@ -7415,11 +7397,8 @@  static int kswapd(void *p)
 		 * request (alloc_order).
 		 */
 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
-						alloc_order);
-		reclaim_order = balance_pgdat(pgdat, alloc_order,
-						highest_zoneidx);
-		if (reclaim_order < alloc_order)
-			goto kswapd_try_sleep;
+					    order);
+		balance_pgdat(pgdat, order, highest_zoneidx);
 	}
 
 	tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);