diff mbox series

[1/2] mm: multi-gen LRU: retry folios written back while isolated

Message ID 20221116013808.3995280-1-yuzhao@google.com (mailing list archive)
State New
Headers show
Series [1/2] mm: multi-gen LRU: retry folios written back while isolated | expand

Commit Message

Yu Zhao Nov. 16, 2022, 1:38 a.m. UTC
The page reclaim isolates a batch of folios from the tail of one of
the LRU lists and works on those folios one by one. For a suitable
swap-backed folio, if the swap device is async, it queues that folio
for writeback. After the page reclaim finishes an entire batch, it
puts back the folios it queued for writeback to the head of the
original LRU list.

In the meantime, the page writeback flushes the queued folios also by
batches. Its batching logic is independent from that of the page
reclaim. For each of the folios it writes back, the page writeback
calls folio_rotate_reclaimable() which tries to rotate a folio to the
tail.

folio_rotate_reclaimable() only works for a folio after the page
reclaim has put it back. If an async swap device is fast enough, the
page writeback can finish with that folio while the page reclaim is
still working on the rest of the batch containing it. In this case,
that folio will remain at the head and the page reclaim will not retry
it before reaching there.

This patch adds a retry to evict_folios(). After evict_folios() has
finished an entire batch and before it puts back folios it cannot free
immediately, it retries those that may have missed the rotation.

Before this patch, ~60% of folios swapped to an Intel Optane missed
folio_rotate_reclaimable(). After this patch, ~99% of missed folios
were reclaimed upon retry.

This problem affects relatively slow async swap devices like Samsung
980 Pro much less and does not affect sync swap devices like zram or
zswap at all.

Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 mm/vmscan.c | 48 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 11 deletions(-)

Comments

Yin, Fengwei Nov. 16, 2022, 3:07 a.m. UTC | #1
On 11/16/2022 9:38 AM, Yu Zhao wrote:
> The page reclaim isolates a batch of folios from the tail of one of
> the LRU lists and works on those folios one by one. For a suitable
> swap-backed folio, if the swap device is async, it queues that folio
> for writeback. After the page reclaim finishes an entire batch, it
> puts back the folios it queued for writeback to the head of the
> original LRU list.
> 
> In the meantime, the page writeback flushes the queued folios also by
> batches. Its batching logic is independent from that of the page
> reclaim. For each of the folios it writes back, the page writeback
> calls folio_rotate_reclaimable() which tries to rotate a folio to the
> tail.
> 
> folio_rotate_reclaimable() only works for a folio after the page
> reclaim has put it back. If an async swap device is fast enough, the
> page writeback can finish with that folio while the page reclaim is
> still working on the rest of the batch containing it. In this case,
> that folio will remain at the head and the page reclaim will not retry
> it before reaching there.
> 
> This patch adds a retry to evict_folios(). After evict_folios() has
> finished an entire batch and before it puts back folios it cannot free
> immediately, it retries those that may have missed the rotation.
> 
> Before this patch, ~60% of folios swapped to an Intel Optane missed
> folio_rotate_reclaimable(). After this patch, ~99% of missed folios
> were reclaimed upon retry.
> 
> This problem affects relatively slow async swap devices like Samsung
> 980 Pro much less and does not affect sync swap devices like zram or
> zswap at all.
> 
> Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> ---
>  mm/vmscan.c | 48 +++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 37 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 04d8b88e5216..dc6ebafa0a37 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4971,10 +4971,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
>  	int scanned;
>  	int reclaimed;
>  	LIST_HEAD(list);
> +	LIST_HEAD(clean);
>  	struct folio *folio;
> +	struct folio *next;
>  	enum vm_event_item item;
>  	struct reclaim_stat stat;
>  	struct lru_gen_mm_walk *walk;
> +	bool skip_retry = false;
>  	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>  
> @@ -4991,20 +4994,37 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
>  
>  	if (list_empty(&list))
>  		return scanned;
> -
> +retry:
>  	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
> +	sc->nr_reclaimed += reclaimed;
>  
> -	list_for_each_entry(folio, &list, lru) {
> -		/* restore LRU_REFS_FLAGS cleared by isolate_folio() */
> -		if (folio_test_workingset(folio))
> -			folio_set_referenced(folio);
> +	list_for_each_entry_safe_reverse(folio, next, &list, lru) {
> +		if (!folio_evictable(folio)) {
> +			list_del(&folio->lru);
> +			folio_putback_lru(folio);
> +			continue;
> +		}
dump question:
My understanding: unevictable folios were filtered out in sort_folios.
So this is because folio could become unevictable during retry? Thanks.


Regards
Yin, Fengwei

>  
> -		/* don't add rejected pages to the oldest generation */
>  		if (folio_test_reclaim(folio) &&
> -		    (folio_test_dirty(folio) || folio_test_writeback(folio)))
> -			folio_clear_active(folio);
> -		else
> -			folio_set_active(folio);
> +		    (folio_test_dirty(folio) || folio_test_writeback(folio))) {
> +			/* restore LRU_REFS_FLAGS cleared by isolate_folio() */
> +			if (folio_test_workingset(folio))
> +				folio_set_referenced(folio);
> +			continue;
> +		}
> +
> +		if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
> +		    folio_mapped(folio) || folio_test_locked(folio) ||
> +		    folio_test_dirty(folio) || folio_test_writeback(folio)) {
> +			/* don't add rejected folios to the oldest generation */
> +			set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
> +				      BIT(PG_active));
> +			continue;
> +		}
> +
> +		/* retry folios that may have missed folio_rotate_reclaimable() */
> +		list_move(&folio->lru, &clean);
> +		sc->nr_scanned -= folio_nr_pages(folio);
>  	}
>  
>  	spin_lock_irq(&lruvec->lru_lock);
> @@ -5026,7 +5046,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
>  	mem_cgroup_uncharge_list(&list);
>  	free_unref_page_list(&list);
>  
> -	sc->nr_reclaimed += reclaimed;
> +	INIT_LIST_HEAD(&list);
> +	list_splice_init(&clean, &list);
> +
> +	if (!list_empty(&list)) {
> +		skip_retry = true;
> +		goto retry;
> +	}
>  
>  	if (need_swapping && type == LRU_GEN_ANON)
>  		*need_swapping = true;
Yu Zhao Nov. 16, 2022, 3:55 a.m. UTC | #2
On Tue, Nov 15, 2022 at 8:07 PM Yin, Fengwei <fengwei.yin@intel.com> wrote:
>
> On 11/16/2022 9:38 AM, Yu Zhao wrote:
> > The page reclaim isolates a batch of folios from the tail of one of
> > the LRU lists and works on those folios one by one. For a suitable
> > swap-backed folio, if the swap device is async, it queues that folio
> > for writeback. After the page reclaim finishes an entire batch, it
> > puts back the folios it queued for writeback to the head of the
> > original LRU list.
> >
> > In the meantime, the page writeback flushes the queued folios also by
> > batches. Its batching logic is independent from that of the page
> > reclaim. For each of the folios it writes back, the page writeback
> > calls folio_rotate_reclaimable() which tries to rotate a folio to the
> > tail.
> >
> > folio_rotate_reclaimable() only works for a folio after the page
> > reclaim has put it back. If an async swap device is fast enough, the
> > page writeback can finish with that folio while the page reclaim is
> > still working on the rest of the batch containing it. In this case,
> > that folio will remain at the head and the page reclaim will not retry
> > it before reaching there.
> >
> > This patch adds a retry to evict_folios(). After evict_folios() has
> > finished an entire batch and before it puts back folios it cannot free
> > immediately, it retries those that may have missed the rotation.
> >
> > Before this patch, ~60% of folios swapped to an Intel Optane missed
> > folio_rotate_reclaimable(). After this patch, ~99% of missed folios
> > were reclaimed upon retry.
> >
> > This problem affects relatively slow async swap devices like Samsung
> > 980 Pro much less and does not affect sync swap devices like zram or
> > zswap at all.
> >
> > Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
> > Signed-off-by: Yu Zhao <yuzhao@google.com>
> > ---
> >  mm/vmscan.c | 48 +++++++++++++++++++++++++++++++++++++-----------
> >  1 file changed, 37 insertions(+), 11 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 04d8b88e5216..dc6ebafa0a37 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -4971,10 +4971,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
> >       int scanned;
> >       int reclaimed;
> >       LIST_HEAD(list);
> > +     LIST_HEAD(clean);
> >       struct folio *folio;
> > +     struct folio *next;
> >       enum vm_event_item item;
> >       struct reclaim_stat stat;
> >       struct lru_gen_mm_walk *walk;
> > +     bool skip_retry = false;
> >       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> >       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> >
> > @@ -4991,20 +4994,37 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
> >
> >       if (list_empty(&list))
> >               return scanned;
> > -
> > +retry:
> >       reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
> > +     sc->nr_reclaimed += reclaimed;
> >
> > -     list_for_each_entry(folio, &list, lru) {
> > -             /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
> > -             if (folio_test_workingset(folio))
> > -                     folio_set_referenced(folio);
> > +     list_for_each_entry_safe_reverse(folio, next, &list, lru) {
> > +             if (!folio_evictable(folio)) {
> > +                     list_del(&folio->lru);
> > +                     folio_putback_lru(folio);
> > +                     continue;
> > +             }
> dump question:
> My understanding: unevictable folios were filtered out in sort_folios.
> So this is because folio could become unevictable during retry? Thanks.

If a folio is mlocked, it's unevictable. A different thread can call
mlock_folio() anytime, so we can see unevictable folios in
sort_folios(), here before and after retry, and later in
move_folios_to_lru(). In all these cases, we move mlocked folios to
the (imaginary) unevictable LRU and __mlock_page() bails out early.
Andrew Morton Nov. 16, 2022, 10:59 p.m. UTC | #3
On Tue, 15 Nov 2022 18:38:07 -0700 Yu Zhao <yuzhao@google.com> wrote:

> The page reclaim isolates a batch of folios from the tail of one of
> the LRU lists and works on those folios one by one. For a suitable
> swap-backed folio, if the swap device is async, it queues that folio
> for writeback. After the page reclaim finishes an entire batch, it
> puts back the folios it queued for writeback to the head of the
> original LRU list.
> 
> In the meantime, the page writeback flushes the queued folios also by
> batches. Its batching logic is independent from that of the page
> reclaim. For each of the folios it writes back, the page writeback
> calls folio_rotate_reclaimable() which tries to rotate a folio to the
> tail.
> 
> folio_rotate_reclaimable() only works for a folio after the page
> reclaim has put it back. If an async swap device is fast enough, the
> page writeback can finish with that folio while the page reclaim is
> still working on the rest of the batch containing it. In this case,
> that folio will remain at the head and the page reclaim will not retry
> it before reaching there.
> 
> This patch adds a retry to evict_folios(). After evict_folios() has
> finished an entire batch and before it puts back folios it cannot free
> immediately, it retries those that may have missed the rotation.
> 
> Before this patch, ~60% of folios swapped to an Intel Optane missed
> folio_rotate_reclaimable(). After this patch, ~99% of missed folios
> were reclaimed upon retry.
> 
> This problem affects relatively slow async swap devices like Samsung
> 980 Pro much less and does not affect sync swap devices like zram or
> zswap at all.

As I understand it, this approach has an implicit assumption that by
the time evict_folios() has completed its first pass, write IOs will
have completed and the resulting folios are available for processing on
evict_folios()'s second pass, yes?

If so, it all kinda works by luck of timing.  If the swap device is
even slower, the number of folios which are unavailable on the second
pass will increase?

Can we make this more deterministic?  For example change evict_folios()
to recognize this situation and to then do folio_rotate_reclaimable()'s
work for it?  Or if that isn't practical, do something else?

(Is folio_rotate_reclaimable() actually useful?  That concept must be
20 years old.  What breaks if we just delete it and leave the pages
wherever they are?)
Yu Zhao Nov. 17, 2022, 12:12 a.m. UTC | #4
On Wed, Nov 16, 2022 at 3:59 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Tue, 15 Nov 2022 18:38:07 -0700 Yu Zhao <yuzhao@google.com> wrote:
>
> > The page reclaim isolates a batch of folios from the tail of one of
> > the LRU lists and works on those folios one by one. For a suitable
> > swap-backed folio, if the swap device is async, it queues that folio
> > for writeback. After the page reclaim finishes an entire batch, it
> > puts back the folios it queued for writeback to the head of the
> > original LRU list.
> >
> > In the meantime, the page writeback flushes the queued folios also by
> > batches. Its batching logic is independent from that of the page
> > reclaim. For each of the folios it writes back, the page writeback
> > calls folio_rotate_reclaimable() which tries to rotate a folio to the
> > tail.
> >
> > folio_rotate_reclaimable() only works for a folio after the page
> > reclaim has put it back. If an async swap device is fast enough, the
> > page writeback can finish with that folio while the page reclaim is
> > still working on the rest of the batch containing it. In this case,
> > that folio will remain at the head and the page reclaim will not retry
> > it before reaching there.
> >
> > This patch adds a retry to evict_folios(). After evict_folios() has
> > finished an entire batch and before it puts back folios it cannot free
> > immediately, it retries those that may have missed the rotation.
> >
> > Before this patch, ~60% of folios swapped to an Intel Optane missed
> > folio_rotate_reclaimable(). After this patch, ~99% of missed folios
> > were reclaimed upon retry.
> >
> > This problem affects relatively slow async swap devices like Samsung
> > 980 Pro much less and does not affect sync swap devices like zram or
> > zswap at all.
>
> As I understand it, this approach has an implicit assumption that by
> the time evict_folios() has completed its first pass, write IOs will
> have completed and the resulting folios are available for processing on
> evict_folios()'s second pass, yes?

Correct.

> If so, it all kinda works by luck of timing.

Yes, it's betting on luck. But it's a very good bet because the race
window on the second pass is probably 100 times smaller.

The race window on the first pass is the while() loop in
shrink_folio_list(), and it has a lot to work on. The race window on
the second pass is a simple list_for_each_entry_safe_reverse() loop.
This small race window is closed immediately after we put the folios
that are still under writeback back on the LRU list. Then we call
shrink_folio_list() again for the retry.

> If the swap device is
> even slower, the number of folios which are unavailable on the second
> pass will increase?

Correct.

> Can we make this more deterministic?  For example change evict_folios()
> to recognize this situation and to then do folio_rotate_reclaimable()'s
> work for it?  Or if that isn't practical, do something else?

There are multiple options, none of them is a better tradeoff:

  1) the page reclaim telling the page writeback exactly when to flush.
    pro: more reliable
    con: the page reclaim doesn't know better

  2) adding a synchronization mechanism between the two
    pro: more reliable
    con: a lot more complexity

  3) unlock folios and submit bio after they are put back on LRU (my
second choice)
    pro: more reliable
    con: more complexity (within mm)

> (Is folio_rotate_reclaimable() actually useful?  That concept must be
> 20 years old.  What breaks if we just delete it and leave the pages
> wherever they are?)

Most people use zram (with rw_page) or zswap nowadays, and they don't
need folio_rotate_reclaimable(). But we still need that function to
support swapping to SSD. (Optane is discontinued.)
Minchan Kim Nov. 17, 2022, 7:46 a.m. UTC | #5
On Tue, Nov 15, 2022 at 06:38:07PM -0700, Yu Zhao wrote:
> The page reclaim isolates a batch of folios from the tail of one of
> the LRU lists and works on those folios one by one. For a suitable
> swap-backed folio, if the swap device is async, it queues that folio
> for writeback. After the page reclaim finishes an entire batch, it
> puts back the folios it queued for writeback to the head of the
> original LRU list.
> 
> In the meantime, the page writeback flushes the queued folios also by
> batches. Its batching logic is independent from that of the page
> reclaim. For each of the folios it writes back, the page writeback
> calls folio_rotate_reclaimable() which tries to rotate a folio to the
> tail.
> 
> folio_rotate_reclaimable() only works for a folio after the page
> reclaim has put it back. If an async swap device is fast enough, the
> page writeback can finish with that folio while the page reclaim is
> still working on the rest of the batch containing it. In this case,
> that folio will remain at the head and the page reclaim will not retry
> it before reaching there.
> 
> This patch adds a retry to evict_folios(). After evict_folios() has
> finished an entire batch and before it puts back folios it cannot free
> immediately, it retries those that may have missed the rotation.

Can we make something like this?

    shrink_folio_list(struct list_head *folio_list, struct list_head *folio_wb_list, )
        pageout
            goto keep
        ..
        ..

        keep:
            if (folio_test_writeback(folio) &&
                    folio_test_reclaim(folio))
                list_add(&folio->lru, &ret_writeback_folio);
    
    move_folios_to_lru(&folio_list, &folio_wb_list);
        struct folio *wb_folio = lru_to_folio(folio_wb_list);

        /*
         * If writeback is already done, move the page into tail.
         * Otherwise, put the page into head and folio_rotate_reclaimable
         * will move it to the tail when the writeback is done
         */
        if (!folio_test_writeback(wb_folio)) &&
                    folio_test_reclaim(wb_folio))
            lruvec_add_folio_tail(lruvec, folio);
Yu Zhao Nov. 17, 2022, 10:22 p.m. UTC | #6
On Thu, Nov 17, 2022 at 12:47 AM Minchan Kim <minchan@kernel.org> wrote:
>
> On Tue, Nov 15, 2022 at 06:38:07PM -0700, Yu Zhao wrote:
> > The page reclaim isolates a batch of folios from the tail of one of
> > the LRU lists and works on those folios one by one. For a suitable
> > swap-backed folio, if the swap device is async, it queues that folio
> > for writeback. After the page reclaim finishes an entire batch, it
> > puts back the folios it queued for writeback to the head of the
> > original LRU list.
> >
> > In the meantime, the page writeback flushes the queued folios also by
> > batches. Its batching logic is independent from that of the page
> > reclaim. For each of the folios it writes back, the page writeback
> > calls folio_rotate_reclaimable() which tries to rotate a folio to the
> > tail.
> >
> > folio_rotate_reclaimable() only works for a folio after the page
> > reclaim has put it back. If an async swap device is fast enough, the
> > page writeback can finish with that folio while the page reclaim is
> > still working on the rest of the batch containing it. In this case,
> > that folio will remain at the head and the page reclaim will not retry
> > it before reaching there.
> >
> > This patch adds a retry to evict_folios(). After evict_folios() has
> > finished an entire batch and before it puts back folios it cannot free
> > immediately, it retries those that may have missed the rotation.
>
> Can we make something like this?

This works for both the active/inactive LRU and MGLRU.

But it's not my prefered way because of these two subtle differences:
1. Folios eligible for retry take an unnecessary round trip below --
they are first added to the LRU list and then removed from there for
retry. For high speed swap devices, the LRU lock contention is already
quite high (>10% in CPU profile under heavy memory pressure). So I'm
hoping we can avoid this round trip.
2. The number of retries of a folio on folio_wb_list is unlimited,
whereas this patch limits the retry to one. So in theory, we can spin
on a bunch of folios that keep failing.

The most ideal solution would be to have the one-off retry logic in
shrink_folio_list(). But right now, that function is very cluttered. I
plan to refactor it (low priority at the moment), and probably after
that, we can add a generic retry for both the active/inactive LRU and
MGLRU. I'll raise its priority if you strongly prefer this. Please
feel free to let me know.

Thanks.


>     shrink_folio_list(struct list_head *folio_list, struct list_head *folio_wb_list, )
>         pageout
>             goto keep
>         ..
>         ..
>
>         keep:
>             if (folio_test_writeback(folio) &&
>                     folio_test_reclaim(folio))
>                 list_add(&folio->lru, &ret_writeback_folio);
>
>     move_folios_to_lru(&folio_list, &folio_wb_list);
>         struct folio *wb_folio = lru_to_folio(folio_wb_list);
>
>         /*
>          * If writeback is already done, move the page into tail.
>          * Otherwise, put the page into head and folio_rotate_reclaimable
>          * will move it to the tail when the writeback is done
>          */
>         if (!folio_test_writeback(wb_folio)) &&
>                     folio_test_reclaim(wb_folio))
>             lruvec_add_folio_tail(lruvec, folio);
Minchan Kim Nov. 18, 2022, 1:26 a.m. UTC | #7
On Thu, Nov 17, 2022 at 03:22:42PM -0700, Yu Zhao wrote:
> On Thu, Nov 17, 2022 at 12:47 AM Minchan Kim <minchan@kernel.org> wrote:
> >
> > On Tue, Nov 15, 2022 at 06:38:07PM -0700, Yu Zhao wrote:
> > > The page reclaim isolates a batch of folios from the tail of one of
> > > the LRU lists and works on those folios one by one. For a suitable
> > > swap-backed folio, if the swap device is async, it queues that folio
> > > for writeback. After the page reclaim finishes an entire batch, it
> > > puts back the folios it queued for writeback to the head of the
> > > original LRU list.
> > >
> > > In the meantime, the page writeback flushes the queued folios also by
> > > batches. Its batching logic is independent from that of the page
> > > reclaim. For each of the folios it writes back, the page writeback
> > > calls folio_rotate_reclaimable() which tries to rotate a folio to the
> > > tail.
> > >
> > > folio_rotate_reclaimable() only works for a folio after the page
> > > reclaim has put it back. If an async swap device is fast enough, the
> > > page writeback can finish with that folio while the page reclaim is
> > > still working on the rest of the batch containing it. In this case,
> > > that folio will remain at the head and the page reclaim will not retry
> > > it before reaching there.
> > >
> > > This patch adds a retry to evict_folios(). After evict_folios() has
> > > finished an entire batch and before it puts back folios it cannot free
> > > immediately, it retries those that may have missed the rotation.
> >
> > Can we make something like this?
> 
> This works for both the active/inactive LRU and MGLRU.

I hope we fix both altogether.

> 
> But it's not my prefered way because of these two subtle differences:
> 1. Folios eligible for retry take an unnecessary round trip below --
> they are first added to the LRU list and then removed from there for
> retry. For high speed swap devices, the LRU lock contention is already
> quite high (>10% in CPU profile under heavy memory pressure). So I'm
> hoping we can avoid this round trip.
> 2. The number of retries of a folio on folio_wb_list is unlimited,
> whereas this patch limits the retry to one. So in theory, we can spin
> on a bunch of folios that keep failing.
> 
> The most ideal solution would be to have the one-off retry logic in
> shrink_folio_list(). But right now, that function is very cluttered. I
> plan to refactor it (low priority at the moment), and probably after
> that, we can add a generic retry for both the active/inactive LRU and
> MGLRU. I'll raise its priority if you strongly prefer this. Please
> feel free to let me know.

Well, my preference for *ideal solution* is writeback completion drops
page immediately without LRU rotating. IIRC, concern was softirq latency
and locking relevant in the context at that time when I tried it.
Yu Zhao Nov. 18, 2022, 1:40 a.m. UTC | #8
On Thu, Nov 17, 2022 at 6:26 PM Minchan Kim <minchan@kernel.org> wrote:
>
> On Thu, Nov 17, 2022 at 03:22:42PM -0700, Yu Zhao wrote:
> > On Thu, Nov 17, 2022 at 12:47 AM Minchan Kim <minchan@kernel.org> wrote:
> > >
> > > On Tue, Nov 15, 2022 at 06:38:07PM -0700, Yu Zhao wrote:
> > > > The page reclaim isolates a batch of folios from the tail of one of
> > > > the LRU lists and works on those folios one by one. For a suitable
> > > > swap-backed folio, if the swap device is async, it queues that folio
> > > > for writeback. After the page reclaim finishes an entire batch, it
> > > > puts back the folios it queued for writeback to the head of the
> > > > original LRU list.
> > > >
> > > > In the meantime, the page writeback flushes the queued folios also by
> > > > batches. Its batching logic is independent from that of the page
> > > > reclaim. For each of the folios it writes back, the page writeback
> > > > calls folio_rotate_reclaimable() which tries to rotate a folio to the
> > > > tail.
> > > >
> > > > folio_rotate_reclaimable() only works for a folio after the page
> > > > reclaim has put it back. If an async swap device is fast enough, the
> > > > page writeback can finish with that folio while the page reclaim is
> > > > still working on the rest of the batch containing it. In this case,
> > > > that folio will remain at the head and the page reclaim will not retry
> > > > it before reaching there.
> > > >
> > > > This patch adds a retry to evict_folios(). After evict_folios() has
> > > > finished an entire batch and before it puts back folios it cannot free
> > > > immediately, it retries those that may have missed the rotation.
> > >
> > > Can we make something like this?
> >
> > This works for both the active/inactive LRU and MGLRU.
>
> I hope we fix both altogether.
>
> >
> > But it's not my prefered way because of these two subtle differences:
> > 1. Folios eligible for retry take an unnecessary round trip below --
> > they are first added to the LRU list and then removed from there for
> > retry. For high speed swap devices, the LRU lock contention is already
> > quite high (>10% in CPU profile under heavy memory pressure). So I'm
> > hoping we can avoid this round trip.
> > 2. The number of retries of a folio on folio_wb_list is unlimited,
> > whereas this patch limits the retry to one. So in theory, we can spin
> > on a bunch of folios that keep failing.
> >
> > The most ideal solution would be to have the one-off retry logic in
> > shrink_folio_list(). But right now, that function is very cluttered. I
> > plan to refactor it (low priority at the moment), and probably after
> > that, we can add a generic retry for both the active/inactive LRU and
> > MGLRU. I'll raise its priority if you strongly prefer this. Please
> > feel free to let me know.
>
> Well, my preference for *ideal solution* is writeback completion drops
> page immediately without LRU rotating. IIRC, concern was softirq latency
> and locking relevant in the context at that time when I tried it.

Are we good for now or are there other ideas we want to try while we are at it?
Minchan Kim Nov. 18, 2022, 9:25 p.m. UTC | #9
On Thu, Nov 17, 2022 at 06:40:05PM -0700, Yu Zhao wrote:
> On Thu, Nov 17, 2022 at 6:26 PM Minchan Kim <minchan@kernel.org> wrote:
> >
> > On Thu, Nov 17, 2022 at 03:22:42PM -0700, Yu Zhao wrote:
> > > On Thu, Nov 17, 2022 at 12:47 AM Minchan Kim <minchan@kernel.org> wrote:
> > > >
> > > > On Tue, Nov 15, 2022 at 06:38:07PM -0700, Yu Zhao wrote:
> > > > > The page reclaim isolates a batch of folios from the tail of one of
> > > > > the LRU lists and works on those folios one by one. For a suitable
> > > > > swap-backed folio, if the swap device is async, it queues that folio
> > > > > for writeback. After the page reclaim finishes an entire batch, it
> > > > > puts back the folios it queued for writeback to the head of the
> > > > > original LRU list.
> > > > >
> > > > > In the meantime, the page writeback flushes the queued folios also by
> > > > > batches. Its batching logic is independent from that of the page
> > > > > reclaim. For each of the folios it writes back, the page writeback
> > > > > calls folio_rotate_reclaimable() which tries to rotate a folio to the
> > > > > tail.
> > > > >
> > > > > folio_rotate_reclaimable() only works for a folio after the page
> > > > > reclaim has put it back. If an async swap device is fast enough, the
> > > > > page writeback can finish with that folio while the page reclaim is
> > > > > still working on the rest of the batch containing it. In this case,
> > > > > that folio will remain at the head and the page reclaim will not retry
> > > > > it before reaching there.
> > > > >
> > > > > This patch adds a retry to evict_folios(). After evict_folios() has
> > > > > finished an entire batch and before it puts back folios it cannot free
> > > > > immediately, it retries those that may have missed the rotation.
> > > >
> > > > Can we make something like this?
> > >
> > > This works for both the active/inactive LRU and MGLRU.
> >
> > I hope we fix both altogether.
> >
> > >
> > > But it's not my prefered way because of these two subtle differences:
> > > 1. Folios eligible for retry take an unnecessary round trip below --
> > > they are first added to the LRU list and then removed from there for
> > > retry. For high speed swap devices, the LRU lock contention is already
> > > quite high (>10% in CPU profile under heavy memory pressure). So I'm
> > > hoping we can avoid this round trip.
> > > 2. The number of retries of a folio on folio_wb_list is unlimited,
> > > whereas this patch limits the retry to one. So in theory, we can spin
> > > on a bunch of folios that keep failing.
> > >
> > > The most ideal solution would be to have the one-off retry logic in
> > > shrink_folio_list(). But right now, that function is very cluttered. I
> > > plan to refactor it (low priority at the moment), and probably after
> > > that, we can add a generic retry for both the active/inactive LRU and
> > > MGLRU. I'll raise its priority if you strongly prefer this. Please
> > > feel free to let me know.
> >
> > Well, my preference for *ideal solution* is writeback completion drops
> > page immediately without LRU rotating. IIRC, concern was softirq latency
> > and locking relevant in the context at that time when I tried it.
> 
> Are we good for now or are there other ideas we want to try while we are at it?
> 

good for now with what solution you are thinking? The retry logic you
suggested? I personally don't like the solution relies on the timing.

If you are concerning about unnecessary round trip, it shouldn't
happen frequency since your assumption is swap device is so fast
so second loop would see their wb done?

Anyway, I am strongly push my preference. Feel free to go with way
you want if the solution can fix both LRU schemes.
Yu Zhao Nov. 18, 2022, 9:51 p.m. UTC | #10
On Fri, Nov 18, 2022 at 2:25 PM Minchan Kim <minchan@kernel.org> wrote:
>
> On Thu, Nov 17, 2022 at 06:40:05PM -0700, Yu Zhao wrote:
> > On Thu, Nov 17, 2022 at 6:26 PM Minchan Kim <minchan@kernel.org> wrote:
> > >
> > > On Thu, Nov 17, 2022 at 03:22:42PM -0700, Yu Zhao wrote:
> > > > On Thu, Nov 17, 2022 at 12:47 AM Minchan Kim <minchan@kernel.org> wrote:
> > > > >
> > > > > On Tue, Nov 15, 2022 at 06:38:07PM -0700, Yu Zhao wrote:
> > > > > > The page reclaim isolates a batch of folios from the tail of one of
> > > > > > the LRU lists and works on those folios one by one. For a suitable
> > > > > > swap-backed folio, if the swap device is async, it queues that folio
> > > > > > for writeback. After the page reclaim finishes an entire batch, it
> > > > > > puts back the folios it queued for writeback to the head of the
> > > > > > original LRU list.
> > > > > >
> > > > > > In the meantime, the page writeback flushes the queued folios also by
> > > > > > batches. Its batching logic is independent from that of the page
> > > > > > reclaim. For each of the folios it writes back, the page writeback
> > > > > > calls folio_rotate_reclaimable() which tries to rotate a folio to the
> > > > > > tail.
> > > > > >
> > > > > > folio_rotate_reclaimable() only works for a folio after the page
> > > > > > reclaim has put it back. If an async swap device is fast enough, the
> > > > > > page writeback can finish with that folio while the page reclaim is
> > > > > > still working on the rest of the batch containing it. In this case,
> > > > > > that folio will remain at the head and the page reclaim will not retry
> > > > > > it before reaching there.
> > > > > >
> > > > > > This patch adds a retry to evict_folios(). After evict_folios() has
> > > > > > finished an entire batch and before it puts back folios it cannot free
> > > > > > immediately, it retries those that may have missed the rotation.
> > > > >
> > > > > Can we make something like this?
> > > >
> > > > This works for both the active/inactive LRU and MGLRU.
> > >
> > > I hope we fix both altogether.
> > >
> > > >
> > > > But it's not my prefered way because of these two subtle differences:
> > > > 1. Folios eligible for retry take an unnecessary round trip below --
> > > > they are first added to the LRU list and then removed from there for
> > > > retry. For high speed swap devices, the LRU lock contention is already
> > > > quite high (>10% in CPU profile under heavy memory pressure). So I'm
> > > > hoping we can avoid this round trip.
> > > > 2. The number of retries of a folio on folio_wb_list is unlimited,
> > > > whereas this patch limits the retry to one. So in theory, we can spin
> > > > on a bunch of folios that keep failing.
> > > >
> > > > The most ideal solution would be to have the one-off retry logic in
> > > > shrink_folio_list(). But right now, that function is very cluttered. I
> > > > plan to refactor it (low priority at the moment), and probably after
> > > > that, we can add a generic retry for both the active/inactive LRU and
> > > > MGLRU. I'll raise its priority if you strongly prefer this. Please
> > > > feel free to let me know.
> > >
> > > Well, my preference for *ideal solution* is writeback completion drops
> > > page immediately without LRU rotating. IIRC, concern was softirq latency
> > > and locking relevant in the context at that time when I tried it.
> >
> > Are we good for now or are there other ideas we want to try while we are at it?
> >
>
> good for now with what solution you are thinking? The retry logic you
> suggested? I personally don't like the solution relies on the timing.
>
> If you are concerning about unnecessary round trip, it shouldn't
> happen frequency since your assumption is swap device is so fast
> so second loop would see their wb done?

No, the round trip that hits the LRU lock in the process.

For folios written and ready to be freed, they'll have to go from
being isolated to the tail of LRU list and then to getting isolated
again. This requires an extra hit on the LRU lock, which is highly
contended for fast swap devices under heavy memory pressure.

> Anyway, I am strongly push my preference. Feel free to go with way
> you want if the solution can fix both LRU schemes.

There is another concern I listed previously:

> > > > 2. The number of retries of a folio on folio_wb_list is unlimited,
> > > > whereas this patch limits the retry to one. So in theory, we can spin
> > > > on a bunch of folios that keep failing.

If this can happen, it'd be really hard to track it down. Any thoughts on this?

I share your desire to fix both. But I don't think we can just dismiss
the two points I listed above. They are reasonable, aren't they?
Minchan Kim Nov. 18, 2022, 10:33 p.m. UTC | #11
On Fri, Nov 18, 2022 at 02:51:01PM -0700, Yu Zhao wrote:
> On Fri, Nov 18, 2022 at 2:25 PM Minchan Kim <minchan@kernel.org> wrote:
> >
> > On Thu, Nov 17, 2022 at 06:40:05PM -0700, Yu Zhao wrote:
> > > On Thu, Nov 17, 2022 at 6:26 PM Minchan Kim <minchan@kernel.org> wrote:
> > > >
> > > > On Thu, Nov 17, 2022 at 03:22:42PM -0700, Yu Zhao wrote:
> > > > > On Thu, Nov 17, 2022 at 12:47 AM Minchan Kim <minchan@kernel.org> wrote:
> > > > > >
> > > > > > On Tue, Nov 15, 2022 at 06:38:07PM -0700, Yu Zhao wrote:
> > > > > > > The page reclaim isolates a batch of folios from the tail of one of
> > > > > > > the LRU lists and works on those folios one by one. For a suitable
> > > > > > > swap-backed folio, if the swap device is async, it queues that folio
> > > > > > > for writeback. After the page reclaim finishes an entire batch, it
> > > > > > > puts back the folios it queued for writeback to the head of the
> > > > > > > original LRU list.
> > > > > > >
> > > > > > > In the meantime, the page writeback flushes the queued folios also by
> > > > > > > batches. Its batching logic is independent from that of the page
> > > > > > > reclaim. For each of the folios it writes back, the page writeback
> > > > > > > calls folio_rotate_reclaimable() which tries to rotate a folio to the
> > > > > > > tail.
> > > > > > >
> > > > > > > folio_rotate_reclaimable() only works for a folio after the page
> > > > > > > reclaim has put it back. If an async swap device is fast enough, the
> > > > > > > page writeback can finish with that folio while the page reclaim is
> > > > > > > still working on the rest of the batch containing it. In this case,
> > > > > > > that folio will remain at the head and the page reclaim will not retry
> > > > > > > it before reaching there.
> > > > > > >
> > > > > > > This patch adds a retry to evict_folios(). After evict_folios() has
> > > > > > > finished an entire batch and before it puts back folios it cannot free
> > > > > > > immediately, it retries those that may have missed the rotation.
> > > > > >
> > > > > > Can we make something like this?
> > > > >
> > > > > This works for both the active/inactive LRU and MGLRU.
> > > >
> > > > I hope we fix both altogether.
> > > >
> > > > >
> > > > > But it's not my prefered way because of these two subtle differences:
> > > > > 1. Folios eligible for retry take an unnecessary round trip below --
> > > > > they are first added to the LRU list and then removed from there for
> > > > > retry. For high speed swap devices, the LRU lock contention is already
> > > > > quite high (>10% in CPU profile under heavy memory pressure). So I'm
> > > > > hoping we can avoid this round trip.
> > > > > 2. The number of retries of a folio on folio_wb_list is unlimited,
> > > > > whereas this patch limits the retry to one. So in theory, we can spin
> > > > > on a bunch of folios that keep failing.
> > > > >
> > > > > The most ideal solution would be to have the one-off retry logic in
> > > > > shrink_folio_list(). But right now, that function is very cluttered. I
> > > > > plan to refactor it (low priority at the moment), and probably after
> > > > > that, we can add a generic retry for both the active/inactive LRU and
> > > > > MGLRU. I'll raise its priority if you strongly prefer this. Please
> > > > > feel free to let me know.
> > > >
> > > > Well, my preference for *ideal solution* is writeback completion drops
> > > > page immediately without LRU rotating. IIRC, concern was softirq latency
> > > > and locking relevant in the context at that time when I tried it.
> > >
> > > Are we good for now or are there other ideas we want to try while we are at it?
> > >
> >
> > good for now with what solution you are thinking? The retry logic you
> > suggested? I personally don't like the solution relies on the timing.
> >
> > If you are concerning about unnecessary round trip, it shouldn't
> > happen frequency since your assumption is swap device is so fast
> > so second loop would see their wb done?
> 
> No, the round trip that hits the LRU lock in the process.

I see what you meant.

> 
> For folios written and ready to be freed, they'll have to go from
> being isolated to the tail of LRU list and then to getting isolated
> again. This requires an extra hit on the LRU lock, which is highly
> contended for fast swap devices under heavy memory pressure.
> 
> > Anyway, I am strongly push my preference. Feel free to go with way

Oh, sorry for the typo: "not strongly push my preference"

> > you want if the solution can fix both LRU schemes.
> 
> There is another concern I listed previously:
> 
> > > > > 2. The number of retries of a folio on folio_wb_list is unlimited,
> > > > > whereas this patch limits the retry to one. So in theory, we can spin
> > > > > on a bunch of folios that keep failing.
> 
> If this can happen, it'd be really hard to track it down. Any thoughts on this?

Could you elaborate why folio_wb_list can keep spinning?

My concern is how we can make sure the timing bet is good for most
workloads on heterogeneous/dvfs frequency core control env.

> 
> I share your desire to fix both. But I don't think we can just dismiss
> the two points I listed above. They are reasonable, aren't they?
>
Yu Zhao Nov. 18, 2022, 11:21 p.m. UTC | #12
On Fri, Nov 18, 2022 at 3:33 PM Minchan Kim <minchan@kernel.org> wrote:
>
> On Fri, Nov 18, 2022 at 02:51:01PM -0700, Yu Zhao wrote:
> > On Fri, Nov 18, 2022 at 2:25 PM Minchan Kim <minchan@kernel.org> wrote:
> > >
> > > On Thu, Nov 17, 2022 at 06:40:05PM -0700, Yu Zhao wrote:
> > > > On Thu, Nov 17, 2022 at 6:26 PM Minchan Kim <minchan@kernel.org> wrote:
> > > > >
> > > > > On Thu, Nov 17, 2022 at 03:22:42PM -0700, Yu Zhao wrote:
> > > > > > On Thu, Nov 17, 2022 at 12:47 AM Minchan Kim <minchan@kernel.org> wrote:
> > > > > > >
> > > > > > > On Tue, Nov 15, 2022 at 06:38:07PM -0700, Yu Zhao wrote:
> > > > > > > > The page reclaim isolates a batch of folios from the tail of one of
> > > > > > > > the LRU lists and works on those folios one by one. For a suitable
> > > > > > > > swap-backed folio, if the swap device is async, it queues that folio
> > > > > > > > for writeback. After the page reclaim finishes an entire batch, it
> > > > > > > > puts back the folios it queued for writeback to the head of the
> > > > > > > > original LRU list.
> > > > > > > >
> > > > > > > > In the meantime, the page writeback flushes the queued folios also by
> > > > > > > > batches. Its batching logic is independent from that of the page
> > > > > > > > reclaim. For each of the folios it writes back, the page writeback
> > > > > > > > calls folio_rotate_reclaimable() which tries to rotate a folio to the
> > > > > > > > tail.
> > > > > > > >
> > > > > > > > folio_rotate_reclaimable() only works for a folio after the page
> > > > > > > > reclaim has put it back. If an async swap device is fast enough, the
> > > > > > > > page writeback can finish with that folio while the page reclaim is
> > > > > > > > still working on the rest of the batch containing it. In this case,
> > > > > > > > that folio will remain at the head and the page reclaim will not retry
> > > > > > > > it before reaching there.
> > > > > > > >
> > > > > > > > This patch adds a retry to evict_folios(). After evict_folios() has
> > > > > > > > finished an entire batch and before it puts back folios it cannot free
> > > > > > > > immediately, it retries those that may have missed the rotation.
> > > > > > >
> > > > > > > Can we make something like this?
> > > > > >
> > > > > > This works for both the active/inactive LRU and MGLRU.
> > > > >
> > > > > I hope we fix both altogether.
> > > > >
> > > > > >
> > > > > > But it's not my prefered way because of these two subtle differences:
> > > > > > 1. Folios eligible for retry take an unnecessary round trip below --
> > > > > > they are first added to the LRU list and then removed from there for
> > > > > > retry. For high speed swap devices, the LRU lock contention is already
> > > > > > quite high (>10% in CPU profile under heavy memory pressure). So I'm
> > > > > > hoping we can avoid this round trip.
> > > > > > 2. The number of retries of a folio on folio_wb_list is unlimited,
> > > > > > whereas this patch limits the retry to one. So in theory, we can spin
> > > > > > on a bunch of folios that keep failing.
> > > > > >
> > > > > > The most ideal solution would be to have the one-off retry logic in
> > > > > > shrink_folio_list(). But right now, that function is very cluttered. I
> > > > > > plan to refactor it (low priority at the moment), and probably after
> > > > > > that, we can add a generic retry for both the active/inactive LRU and
> > > > > > MGLRU. I'll raise its priority if you strongly prefer this. Please
> > > > > > feel free to let me know.
> > > > >
> > > > > Well, my preference for *ideal solution* is writeback completion drops
> > > > > page immediately without LRU rotating. IIRC, concern was softirq latency
> > > > > and locking relevant in the context at that time when I tried it.
> > > >
> > > > Are we good for now or are there other ideas we want to try while we are at it?
> > > >
> > >
> > > good for now with what solution you are thinking? The retry logic you
> > > suggested? I personally don't like the solution relies on the timing.
> > >
> > > If you are concerning about unnecessary round trip, it shouldn't
> > > happen frequency since your assumption is swap device is so fast
> > > so second loop would see their wb done?
> >
> > No, the round trip that hits the LRU lock in the process.
>
> I see what you meant.
>
> >
> > For folios written and ready to be freed, they'll have to go from
> > being isolated to the tail of LRU list and then to getting isolated
> > again. This requires an extra hit on the LRU lock, which is highly
> > contended for fast swap devices under heavy memory pressure.
> >
> > > Anyway, I am strongly push my preference. Feel free to go with way
>
> Oh, sorry for the typo: "not strongly push my preference"
>
> > > you want if the solution can fix both LRU schemes.
> >
> > There is another concern I listed previously:
> >
> > > > > > 2. The number of retries of a folio on folio_wb_list is unlimited,
> > > > > > whereas this patch limits the retry to one. So in theory, we can spin
> > > > > > on a bunch of folios that keep failing.
> >
> > If this can happen, it'd be really hard to track it down. Any thoughts on this?
>
> Could you elaborate why folio_wb_list can keep spinning?

Currently page reclaim never puts a folio that it failed to evict back
to the tail because that folio can fail a second, a third time, etc.

If we want to add a retry logic, we need to make sure it's a limited
retry for each failed folio. Once its limit is reached, we'll have to
put that folio back to the head so that we won't risk spinning on it
indefinitely.

The below introduces an *unlimited* retry logic, and to make it work,
we would need to exclude all conditions that could cause wb_folio to
fail when it goes through shirnk_folio_list() again. Otherwise, it can
repeat the same process and deadlock. It's difficult to enumerate
those conditions exhaustively, because they depend on other layers of
the kernel.

        /*
         * If writeback is already done, move the page into tail.
         * Otherwise, put the page into head and folio_rotate_reclaimable
         * will move it to the tail when the writeback is done
         */
        if (!folio_test_writeback(wb_folio)) &&
                    folio_test_reclaim(wb_folio))
            lruvec_add_folio_tail(lruvec, folio);

> My concern is how we can make sure the timing bet is good for most
> workloads on heterogeneous/dvfs frequency core control env.

I agree. It's not an easy problem to solve.
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04d8b88e5216..dc6ebafa0a37 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4971,10 +4971,13 @@  static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 	int scanned;
 	int reclaimed;
 	LIST_HEAD(list);
+	LIST_HEAD(clean);
 	struct folio *folio;
+	struct folio *next;
 	enum vm_event_item item;
 	struct reclaim_stat stat;
 	struct lru_gen_mm_walk *walk;
+	bool skip_retry = false;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
@@ -4991,20 +4994,37 @@  static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 
 	if (list_empty(&list))
 		return scanned;
-
+retry:
 	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+	sc->nr_reclaimed += reclaimed;
 
-	list_for_each_entry(folio, &list, lru) {
-		/* restore LRU_REFS_FLAGS cleared by isolate_folio() */
-		if (folio_test_workingset(folio))
-			folio_set_referenced(folio);
+	list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+		if (!folio_evictable(folio)) {
+			list_del(&folio->lru);
+			folio_putback_lru(folio);
+			continue;
+		}
 
-		/* don't add rejected pages to the oldest generation */
 		if (folio_test_reclaim(folio) &&
-		    (folio_test_dirty(folio) || folio_test_writeback(folio)))
-			folio_clear_active(folio);
-		else
-			folio_set_active(folio);
+		    (folio_test_dirty(folio) || folio_test_writeback(folio))) {
+			/* restore LRU_REFS_FLAGS cleared by isolate_folio() */
+			if (folio_test_workingset(folio))
+				folio_set_referenced(folio);
+			continue;
+		}
+
+		if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
+		    folio_mapped(folio) || folio_test_locked(folio) ||
+		    folio_test_dirty(folio) || folio_test_writeback(folio)) {
+			/* don't add rejected folios to the oldest generation */
+			set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
+				      BIT(PG_active));
+			continue;
+		}
+
+		/* retry folios that may have missed folio_rotate_reclaimable() */
+		list_move(&folio->lru, &clean);
+		sc->nr_scanned -= folio_nr_pages(folio);
 	}
 
 	spin_lock_irq(&lruvec->lru_lock);
@@ -5026,7 +5046,13 @@  static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 	mem_cgroup_uncharge_list(&list);
 	free_unref_page_list(&list);
 
-	sc->nr_reclaimed += reclaimed;
+	INIT_LIST_HEAD(&list);
+	list_splice_init(&clean, &list);
+
+	if (!list_empty(&list)) {
+		skip_retry = true;
+		goto retry;
+	}
 
 	if (need_swapping && type == LRU_GEN_ANON)
 		*need_swapping = true;