Message ID | 20241209083618.2889145-2-chenridong@huaweicloud.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: vmascan: retry folios written back while isolated for traditional LRU | expand |
On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@huaweicloud.com> wrote: > > From: Chen Ridong <chenridong@huawei.com> > > The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back > while isolated") only fixed the issue for mglru. However, this issue > also exists in the traditional active/inactive LRU. This issue will be > worse if THP is split, which makes the list longer and needs longer time > to finish a batch of folios reclaim. > > This issue should be fixed in the same way for the traditional LRU. > Therefore, the common logic was extracted to the 'find_folios_written_back' > function firstly, which is then reused in the 'shrink_inactive_list' > function. Finally, retry reclaiming those folios that may have missed the > rotation for traditional LRU. let's drop the cover-letter and refine the changelog. > > Signed-off-by: Chen Ridong <chenridong@huawei.com> > --- > include/linux/mmzone.h | 3 +- > mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------ > 2 files changed, 77 insertions(+), 34 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index b36124145a16..47c6e8c43dcd 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; > > #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) > #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) > +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) > > #ifdef CONFIG_LRU_GEN > > @@ -406,8 +407,6 @@ enum { > NR_LRU_GEN_CAPS > }; > > -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) > - > #define MIN_LRU_BATCH BITS_PER_LONG > #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 76378bc257e3..1f0d194f8b2f 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, > task->reclaim_state = rs; > } > > +/** > + * find_folios_written_back - Find and move the written back folios to a new list. > + * @list: filios list > + * @clean: the written back folios list > + * @skip: whether skip to move the written back folios to clean list. > + */ > +static inline void find_folios_written_back(struct list_head *list, > + struct list_head *clean, bool skip) > +{ > + struct folio *folio; > + struct folio *next; > + > + list_for_each_entry_safe_reverse(folio, next, list, lru) { > + if (!folio_evictable(folio)) { > + list_del(&folio->lru); > + folio_putback_lru(folio); > + continue; > + } > + > + if (folio_test_reclaim(folio) && > + (folio_test_dirty(folio) || folio_test_writeback(folio))) { > + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ > + if (lru_gen_enabled() && folio_test_workingset(folio)) > + folio_set_referenced(folio); > + continue; > + } > + > + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || > + folio_mapped(folio) || folio_test_locked(folio) || > + folio_test_dirty(folio) || folio_test_writeback(folio)) { > + /* don't add rejected folios to the oldest generation */ > + if (lru_gen_enabled()) > + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, > + BIT(PG_active)); > + continue; > + } > + > + /* retry folios that may have missed folio_rotate_reclaimable() */ > + list_move(&folio->lru, clean); > + } > +} > + > /* > * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to > * scan_control->nr_reclaimed. > @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) > return !(current->flags & PF_LOCAL_THROTTLE); > } > > +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, > + struct reclaim_stat *curr) > +{ > + int i; > + > + stat->nr_dirty += curr->nr_dirty; > + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; > + stat->nr_congested += curr->nr_congested; > + stat->nr_writeback += curr->nr_writeback; > + stat->nr_immediate += curr->nr_immediate; > + stat->nr_pageout += curr->nr_pageout; > + stat->nr_ref_keep += curr->nr_ref_keep; > + stat->nr_unmap_fail += curr->nr_unmap_fail; > + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; > + stat->nr_demoted += curr->nr_demoted; > + for (i = 0; i < ANON_AND_FILE; i++) > + stat->nr_activate[i] = curr->nr_activate[i]; > +} you had no this before, what's the purpose of this? > + > /* > * shrink_inactive_list() is a helper for shrink_node(). It returns the number > * of reclaimed pages > @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, > enum lru_list lru) > { > LIST_HEAD(folio_list); > + LIST_HEAD(clean_list); > unsigned long nr_scanned; > unsigned int nr_reclaimed = 0; > unsigned long nr_taken; > - struct reclaim_stat stat; > + struct reclaim_stat stat, curr; > bool file = is_file_lru(lru); > enum vm_event_item item; > struct pglist_data *pgdat = lruvec_pgdat(lruvec); > bool stalled = false; > + bool skip_retry = false; > > while (unlikely(too_many_isolated(pgdat, file, sc))) { > if (stalled) > @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, > if (nr_taken == 0) > return 0; > > - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); > + memset(&stat, 0, sizeof(stat)); > +retry: > + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); > + find_folios_written_back(&folio_list, &clean_list, skip_retry); > + acc_reclaimed_stat(&stat, &curr); > > spin_lock_irq(&lruvec->lru_lock); > move_folios_to_lru(lruvec, &folio_list); > + if (!list_empty(&clean_list)) { > + list_splice_init(&clean_list, &folio_list); > + skip_retry = true; > + spin_unlock_irq(&lruvec->lru_lock); > + goto retry; > + } > > __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), > stat.nr_demoted); > @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap > int reclaimed; > LIST_HEAD(list); > LIST_HEAD(clean); > - struct folio *folio; > - struct folio *next; > enum vm_event_item item; > struct reclaim_stat stat; > struct lru_gen_mm_walk *walk; > @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap > scanned, reclaimed, &stat, sc->priority, > type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); > > - list_for_each_entry_safe_reverse(folio, next, &list, lru) { > - if (!folio_evictable(folio)) { > - list_del(&folio->lru); > - folio_putback_lru(folio); > - continue; > - } > - > - if (folio_test_reclaim(folio) && > - (folio_test_dirty(folio) || folio_test_writeback(folio))) { > - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ > - if (folio_test_workingset(folio)) > - folio_set_referenced(folio); > - continue; > - } > - > - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || > - folio_mapped(folio) || folio_test_locked(folio) || > - folio_test_dirty(folio) || folio_test_writeback(folio)) { > - /* don't add rejected folios to the oldest generation */ > - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, > - BIT(PG_active)); > - continue; > - } > - > - /* retry folios that may have missed folio_rotate_reclaimable() */ > - list_move(&folio->lru, &clean); > - } > - > + find_folios_written_back(&list, &clean, skip_retry); > spin_lock_irq(&lruvec->lru_lock); > > move_folios_to_lru(lruvec, &list); > -- > 2.34.1 > Thanks Barry >
On 2024/12/10 12:54, Barry Song wrote: > On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@huaweicloud.com> wrote: >> >> From: Chen Ridong <chenridong@huawei.com> >> >> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back >> while isolated") only fixed the issue for mglru. However, this issue >> also exists in the traditional active/inactive LRU. This issue will be >> worse if THP is split, which makes the list longer and needs longer time >> to finish a batch of folios reclaim. >> >> This issue should be fixed in the same way for the traditional LRU. >> Therefore, the common logic was extracted to the 'find_folios_written_back' >> function firstly, which is then reused in the 'shrink_inactive_list' >> function. Finally, retry reclaiming those folios that may have missed the >> rotation for traditional LRU. > > let's drop the cover-letter and refine the changelog. > Will update. >> >> Signed-off-by: Chen Ridong <chenridong@huawei.com> >> --- >> include/linux/mmzone.h | 3 +- >> mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------ >> 2 files changed, 77 insertions(+), 34 deletions(-) >> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >> index b36124145a16..47c6e8c43dcd 100644 >> --- a/include/linux/mmzone.h >> +++ b/include/linux/mmzone.h >> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; >> >> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) >> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) >> +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) >> >> #ifdef CONFIG_LRU_GEN >> >> @@ -406,8 +407,6 @@ enum { >> NR_LRU_GEN_CAPS >> }; >> >> -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) >> - >> #define MIN_LRU_BATCH BITS_PER_LONG >> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) >> >> diff --git a/mm/vmscan.c b/mm/vmscan.c >> index 76378bc257e3..1f0d194f8b2f 100644 >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, >> task->reclaim_state = rs; >> } >> >> +/** >> + * find_folios_written_back - Find and move the written back folios to a new list. >> + * @list: filios list >> + * @clean: the written back folios list >> + * @skip: whether skip to move the written back folios to clean list. >> + */ >> +static inline void find_folios_written_back(struct list_head *list, >> + struct list_head *clean, bool skip) >> +{ >> + struct folio *folio; >> + struct folio *next; >> + >> + list_for_each_entry_safe_reverse(folio, next, list, lru) { >> + if (!folio_evictable(folio)) { >> + list_del(&folio->lru); >> + folio_putback_lru(folio); >> + continue; >> + } >> + >> + if (folio_test_reclaim(folio) && >> + (folio_test_dirty(folio) || folio_test_writeback(folio))) { >> + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ >> + if (lru_gen_enabled() && folio_test_workingset(folio)) >> + folio_set_referenced(folio); >> + continue; >> + } >> + >> + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || >> + folio_mapped(folio) || folio_test_locked(folio) || >> + folio_test_dirty(folio) || folio_test_writeback(folio)) { >> + /* don't add rejected folios to the oldest generation */ >> + if (lru_gen_enabled()) >> + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, >> + BIT(PG_active)); >> + continue; >> + } >> + >> + /* retry folios that may have missed folio_rotate_reclaimable() */ >> + list_move(&folio->lru, clean); >> + } >> +} >> + >> /* >> * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to >> * scan_control->nr_reclaimed. >> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) >> return !(current->flags & PF_LOCAL_THROTTLE); >> } >> >> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, >> + struct reclaim_stat *curr) >> +{ >> + int i; >> + >> + stat->nr_dirty += curr->nr_dirty; >> + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; >> + stat->nr_congested += curr->nr_congested; >> + stat->nr_writeback += curr->nr_writeback; >> + stat->nr_immediate += curr->nr_immediate; >> + stat->nr_pageout += curr->nr_pageout; >> + stat->nr_ref_keep += curr->nr_ref_keep; >> + stat->nr_unmap_fail += curr->nr_unmap_fail; >> + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; >> + stat->nr_demoted += curr->nr_demoted; >> + for (i = 0; i < ANON_AND_FILE; i++) >> + stat->nr_activate[i] = curr->nr_activate[i]; >> +} > > you had no this before, what's the purpose of this? > We may call shrink_folio_list twice, and the 'stat curr' will reset in the shrink_folio_list function. We should accumulate the stats as a whole, which will then be used to calculate the cost and return it to the caller. Thanks, Ridong >> + >> /* >> * shrink_inactive_list() is a helper for shrink_node(). It returns the number >> * of reclaimed pages >> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, >> enum lru_list lru) >> { >> LIST_HEAD(folio_list); >> + LIST_HEAD(clean_list); >> unsigned long nr_scanned; >> unsigned int nr_reclaimed = 0; >> unsigned long nr_taken; >> - struct reclaim_stat stat; >> + struct reclaim_stat stat, curr; >> bool file = is_file_lru(lru); >> enum vm_event_item item; >> struct pglist_data *pgdat = lruvec_pgdat(lruvec); >> bool stalled = false; >> + bool skip_retry = false; >> >> while (unlikely(too_many_isolated(pgdat, file, sc))) { >> if (stalled) >> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, >> if (nr_taken == 0) >> return 0; >> >> - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); >> + memset(&stat, 0, sizeof(stat)); >> +retry: >> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); >> + find_folios_written_back(&folio_list, &clean_list, skip_retry); >> + acc_reclaimed_stat(&stat, &curr); >> >> spin_lock_irq(&lruvec->lru_lock); >> move_folios_to_lru(lruvec, &folio_list); >> + if (!list_empty(&clean_list)) { >> + list_splice_init(&clean_list, &folio_list); >> + skip_retry = true; >> + spin_unlock_irq(&lruvec->lru_lock); >> + goto retry; >> + } >> >> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), >> stat.nr_demoted); >> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap >> int reclaimed; >> LIST_HEAD(list); >> LIST_HEAD(clean); >> - struct folio *folio; >> - struct folio *next; >> enum vm_event_item item; >> struct reclaim_stat stat; >> struct lru_gen_mm_walk *walk; >> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap >> scanned, reclaimed, &stat, sc->priority, >> type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); >> >> - list_for_each_entry_safe_reverse(folio, next, &list, lru) { >> - if (!folio_evictable(folio)) { >> - list_del(&folio->lru); >> - folio_putback_lru(folio); >> - continue; >> - } >> - >> - if (folio_test_reclaim(folio) && >> - (folio_test_dirty(folio) || folio_test_writeback(folio))) { >> - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ >> - if (folio_test_workingset(folio)) >> - folio_set_referenced(folio); >> - continue; >> - } >> - >> - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || >> - folio_mapped(folio) || folio_test_locked(folio) || >> - folio_test_dirty(folio) || folio_test_writeback(folio)) { >> - /* don't add rejected folios to the oldest generation */ >> - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, >> - BIT(PG_active)); >> - continue; >> - } >> - >> - /* retry folios that may have missed folio_rotate_reclaimable() */ >> - list_move(&folio->lru, &clean); >> - } >> - >> + find_folios_written_back(&list, &clean, skip_retry); >> spin_lock_irq(&lruvec->lru_lock); >> >> move_folios_to_lru(lruvec, &list); >> -- >> 2.34.1 >> > > Thanks > Barry > >>
On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@huawei.com> wrote: > > > > On 2024/12/10 12:54, Barry Song wrote: > > On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@huaweicloud.com> wrote: > >> > >> From: Chen Ridong <chenridong@huawei.com> > >> > >> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back > >> while isolated") only fixed the issue for mglru. However, this issue > >> also exists in the traditional active/inactive LRU. This issue will be > >> worse if THP is split, which makes the list longer and needs longer time > >> to finish a batch of folios reclaim. > >> > >> This issue should be fixed in the same way for the traditional LRU. > >> Therefore, the common logic was extracted to the 'find_folios_written_back' > >> function firstly, which is then reused in the 'shrink_inactive_list' > >> function. Finally, retry reclaiming those folios that may have missed the > >> rotation for traditional LRU. > > > > let's drop the cover-letter and refine the changelog. > > > Will update. > > >> > >> Signed-off-by: Chen Ridong <chenridong@huawei.com> > >> --- > >> include/linux/mmzone.h | 3 +- > >> mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------ > >> 2 files changed, 77 insertions(+), 34 deletions(-) > >> > >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > >> index b36124145a16..47c6e8c43dcd 100644 > >> --- a/include/linux/mmzone.h > >> +++ b/include/linux/mmzone.h > >> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; > >> > >> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) > >> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) > >> +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) > >> > >> #ifdef CONFIG_LRU_GEN > >> > >> @@ -406,8 +407,6 @@ enum { > >> NR_LRU_GEN_CAPS > >> }; > >> > >> -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) > >> - > >> #define MIN_LRU_BATCH BITS_PER_LONG > >> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) > >> > >> diff --git a/mm/vmscan.c b/mm/vmscan.c > >> index 76378bc257e3..1f0d194f8b2f 100644 > >> --- a/mm/vmscan.c > >> +++ b/mm/vmscan.c > >> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, > >> task->reclaim_state = rs; > >> } > >> > >> +/** > >> + * find_folios_written_back - Find and move the written back folios to a new list. > >> + * @list: filios list > >> + * @clean: the written back folios list > >> + * @skip: whether skip to move the written back folios to clean list. > >> + */ > >> +static inline void find_folios_written_back(struct list_head *list, > >> + struct list_head *clean, bool skip) > >> +{ > >> + struct folio *folio; > >> + struct folio *next; > >> + > >> + list_for_each_entry_safe_reverse(folio, next, list, lru) { > >> + if (!folio_evictable(folio)) { > >> + list_del(&folio->lru); > >> + folio_putback_lru(folio); > >> + continue; > >> + } > >> + > >> + if (folio_test_reclaim(folio) && > >> + (folio_test_dirty(folio) || folio_test_writeback(folio))) { > >> + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ > >> + if (lru_gen_enabled() && folio_test_workingset(folio)) > >> + folio_set_referenced(folio); > >> + continue; > >> + } > >> + > >> + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || > >> + folio_mapped(folio) || folio_test_locked(folio) || > >> + folio_test_dirty(folio) || folio_test_writeback(folio)) { > >> + /* don't add rejected folios to the oldest generation */ > >> + if (lru_gen_enabled()) > >> + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, > >> + BIT(PG_active)); > >> + continue; > >> + } > >> + > >> + /* retry folios that may have missed folio_rotate_reclaimable() */ > >> + list_move(&folio->lru, clean); > >> + } > >> +} > >> + > >> /* > >> * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to > >> * scan_control->nr_reclaimed. > >> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) > >> return !(current->flags & PF_LOCAL_THROTTLE); > >> } > >> > >> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, > >> + struct reclaim_stat *curr) > >> +{ > >> + int i; > >> + > >> + stat->nr_dirty += curr->nr_dirty; > >> + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; > >> + stat->nr_congested += curr->nr_congested; > >> + stat->nr_writeback += curr->nr_writeback; > >> + stat->nr_immediate += curr->nr_immediate; > >> + stat->nr_pageout += curr->nr_pageout; > >> + stat->nr_ref_keep += curr->nr_ref_keep; > >> + stat->nr_unmap_fail += curr->nr_unmap_fail; > >> + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; > >> + stat->nr_demoted += curr->nr_demoted; > >> + for (i = 0; i < ANON_AND_FILE; i++) > >> + stat->nr_activate[i] = curr->nr_activate[i]; > >> +} > > > > you had no this before, what's the purpose of this? > > > > We may call shrink_folio_list twice, and the 'stat curr' will reset in > the shrink_folio_list function. We should accumulate the stats as a > whole, which will then be used to calculate the cost and return it to > the caller. Does mglru have the same issue? If so, we may need to send a patch to fix mglru's stat accounting as well. By the way, the code is rather messy—could it be implemented as shown below instead? diff --git a/mm/vmscan.c b/mm/vmscan.c index 1f0d194f8b2f..40d2ddde21f5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, struct swap_iocb *plug = NULL; folio_batch_init(&free_folios); - memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc); @@ -1949,25 +1948,6 @@ static int current_may_throttle(void) return !(current->flags & PF_LOCAL_THROTTLE); } -static inline void acc_reclaimed_stat(struct reclaim_stat *stat, - struct reclaim_stat *curr) -{ - int i; - - stat->nr_dirty += curr->nr_dirty; - stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; - stat->nr_congested += curr->nr_congested; - stat->nr_writeback += curr->nr_writeback; - stat->nr_immediate += curr->nr_immediate; - stat->nr_pageout += curr->nr_pageout; - stat->nr_ref_keep += curr->nr_ref_keep; - stat->nr_unmap_fail += curr->nr_unmap_fail; - stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; - stat->nr_demoted += curr->nr_demoted; - for (i = 0; i < ANON_AND_FILE; i++) - stat->nr_activate[i] = curr->nr_activate[i]; -} - /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages @@ -1981,7 +1961,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, unsigned long nr_scanned; unsigned int nr_reclaimed = 0; unsigned long nr_taken; - struct reclaim_stat stat, curr; + struct reclaim_stat stat; bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -2022,9 +2002,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, memset(&stat, 0, sizeof(stat)); retry: - nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false); find_folios_written_back(&folio_list, &clean_list, skip_retry); - acc_reclaimed_stat(&stat, &curr); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); > > Thanks, > Ridong > > >> + > >> /* > >> * shrink_inactive_list() is a helper for shrink_node(). It returns the number > >> * of reclaimed pages > >> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, > >> enum lru_list lru) > >> { > >> LIST_HEAD(folio_list); > >> + LIST_HEAD(clean_list); > >> unsigned long nr_scanned; > >> unsigned int nr_reclaimed = 0; > >> unsigned long nr_taken; > >> - struct reclaim_stat stat; > >> + struct reclaim_stat stat, curr; > >> bool file = is_file_lru(lru); > >> enum vm_event_item item; > >> struct pglist_data *pgdat = lruvec_pgdat(lruvec); > >> bool stalled = false; > >> + bool skip_retry = false; > >> > >> while (unlikely(too_many_isolated(pgdat, file, sc))) { > >> if (stalled) > >> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, > >> if (nr_taken == 0) > >> return 0; > >> > >> - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); > >> + memset(&stat, 0, sizeof(stat)); > >> +retry: > >> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); > >> + find_folios_written_back(&folio_list, &clean_list, skip_retry); > >> + acc_reclaimed_stat(&stat, &curr); > >> > >> spin_lock_irq(&lruvec->lru_lock); > >> move_folios_to_lru(lruvec, &folio_list); > >> + if (!list_empty(&clean_list)) { > >> + list_splice_init(&clean_list, &folio_list); > >> + skip_retry = true; > >> + spin_unlock_irq(&lruvec->lru_lock); > >> + goto retry; This is rather confusing. We're still jumping to retry even though skip_retry=true is set. Can we find a clearer approach for this? It was somewhat acceptable before we introduced the extracted function find_folios_written_back(). However, it has become harder to follow now that skip_retry is passed across functions. I find renaming skip_retry to is_retry more intuitive. The logic is that since we are already retrying, find_folios_written_back() shouldn’t move folios to the clean list again. The intended semantics are: we have retris, don’t retry again. > >> + } > >> > >> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), > >> stat.nr_demoted); > >> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap > >> int reclaimed; > >> LIST_HEAD(list); > >> LIST_HEAD(clean); > >> - struct folio *folio; > >> - struct folio *next; > >> enum vm_event_item item; > >> struct reclaim_stat stat; > >> struct lru_gen_mm_walk *walk; > >> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap > >> scanned, reclaimed, &stat, sc->priority, > >> type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); > >> > >> - list_for_each_entry_safe_reverse(folio, next, &list, lru) { > >> - if (!folio_evictable(folio)) { > >> - list_del(&folio->lru); > >> - folio_putback_lru(folio); > >> - continue; > >> - } > >> - > >> - if (folio_test_reclaim(folio) && > >> - (folio_test_dirty(folio) || folio_test_writeback(folio))) { > >> - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ > >> - if (folio_test_workingset(folio)) > >> - folio_set_referenced(folio); > >> - continue; > >> - } > >> - > >> - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || > >> - folio_mapped(folio) || folio_test_locked(folio) || > >> - folio_test_dirty(folio) || folio_test_writeback(folio)) { > >> - /* don't add rejected folios to the oldest generation */ > >> - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, > >> - BIT(PG_active)); > >> - continue; > >> - } > >> - > >> - /* retry folios that may have missed folio_rotate_reclaimable() */ > >> - list_move(&folio->lru, &clean); > >> - } > >> - > >> + find_folios_written_back(&list, &clean, skip_retry); > >> spin_lock_irq(&lruvec->lru_lock); > >> > >> move_folios_to_lru(lruvec, &list); > >> -- > >> 2.34.1 > >> > > Thanks Barry
On 2024/12/10 16:24, Barry Song wrote: > On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@huawei.com> wrote: >> >> >> >> On 2024/12/10 12:54, Barry Song wrote: >>> On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@huaweicloud.com> wrote: >>>> >>>> From: Chen Ridong <chenridong@huawei.com> >>>> >>>> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back >>>> while isolated") only fixed the issue for mglru. However, this issue >>>> also exists in the traditional active/inactive LRU. This issue will be >>>> worse if THP is split, which makes the list longer and needs longer time >>>> to finish a batch of folios reclaim. >>>> >>>> This issue should be fixed in the same way for the traditional LRU. >>>> Therefore, the common logic was extracted to the 'find_folios_written_back' >>>> function firstly, which is then reused in the 'shrink_inactive_list' >>>> function. Finally, retry reclaiming those folios that may have missed the >>>> rotation for traditional LRU. >>> >>> let's drop the cover-letter and refine the changelog. >>> >> Will update. >> >>>> >>>> Signed-off-by: Chen Ridong <chenridong@huawei.com> >>>> --- >>>> include/linux/mmzone.h | 3 +- >>>> mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------ >>>> 2 files changed, 77 insertions(+), 34 deletions(-) >>>> >>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >>>> index b36124145a16..47c6e8c43dcd 100644 >>>> --- a/include/linux/mmzone.h >>>> +++ b/include/linux/mmzone.h >>>> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; >>>> >>>> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) >>>> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) >>>> +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) >>>> >>>> #ifdef CONFIG_LRU_GEN >>>> >>>> @@ -406,8 +407,6 @@ enum { >>>> NR_LRU_GEN_CAPS >>>> }; >>>> >>>> -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) >>>> - >>>> #define MIN_LRU_BATCH BITS_PER_LONG >>>> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) >>>> >>>> diff --git a/mm/vmscan.c b/mm/vmscan.c >>>> index 76378bc257e3..1f0d194f8b2f 100644 >>>> --- a/mm/vmscan.c >>>> +++ b/mm/vmscan.c >>>> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, >>>> task->reclaim_state = rs; >>>> } >>>> >>>> +/** >>>> + * find_folios_written_back - Find and move the written back folios to a new list. >>>> + * @list: filios list >>>> + * @clean: the written back folios list >>>> + * @skip: whether skip to move the written back folios to clean list. >>>> + */ >>>> +static inline void find_folios_written_back(struct list_head *list, >>>> + struct list_head *clean, bool skip) >>>> +{ >>>> + struct folio *folio; >>>> + struct folio *next; >>>> + >>>> + list_for_each_entry_safe_reverse(folio, next, list, lru) { >>>> + if (!folio_evictable(folio)) { >>>> + list_del(&folio->lru); >>>> + folio_putback_lru(folio); >>>> + continue; >>>> + } >>>> + >>>> + if (folio_test_reclaim(folio) && >>>> + (folio_test_dirty(folio) || folio_test_writeback(folio))) { >>>> + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ >>>> + if (lru_gen_enabled() && folio_test_workingset(folio)) >>>> + folio_set_referenced(folio); >>>> + continue; >>>> + } >>>> + >>>> + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || >>>> + folio_mapped(folio) || folio_test_locked(folio) || >>>> + folio_test_dirty(folio) || folio_test_writeback(folio)) { >>>> + /* don't add rejected folios to the oldest generation */ >>>> + if (lru_gen_enabled()) >>>> + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, >>>> + BIT(PG_active)); >>>> + continue; >>>> + } >>>> + >>>> + /* retry folios that may have missed folio_rotate_reclaimable() */ >>>> + list_move(&folio->lru, clean); >>>> + } >>>> +} >>>> + >>>> /* >>>> * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to >>>> * scan_control->nr_reclaimed. >>>> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) >>>> return !(current->flags & PF_LOCAL_THROTTLE); >>>> } >>>> >>>> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, >>>> + struct reclaim_stat *curr) >>>> +{ >>>> + int i; >>>> + >>>> + stat->nr_dirty += curr->nr_dirty; >>>> + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; >>>> + stat->nr_congested += curr->nr_congested; >>>> + stat->nr_writeback += curr->nr_writeback; >>>> + stat->nr_immediate += curr->nr_immediate; >>>> + stat->nr_pageout += curr->nr_pageout; >>>> + stat->nr_ref_keep += curr->nr_ref_keep; >>>> + stat->nr_unmap_fail += curr->nr_unmap_fail; >>>> + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; >>>> + stat->nr_demoted += curr->nr_demoted; >>>> + for (i = 0; i < ANON_AND_FILE; i++) >>>> + stat->nr_activate[i] = curr->nr_activate[i]; >>>> +} >>> >>> you had no this before, what's the purpose of this? >>> >> >> We may call shrink_folio_list twice, and the 'stat curr' will reset in >> the shrink_folio_list function. We should accumulate the stats as a >> whole, which will then be used to calculate the cost and return it to >> the caller. > > Does mglru have the same issue? If so, we may need to send a patch to > fix mglru's stat accounting as well. By the way, the code is rather > messy—could it be implemented as shown below instead? > I have checked the code (in the evict_folios function) again, and it appears that 'reclaimed' should correspond to sc->nr_reclaimed, which accumulates the results twice. Should I address this issue with a separate patch? if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); __count_memcg_events(memcg, item, reclaimed); __count_vm_events(PGSTEAL_ANON + type, reclaimed); > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 1f0d194f8b2f..40d2ddde21f5 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct > list_head *folio_list, > struct swap_iocb *plug = NULL; > > folio_batch_init(&free_folios); > - memset(stat, 0, sizeof(*stat)); > cond_resched(); > do_demote_pass = can_demote(pgdat->node_id, sc); > > @@ -1949,25 +1948,6 @@ static int current_may_throttle(void) > return !(current->flags & PF_LOCAL_THROTTLE); > } > > -static inline void acc_reclaimed_stat(struct reclaim_stat *stat, > - struct reclaim_stat *curr) > -{ > - int i; > - > - stat->nr_dirty += curr->nr_dirty; > - stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; > - stat->nr_congested += curr->nr_congested; > - stat->nr_writeback += curr->nr_writeback; > - stat->nr_immediate += curr->nr_immediate; > - stat->nr_pageout += curr->nr_pageout; > - stat->nr_ref_keep += curr->nr_ref_keep; > - stat->nr_unmap_fail += curr->nr_unmap_fail; > - stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; > - stat->nr_demoted += curr->nr_demoted; > - for (i = 0; i < ANON_AND_FILE; i++) > - stat->nr_activate[i] = curr->nr_activate[i]; > -} > - > /* > * shrink_inactive_list() is a helper for shrink_node(). It returns the number > * of reclaimed pages > @@ -1981,7 +1961,7 @@ static unsigned long > shrink_inactive_list(unsigned long nr_to_scan, > unsigned long nr_scanned; > unsigned int nr_reclaimed = 0; > unsigned long nr_taken; > - struct reclaim_stat stat, curr; > + struct reclaim_stat stat; > bool file = is_file_lru(lru); > enum vm_event_item item; > struct pglist_data *pgdat = lruvec_pgdat(lruvec); > @@ -2022,9 +2002,8 @@ static unsigned long > shrink_inactive_list(unsigned long nr_to_scan, > > memset(&stat, 0, sizeof(stat)); > retry: > - nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); > + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false); > find_folios_written_back(&folio_list, &clean_list, skip_retry); > - acc_reclaimed_stat(&stat, &curr); > > spin_lock_irq(&lruvec->lru_lock); > move_folios_to_lru(lruvec, &folio_list); > This seems much better. But we have extras works to do: 1. In the shrink_folio_list function: --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1089,12 +1089,12 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, LIST_HEAD(ret_folios); LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; - unsigned int pgactivate = 0; + unsigned int pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; + unsigned int nr_demote = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; folio_batch_init(&free_folios); - memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc); @@ -1558,7 +1558,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, /* Migrate folios selected for demotion */ stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); - nr_reclaimed += stat->nr_demoted; + stat->nr_demoted += nr_demote; + nr_reclaimed += nr_demote; /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */ @@ -1586,7 +1587,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, } } - pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; + pgactivate = stat->nr_activate[0] + stat->nr_activate[1] - pgactivate; mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); 2. Outsize of the shrink_folio_list function, The callers should memset the stat. If you think this will be better, I will update like this. >> >> Thanks, >> Ridong >> >>>> + >>>> /* >>>> * shrink_inactive_list() is a helper for shrink_node(). It returns the number >>>> * of reclaimed pages >>>> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, >>>> enum lru_list lru) >>>> { >>>> LIST_HEAD(folio_list); >>>> + LIST_HEAD(clean_list); >>>> unsigned long nr_scanned; >>>> unsigned int nr_reclaimed = 0; >>>> unsigned long nr_taken; >>>> - struct reclaim_stat stat; >>>> + struct reclaim_stat stat, curr; >>>> bool file = is_file_lru(lru); >>>> enum vm_event_item item; >>>> struct pglist_data *pgdat = lruvec_pgdat(lruvec); >>>> bool stalled = false; >>>> + bool skip_retry = false; >>>> >>>> while (unlikely(too_many_isolated(pgdat, file, sc))) { >>>> if (stalled) >>>> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, >>>> if (nr_taken == 0) >>>> return 0; >>>> >>>> - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); >>>> + memset(&stat, 0, sizeof(stat)); >>>> +retry: >>>> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); >>>> + find_folios_written_back(&folio_list, &clean_list, skip_retry); >>>> + acc_reclaimed_stat(&stat, &curr); >>>> >>>> spin_lock_irq(&lruvec->lru_lock); >>>> move_folios_to_lru(lruvec, &folio_list); >>>> + if (!list_empty(&clean_list)) { >>>> + list_splice_init(&clean_list, &folio_list); >>>> + skip_retry = true; >>>> + spin_unlock_irq(&lruvec->lru_lock); >>>> + goto retry; > > This is rather confusing. We're still jumping to retry even though > skip_retry=true is set. Can we find a clearer approach for this? > > It was somewhat acceptable before we introduced the extracted > function find_folios_written_back(). However, it has become > harder to follow now that skip_retry is passed across functions. > > I find renaming skip_retry to is_retry more intuitive. The logic > is that since we are already retrying, find_folios_written_back() > shouldn’t move folios to the clean list again. The intended semantics > are: we have retris, don’t retry again. > Reasonable. Will update. Thanks, Ridong > >>>> + } >>>> >>>> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), >>>> stat.nr_demoted); >>>> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap >>>> int reclaimed; >>>> LIST_HEAD(list); >>>> LIST_HEAD(clean); >>>> - struct folio *folio; >>>> - struct folio *next; >>>> enum vm_event_item item; >>>> struct reclaim_stat stat; >>>> struct lru_gen_mm_walk *walk; >>>> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap >>>> scanned, reclaimed, &stat, sc->priority, >>>> type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); >>>> >>>> - list_for_each_entry_safe_reverse(folio, next, &list, lru) { >>>> - if (!folio_evictable(folio)) { >>>> - list_del(&folio->lru); >>>> - folio_putback_lru(folio); >>>> - continue; >>>> - } >>>> - >>>> - if (folio_test_reclaim(folio) && >>>> - (folio_test_dirty(folio) || folio_test_writeback(folio))) { >>>> - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ >>>> - if (folio_test_workingset(folio)) >>>> - folio_set_referenced(folio); >>>> - continue; >>>> - } >>>> - >>>> - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || >>>> - folio_mapped(folio) || folio_test_locked(folio) || >>>> - folio_test_dirty(folio) || folio_test_writeback(folio)) { >>>> - /* don't add rejected folios to the oldest generation */ >>>> - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, >>>> - BIT(PG_active)); >>>> - continue; >>>> - } >>>> - >>>> - /* retry folios that may have missed folio_rotate_reclaimable() */ >>>> - list_move(&folio->lru, &clean); >>>> - } >>>> - >>>> + find_folios_written_back(&list, &clean, skip_retry); >>>> spin_lock_irq(&lruvec->lru_lock); >>>> >>>> move_folios_to_lru(lruvec, &list); >>>> -- >>>> 2.34.1 >>>> >>> > > Thanks > Barry
On Wed, Dec 11, 2024 at 1:11 AM chenridong <chenridong@huawei.com> wrote: > > > > On 2024/12/10 16:24, Barry Song wrote: > > On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@huawei.com> wrote: > >> > >> > >> > >> On 2024/12/10 12:54, Barry Song wrote: > >>> On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@huaweicloud.com> wrote: > >>>> > >>>> From: Chen Ridong <chenridong@huawei.com> > >>>> > >>>> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back > >>>> while isolated") only fixed the issue for mglru. However, this issue > >>>> also exists in the traditional active/inactive LRU. This issue will be > >>>> worse if THP is split, which makes the list longer and needs longer time > >>>> to finish a batch of folios reclaim. > >>>> > >>>> This issue should be fixed in the same way for the traditional LRU. > >>>> Therefore, the common logic was extracted to the 'find_folios_written_back' > >>>> function firstly, which is then reused in the 'shrink_inactive_list' > >>>> function. Finally, retry reclaiming those folios that may have missed the > >>>> rotation for traditional LRU. > >>> > >>> let's drop the cover-letter and refine the changelog. > >>> > >> Will update. > >> > >>>> > >>>> Signed-off-by: Chen Ridong <chenridong@huawei.com> > >>>> --- > >>>> include/linux/mmzone.h | 3 +- > >>>> mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------ > >>>> 2 files changed, 77 insertions(+), 34 deletions(-) > >>>> > >>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > >>>> index b36124145a16..47c6e8c43dcd 100644 > >>>> --- a/include/linux/mmzone.h > >>>> +++ b/include/linux/mmzone.h > >>>> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; > >>>> > >>>> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) > >>>> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) > >>>> +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) > >>>> > >>>> #ifdef CONFIG_LRU_GEN > >>>> > >>>> @@ -406,8 +407,6 @@ enum { > >>>> NR_LRU_GEN_CAPS > >>>> }; > >>>> > >>>> -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) > >>>> - > >>>> #define MIN_LRU_BATCH BITS_PER_LONG > >>>> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) > >>>> > >>>> diff --git a/mm/vmscan.c b/mm/vmscan.c > >>>> index 76378bc257e3..1f0d194f8b2f 100644 > >>>> --- a/mm/vmscan.c > >>>> +++ b/mm/vmscan.c > >>>> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, > >>>> task->reclaim_state = rs; > >>>> } > >>>> > >>>> +/** > >>>> + * find_folios_written_back - Find and move the written back folios to a new list. > >>>> + * @list: filios list > >>>> + * @clean: the written back folios list > >>>> + * @skip: whether skip to move the written back folios to clean list. > >>>> + */ > >>>> +static inline void find_folios_written_back(struct list_head *list, > >>>> + struct list_head *clean, bool skip) > >>>> +{ > >>>> + struct folio *folio; > >>>> + struct folio *next; > >>>> + > >>>> + list_for_each_entry_safe_reverse(folio, next, list, lru) { > >>>> + if (!folio_evictable(folio)) { > >>>> + list_del(&folio->lru); > >>>> + folio_putback_lru(folio); > >>>> + continue; > >>>> + } > >>>> + > >>>> + if (folio_test_reclaim(folio) && > >>>> + (folio_test_dirty(folio) || folio_test_writeback(folio))) { > >>>> + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ > >>>> + if (lru_gen_enabled() && folio_test_workingset(folio)) > >>>> + folio_set_referenced(folio); > >>>> + continue; > >>>> + } > >>>> + > >>>> + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || > >>>> + folio_mapped(folio) || folio_test_locked(folio) || > >>>> + folio_test_dirty(folio) || folio_test_writeback(folio)) { > >>>> + /* don't add rejected folios to the oldest generation */ > >>>> + if (lru_gen_enabled()) > >>>> + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, > >>>> + BIT(PG_active)); > >>>> + continue; > >>>> + } > >>>> + > >>>> + /* retry folios that may have missed folio_rotate_reclaimable() */ > >>>> + list_move(&folio->lru, clean); > >>>> + } > >>>> +} > >>>> + > >>>> /* > >>>> * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to > >>>> * scan_control->nr_reclaimed. > >>>> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) > >>>> return !(current->flags & PF_LOCAL_THROTTLE); > >>>> } > >>>> > >>>> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, > >>>> + struct reclaim_stat *curr) > >>>> +{ > >>>> + int i; > >>>> + > >>>> + stat->nr_dirty += curr->nr_dirty; > >>>> + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; > >>>> + stat->nr_congested += curr->nr_congested; > >>>> + stat->nr_writeback += curr->nr_writeback; > >>>> + stat->nr_immediate += curr->nr_immediate; > >>>> + stat->nr_pageout += curr->nr_pageout; > >>>> + stat->nr_ref_keep += curr->nr_ref_keep; > >>>> + stat->nr_unmap_fail += curr->nr_unmap_fail; > >>>> + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; > >>>> + stat->nr_demoted += curr->nr_demoted; > >>>> + for (i = 0; i < ANON_AND_FILE; i++) > >>>> + stat->nr_activate[i] = curr->nr_activate[i]; > >>>> +} > >>> > >>> you had no this before, what's the purpose of this? > >>> > >> > >> We may call shrink_folio_list twice, and the 'stat curr' will reset in > >> the shrink_folio_list function. We should accumulate the stats as a > >> whole, which will then be used to calculate the cost and return it to > >> the caller. > > > > Does mglru have the same issue? If so, we may need to send a patch to > > fix mglru's stat accounting as well. By the way, the code is rather > > messy—could it be implemented as shown below instead? > > > > I have checked the code (in the evict_folios function) again, and it > appears that 'reclaimed' should correspond to sc->nr_reclaimed, which > accumulates the results twice. Should I address this issue with a > separate patch? I don't think there is any problem. reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; reclaimed is always the number of pages we have reclaimed in shrink_folio_list() no matter if it is retry or not. > > if (!cgroup_reclaim(sc)) > __count_vm_events(item, reclaimed); > __count_memcg_events(memcg, item, reclaimed); > __count_vm_events(PGSTEAL_ANON + type, reclaimed); > > > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > index 1f0d194f8b2f..40d2ddde21f5 100644 > > --- a/mm/vmscan.c > > +++ b/mm/vmscan.c > > @@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct > > list_head *folio_list, > > struct swap_iocb *plug = NULL; > > > > folio_batch_init(&free_folios); > > - memset(stat, 0, sizeof(*stat)); > > cond_resched(); > > do_demote_pass = can_demote(pgdat->node_id, sc); > > > > @@ -1949,25 +1948,6 @@ static int current_may_throttle(void) > > return !(current->flags & PF_LOCAL_THROTTLE); > > } > > > > -static inline void acc_reclaimed_stat(struct reclaim_stat *stat, > > - struct reclaim_stat *curr) > > -{ > > - int i; > > - > > - stat->nr_dirty += curr->nr_dirty; > > - stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; > > - stat->nr_congested += curr->nr_congested; > > - stat->nr_writeback += curr->nr_writeback; > > - stat->nr_immediate += curr->nr_immediate; > > - stat->nr_pageout += curr->nr_pageout; > > - stat->nr_ref_keep += curr->nr_ref_keep; > > - stat->nr_unmap_fail += curr->nr_unmap_fail; > > - stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; > > - stat->nr_demoted += curr->nr_demoted; > > - for (i = 0; i < ANON_AND_FILE; i++) > > - stat->nr_activate[i] = curr->nr_activate[i]; > > -} > > - > > /* > > * shrink_inactive_list() is a helper for shrink_node(). It returns the number > > * of reclaimed pages > > @@ -1981,7 +1961,7 @@ static unsigned long > > shrink_inactive_list(unsigned long nr_to_scan, > > unsigned long nr_scanned; > > unsigned int nr_reclaimed = 0; > > unsigned long nr_taken; > > - struct reclaim_stat stat, curr; > > + struct reclaim_stat stat; > > bool file = is_file_lru(lru); > > enum vm_event_item item; > > struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > @@ -2022,9 +2002,8 @@ static unsigned long > > shrink_inactive_list(unsigned long nr_to_scan, > > > > memset(&stat, 0, sizeof(stat)); > > retry: > > - nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); > > + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false); > > find_folios_written_back(&folio_list, &clean_list, skip_retry); > > - acc_reclaimed_stat(&stat, &curr); > > > > spin_lock_irq(&lruvec->lru_lock); > > move_folios_to_lru(lruvec, &folio_list); > > > > This seems much better. But we have extras works to do: > > 1. In the shrink_folio_list function: > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1089,12 +1089,12 @@ static unsigned int shrink_folio_list(struct > list_head *folio_list, > LIST_HEAD(ret_folios); > LIST_HEAD(demote_folios); > unsigned int nr_reclaimed = 0; > - unsigned int pgactivate = 0; > + unsigned int pgactivate = stat->nr_activate[0] + > stat->nr_activate[1]; > + unsigned int nr_demote = 0; > bool do_demote_pass; > struct swap_iocb *plug = NULL; > > folio_batch_init(&free_folios); > - memset(stat, 0, sizeof(*stat)); > cond_resched(); > do_demote_pass = can_demote(pgdat->node_id, sc); > > @@ -1558,7 +1558,8 @@ static unsigned int shrink_folio_list(struct > list_head *folio_list, > > /* Migrate folios selected for demotion */ > stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); > - nr_reclaimed += stat->nr_demoted; > + stat->nr_demoted += nr_demote; > + nr_reclaimed += nr_demote; > /* Folios that could not be demoted are still in @demote_folios */ > if (!list_empty(&demote_folios)) { > /* Folios which weren't demoted go back on @folio_list */ > @@ -1586,7 +1587,7 @@ static unsigned int shrink_folio_list(struct > list_head *folio_list, > } > } > > - pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; > + pgactivate = stat->nr_activate[0] + stat->nr_activate[1] - > pgactivate; > > mem_cgroup_uncharge_folios(&free_folios); > try_to_unmap_flush(); > > > 2. Outsize of the shrink_folio_list function, The callers should memset > the stat. > > If you think this will be better, I will update like this. no. Please goto retry after you have collected all you need from `stat` just as mglru is doing, drop the "curr" and acc_reclaimed_stat(). sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; move_folios_to_lru() has helped moving all uninterested folios back to lruvec before you retry. There is no duplicated counting. > > >> > >> Thanks, > >> Ridong > >> > >>>> + > >>>> /* > >>>> * shrink_inactive_list() is a helper for shrink_node(). It returns the number > >>>> * of reclaimed pages > >>>> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, > >>>> enum lru_list lru) > >>>> { > >>>> LIST_HEAD(folio_list); > >>>> + LIST_HEAD(clean_list); > >>>> unsigned long nr_scanned; > >>>> unsigned int nr_reclaimed = 0; > >>>> unsigned long nr_taken; > >>>> - struct reclaim_stat stat; > >>>> + struct reclaim_stat stat, curr; > >>>> bool file = is_file_lru(lru); > >>>> enum vm_event_item item; > >>>> struct pglist_data *pgdat = lruvec_pgdat(lruvec); > >>>> bool stalled = false; > >>>> + bool skip_retry = false; > >>>> > >>>> while (unlikely(too_many_isolated(pgdat, file, sc))) { > >>>> if (stalled) > >>>> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, > >>>> if (nr_taken == 0) > >>>> return 0; > >>>> > >>>> - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); > >>>> + memset(&stat, 0, sizeof(stat)); > >>>> +retry: > >>>> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); > >>>> + find_folios_written_back(&folio_list, &clean_list, skip_retry); > >>>> + acc_reclaimed_stat(&stat, &curr); > >>>> > >>>> spin_lock_irq(&lruvec->lru_lock); > >>>> move_folios_to_lru(lruvec, &folio_list); > >>>> + if (!list_empty(&clean_list)) { > >>>> + list_splice_init(&clean_list, &folio_list); > >>>> + skip_retry = true; > >>>> + spin_unlock_irq(&lruvec->lru_lock); > >>>> + goto retry; > > > > This is rather confusing. We're still jumping to retry even though > > skip_retry=true is set. Can we find a clearer approach for this? > > > > It was somewhat acceptable before we introduced the extracted > > function find_folios_written_back(). However, it has become > > harder to follow now that skip_retry is passed across functions. > > > > I find renaming skip_retry to is_retry more intuitive. The logic > > is that since we are already retrying, find_folios_written_back() > > shouldn’t move folios to the clean list again. The intended semantics > > are: we have retris, don’t retry again. > > > > Reasonable. Will update. > > Thanks, > Ridong > > > > >>>> + } > >>>> > >>>> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), > >>>> stat.nr_demoted); > >>>> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap > >>>> int reclaimed; > >>>> LIST_HEAD(list); > >>>> LIST_HEAD(clean); > >>>> - struct folio *folio; > >>>> - struct folio *next; > >>>> enum vm_event_item item; > >>>> struct reclaim_stat stat; > >>>> struct lru_gen_mm_walk *walk; > >>>> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap > >>>> scanned, reclaimed, &stat, sc->priority, > >>>> type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); > >>>> > >>>> - list_for_each_entry_safe_reverse(folio, next, &list, lru) { > >>>> - if (!folio_evictable(folio)) { > >>>> - list_del(&folio->lru); > >>>> - folio_putback_lru(folio); > >>>> - continue; > >>>> - } > >>>> - > >>>> - if (folio_test_reclaim(folio) && > >>>> - (folio_test_dirty(folio) || folio_test_writeback(folio))) { > >>>> - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ > >>>> - if (folio_test_workingset(folio)) > >>>> - folio_set_referenced(folio); > >>>> - continue; > >>>> - } > >>>> - > >>>> - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || > >>>> - folio_mapped(folio) || folio_test_locked(folio) || > >>>> - folio_test_dirty(folio) || folio_test_writeback(folio)) { > >>>> - /* don't add rejected folios to the oldest generation */ > >>>> - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, > >>>> - BIT(PG_active)); > >>>> - continue; > >>>> - } > >>>> - > >>>> - /* retry folios that may have missed folio_rotate_reclaimable() */ > >>>> - list_move(&folio->lru, &clean); > >>>> - } > >>>> - > >>>> + find_folios_written_back(&list, &clean, skip_retry); > >>>> spin_lock_irq(&lruvec->lru_lock); > >>>> > >>>> move_folios_to_lru(lruvec, &list); > >>>> -- > >>>> 2.34.1 > >>>> > >>> > > > > Thanks > > Barry
On 2024/12/13 7:17, Barry Song wrote: > On Wed, Dec 11, 2024 at 1:11 AM chenridong <chenridong@huawei.com> wrote: >> >> >> >> On 2024/12/10 16:24, Barry Song wrote: >>> On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@huawei.com> wrote: >>>> >>>> >>>> >>>> On 2024/12/10 12:54, Barry Song wrote: >>>>> On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@huaweicloud.com> wrote: >>>>>> >>>>>> From: Chen Ridong <chenridong@huawei.com> >>>>>> >>>>>> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back >>>>>> while isolated") only fixed the issue for mglru. However, this issue >>>>>> also exists in the traditional active/inactive LRU. This issue will be >>>>>> worse if THP is split, which makes the list longer and needs longer time >>>>>> to finish a batch of folios reclaim. >>>>>> >>>>>> This issue should be fixed in the same way for the traditional LRU. >>>>>> Therefore, the common logic was extracted to the 'find_folios_written_back' >>>>>> function firstly, which is then reused in the 'shrink_inactive_list' >>>>>> function. Finally, retry reclaiming those folios that may have missed the >>>>>> rotation for traditional LRU. >>>>> >>>>> let's drop the cover-letter and refine the changelog. >>>>> >>>> Will update. >>>> >>>>>> >>>>>> Signed-off-by: Chen Ridong <chenridong@huawei.com> >>>>>> --- >>>>>> include/linux/mmzone.h | 3 +- >>>>>> mm/vmscan.c | 108 +++++++++++++++++++++++++++++------------ >>>>>> 2 files changed, 77 insertions(+), 34 deletions(-) >>>>>> >>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >>>>>> index b36124145a16..47c6e8c43dcd 100644 >>>>>> --- a/include/linux/mmzone.h >>>>>> +++ b/include/linux/mmzone.h >>>>>> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; >>>>>> >>>>>> #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) >>>>>> #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) >>>>>> +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) >>>>>> >>>>>> #ifdef CONFIG_LRU_GEN >>>>>> >>>>>> @@ -406,8 +407,6 @@ enum { >>>>>> NR_LRU_GEN_CAPS >>>>>> }; >>>>>> >>>>>> -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) >>>>>> - >>>>>> #define MIN_LRU_BATCH BITS_PER_LONG >>>>>> #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) >>>>>> >>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c >>>>>> index 76378bc257e3..1f0d194f8b2f 100644 >>>>>> --- a/mm/vmscan.c >>>>>> +++ b/mm/vmscan.c >>>>>> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, >>>>>> task->reclaim_state = rs; >>>>>> } >>>>>> >>>>>> +/** >>>>>> + * find_folios_written_back - Find and move the written back folios to a new list. >>>>>> + * @list: filios list >>>>>> + * @clean: the written back folios list >>>>>> + * @skip: whether skip to move the written back folios to clean list. >>>>>> + */ >>>>>> +static inline void find_folios_written_back(struct list_head *list, >>>>>> + struct list_head *clean, bool skip) >>>>>> +{ >>>>>> + struct folio *folio; >>>>>> + struct folio *next; >>>>>> + >>>>>> + list_for_each_entry_safe_reverse(folio, next, list, lru) { >>>>>> + if (!folio_evictable(folio)) { >>>>>> + list_del(&folio->lru); >>>>>> + folio_putback_lru(folio); >>>>>> + continue; >>>>>> + } >>>>>> + >>>>>> + if (folio_test_reclaim(folio) && >>>>>> + (folio_test_dirty(folio) || folio_test_writeback(folio))) { >>>>>> + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ >>>>>> + if (lru_gen_enabled() && folio_test_workingset(folio)) >>>>>> + folio_set_referenced(folio); >>>>>> + continue; >>>>>> + } >>>>>> + >>>>>> + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || >>>>>> + folio_mapped(folio) || folio_test_locked(folio) || >>>>>> + folio_test_dirty(folio) || folio_test_writeback(folio)) { >>>>>> + /* don't add rejected folios to the oldest generation */ >>>>>> + if (lru_gen_enabled()) >>>>>> + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, >>>>>> + BIT(PG_active)); >>>>>> + continue; >>>>>> + } >>>>>> + >>>>>> + /* retry folios that may have missed folio_rotate_reclaimable() */ >>>>>> + list_move(&folio->lru, clean); >>>>>> + } >>>>>> +} >>>>>> + >>>>>> /* >>>>>> * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to >>>>>> * scan_control->nr_reclaimed. >>>>>> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) >>>>>> return !(current->flags & PF_LOCAL_THROTTLE); >>>>>> } >>>>>> >>>>>> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, >>>>>> + struct reclaim_stat *curr) >>>>>> +{ >>>>>> + int i; >>>>>> + >>>>>> + stat->nr_dirty += curr->nr_dirty; >>>>>> + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; >>>>>> + stat->nr_congested += curr->nr_congested; >>>>>> + stat->nr_writeback += curr->nr_writeback; >>>>>> + stat->nr_immediate += curr->nr_immediate; >>>>>> + stat->nr_pageout += curr->nr_pageout; >>>>>> + stat->nr_ref_keep += curr->nr_ref_keep; >>>>>> + stat->nr_unmap_fail += curr->nr_unmap_fail; >>>>>> + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; >>>>>> + stat->nr_demoted += curr->nr_demoted; >>>>>> + for (i = 0; i < ANON_AND_FILE; i++) >>>>>> + stat->nr_activate[i] = curr->nr_activate[i]; >>>>>> +} >>>>> >>>>> you had no this before, what's the purpose of this? >>>>> >>>> >>>> We may call shrink_folio_list twice, and the 'stat curr' will reset in >>>> the shrink_folio_list function. We should accumulate the stats as a >>>> whole, which will then be used to calculate the cost and return it to >>>> the caller. >>> >>> Does mglru have the same issue? If so, we may need to send a patch to >>> fix mglru's stat accounting as well. By the way, the code is rather >>> messy—could it be implemented as shown below instead? >>> >> >> I have checked the code (in the evict_folios function) again, and it >> appears that 'reclaimed' should correspond to sc->nr_reclaimed, which >> accumulates the results twice. Should I address this issue with a >> separate patch? > > I don't think there is any problem. > > reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); > sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; > sc->nr_reclaimed += reclaimed; > > reclaimed is always the number of pages we have reclaimed in > shrink_folio_list() no matter if it is retry or not. > Thank you, I made a mistake. >> >> if (!cgroup_reclaim(sc)) >> __count_vm_events(item, reclaimed); >> __count_memcg_events(memcg, item, reclaimed); >> __count_vm_events(PGSTEAL_ANON + type, reclaimed); >> >> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c >>> index 1f0d194f8b2f..40d2ddde21f5 100644 >>> --- a/mm/vmscan.c >>> +++ b/mm/vmscan.c >>> @@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct >>> list_head *folio_list, >>> struct swap_iocb *plug = NULL; >>> >>> folio_batch_init(&free_folios); >>> - memset(stat, 0, sizeof(*stat)); >>> cond_resched(); >>> do_demote_pass = can_demote(pgdat->node_id, sc); >>> >>> @@ -1949,25 +1948,6 @@ static int current_may_throttle(void) >>> return !(current->flags & PF_LOCAL_THROTTLE); >>> } >>> >>> -static inline void acc_reclaimed_stat(struct reclaim_stat *stat, >>> - struct reclaim_stat *curr) >>> -{ >>> - int i; >>> - >>> - stat->nr_dirty += curr->nr_dirty; >>> - stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; >>> - stat->nr_congested += curr->nr_congested; >>> - stat->nr_writeback += curr->nr_writeback; >>> - stat->nr_immediate += curr->nr_immediate; >>> - stat->nr_pageout += curr->nr_pageout; >>> - stat->nr_ref_keep += curr->nr_ref_keep; >>> - stat->nr_unmap_fail += curr->nr_unmap_fail; >>> - stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; >>> - stat->nr_demoted += curr->nr_demoted; >>> - for (i = 0; i < ANON_AND_FILE; i++) >>> - stat->nr_activate[i] = curr->nr_activate[i]; >>> -} >>> - >>> /* >>> * shrink_inactive_list() is a helper for shrink_node(). It returns the number >>> * of reclaimed pages >>> @@ -1981,7 +1961,7 @@ static unsigned long >>> shrink_inactive_list(unsigned long nr_to_scan, >>> unsigned long nr_scanned; >>> unsigned int nr_reclaimed = 0; >>> unsigned long nr_taken; >>> - struct reclaim_stat stat, curr; >>> + struct reclaim_stat stat; >>> bool file = is_file_lru(lru); >>> enum vm_event_item item; >>> struct pglist_data *pgdat = lruvec_pgdat(lruvec); >>> @@ -2022,9 +2002,8 @@ static unsigned long >>> shrink_inactive_list(unsigned long nr_to_scan, >>> >>> memset(&stat, 0, sizeof(stat)); >>> retry: >>> - nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); >>> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false); >>> find_folios_written_back(&folio_list, &clean_list, skip_retry); >>> - acc_reclaimed_stat(&stat, &curr); >>> >>> spin_lock_irq(&lruvec->lru_lock); >>> move_folios_to_lru(lruvec, &folio_list); >>> >> >> This seems much better. But we have extras works to do: >> >> 1. In the shrink_folio_list function: >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -1089,12 +1089,12 @@ static unsigned int shrink_folio_list(struct >> list_head *folio_list, >> LIST_HEAD(ret_folios); >> LIST_HEAD(demote_folios); >> unsigned int nr_reclaimed = 0; >> - unsigned int pgactivate = 0; >> + unsigned int pgactivate = stat->nr_activate[0] + >> stat->nr_activate[1]; >> + unsigned int nr_demote = 0; >> bool do_demote_pass; >> struct swap_iocb *plug = NULL; >> >> folio_batch_init(&free_folios); >> - memset(stat, 0, sizeof(*stat)); >> cond_resched(); >> do_demote_pass = can_demote(pgdat->node_id, sc); >> >> @@ -1558,7 +1558,8 @@ static unsigned int shrink_folio_list(struct >> list_head *folio_list, >> >> /* Migrate folios selected for demotion */ >> stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); >> - nr_reclaimed += stat->nr_demoted; >> + stat->nr_demoted += nr_demote; >> + nr_reclaimed += nr_demote; >> /* Folios that could not be demoted are still in @demote_folios */ >> if (!list_empty(&demote_folios)) { >> /* Folios which weren't demoted go back on @folio_list */ >> @@ -1586,7 +1587,7 @@ static unsigned int shrink_folio_list(struct >> list_head *folio_list, >> } >> } >> >> - pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; >> + pgactivate = stat->nr_activate[0] + stat->nr_activate[1] - >> pgactivate; >> >> mem_cgroup_uncharge_folios(&free_folios); >> try_to_unmap_flush(); >> >> >> 2. Outsize of the shrink_folio_list function, The callers should memset >> the stat. >> >> If you think this will be better, I will update like this. > > no. Please goto retry after you have collected all you need from > `stat` just as mglru > is doing, drop the "curr" and acc_reclaimed_stat(). > > sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; > sc->nr_reclaimed += reclaimed; > > move_folios_to_lru() has helped moving all uninterested folios back to lruvec > before you retry. There is no duplicated counting. > See, thank you. Thanks, Ridong >> >>>> >>>> Thanks, >>>> Ridong >>>> >>>>>> + >>>>>> /* >>>>>> * shrink_inactive_list() is a helper for shrink_node(). It returns the number >>>>>> * of reclaimed pages >>>>>> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, >>>>>> enum lru_list lru) >>>>>> { >>>>>> LIST_HEAD(folio_list); >>>>>> + LIST_HEAD(clean_list); >>>>>> unsigned long nr_scanned; >>>>>> unsigned int nr_reclaimed = 0; >>>>>> unsigned long nr_taken; >>>>>> - struct reclaim_stat stat; >>>>>> + struct reclaim_stat stat, curr; >>>>>> bool file = is_file_lru(lru); >>>>>> enum vm_event_item item; >>>>>> struct pglist_data *pgdat = lruvec_pgdat(lruvec); >>>>>> bool stalled = false; >>>>>> + bool skip_retry = false; >>>>>> >>>>>> while (unlikely(too_many_isolated(pgdat, file, sc))) { >>>>>> if (stalled) >>>>>> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, >>>>>> if (nr_taken == 0) >>>>>> return 0; >>>>>> >>>>>> - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); >>>>>> + memset(&stat, 0, sizeof(stat)); >>>>>> +retry: >>>>>> + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); >>>>>> + find_folios_written_back(&folio_list, &clean_list, skip_retry); >>>>>> + acc_reclaimed_stat(&stat, &curr); >>>>>> >>>>>> spin_lock_irq(&lruvec->lru_lock); >>>>>> move_folios_to_lru(lruvec, &folio_list); >>>>>> + if (!list_empty(&clean_list)) { >>>>>> + list_splice_init(&clean_list, &folio_list); >>>>>> + skip_retry = true; >>>>>> + spin_unlock_irq(&lruvec->lru_lock); >>>>>> + goto retry; >>> >>> This is rather confusing. We're still jumping to retry even though >>> skip_retry=true is set. Can we find a clearer approach for this? >>> >>> It was somewhat acceptable before we introduced the extracted >>> function find_folios_written_back(). However, it has become >>> harder to follow now that skip_retry is passed across functions. >>> >>> I find renaming skip_retry to is_retry more intuitive. The logic >>> is that since we are already retrying, find_folios_written_back() >>> shouldn’t move folios to the clean list again. The intended semantics >>> are: we have retris, don’t retry again. >>> >> >> Reasonable. Will update. >> >> Thanks, >> Ridong >> >>> >>>>>> + } >>>>>> >>>>>> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), >>>>>> stat.nr_demoted); >>>>>> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap >>>>>> int reclaimed; >>>>>> LIST_HEAD(list); >>>>>> LIST_HEAD(clean); >>>>>> - struct folio *folio; >>>>>> - struct folio *next; >>>>>> enum vm_event_item item; >>>>>> struct reclaim_stat stat; >>>>>> struct lru_gen_mm_walk *walk; >>>>>> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap >>>>>> scanned, reclaimed, &stat, sc->priority, >>>>>> type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); >>>>>> >>>>>> - list_for_each_entry_safe_reverse(folio, next, &list, lru) { >>>>>> - if (!folio_evictable(folio)) { >>>>>> - list_del(&folio->lru); >>>>>> - folio_putback_lru(folio); >>>>>> - continue; >>>>>> - } >>>>>> - >>>>>> - if (folio_test_reclaim(folio) && >>>>>> - (folio_test_dirty(folio) || folio_test_writeback(folio))) { >>>>>> - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ >>>>>> - if (folio_test_workingset(folio)) >>>>>> - folio_set_referenced(folio); >>>>>> - continue; >>>>>> - } >>>>>> - >>>>>> - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || >>>>>> - folio_mapped(folio) || folio_test_locked(folio) || >>>>>> - folio_test_dirty(folio) || folio_test_writeback(folio)) { >>>>>> - /* don't add rejected folios to the oldest generation */ >>>>>> - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, >>>>>> - BIT(PG_active)); >>>>>> - continue; >>>>>> - } >>>>>> - >>>>>> - /* retry folios that may have missed folio_rotate_reclaimable() */ >>>>>> - list_move(&folio->lru, &clean); >>>>>> - } >>>>>> - >>>>>> + find_folios_written_back(&list, &clean, skip_retry); >>>>>> spin_lock_irq(&lruvec->lru_lock); >>>>>> >>>>>> move_folios_to_lru(lruvec, &list); >>>>>> -- >>>>>> 2.34.1 >>>>>> >>>>> >>> >>> Thanks >>> Barry
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b36124145a16..47c6e8c43dcd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -391,6 +391,7 @@ struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) #ifdef CONFIG_LRU_GEN @@ -406,8 +407,6 @@ enum { NR_LRU_GEN_CAPS }; -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) diff --git a/mm/vmscan.c b/mm/vmscan.c index 76378bc257e3..1f0d194f8b2f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task, task->reclaim_state = rs; } +/** + * find_folios_written_back - Find and move the written back folios to a new list. + * @list: filios list + * @clean: the written back folios list + * @skip: whether skip to move the written back folios to clean list. + */ +static inline void find_folios_written_back(struct list_head *list, + struct list_head *clean, bool skip) +{ + struct folio *folio; + struct folio *next; + + list_for_each_entry_safe_reverse(folio, next, list, lru) { + if (!folio_evictable(folio)) { + list_del(&folio->lru); + folio_putback_lru(folio); + continue; + } + + if (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio))) { + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ + if (lru_gen_enabled() && folio_test_workingset(folio)) + folio_set_referenced(folio); + continue; + } + + if (skip || folio_test_active(folio) || folio_test_referenced(folio) || + folio_mapped(folio) || folio_test_locked(folio) || + folio_test_dirty(folio) || folio_test_writeback(folio)) { + /* don't add rejected folios to the oldest generation */ + if (lru_gen_enabled()) + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, + BIT(PG_active)); + continue; + } + + /* retry folios that may have missed folio_rotate_reclaimable() */ + list_move(&folio->lru, clean); + } +} + /* * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to * scan_control->nr_reclaimed. @@ -1907,6 +1949,25 @@ static int current_may_throttle(void) return !(current->flags & PF_LOCAL_THROTTLE); } +static inline void acc_reclaimed_stat(struct reclaim_stat *stat, + struct reclaim_stat *curr) +{ + int i; + + stat->nr_dirty += curr->nr_dirty; + stat->nr_unqueued_dirty += curr->nr_unqueued_dirty; + stat->nr_congested += curr->nr_congested; + stat->nr_writeback += curr->nr_writeback; + stat->nr_immediate += curr->nr_immediate; + stat->nr_pageout += curr->nr_pageout; + stat->nr_ref_keep += curr->nr_ref_keep; + stat->nr_unmap_fail += curr->nr_unmap_fail; + stat->nr_lazyfree_fail += curr->nr_lazyfree_fail; + stat->nr_demoted += curr->nr_demoted; + for (i = 0; i < ANON_AND_FILE; i++) + stat->nr_activate[i] = curr->nr_activate[i]; +} + /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, enum lru_list lru) { LIST_HEAD(folio_list); + LIST_HEAD(clean_list); unsigned long nr_scanned; unsigned int nr_reclaimed = 0; unsigned long nr_taken; - struct reclaim_stat stat; + struct reclaim_stat stat, curr; bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false; + bool skip_retry = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { if (stalled) @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); + memset(&stat, 0, sizeof(stat)); +retry: + nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false); + find_folios_written_back(&folio_list, &clean_list, skip_retry); + acc_reclaimed_stat(&stat, &curr); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); + if (!list_empty(&clean_list)) { + list_splice_init(&clean_list, &folio_list); + skip_retry = true; + spin_unlock_irq(&lruvec->lru_lock); + goto retry; + } __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), stat.nr_demoted); @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap int reclaimed; LIST_HEAD(list); LIST_HEAD(clean); - struct folio *folio; - struct folio *next; enum vm_event_item item; struct reclaim_stat stat; struct lru_gen_mm_walk *walk; @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap scanned, reclaimed, &stat, sc->priority, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); - list_for_each_entry_safe_reverse(folio, next, &list, lru) { - if (!folio_evictable(folio)) { - list_del(&folio->lru); - folio_putback_lru(folio); - continue; - } - - if (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio))) { - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ - if (folio_test_workingset(folio)) - folio_set_referenced(folio); - continue; - } - - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || - folio_mapped(folio) || folio_test_locked(folio) || - folio_test_dirty(folio) || folio_test_writeback(folio)) { - /* don't add rejected folios to the oldest generation */ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, - BIT(PG_active)); - continue; - } - - /* retry folios that may have missed folio_rotate_reclaimable() */ - list_move(&folio->lru, &clean); - } - + find_folios_written_back(&list, &clean, skip_retry); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &list);