diff mbox series

[RFC] mm: madvise: pageout: ignore references rather than clearing young

Message ID 20240223041550.77157-1-21cnbao@gmail.com (mailing list archive)
State New
Headers show
Series [RFC] mm: madvise: pageout: ignore references rather than clearing young | expand

Commit Message

Barry Song Feb. 23, 2024, 4:15 a.m. UTC
From: Barry Song <v-songbaohua@oppo.com>

While doing MADV_PAGEOUT, the current code will clear PTE young
so that vmscan won't read young flags to allow the reclamation
of madvised folios to go ahead.
It seems we can do it by directly ignoring references, thus we
can remove tlb flush in madvise and rmap overhead in vmscan.

Regarding the side effect, in the original code, if a parallel
thread runs side by side to access the madvised memory with the
thread doing madvise, folios will get a chance to be re-activated
by vmscan. But with the patch, they will still be reclaimed. But
this behaviour doing PAGEOUT and doing access at the same time is
quite silly like DoS. So probably, we don't need to care.

A microbench as below has shown 6% decrement on the latency of
MADV_PAGEOUT,

 #define PGSIZE 4096
 main()
 {
 	int i;
 #define SIZE 512*1024*1024
 	volatile long *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

 	for (i = 0; i < SIZE/sizeof(long); i += PGSIZE / sizeof(long))
 		p[i] =  0x11;

 	madvise(p, SIZE, MADV_PAGEOUT);
 }

w/o patch                    w/ patch
root@10:~# time ./a.out      root@10:~# time ./a.out
real	0m49.634s            real   0m46.334s
user	0m0.637s             user   0m0.648s
sys	0m47.434s            sys    0m44.265s

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/damon/paddr.c |  2 +-
 mm/internal.h    |  2 +-
 mm/madvise.c     |  8 ++++----
 mm/vmscan.c      | 12 +++++++-----
 4 files changed, 13 insertions(+), 11 deletions(-)

Comments

Minchan Kim Feb. 23, 2024, 10:09 p.m. UTC | #1
Hi Barry,

On Fri, Feb 23, 2024 at 05:15:50PM +1300, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> While doing MADV_PAGEOUT, the current code will clear PTE young
> so that vmscan won't read young flags to allow the reclamation
> of madvised folios to go ahead.

Isn't it good to accelerate reclaiming? vmscan checks whether the
page was accessed recenlty by the young bit from pte and if it is,
it doesn't reclaim the page. Since we have cleared the young bit
in pte in madvise_pageout, vmscan is likely to reclaim the page
since it wouldn't see the ferencecd_ptes from folio_check_references.

Could you clarify if I miss something here?


> It seems we can do it by directly ignoring references, thus we
> can remove tlb flush in madvise and rmap overhead in vmscan.
> 
> Regarding the side effect, in the original code, if a parallel
> thread runs side by side to access the madvised memory with the
> thread doing madvise, folios will get a chance to be re-activated
> by vmscan. But with the patch, they will still be reclaimed. But
> this behaviour doing PAGEOUT and doing access at the same time is
> quite silly like DoS. So probably, we don't need to care.
> 
> A microbench as below has shown 6% decrement on the latency of
> MADV_PAGEOUT,
> 
>  #define PGSIZE 4096
>  main()
>  {
>  	int i;
>  #define SIZE 512*1024*1024
>  	volatile long *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
>  			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> 
>  	for (i = 0; i < SIZE/sizeof(long); i += PGSIZE / sizeof(long))
>  		p[i] =  0x11;
> 
>  	madvise(p, SIZE, MADV_PAGEOUT);
>  }
> 
> w/o patch                    w/ patch
> root@10:~# time ./a.out      root@10:~# time ./a.out
> real	0m49.634s            real   0m46.334s
> user	0m0.637s             user   0m0.648s
> sys	0m47.434s            sys    0m44.265s
> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  mm/damon/paddr.c |  2 +-
>  mm/internal.h    |  2 +-
>  mm/madvise.c     |  8 ++++----
>  mm/vmscan.c      | 12 +++++++-----
>  4 files changed, 13 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
> index 081e2a325778..5e6dc312072c 100644
> --- a/mm/damon/paddr.c
> +++ b/mm/damon/paddr.c
> @@ -249,7 +249,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
>  put_folio:
>  		folio_put(folio);
>  	}
> -	applied = reclaim_pages(&folio_list);
> +	applied = reclaim_pages(&folio_list, false);
>  	cond_resched();
>  	return applied * PAGE_SIZE;
>  }
> diff --git a/mm/internal.h b/mm/internal.h
> index 93e229112045..36c11ea41f47 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -868,7 +868,7 @@ extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
>          unsigned long, unsigned long);
>  
>  extern void set_pageblock_order(void);
> -unsigned long reclaim_pages(struct list_head *folio_list);
> +unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references);
>  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
>  					    struct list_head *folio_list);
>  /* The ALLOC_WMARK bits are used as an index to zone->watermark */
> diff --git a/mm/madvise.c b/mm/madvise.c
> index abde3edb04f0..44a498c94158 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -386,7 +386,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  			return 0;
>  		}
>  
> -		if (pmd_young(orig_pmd)) {
> +		if (!pageout && pmd_young(orig_pmd)) {
>  			pmdp_invalidate(vma, addr, pmd);
>  			orig_pmd = pmd_mkold(orig_pmd);
>  
> @@ -410,7 +410,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  huge_unlock:
>  		spin_unlock(ptl);
>  		if (pageout)
> -			reclaim_pages(&folio_list);
> +			reclaim_pages(&folio_list, true);
>  		return 0;
>  	}
>  
> @@ -490,7 +490,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  
>  		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
>  
> -		if (pte_young(ptent)) {
> +		if (!pageout && pte_young(ptent)) {
>  			ptent = ptep_get_and_clear_full(mm, addr, pte,
>  							tlb->fullmm);
>  			ptent = pte_mkold(ptent);
> @@ -524,7 +524,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  		pte_unmap_unlock(start_pte, ptl);
>  	}
>  	if (pageout)
> -		reclaim_pages(&folio_list);
> +		reclaim_pages(&folio_list, true);
>  	cond_resched();
>  
>  	return 0;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 402c290fbf5a..ba2f37f46a73 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2102,7 +2102,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  }
>  
>  static unsigned int reclaim_folio_list(struct list_head *folio_list,
> -				      struct pglist_data *pgdat)
> +				      struct pglist_data *pgdat,
> +				      bool ignore_references)
>  {
>  	struct reclaim_stat dummy_stat;
>  	unsigned int nr_reclaimed;
> @@ -2115,7 +2116,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
>  		.no_demotion = 1,
>  	};
>  
> -	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
> +	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
>  	while (!list_empty(folio_list)) {
>  		folio = lru_to_folio(folio_list);
>  		list_del(&folio->lru);
> @@ -2125,7 +2126,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
>  	return nr_reclaimed;
>  }
>  
> -unsigned long reclaim_pages(struct list_head *folio_list)
> +unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references)
>  {
>  	int nid;
>  	unsigned int nr_reclaimed = 0;
> @@ -2147,11 +2148,12 @@ unsigned long reclaim_pages(struct list_head *folio_list)
>  			continue;
>  		}
>  
> -		nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
> +		nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid),
> +						   ignore_references);
>  		nid = folio_nid(lru_to_folio(folio_list));
>  	} while (!list_empty(folio_list));
>  
> -	nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
> +	nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references);
>  
>  	memalloc_noreclaim_restore(noreclaim_flag);
>  
> -- 
> 2.34.1
>
Barry Song Feb. 23, 2024, 10:20 p.m. UTC | #2
On Sat, Feb 24, 2024 at 11:09 AM Minchan Kim <minchan@kernel.org> wrote:
>
> Hi Barry,
>
> On Fri, Feb 23, 2024 at 05:15:50PM +1300, Barry Song wrote:
> > From: Barry Song <v-songbaohua@oppo.com>
> >
> > While doing MADV_PAGEOUT, the current code will clear PTE young
> > so that vmscan won't read young flags to allow the reclamation
> > of madvised folios to go ahead.
>
> Isn't it good to accelerate reclaiming? vmscan checks whether the
> page was accessed recenlty by the young bit from pte and if it is,
> it doesn't reclaim the page. Since we have cleared the young bit
> in pte in madvise_pageout, vmscan is likely to reclaim the page
> since it wouldn't see the ferencecd_ptes from folio_check_references.

right, but the proposal is asking vmscan to skip the folio_check_references
if this is a PAGEOUT. so we remove both pte_clear_young and rmap
of folio_check_references.

>
> Could you clarify if I miss something here?

guest you missed we are skipping folio_check_references now.
we remove both, thus, make MADV_PAGEOUT 6% faster.

>
>
> > It seems we can do it by directly ignoring references, thus we
> > can remove tlb flush in madvise and rmap overhead in vmscan.
> >
> > Regarding the side effect, in the original code, if a parallel
> > thread runs side by side to access the madvised memory with the
> > thread doing madvise, folios will get a chance to be re-activated
> > by vmscan. But with the patch, they will still be reclaimed. But
> > this behaviour doing PAGEOUT and doing access at the same time is
> > quite silly like DoS. So probably, we don't need to care.
> >
> > A microbench as below has shown 6% decrement on the latency of
> > MADV_PAGEOUT,
> >
> >  #define PGSIZE 4096
> >  main()
> >  {
> >       int i;
> >  #define SIZE 512*1024*1024
> >       volatile long *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
> >                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> >
> >       for (i = 0; i < SIZE/sizeof(long); i += PGSIZE / sizeof(long))
> >               p[i] =  0x11;
> >
> >       madvise(p, SIZE, MADV_PAGEOUT);
> >  }
> >
> > w/o patch                    w/ patch
> > root@10:~# time ./a.out      root@10:~# time ./a.out
> > real  0m49.634s            real   0m46.334s
> > user  0m0.637s             user   0m0.648s
> > sys   0m47.434s            sys    0m44.265s
> >
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  mm/damon/paddr.c |  2 +-
> >  mm/internal.h    |  2 +-
> >  mm/madvise.c     |  8 ++++----
> >  mm/vmscan.c      | 12 +++++++-----
> >  4 files changed, 13 insertions(+), 11 deletions(-)
> >
> > diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
> > index 081e2a325778..5e6dc312072c 100644
> > --- a/mm/damon/paddr.c
> > +++ b/mm/damon/paddr.c
> > @@ -249,7 +249,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
> >  put_folio:
> >               folio_put(folio);
> >       }
> > -     applied = reclaim_pages(&folio_list);
> > +     applied = reclaim_pages(&folio_list, false);
> >       cond_resched();
> >       return applied * PAGE_SIZE;
> >  }
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 93e229112045..36c11ea41f47 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -868,7 +868,7 @@ extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
> >          unsigned long, unsigned long);
> >
> >  extern void set_pageblock_order(void);
> > -unsigned long reclaim_pages(struct list_head *folio_list);
> > +unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references);
> >  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
> >                                           struct list_head *folio_list);
> >  /* The ALLOC_WMARK bits are used as an index to zone->watermark */
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index abde3edb04f0..44a498c94158 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -386,7 +386,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> >                       return 0;
> >               }
> >
> > -             if (pmd_young(orig_pmd)) {
> > +             if (!pageout && pmd_young(orig_pmd)) {
> >                       pmdp_invalidate(vma, addr, pmd);
> >                       orig_pmd = pmd_mkold(orig_pmd);
> >
> > @@ -410,7 +410,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> >  huge_unlock:
> >               spin_unlock(ptl);
> >               if (pageout)
> > -                     reclaim_pages(&folio_list);
> > +                     reclaim_pages(&folio_list, true);
> >               return 0;
> >       }
> >
> > @@ -490,7 +490,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> >
> >               VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
> >
> > -             if (pte_young(ptent)) {
> > +             if (!pageout && pte_young(ptent)) {
> >                       ptent = ptep_get_and_clear_full(mm, addr, pte,
> >                                                       tlb->fullmm);
> >                       ptent = pte_mkold(ptent);
> > @@ -524,7 +524,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> >               pte_unmap_unlock(start_pte, ptl);
> >       }
> >       if (pageout)
> > -             reclaim_pages(&folio_list);
> > +             reclaim_pages(&folio_list, true);
> >       cond_resched();
> >
> >       return 0;
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 402c290fbf5a..ba2f37f46a73 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -2102,7 +2102,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
> >  }
> >
> >  static unsigned int reclaim_folio_list(struct list_head *folio_list,
> > -                                   struct pglist_data *pgdat)
> > +                                   struct pglist_data *pgdat,
> > +                                   bool ignore_references)
> >  {
> >       struct reclaim_stat dummy_stat;
> >       unsigned int nr_reclaimed;
> > @@ -2115,7 +2116,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
> >               .no_demotion = 1,
> >       };
> >
> > -     nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
> > +     nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
> >       while (!list_empty(folio_list)) {
> >               folio = lru_to_folio(folio_list);
> >               list_del(&folio->lru);
> > @@ -2125,7 +2126,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
> >       return nr_reclaimed;
> >  }
> >
> > -unsigned long reclaim_pages(struct list_head *folio_list)
> > +unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references)
> >  {
> >       int nid;
> >       unsigned int nr_reclaimed = 0;
> > @@ -2147,11 +2148,12 @@ unsigned long reclaim_pages(struct list_head *folio_list)
> >                       continue;
> >               }
> >
> > -             nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
> > +             nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid),
> > +                                                ignore_references);
> >               nid = folio_nid(lru_to_folio(folio_list));
> >       } while (!list_empty(folio_list));
> >
> > -     nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
> > +     nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references);
> >
> >       memalloc_noreclaim_restore(noreclaim_flag);
> >
> > --
> > 2.34.1
> >

Thanks
Barry
Minchan Kim Feb. 23, 2024, 11:24 p.m. UTC | #3
On Sat, Feb 24, 2024 at 11:20:36AM +1300, Barry Song wrote:
> On Sat, Feb 24, 2024 at 11:09 AM Minchan Kim <minchan@kernel.org> wrote:
> >
> > Hi Barry,
> >
> > On Fri, Feb 23, 2024 at 05:15:50PM +1300, Barry Song wrote:
> > > From: Barry Song <v-songbaohua@oppo.com>
> > >
> > > While doing MADV_PAGEOUT, the current code will clear PTE young
> > > so that vmscan won't read young flags to allow the reclamation
> > > of madvised folios to go ahead.
> >
> > Isn't it good to accelerate reclaiming? vmscan checks whether the
> > page was accessed recenlty by the young bit from pte and if it is,
> > it doesn't reclaim the page. Since we have cleared the young bit
> > in pte in madvise_pageout, vmscan is likely to reclaim the page
> > since it wouldn't see the ferencecd_ptes from folio_check_references.
> 
> right, but the proposal is asking vmscan to skip the folio_check_references
> if this is a PAGEOUT. so we remove both pte_clear_young and rmap
> of folio_check_references.
> 
> >
> > Could you clarify if I miss something here?
> 
> guest you missed we are skipping folio_check_references now.
> we remove both, thus, make MADV_PAGEOUT 6% faster.

This makes sense to me.

Only concern was race with mlock during the reclaim but the race was already
there for normal page reclaming. Thus, mlock would already handle it.

Thanks.
Barry Song Feb. 24, 2024, 4:37 a.m. UTC | #4
On Sat, Feb 24, 2024 at 7:24 AM Minchan Kim <minchan@kernel.org> wrote:
>
> On Sat, Feb 24, 2024 at 11:20:36AM +1300, Barry Song wrote:
> > On Sat, Feb 24, 2024 at 11:09 AM Minchan Kim <minchan@kernel.org> wrote:
> > >
> > > Hi Barry,
> > >
> > > On Fri, Feb 23, 2024 at 05:15:50PM +1300, Barry Song wrote:
> > > > From: Barry Song <v-songbaohua@oppo.com>
> > > >
> > > > While doing MADV_PAGEOUT, the current code will clear PTE young
> > > > so that vmscan won't read young flags to allow the reclamation
> > > > of madvised folios to go ahead.
> > >
> > > Isn't it good to accelerate reclaiming? vmscan checks whether the
> > > page was accessed recenlty by the young bit from pte and if it is,
> > > it doesn't reclaim the page. Since we have cleared the young bit
> > > in pte in madvise_pageout, vmscan is likely to reclaim the page
> > > since it wouldn't see the ferencecd_ptes from folio_check_references.
> >
> > right, but the proposal is asking vmscan to skip the folio_check_references
> > if this is a PAGEOUT. so we remove both pte_clear_young and rmap
> > of folio_check_references.
> >
> > >
> > > Could you clarify if I miss something here?
> >
> > guest you missed we are skipping folio_check_references now.
> > we remove both, thus, make MADV_PAGEOUT 6% faster.
>
> This makes sense to me.
>
> Only concern was race with mlock during the reclaim but the race was already
> there for normal page reclaming. Thus, mlock would already handle it.

yes. in try_to_unmap_one(), mlock()'s vma is not reclaimed,
while (page_vma_mapped_walk(&pvmw)) {
      /* Unexpected PMD-mapped THP? */
      VM_BUG_ON_FOLIO(!pvmw.pte, folio);

      /*
       * If the folio is in an mlock()d vma, we must not swap it out.
       */
      if (!(flags & TTU_IGNORE_MLOCK) &&
               (vma->vm_flags & VM_LOCKED)) {
            /* Restore the mlock which got missed */
                 if (!folio_test_large(folio))
                           mlock_vma_folio(folio, vma);
                  page_vma_mapped_walk_done(&pvmw);
                  ret = false;
                  break;
  }

BTW,
Hi SeongJae,
I am not quite sure if damon also needs this, so I have kept damon as is by
setting ignore_references = false.  MADV_PAGEOUT is an explicit hint users
don't want the memory to be reclaimed, I don't know if it is true for damon as
well. If you have some comments, please chime in.

>
> Thanks.

Thanks
Barry
SeongJae Park Feb. 24, 2024, 7:02 p.m. UTC | #5
On Fri, 23 Feb 2024 17:15:50 +1300 Barry Song <21cnbao@gmail.com> wrote:

> From: Barry Song <v-songbaohua@oppo.com>
> 
> While doing MADV_PAGEOUT, the current code will clear PTE young
> so that vmscan won't read young flags to allow the reclamation
> of madvised folios to go ahead.
> It seems we can do it by directly ignoring references, thus we
> can remove tlb flush in madvise and rmap overhead in vmscan.
> 
> Regarding the side effect, in the original code, if a parallel
> thread runs side by side to access the madvised memory with the
> thread doing madvise, folios will get a chance to be re-activated
> by vmscan. But with the patch, they will still be reclaimed. But
> this behaviour doing PAGEOUT and doing access at the same time is
> quite silly like DoS. So probably, we don't need to care.

I think we might need to take care of the case, since users may use just a
best-effort estimation like DAMON for the target pages.  In such cases, the
page granularity re-check of the access could be helpful.  So I concern if this
could be a visible behavioral change for some valid use cases.

> 
> A microbench as below has shown 6% decrement on the latency of
> MADV_PAGEOUT,

I assume some of the users may use MADV_PAGEOUT for proactive reclamation of
the memory.  In the use case, I think latency of MADV_PAGEOUT might be not that
important.

Hence I think the cons of the behavioral change might outweigh the pros of the
latench improvement, for such best-effort proactive reclamation use case.  Hope
to hear and learn from others' opinions.

> 
>  #define PGSIZE 4096
>  main()
>  {
>  	int i;
>  #define SIZE 512*1024*1024
>  	volatile long *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
>  			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> 
>  	for (i = 0; i < SIZE/sizeof(long); i += PGSIZE / sizeof(long))
>  		p[i] =  0x11;
> 
>  	madvise(p, SIZE, MADV_PAGEOUT);
>  }
> 
> w/o patch                    w/ patch
> root@10:~# time ./a.out      root@10:~# time ./a.out
> real	0m49.634s            real   0m46.334s
> user	0m0.637s             user   0m0.648s
> sys	0m47.434s            sys    0m44.265s
> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>


Thanks,
SJ

[...]
SeongJae Park Feb. 24, 2024, 7:07 p.m. UTC | #6
Hi Barry,

On Sat, 24 Feb 2024 12:37:59 +0800 Barry Song <21cnbao@gmail.com> wrote:

[...]
> 
> BTW,
> Hi SeongJae,
> I am not quite sure if damon also needs this, so I have kept damon as is by
> setting ignore_references = false.  MADV_PAGEOUT is an explicit hint users
> don't want the memory to be reclaimed, I don't know if it is true for damon as
> well. If you have some comments, please chime in.

Thank you for calling my name :)

For DAMON's usecase, the document simply says the behavior would be same to
MADV_PAGEOUT, so if we conclude to change MADV_PAGEOUT, I think same change
should be made for DAMON's usecase, or update DAMON document.


Thanks,
SJ

> 
> >
> > Thanks.
> 
> Thanks
> Barry
Barry Song Feb. 24, 2024, 7:50 p.m. UTC | #7
On Sun, Feb 25, 2024 at 3:02 AM SeongJae Park <sj@kernel.org> wrote:
>
> On Fri, 23 Feb 2024 17:15:50 +1300 Barry Song <21cnbao@gmail.com> wrote:
>
> > From: Barry Song <v-songbaohua@oppo.com>
> >
> > While doing MADV_PAGEOUT, the current code will clear PTE young
> > so that vmscan won't read young flags to allow the reclamation
> > of madvised folios to go ahead.
> > It seems we can do it by directly ignoring references, thus we
> > can remove tlb flush in madvise and rmap overhead in vmscan.
> >
> > Regarding the side effect, in the original code, if a parallel
> > thread runs side by side to access the madvised memory with the
> > thread doing madvise, folios will get a chance to be re-activated
> > by vmscan. But with the patch, they will still be reclaimed. But
> > this behaviour doing PAGEOUT and doing access at the same time is
> > quite silly like DoS. So probably, we don't need to care.
>
> I think we might need to take care of the case, since users may use just a
> best-effort estimation like DAMON for the target pages.  In such cases, the
> page granularity re-check of the access could be helpful.  So I concern if this
> could be a visible behavioral change for some valid use cases.

Hi SeongJae,

If you read the code of MADV_PAGEOUT,  you will find it is not the best-effort.
It does clearing pte  young and immediately after the ptes are cleared, it reads
pte and checks if the ptes are young. If not, reclaim it. So the
purpose of clearing
PTE young is helping the check of young in folio_references to return false.
The gap between clearing ptes and re-checking ptes is quite small at
microseconds
level.

>
> >
> > A microbench as below has shown 6% decrement on the latency of
> > MADV_PAGEOUT,
>
> I assume some of the users may use MADV_PAGEOUT for proactive reclamation of
> the memory.  In the use case, I think latency of MADV_PAGEOUT might be not that
> important.
>
> Hence I think the cons of the behavioral change might outweigh the pros of the
> latench improvement, for such best-effort proactive reclamation use case.  Hope
> to hear and learn from others' opinions.

I don't see  the behavioral change for MADV_PAGEOUT as just the ping-pong
is removed. The only chance is in that very small time gap, somebody accesses
the cleared ptes and makes it young again, considering this time gap
is so small,
i don't think it is worth caring.  thus, i don't see pros for MADV_PAGEOUT case,
but we improve the efficiency of MADV_PAGEOUT and save the power of
Android phones.

>
> >
> >  #define PGSIZE 4096
> >  main()
> >  {
> >       int i;
> >  #define SIZE 512*1024*1024
> >       volatile long *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
> >                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> >
> >       for (i = 0; i < SIZE/sizeof(long); i += PGSIZE / sizeof(long))
> >               p[i] =  0x11;
> >
> >       madvise(p, SIZE, MADV_PAGEOUT);
> >  }
> >
> > w/o patch                    w/ patch
> > root@10:~# time ./a.out      root@10:~# time ./a.out
> > real  0m49.634s            real   0m46.334s
> > user  0m0.637s             user   0m0.648s
> > sys   0m47.434s            sys    0m44.265s
> >
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>
>

Thanks
Barry
Barry Song Feb. 24, 2024, 8:01 p.m. UTC | #8
On Sun, Feb 25, 2024 at 3:07 AM SeongJae Park <sj@kernel.org> wrote:
>
> Hi Barry,
>
> On Sat, 24 Feb 2024 12:37:59 +0800 Barry Song <21cnbao@gmail.com> wrote:
>
> [...]
> >
> > BTW,
> > Hi SeongJae,
> > I am not quite sure if damon also needs this, so I have kept damon as is by
> > setting ignore_references = false.  MADV_PAGEOUT is an explicit hint users
> > don't want the memory to be reclaimed, I don't know if it is true for damon as
> > well. If you have some comments, please chime in.
>
> Thank you for calling my name :)
>
> For DAMON's usecase, the document simply says the behavior would be same to
> MADV_PAGEOUT, so if we conclude to change MADV_PAGEOUT, I think same change
> should be made for DAMON's usecase, or update DAMON document.

Hi SeongJae,

I don't find similar clearing pte young in damon_pa_pageout(), so i
guess damon's
behaviour is actually different with MADV_PAGEOUT which has  pte-clearing. damon
is probably the best-effort but MADV_PAGEOUT isn't .

static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
{
        unsigned long addr, applied;
        LIST_HEAD(folio_list);

        for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
                     struct folio *folio = damon_get_folio(PHYS_PFN(addr));
                     ....

                     if (damos_pa_filter_out(s, folio))
                                goto put_folio;

                    folio_clear_referenced(folio);
                    folio_test_clear_young(folio);
                    if (!folio_isolate_lru(folio))
                         goto put_folio;
                    if (folio_test_unevictable(folio))
                          folio_putback_lru(folio);
                    else
                          list_add(&folio->lru, &folio_list);
put_folio:
               folio_put(folio);
         }
         applied = reclaim_pages(&folio_list);
         cond_resched();
         return applied * PAGE_SIZE;
}

am i missing something?

>
>
> Thanks,
> SJ
>

Thanks
Barry
SeongJae Park Feb. 24, 2024, 8:02 p.m. UTC | #9
On Sun, 25 Feb 2024 03:50:48 +0800 Barry Song <21cnbao@gmail.com> wrote:

> On Sun, Feb 25, 2024 at 3:02 AM SeongJae Park <sj@kernel.org> wrote:
> >
> > On Fri, 23 Feb 2024 17:15:50 +1300 Barry Song <21cnbao@gmail.com> wrote:
> >
> > > From: Barry Song <v-songbaohua@oppo.com>
> > >
> > > While doing MADV_PAGEOUT, the current code will clear PTE young
> > > so that vmscan won't read young flags to allow the reclamation
> > > of madvised folios to go ahead.
> > > It seems we can do it by directly ignoring references, thus we
> > > can remove tlb flush in madvise and rmap overhead in vmscan.
> > >
> > > Regarding the side effect, in the original code, if a parallel
> > > thread runs side by side to access the madvised memory with the
> > > thread doing madvise, folios will get a chance to be re-activated
> > > by vmscan. But with the patch, they will still be reclaimed. But
> > > this behaviour doing PAGEOUT and doing access at the same time is
> > > quite silly like DoS. So probably, we don't need to care.
> >
> > I think we might need to take care of the case, since users may use just a
> > best-effort estimation like DAMON for the target pages.  In such cases, the
> > page granularity re-check of the access could be helpful.  So I concern if this
> > could be a visible behavioral change for some valid use cases.
> 
> Hi SeongJae,
> 
> If you read the code of MADV_PAGEOUT,  you will find it is not the best-effort.

I'm not saying about MADV_PAGEOUT, but the logic of ther user of MADV_PAGEOUT,
which being used for finding the pages to reclaim.

> It does clearing pte  young and immediately after the ptes are cleared, it reads
> pte and checks if the ptes are young. If not, reclaim it. So the
> purpose of clearing
> PTE young is helping the check of young in folio_references to return false.
> The gap between clearing ptes and re-checking ptes is quite small at
> microseconds
> level.
> 
> >
> > >
> > > A microbench as below has shown 6% decrement on the latency of
> > > MADV_PAGEOUT,
> >
> > I assume some of the users may use MADV_PAGEOUT for proactive reclamation of
> > the memory.  In the use case, I think latency of MADV_PAGEOUT might be not that
> > important.
> >
> > Hence I think the cons of the behavioral change might outweigh the pros of the
> > latench improvement, for such best-effort proactive reclamation use case.  Hope
> > to hear and learn from others' opinions.
> 
> I don't see  the behavioral change for MADV_PAGEOUT as just the ping-pong
> is removed. The only chance is in that very small time gap, somebody accesses
> the cleared ptes and makes it young again, considering this time gap
> is so small,
> i don't think it is worth caring.  thus, i don't see pros for MADV_PAGEOUT
> case, but we improve the efficiency of MADV_PAGEOUT and save the power of
> Android phones.

Ok, I agree the time gap is small enough and the benefit could be significant
on such use case.  Thank you for enlightening me with the nice examples and the
numbers :)


Thanks,
SJ

[...]
SeongJae Park Feb. 24, 2024, 8:12 p.m. UTC | #10
On Sat, 24 Feb 2024 11:07:23 -0800 SeongJae Park <sj@kernel.org> wrote:

> Hi Barry,
> 
> On Sat, 24 Feb 2024 12:37:59 +0800 Barry Song <21cnbao@gmail.com> wrote:
> 
> [...]
> > 
> > BTW\uff0c
> > Hi SeongJae,
> > I am not quite sure if damon also needs this, so I have kept damon as is by
> > setting ignore_references = false.  MADV_PAGEOUT is an explicit hint users
> > don't want the memory to be reclaimed, I don't know if it is true for damon as
> > well. If you have some comments, please chime in.
> 
> Thank you for calling my name :)
> 
> For DAMON's usecase, the document simply says the behavior would be same to
> MADV_PAGEOUT, so if we conclude to change MADV_PAGEOUT, I think same change
> should be made for DAMON's usecase, or update DAMON document.

Thanks to Barry's nice explanation on my other reply to the patch, now I think
the change is modest, and therefore I'd prefer the first way: Changing DAMON's
usecase, and keep the document as is.


Thanks,
SJ

> 
> 
> Thanks,
> SJ
> 
> > 
> > >
> > > Thanks.
> > 
> > Thanks
> > Barry
Barry Song Feb. 24, 2024, 8:33 p.m. UTC | #11
On Sun, Feb 25, 2024 at 4:12 AM SeongJae Park <sj@kernel.org> wrote:
>
> On Sat, 24 Feb 2024 11:07:23 -0800 SeongJae Park <sj@kernel.org> wrote:
>
> > Hi Barry,
> >
> > On Sat, 24 Feb 2024 12:37:59 +0800 Barry Song <21cnbao@gmail.com> wrote:
> >
> > [...]
> > >
> > > BTW\uff0c
> > > Hi SeongJae,
> > > I am not quite sure if damon also needs this, so I have kept damon as is by
> > > setting ignore_references = false.  MADV_PAGEOUT is an explicit hint users
> > > don't want the memory to be reclaimed, I don't know if it is true for damon as
> > > well. If you have some comments, please chime in.
> >
> > Thank you for calling my name :)
> >
> > For DAMON's usecase, the document simply says the behavior would be same to
> > MADV_PAGEOUT, so if we conclude to change MADV_PAGEOUT, I think same change
> > should be made for DAMON's usecase, or update DAMON document.
>
> Thanks to Barry's nice explanation on my other reply to the patch, now I think
> the change is modest, and therefore I'd prefer the first way: Changing DAMON's
> usecase, and keep the document as is.

Hi SeongJae,

thanks! I actually blindly voted for keeping DAMON's behaviour but
slightly updated the
document as I set ignore_references to false for the DAMON case in the RFC :-)

--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -249,7 +249,7 @@ static unsigned long damon_pa_pageout(struct
damon_region *r, struct damos *s)
 put_folio:
  folio_put(folio);
  }
- applied = reclaim_pages(&folio_list);
+ applied = reclaim_pages(&folio_list, false);
  cond_resched();
  return applied * PAGE_SIZE;
 }

MADV_PAGEOUT comes from userspace by a specific process to tell the kernel
to reclaim its own memory(actually focus on non-shared memory as it
skips folios with
mapcount>1).
The range is a virtual address and the app does know it doesn't want
to access the
range in the foreseeable future.  and the affected app is itself not global.

In the DAMON case,  it seems the range is the physical address.  if
the pa is mapped
by more than one process, it seems safer to double-check in the kernel
as it might
affect multiple processes?

Please correct me if I am wrong.

>
>
> Thanks,
> SJ
>
> >
> >
> > Thanks,
> > SJ

Thanks
 Barry
SeongJae Park Feb. 24, 2024, 8:54 p.m. UTC | #12
On Sun, 25 Feb 2024 04:01:40 +0800 Barry Song <21cnbao@gmail.com> wrote:

> On Sun, Feb 25, 2024 at 3:07 AM SeongJae Park <sj@kernel.org> wrote:
> >
> > Hi Barry,
> >
> > On Sat, 24 Feb 2024 12:37:59 +0800 Barry Song <21cnbao@gmail.com> wrote:
> >
> > [...]
> > >
> > > BTW,
> > > Hi SeongJae,
> > > I am not quite sure if damon also needs this, so I have kept damon as is by
> > > setting ignore_references = false.  MADV_PAGEOUT is an explicit hint users
> > > don't want the memory to be reclaimed, I don't know if it is true for damon as
> > > well. If you have some comments, please chime in.
> >
> > Thank you for calling my name :)
> >
> > For DAMON's usecase, the document simply says the behavior would be same to
> > MADV_PAGEOUT, so if we conclude to change MADV_PAGEOUT, I think same change
> > should be made for DAMON's usecase, or update DAMON document.
> 
> Hi SeongJae,
> 
> I don't find similar clearing pte young in damon_pa_pageout(), so i
> guess damon's
> behaviour is actually different with MADV_PAGEOUT which has  pte-clearing. damon
> is probably the best-effort but MADV_PAGEOUT isn't .
> 
> static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
> {
>         unsigned long addr, applied;
>         LIST_HEAD(folio_list);
> 
>         for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
>                      struct folio *folio = damon_get_folio(PHYS_PFN(addr));
>                      ....
> 
>                      if (damos_pa_filter_out(s, folio))
>                                 goto put_folio;
> 
>                     folio_clear_referenced(folio);
>                     folio_test_clear_young(folio);
>                     if (!folio_isolate_lru(folio))
>                          goto put_folio;
>                     if (folio_test_unevictable(folio))
>                           folio_putback_lru(folio);
>                     else
>                           list_add(&folio->lru, &folio_list);
> put_folio:
>                folio_put(folio);
>          }
>          applied = reclaim_pages(&folio_list);
>          cond_resched();
>          return applied * PAGE_SIZE;
> }
> 
> am i missing something?

Thank you for checking this again.  You're right.

Technically speaking, DAMON's usage of MADV_PAGEOUT is in vaddr.c.  paddr.c is
using not MADV_PAGEOUT but reclaim_pages().  Usage of reclaim_pages() from
paddr is different from that of MADV_PAGEOUT since paddr doesn't clear PTE.  I
was confused from the difference between vaddr and paddr.  I actually wanted to
document the difference but haven't had a time for that yet.  Thank you for
letting me remind this.

So, your change on MADV_PAGEOUT will make an effect to vaddr, and I think it's
ok.  Your change on reclaim_pages() could make an effect to paddr, depending on
the additional parameter's value.  I now think it would better to make no
effect here.  That is, let's keep the change for paddr.c in your patch as is.


Thanks,
SJ

> 
> >
> >
> > Thanks,
> > SJ
> >
> 
> Thanks
> Barry
SeongJae Park Feb. 24, 2024, 9:02 p.m. UTC | #13
On Sun, 25 Feb 2024 04:33:25 +0800 Barry Song <21cnbao@gmail.com> wrote:

> On Sun, Feb 25, 2024 at 4:12 AM SeongJae Park <sj@kernel.org> wrote:
> >
> > On Sat, 24 Feb 2024 11:07:23 -0800 SeongJae Park <sj@kernel.org> wrote:
> >
> > > Hi Barry,
> > >
> > > On Sat, 24 Feb 2024 12:37:59 +0800 Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > [...]
> > > >
> > > > BTW\uff0c
> > > > Hi SeongJae,
> > > > I am not quite sure if damon also needs this, so I have kept damon as is by
> > > > setting ignore_references = false.  MADV_PAGEOUT is an explicit hint users
> > > > don't want the memory to be reclaimed, I don't know if it is true for damon as
> > > > well. If you have some comments, please chime in.
> > >
> > > Thank you for calling my name :)
> > >
> > > For DAMON's usecase, the document simply says the behavior would be same to
> > > MADV_PAGEOUT, so if we conclude to change MADV_PAGEOUT, I think same change
> > > should be made for DAMON's usecase, or update DAMON document.
> >
> > Thanks to Barry's nice explanation on my other reply to the patch, now I think
> > the change is modest, and therefore I'd prefer the first way: Changing DAMON's
> > usecase, and keep the document as is.
> 
> Hi SeongJae,
> 
> thanks! I actually blindly voted for keeping DAMON's behaviour but
> slightly updated the
> document as I set ignore_references to false for the DAMON case in the RFC :-)
> 
> --- a/mm/damon/paddr.c
> +++ b/mm/damon/paddr.c
> @@ -249,7 +249,7 @@ static unsigned long damon_pa_pageout(struct
> damon_region *r, struct damos *s)
>  put_folio:
>   folio_put(folio);
>   }
> - applied = reclaim_pages(&folio_list);
> + applied = reclaim_pages(&folio_list, false);
>   cond_resched();
>   return applied * PAGE_SIZE;
>  }
> 
> MADV_PAGEOUT comes from userspace by a specific process to tell the kernel
> to reclaim its own memory(actually focus on non-shared memory as it
> skips folios with
> mapcount>1).
> The range is a virtual address and the app does know it doesn't want
> to access the
> range in the foreseeable future.  and the affected app is itself not global.
> 
> In the DAMON case,  it seems the range is the physical address.  if
> the pa is mapped
> by more than one process, it seems safer to double-check in the kernel
> as it might
> affect multiple processes?
> 
> Please correct me if I am wrong.

You're correct.  Please consider below in my previous reply[1] as my opinion.

    let's keep the change for paddr.c in your patch as is.

[1] https://lore.kernel.org/r/20240224205453.47096-1-sj@kernel.org


Thanks,
SJ

> 
> >
> >
> > Thanks,
> > SJ
> >
> > >
> > >
> > > Thanks,
> > > SJ
> 
> Thanks
>  Barry
Barry Song Feb. 24, 2024, 9:54 p.m. UTC | #14
On Sun, Feb 25, 2024 at 9:54 AM SeongJae Park <sj@kernel.org> wrote:
>
> On Sun, 25 Feb 2024 04:01:40 +0800 Barry Song <21cnbao@gmail.com> wrote:
>
> > On Sun, Feb 25, 2024 at 3:07 AM SeongJae Park <sj@kernel.org> wrote:
> > >
> > > Hi Barry,
> > >
> > > On Sat, 24 Feb 2024 12:37:59 +0800 Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > [...]
> > > >
> > > > BTW,
> > > > Hi SeongJae,
> > > > I am not quite sure if damon also needs this, so I have kept damon as is by
> > > > setting ignore_references = false.  MADV_PAGEOUT is an explicit hint users
> > > > don't want the memory to be reclaimed, I don't know if it is true for damon as
> > > > well. If you have some comments, please chime in.
> > >
> > > Thank you for calling my name :)
> > >
> > > For DAMON's usecase, the document simply says the behavior would be same to
> > > MADV_PAGEOUT, so if we conclude to change MADV_PAGEOUT, I think same change
> > > should be made for DAMON's usecase, or update DAMON document.
> >
> > Hi SeongJae,
> >
> > I don't find similar clearing pte young in damon_pa_pageout(), so i
> > guess damon's
> > behaviour is actually different with MADV_PAGEOUT which has  pte-clearing. damon
> > is probably the best-effort but MADV_PAGEOUT isn't .
> >
> > static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
> > {
> >         unsigned long addr, applied;
> >         LIST_HEAD(folio_list);
> >
> >         for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
> >                      struct folio *folio = damon_get_folio(PHYS_PFN(addr));
> >                      ....
> >
> >                      if (damos_pa_filter_out(s, folio))
> >                                 goto put_folio;
> >
> >                     folio_clear_referenced(folio);
> >                     folio_test_clear_young(folio);
> >                     if (!folio_isolate_lru(folio))
> >                          goto put_folio;
> >                     if (folio_test_unevictable(folio))
> >                           folio_putback_lru(folio);
> >                     else
> >                           list_add(&folio->lru, &folio_list);
> > put_folio:
> >                folio_put(folio);
> >          }
> >          applied = reclaim_pages(&folio_list);
> >          cond_resched();
> >          return applied * PAGE_SIZE;
> > }
> >
> > am i missing something?
>
> Thank you for checking this again.  You're right.
>
> Technically speaking, DAMON's usage of MADV_PAGEOUT is in vaddr.c.  paddr.c is
> using not MADV_PAGEOUT but reclaim_pages().  Usage of reclaim_pages() from
> paddr is different from that of MADV_PAGEOUT since paddr doesn't clear PTE.  I
> was confused from the difference between vaddr and paddr.  I actually wanted to
> document the difference but haven't had a time for that yet.  Thank you for
> letting me remind this.

Hi SeongJae,

thanks! I bravely had a go at fixing the damon's doc[1]. as it seems
the fix is anyway needed
no matter if we have my patch to optimize MADV_PAGEOUT.

[1] https://lore.kernel.org/linux-mm/20240224215023.5271-1-21cnbao@gmail.com/

>
> So, your change on MADV_PAGEOUT will make an effect to vaddr, and I think it's
> ok.  Your change on reclaim_pages() could make an effect to paddr, depending on
> the additional parameter's value.  I now think it would better to make no
> effect here.  That is, let's keep the change for paddr.c in your patch as is.

thanks! it seems everything is quite clear now.

>
>
> Thanks,
> SJ
>
> >
> > >
> > >
> > > Thanks,
> > > SJ
> > >
> >

Thanks
Barry
diff mbox series

Patch

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 081e2a325778..5e6dc312072c 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -249,7 +249,7 @@  static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
 put_folio:
 		folio_put(folio);
 	}
-	applied = reclaim_pages(&folio_list);
+	applied = reclaim_pages(&folio_list, false);
 	cond_resched();
 	return applied * PAGE_SIZE;
 }
diff --git a/mm/internal.h b/mm/internal.h
index 93e229112045..36c11ea41f47 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -868,7 +868,7 @@  extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
-unsigned long reclaim_pages(struct list_head *folio_list);
+unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references);
 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 					    struct list_head *folio_list);
 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
diff --git a/mm/madvise.c b/mm/madvise.c
index abde3edb04f0..44a498c94158 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -386,7 +386,7 @@  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			return 0;
 		}
 
-		if (pmd_young(orig_pmd)) {
+		if (!pageout && pmd_young(orig_pmd)) {
 			pmdp_invalidate(vma, addr, pmd);
 			orig_pmd = pmd_mkold(orig_pmd);
 
@@ -410,7 +410,7 @@  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 huge_unlock:
 		spin_unlock(ptl);
 		if (pageout)
-			reclaim_pages(&folio_list);
+			reclaim_pages(&folio_list, true);
 		return 0;
 	}
 
@@ -490,7 +490,7 @@  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 
 		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
 
-		if (pte_young(ptent)) {
+		if (!pageout && pte_young(ptent)) {
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			ptent = pte_mkold(ptent);
@@ -524,7 +524,7 @@  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		pte_unmap_unlock(start_pte, ptl);
 	}
 	if (pageout)
-		reclaim_pages(&folio_list);
+		reclaim_pages(&folio_list, true);
 	cond_resched();
 
 	return 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 402c290fbf5a..ba2f37f46a73 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2102,7 +2102,8 @@  static void shrink_active_list(unsigned long nr_to_scan,
 }
 
 static unsigned int reclaim_folio_list(struct list_head *folio_list,
-				      struct pglist_data *pgdat)
+				      struct pglist_data *pgdat,
+				      bool ignore_references)
 {
 	struct reclaim_stat dummy_stat;
 	unsigned int nr_reclaimed;
@@ -2115,7 +2116,7 @@  static unsigned int reclaim_folio_list(struct list_head *folio_list,
 		.no_demotion = 1,
 	};
 
-	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
+	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
 	while (!list_empty(folio_list)) {
 		folio = lru_to_folio(folio_list);
 		list_del(&folio->lru);
@@ -2125,7 +2126,7 @@  static unsigned int reclaim_folio_list(struct list_head *folio_list,
 	return nr_reclaimed;
 }
 
-unsigned long reclaim_pages(struct list_head *folio_list)
+unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references)
 {
 	int nid;
 	unsigned int nr_reclaimed = 0;
@@ -2147,11 +2148,12 @@  unsigned long reclaim_pages(struct list_head *folio_list)
 			continue;
 		}
 
-		nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
+		nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid),
+						   ignore_references);
 		nid = folio_nid(lru_to_folio(folio_list));
 	} while (!list_empty(folio_list));
 
-	nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
+	nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references);
 
 	memalloc_noreclaim_restore(noreclaim_flag);