diff mbox series

[v3,4/4] mm: Avoid splitting pmd for lazyfree pmd-mapped THP in try_to_unmap

Message ID 20250115033808.40641-5-21cnbao@gmail.com (mailing list archive)
State New
Headers show
Series mm: batched unmap lazyfree large folios during reclamation | expand

Commit Message

Barry Song Jan. 15, 2025, 3:38 a.m. UTC
From: Barry Song <v-songbaohua@oppo.com>

The try_to_unmap_one() function currently handles PMD-mapped THPs
inefficiently. It first splits the PMD into PTEs, copies the dirty
state from the PMD to the PTEs, iterates over the PTEs to locate
the dirty state, and then marks the THP as swap-backed. This process
involves unnecessary PMD splitting and redundant iteration. Instead,
this functionality can be efficiently managed in
__discard_anon_folio_pmd_locked(), avoiding the extra steps and
improving performance.

The following microbenchmark redirties folios after invoking MADV_FREE,
then measures the time taken to perform memory reclamation (actually
set those folios swapbacked again) on the redirtied folios.

 #include <stdio.h>
 #include <sys/mman.h>
 #include <string.h>
 #include <time.h>

 #define SIZE 128*1024*1024  // 128 MB

 int main(int argc, char *argv[])
 {
 	while(1) {
 		volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
 				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

 		memset((void *)p, 1, SIZE);
 		madvise((void *)p, SIZE, MADV_FREE);
 		/* redirty after MADV_FREE */
 		memset((void *)p, 1, SIZE);

		clock_t start_time = clock();
 		madvise((void *)p, SIZE, MADV_PAGEOUT);
 		clock_t end_time = clock();

 		double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
 		printf("Time taken by reclamation: %f seconds\n", elapsed_time);

 		munmap((void *)p, SIZE);
 	}
 	return 0;
 }

Testing results are as below,
w/o patch:
~ # ./a.out
Time taken by reclamation: 0.007300 seconds
Time taken by reclamation: 0.007226 seconds
Time taken by reclamation: 0.007295 seconds
Time taken by reclamation: 0.007731 seconds
Time taken by reclamation: 0.007134 seconds
Time taken by reclamation: 0.007285 seconds
Time taken by reclamation: 0.007720 seconds
Time taken by reclamation: 0.007128 seconds
Time taken by reclamation: 0.007710 seconds
Time taken by reclamation: 0.007712 seconds
Time taken by reclamation: 0.007236 seconds
Time taken by reclamation: 0.007690 seconds
Time taken by reclamation: 0.007174 seconds
Time taken by reclamation: 0.007670 seconds
Time taken by reclamation: 0.007169 seconds
Time taken by reclamation: 0.007305 seconds
Time taken by reclamation: 0.007432 seconds
Time taken by reclamation: 0.007158 seconds
Time taken by reclamation: 0.007133 seconds
…

w/ patch

~ # ./a.out
Time taken by reclamation: 0.002124 seconds
Time taken by reclamation: 0.002116 seconds
Time taken by reclamation: 0.002150 seconds
Time taken by reclamation: 0.002261 seconds
Time taken by reclamation: 0.002137 seconds
Time taken by reclamation: 0.002173 seconds
Time taken by reclamation: 0.002063 seconds
Time taken by reclamation: 0.002088 seconds
Time taken by reclamation: 0.002169 seconds
Time taken by reclamation: 0.002124 seconds
Time taken by reclamation: 0.002111 seconds
Time taken by reclamation: 0.002224 seconds
Time taken by reclamation: 0.002297 seconds
Time taken by reclamation: 0.002260 seconds
Time taken by reclamation: 0.002246 seconds
Time taken by reclamation: 0.002272 seconds
Time taken by reclamation: 0.002277 seconds
Time taken by reclamation: 0.002462 seconds
…

This patch significantly speeds up try_to_unmap_one() by allowing it
to skip redirtied THPs without splitting the PMD.

Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Suggested-by: Lance Yang <ioworker0@gmail.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/huge_memory.c | 24 +++++++++++++++++-------
 mm/rmap.c        | 13 ++++++++++---
 2 files changed, 27 insertions(+), 10 deletions(-)

Comments

Lance Yang Jan. 15, 2025, 5:01 a.m. UTC | #1
On Wed, Jan 15, 2025 at 11:38 AM Barry Song <21cnbao@gmail.com> wrote:
>
> From: Barry Song <v-songbaohua@oppo.com>
>
> The try_to_unmap_one() function currently handles PMD-mapped THPs
> inefficiently. It first splits the PMD into PTEs, copies the dirty
> state from the PMD to the PTEs, iterates over the PTEs to locate
> the dirty state, and then marks the THP as swap-backed. This process
> involves unnecessary PMD splitting and redundant iteration. Instead,
> this functionality can be efficiently managed in
> __discard_anon_folio_pmd_locked(), avoiding the extra steps and
> improving performance.
>
> The following microbenchmark redirties folios after invoking MADV_FREE,
> then measures the time taken to perform memory reclamation (actually
> set those folios swapbacked again) on the redirtied folios.
>
>  #include <stdio.h>
>  #include <sys/mman.h>
>  #include <string.h>
>  #include <time.h>
>
>  #define SIZE 128*1024*1024  // 128 MB
>
>  int main(int argc, char *argv[])
>  {
>         while(1) {
>                 volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
>                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>
>                 memset((void *)p, 1, SIZE);
>                 madvise((void *)p, SIZE, MADV_FREE);
>                 /* redirty after MADV_FREE */
>                 memset((void *)p, 1, SIZE);
>
>                 clock_t start_time = clock();
>                 madvise((void *)p, SIZE, MADV_PAGEOUT);
>                 clock_t end_time = clock();
>
>                 double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
>                 printf("Time taken by reclamation: %f seconds\n", elapsed_time);
>
>                 munmap((void *)p, SIZE);
>         }
>         return 0;
>  }
>
> Testing results are as below,
> w/o patch:
> ~ # ./a.out
> Time taken by reclamation: 0.007300 seconds
> Time taken by reclamation: 0.007226 seconds
> Time taken by reclamation: 0.007295 seconds
> Time taken by reclamation: 0.007731 seconds
> Time taken by reclamation: 0.007134 seconds
> Time taken by reclamation: 0.007285 seconds
> Time taken by reclamation: 0.007720 seconds
> Time taken by reclamation: 0.007128 seconds
> Time taken by reclamation: 0.007710 seconds
> Time taken by reclamation: 0.007712 seconds
> Time taken by reclamation: 0.007236 seconds
> Time taken by reclamation: 0.007690 seconds
> Time taken by reclamation: 0.007174 seconds
> Time taken by reclamation: 0.007670 seconds
> Time taken by reclamation: 0.007169 seconds
> Time taken by reclamation: 0.007305 seconds
> Time taken by reclamation: 0.007432 seconds
> Time taken by reclamation: 0.007158 seconds
> Time taken by reclamation: 0.007133 seconds
> …
>
> w/ patch
>
> ~ # ./a.out
> Time taken by reclamation: 0.002124 seconds
> Time taken by reclamation: 0.002116 seconds
> Time taken by reclamation: 0.002150 seconds
> Time taken by reclamation: 0.002261 seconds
> Time taken by reclamation: 0.002137 seconds
> Time taken by reclamation: 0.002173 seconds
> Time taken by reclamation: 0.002063 seconds
> Time taken by reclamation: 0.002088 seconds
> Time taken by reclamation: 0.002169 seconds
> Time taken by reclamation: 0.002124 seconds
> Time taken by reclamation: 0.002111 seconds
> Time taken by reclamation: 0.002224 seconds
> Time taken by reclamation: 0.002297 seconds
> Time taken by reclamation: 0.002260 seconds
> Time taken by reclamation: 0.002246 seconds
> Time taken by reclamation: 0.002272 seconds
> Time taken by reclamation: 0.002277 seconds
> Time taken by reclamation: 0.002462 seconds
> …
>
> This patch significantly speeds up try_to_unmap_one() by allowing it
> to skip redirtied THPs without splitting the PMD.
>
> Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Suggested-by: Lance Yang <ioworker0@gmail.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  mm/huge_memory.c | 24 +++++++++++++++++-------
>  mm/rmap.c        | 13 ++++++++++---
>  2 files changed, 27 insertions(+), 10 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 3d3ebdc002d5..47cc8c3f8f80 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3070,8 +3070,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
>         int ref_count, map_count;
>         pmd_t orig_pmd = *pmdp;
>
> -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
> +       if (pmd_dirty(orig_pmd))
> +               folio_set_dirty(folio);
> +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> +               folio_set_swapbacked(folio);
>                 return false;
> +       }

If either the PMD or the folio is dirty, should we just return false right away,
regardless of VM_DROPPABLE? There’s no need to proceed further in that
case, IMHO ;)

Thanks,
Lance

>
>         orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
>
> @@ -3098,8 +3102,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
>          *
>          * The only folio refs must be one from isolation plus the rmap(s).
>          */
> -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
> -           ref_count != map_count + 1) {
> +       if (pmd_dirty(orig_pmd))
> +               folio_set_dirty(folio);
> +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> +               folio_set_swapbacked(folio);
> +               set_pmd_at(mm, addr, pmdp, orig_pmd);
> +               return false;
> +       }
> +
> +       if (ref_count != map_count + 1) {
>                 set_pmd_at(mm, addr, pmdp, orig_pmd);
>                 return false;
>         }
> @@ -3119,12 +3130,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
>  {
>         VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
>         VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> +       VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
> +       VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
>         VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
>
> -       if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
> -               return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> -
> -       return false;
> +       return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
>  }
>
>  static void remap_page(struct folio *folio, unsigned long nr, int flags)
> diff --git a/mm/rmap.c b/mm/rmap.c
> index be1978d2712d..a859c399ec7c 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1724,9 +1724,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>                 }
>
>                 if (!pvmw.pte) {
> -                       if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
> -                                                 folio))
> -                               goto walk_done;
> +                       if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
> +                               if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
> +                                       goto walk_done;
> +                               /*
> +                                * unmap_huge_pmd_locked has either already marked
> +                                * the folio as swap-backed or decided to retain it
> +                                * due to GUP or speculative references.
> +                                */
> +                               goto walk_abort;
> +                       }
>
>                         if (flags & TTU_SPLIT_HUGE_PMD) {
>                                 /*
> --
> 2.39.3 (Apple Git-146)
>
Barry Song Jan. 15, 2025, 5:09 a.m. UTC | #2
On Wed, Jan 15, 2025 at 6:01 PM Lance Yang <ioworker0@gmail.com> wrote:
>
> On Wed, Jan 15, 2025 at 11:38 AM Barry Song <21cnbao@gmail.com> wrote:
> >
> > From: Barry Song <v-songbaohua@oppo.com>
> >
> > The try_to_unmap_one() function currently handles PMD-mapped THPs
> > inefficiently. It first splits the PMD into PTEs, copies the dirty
> > state from the PMD to the PTEs, iterates over the PTEs to locate
> > the dirty state, and then marks the THP as swap-backed. This process
> > involves unnecessary PMD splitting and redundant iteration. Instead,
> > this functionality can be efficiently managed in
> > __discard_anon_folio_pmd_locked(), avoiding the extra steps and
> > improving performance.
> >
> > The following microbenchmark redirties folios after invoking MADV_FREE,
> > then measures the time taken to perform memory reclamation (actually
> > set those folios swapbacked again) on the redirtied folios.
> >
> >  #include <stdio.h>
> >  #include <sys/mman.h>
> >  #include <string.h>
> >  #include <time.h>
> >
> >  #define SIZE 128*1024*1024  // 128 MB
> >
> >  int main(int argc, char *argv[])
> >  {
> >         while(1) {
> >                 volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
> >                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> >
> >                 memset((void *)p, 1, SIZE);
> >                 madvise((void *)p, SIZE, MADV_FREE);
> >                 /* redirty after MADV_FREE */
> >                 memset((void *)p, 1, SIZE);
> >
> >                 clock_t start_time = clock();
> >                 madvise((void *)p, SIZE, MADV_PAGEOUT);
> >                 clock_t end_time = clock();
> >
> >                 double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
> >                 printf("Time taken by reclamation: %f seconds\n", elapsed_time);
> >
> >                 munmap((void *)p, SIZE);
> >         }
> >         return 0;
> >  }
> >
> > Testing results are as below,
> > w/o patch:
> > ~ # ./a.out
> > Time taken by reclamation: 0.007300 seconds
> > Time taken by reclamation: 0.007226 seconds
> > Time taken by reclamation: 0.007295 seconds
> > Time taken by reclamation: 0.007731 seconds
> > Time taken by reclamation: 0.007134 seconds
> > Time taken by reclamation: 0.007285 seconds
> > Time taken by reclamation: 0.007720 seconds
> > Time taken by reclamation: 0.007128 seconds
> > Time taken by reclamation: 0.007710 seconds
> > Time taken by reclamation: 0.007712 seconds
> > Time taken by reclamation: 0.007236 seconds
> > Time taken by reclamation: 0.007690 seconds
> > Time taken by reclamation: 0.007174 seconds
> > Time taken by reclamation: 0.007670 seconds
> > Time taken by reclamation: 0.007169 seconds
> > Time taken by reclamation: 0.007305 seconds
> > Time taken by reclamation: 0.007432 seconds
> > Time taken by reclamation: 0.007158 seconds
> > Time taken by reclamation: 0.007133 seconds
> > …
> >
> > w/ patch
> >
> > ~ # ./a.out
> > Time taken by reclamation: 0.002124 seconds
> > Time taken by reclamation: 0.002116 seconds
> > Time taken by reclamation: 0.002150 seconds
> > Time taken by reclamation: 0.002261 seconds
> > Time taken by reclamation: 0.002137 seconds
> > Time taken by reclamation: 0.002173 seconds
> > Time taken by reclamation: 0.002063 seconds
> > Time taken by reclamation: 0.002088 seconds
> > Time taken by reclamation: 0.002169 seconds
> > Time taken by reclamation: 0.002124 seconds
> > Time taken by reclamation: 0.002111 seconds
> > Time taken by reclamation: 0.002224 seconds
> > Time taken by reclamation: 0.002297 seconds
> > Time taken by reclamation: 0.002260 seconds
> > Time taken by reclamation: 0.002246 seconds
> > Time taken by reclamation: 0.002272 seconds
> > Time taken by reclamation: 0.002277 seconds
> > Time taken by reclamation: 0.002462 seconds
> > …
> >
> > This patch significantly speeds up try_to_unmap_one() by allowing it
> > to skip redirtied THPs without splitting the PMD.
> >
> > Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Suggested-by: Lance Yang <ioworker0@gmail.com>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  mm/huge_memory.c | 24 +++++++++++++++++-------
> >  mm/rmap.c        | 13 ++++++++++---
> >  2 files changed, 27 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 3d3ebdc002d5..47cc8c3f8f80 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -3070,8 +3070,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
> >         int ref_count, map_count;
> >         pmd_t orig_pmd = *pmdp;
> >
> > -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
> > +       if (pmd_dirty(orig_pmd))
> > +               folio_set_dirty(folio);
> > +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> > +               folio_set_swapbacked(folio);
> >                 return false;
> > +       }
>
> If either the PMD or the folio is dirty, should we just return false right away,
> regardless of VM_DROPPABLE? There’s no need to proceed further in that
> case, IMHO ;)

I don't quite understand you, but we need to proceed to clear pmd entry.
if vm_droppable is true, even if the folio is dirty, we still drop the folio.

>
> Thanks,
> Lance
>
> >
> >         orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
> >
> > @@ -3098,8 +3102,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
> >          *
> >          * The only folio refs must be one from isolation plus the rmap(s).
> >          */
> > -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
> > -           ref_count != map_count + 1) {
> > +       if (pmd_dirty(orig_pmd))
> > +               folio_set_dirty(folio);
> > +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> > +               folio_set_swapbacked(folio);
> > +               set_pmd_at(mm, addr, pmdp, orig_pmd);
> > +               return false;
> > +       }
> > +
> > +       if (ref_count != map_count + 1) {
> >                 set_pmd_at(mm, addr, pmdp, orig_pmd);
> >                 return false;
> >         }
> > @@ -3119,12 +3130,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> >  {
> >         VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
> >         VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> > +       VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
> > +       VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
> >         VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
> >
> > -       if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
> > -               return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> > -
> > -       return false;
> > +       return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> >  }
> >
> >  static void remap_page(struct folio *folio, unsigned long nr, int flags)
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index be1978d2712d..a859c399ec7c 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1724,9 +1724,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> >                 }
> >
> >                 if (!pvmw.pte) {
> > -                       if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
> > -                                                 folio))
> > -                               goto walk_done;
> > +                       if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
> > +                               if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
> > +                                       goto walk_done;
> > +                               /*
> > +                                * unmap_huge_pmd_locked has either already marked
> > +                                * the folio as swap-backed or decided to retain it
> > +                                * due to GUP or speculative references.
> > +                                */
> > +                               goto walk_abort;
> > +                       }
> >
> >                         if (flags & TTU_SPLIT_HUGE_PMD) {
> >                                 /*
> > --
> > 2.39.3 (Apple Git-146)
> >
Lance Yang Jan. 15, 2025, 5:41 a.m. UTC | #3
On Wed, Jan 15, 2025 at 1:09 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Jan 15, 2025 at 6:01 PM Lance Yang <ioworker0@gmail.com> wrote:
> >
> > On Wed, Jan 15, 2025 at 11:38 AM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > From: Barry Song <v-songbaohua@oppo.com>
> > >
> > > The try_to_unmap_one() function currently handles PMD-mapped THPs
> > > inefficiently. It first splits the PMD into PTEs, copies the dirty
> > > state from the PMD to the PTEs, iterates over the PTEs to locate
> > > the dirty state, and then marks the THP as swap-backed. This process
> > > involves unnecessary PMD splitting and redundant iteration. Instead,
> > > this functionality can be efficiently managed in
> > > __discard_anon_folio_pmd_locked(), avoiding the extra steps and
> > > improving performance.
> > >
> > > The following microbenchmark redirties folios after invoking MADV_FREE,
> > > then measures the time taken to perform memory reclamation (actually
> > > set those folios swapbacked again) on the redirtied folios.
> > >
> > >  #include <stdio.h>
> > >  #include <sys/mman.h>
> > >  #include <string.h>
> > >  #include <time.h>
> > >
> > >  #define SIZE 128*1024*1024  // 128 MB
> > >
> > >  int main(int argc, char *argv[])
> > >  {
> > >         while(1) {
> > >                 volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
> > >                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> > >
> > >                 memset((void *)p, 1, SIZE);
> > >                 madvise((void *)p, SIZE, MADV_FREE);
> > >                 /* redirty after MADV_FREE */
> > >                 memset((void *)p, 1, SIZE);
> > >
> > >                 clock_t start_time = clock();
> > >                 madvise((void *)p, SIZE, MADV_PAGEOUT);
> > >                 clock_t end_time = clock();
> > >
> > >                 double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
> > >                 printf("Time taken by reclamation: %f seconds\n", elapsed_time);
> > >
> > >                 munmap((void *)p, SIZE);
> > >         }
> > >         return 0;
> > >  }
> > >
> > > Testing results are as below,
> > > w/o patch:
> > > ~ # ./a.out
> > > Time taken by reclamation: 0.007300 seconds
> > > Time taken by reclamation: 0.007226 seconds
> > > Time taken by reclamation: 0.007295 seconds
> > > Time taken by reclamation: 0.007731 seconds
> > > Time taken by reclamation: 0.007134 seconds
> > > Time taken by reclamation: 0.007285 seconds
> > > Time taken by reclamation: 0.007720 seconds
> > > Time taken by reclamation: 0.007128 seconds
> > > Time taken by reclamation: 0.007710 seconds
> > > Time taken by reclamation: 0.007712 seconds
> > > Time taken by reclamation: 0.007236 seconds
> > > Time taken by reclamation: 0.007690 seconds
> > > Time taken by reclamation: 0.007174 seconds
> > > Time taken by reclamation: 0.007670 seconds
> > > Time taken by reclamation: 0.007169 seconds
> > > Time taken by reclamation: 0.007305 seconds
> > > Time taken by reclamation: 0.007432 seconds
> > > Time taken by reclamation: 0.007158 seconds
> > > Time taken by reclamation: 0.007133 seconds
> > > …
> > >
> > > w/ patch
> > >
> > > ~ # ./a.out
> > > Time taken by reclamation: 0.002124 seconds
> > > Time taken by reclamation: 0.002116 seconds
> > > Time taken by reclamation: 0.002150 seconds
> > > Time taken by reclamation: 0.002261 seconds
> > > Time taken by reclamation: 0.002137 seconds
> > > Time taken by reclamation: 0.002173 seconds
> > > Time taken by reclamation: 0.002063 seconds
> > > Time taken by reclamation: 0.002088 seconds
> > > Time taken by reclamation: 0.002169 seconds
> > > Time taken by reclamation: 0.002124 seconds
> > > Time taken by reclamation: 0.002111 seconds
> > > Time taken by reclamation: 0.002224 seconds
> > > Time taken by reclamation: 0.002297 seconds
> > > Time taken by reclamation: 0.002260 seconds
> > > Time taken by reclamation: 0.002246 seconds
> > > Time taken by reclamation: 0.002272 seconds
> > > Time taken by reclamation: 0.002277 seconds
> > > Time taken by reclamation: 0.002462 seconds
> > > …
> > >
> > > This patch significantly speeds up try_to_unmap_one() by allowing it
> > > to skip redirtied THPs without splitting the PMD.
> > >
> > > Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > > Suggested-by: Lance Yang <ioworker0@gmail.com>
> > > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > > ---
> > >  mm/huge_memory.c | 24 +++++++++++++++++-------
> > >  mm/rmap.c        | 13 ++++++++++---
> > >  2 files changed, 27 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index 3d3ebdc002d5..47cc8c3f8f80 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -3070,8 +3070,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
> > >         int ref_count, map_count;
> > >         pmd_t orig_pmd = *pmdp;
> > >
> > > -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
> > > +       if (pmd_dirty(orig_pmd))
> > > +               folio_set_dirty(folio);
> > > +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> > > +               folio_set_swapbacked(folio);
> > >                 return false;
> > > +       }
> >
> > If either the PMD or the folio is dirty, should we just return false right away,
> > regardless of VM_DROPPABLE? There’s no need to proceed further in that
> > case, IMHO ;)
>
> I don't quite understand you, but we need to proceed to clear pmd entry.
> if vm_droppable is true, even if the folio is dirty, we still drop the folio.

Ah, you're right, and I completely got it wrong ;(

Thanks,
Lance

>
> >
> > Thanks,
> > Lance
> >
> > >
> > >         orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
> > >
> > > @@ -3098,8 +3102,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
> > >          *
> > >          * The only folio refs must be one from isolation plus the rmap(s).
> > >          */
> > > -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
> > > -           ref_count != map_count + 1) {
> > > +       if (pmd_dirty(orig_pmd))
> > > +               folio_set_dirty(folio);
> > > +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> > > +               folio_set_swapbacked(folio);
> > > +               set_pmd_at(mm, addr, pmdp, orig_pmd);
> > > +               return false;
> > > +       }
> > > +
> > > +       if (ref_count != map_count + 1) {
> > >                 set_pmd_at(mm, addr, pmdp, orig_pmd);
> > >                 return false;
> > >         }
> > > @@ -3119,12 +3130,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> > >  {
> > >         VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
> > >         VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> > > +       VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
> > > +       VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
> > >         VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
> > >
> > > -       if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
> > > -               return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> > > -
> > > -       return false;
> > > +       return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> > >  }
> > >
> > >  static void remap_page(struct folio *folio, unsigned long nr, int flags)
> > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > index be1978d2712d..a859c399ec7c 100644
> > > --- a/mm/rmap.c
> > > +++ b/mm/rmap.c
> > > @@ -1724,9 +1724,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > >                 }
> > >
> > >                 if (!pvmw.pte) {
> > > -                       if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
> > > -                                                 folio))
> > > -                               goto walk_done;
> > > +                       if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
> > > +                               if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
> > > +                                       goto walk_done;
> > > +                               /*
> > > +                                * unmap_huge_pmd_locked has either already marked
> > > +                                * the folio as swap-backed or decided to retain it
> > > +                                * due to GUP or speculative references.
> > > +                                */
> > > +                               goto walk_abort;
> > > +                       }
> > >
> > >                         if (flags & TTU_SPLIT_HUGE_PMD) {
> > >                                 /*
> > > --
> > > 2.39.3 (Apple Git-146)
> > >
Lance Yang Jan. 15, 2025, 6:26 a.m. UTC | #4
On Wed, Jan 15, 2025 at 1:09 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Jan 15, 2025 at 6:01 PM Lance Yang <ioworker0@gmail.com> wrote:
> >
> > On Wed, Jan 15, 2025 at 11:38 AM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > From: Barry Song <v-songbaohua@oppo.com>
> > >
> > > The try_to_unmap_one() function currently handles PMD-mapped THPs
> > > inefficiently. It first splits the PMD into PTEs, copies the dirty
> > > state from the PMD to the PTEs, iterates over the PTEs to locate
> > > the dirty state, and then marks the THP as swap-backed. This process
> > > involves unnecessary PMD splitting and redundant iteration. Instead,
> > > this functionality can be efficiently managed in
> > > __discard_anon_folio_pmd_locked(), avoiding the extra steps and
> > > improving performance.
> > >
> > > The following microbenchmark redirties folios after invoking MADV_FREE,
> > > then measures the time taken to perform memory reclamation (actually
> > > set those folios swapbacked again) on the redirtied folios.
> > >
> > >  #include <stdio.h>
> > >  #include <sys/mman.h>
> > >  #include <string.h>
> > >  #include <time.h>
> > >
> > >  #define SIZE 128*1024*1024  // 128 MB
> > >
> > >  int main(int argc, char *argv[])
> > >  {
> > >         while(1) {
> > >                 volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
> > >                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> > >
> > >                 memset((void *)p, 1, SIZE);
> > >                 madvise((void *)p, SIZE, MADV_FREE);
> > >                 /* redirty after MADV_FREE */
> > >                 memset((void *)p, 1, SIZE);
> > >
> > >                 clock_t start_time = clock();
> > >                 madvise((void *)p, SIZE, MADV_PAGEOUT);
> > >                 clock_t end_time = clock();
> > >
> > >                 double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
> > >                 printf("Time taken by reclamation: %f seconds\n", elapsed_time);
> > >
> > >                 munmap((void *)p, SIZE);
> > >         }
> > >         return 0;
> > >  }
> > >
> > > Testing results are as below,
> > > w/o patch:
> > > ~ # ./a.out
> > > Time taken by reclamation: 0.007300 seconds
> > > Time taken by reclamation: 0.007226 seconds
> > > Time taken by reclamation: 0.007295 seconds
> > > Time taken by reclamation: 0.007731 seconds
> > > Time taken by reclamation: 0.007134 seconds
> > > Time taken by reclamation: 0.007285 seconds
> > > Time taken by reclamation: 0.007720 seconds
> > > Time taken by reclamation: 0.007128 seconds
> > > Time taken by reclamation: 0.007710 seconds
> > > Time taken by reclamation: 0.007712 seconds
> > > Time taken by reclamation: 0.007236 seconds
> > > Time taken by reclamation: 0.007690 seconds
> > > Time taken by reclamation: 0.007174 seconds
> > > Time taken by reclamation: 0.007670 seconds
> > > Time taken by reclamation: 0.007169 seconds
> > > Time taken by reclamation: 0.007305 seconds
> > > Time taken by reclamation: 0.007432 seconds
> > > Time taken by reclamation: 0.007158 seconds
> > > Time taken by reclamation: 0.007133 seconds
> > > …
> > >
> > > w/ patch
> > >
> > > ~ # ./a.out
> > > Time taken by reclamation: 0.002124 seconds
> > > Time taken by reclamation: 0.002116 seconds
> > > Time taken by reclamation: 0.002150 seconds
> > > Time taken by reclamation: 0.002261 seconds
> > > Time taken by reclamation: 0.002137 seconds
> > > Time taken by reclamation: 0.002173 seconds
> > > Time taken by reclamation: 0.002063 seconds
> > > Time taken by reclamation: 0.002088 seconds
> > > Time taken by reclamation: 0.002169 seconds
> > > Time taken by reclamation: 0.002124 seconds
> > > Time taken by reclamation: 0.002111 seconds
> > > Time taken by reclamation: 0.002224 seconds
> > > Time taken by reclamation: 0.002297 seconds
> > > Time taken by reclamation: 0.002260 seconds
> > > Time taken by reclamation: 0.002246 seconds
> > > Time taken by reclamation: 0.002272 seconds
> > > Time taken by reclamation: 0.002277 seconds
> > > Time taken by reclamation: 0.002462 seconds
> > > …
> > >
> > > This patch significantly speeds up try_to_unmap_one() by allowing it
> > > to skip redirtied THPs without splitting the PMD.
> > >
> > > Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > > Suggested-by: Lance Yang <ioworker0@gmail.com>
> > > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > > ---
> > >  mm/huge_memory.c | 24 +++++++++++++++++-------
> > >  mm/rmap.c        | 13 ++++++++++---
> > >  2 files changed, 27 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index 3d3ebdc002d5..47cc8c3f8f80 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -3070,8 +3070,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
> > >         int ref_count, map_count;
> > >         pmd_t orig_pmd = *pmdp;
> > >
> > > -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
> > > +       if (pmd_dirty(orig_pmd))
> > > +               folio_set_dirty(folio);
> > > +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> > > +               folio_set_swapbacked(folio);
> > >                 return false;
> > > +       }
> >
> > If either the PMD or the folio is dirty, should we just return false right away,
> > regardless of VM_DROPPABLE? There’s no need to proceed further in that
> > case, IMHO ;)
>
> I don't quite understand you, but we need to proceed to clear pmd entry.
> if vm_droppable is true, even if the folio is dirty, we still drop the folio.

Hey barry,

One thing I still don’t quite understand is as follows:

One of the semantics of VM_DROPPABLE is that, under memory pressure,
the kernel can drop the pages. Similarly, for MADV_FREE, one of its
semantics is that the kernel can free the pages when memory pressure
occurs, but only if there is no subsequent write (i.e., the PMD is clean).

So, if VM_DROPPABLE is true, we still drop the folio even if it's dirty. This
seems to conflict with the semantics of MADV_FREE, which requires the
folio or PMD to be clean before being dropped. wdyt?

Thanks,
Lance




>
> >
> > Thanks,
> > Lance
> >
> > >
> > >         orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
> > >
> > > @@ -3098,8 +3102,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
> > >          *
> > >          * The only folio refs must be one from isolation plus the rmap(s).
> > >          */
> > > -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
> > > -           ref_count != map_count + 1) {
> > > +       if (pmd_dirty(orig_pmd))
> > > +               folio_set_dirty(folio);
> > > +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> > > +               folio_set_swapbacked(folio);
> > > +               set_pmd_at(mm, addr, pmdp, orig_pmd);
> > > +               return false;
> > > +       }
> > > +
> > > +       if (ref_count != map_count + 1) {
> > >                 set_pmd_at(mm, addr, pmdp, orig_pmd);
> > >                 return false;
> > >         }
> > > @@ -3119,12 +3130,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> > >  {
> > >         VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
> > >         VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> > > +       VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
> > > +       VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
> > >         VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
> > >
> > > -       if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
> > > -               return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> > > -
> > > -       return false;
> > > +       return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> > >  }
> > >
> > >  static void remap_page(struct folio *folio, unsigned long nr, int flags)
> > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > index be1978d2712d..a859c399ec7c 100644
> > > --- a/mm/rmap.c
> > > +++ b/mm/rmap.c
> > > @@ -1724,9 +1724,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > >                 }
> > >
> > >                 if (!pvmw.pte) {
> > > -                       if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
> > > -                                                 folio))
> > > -                               goto walk_done;
> > > +                       if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
> > > +                               if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
> > > +                                       goto walk_done;
> > > +                               /*
> > > +                                * unmap_huge_pmd_locked has either already marked
> > > +                                * the folio as swap-backed or decided to retain it
> > > +                                * due to GUP or speculative references.
> > > +                                */
> > > +                               goto walk_abort;
> > > +                       }
> > >
> > >                         if (flags & TTU_SPLIT_HUGE_PMD) {
> > >                                 /*
> > > --
> > > 2.39.3 (Apple Git-146)
> > >
Barry Song Jan. 15, 2025, 6:42 a.m. UTC | #5
On Wed, Jan 15, 2025 at 7:26 PM Lance Yang <ioworker0@gmail.com> wrote:
>
> On Wed, Jan 15, 2025 at 1:09 PM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Wed, Jan 15, 2025 at 6:01 PM Lance Yang <ioworker0@gmail.com> wrote:
> > >
> > > On Wed, Jan 15, 2025 at 11:38 AM Barry Song <21cnbao@gmail.com> wrote:
> > > >
> > > > From: Barry Song <v-songbaohua@oppo.com>
> > > >
> > > > The try_to_unmap_one() function currently handles PMD-mapped THPs
> > > > inefficiently. It first splits the PMD into PTEs, copies the dirty
> > > > state from the PMD to the PTEs, iterates over the PTEs to locate
> > > > the dirty state, and then marks the THP as swap-backed. This process
> > > > involves unnecessary PMD splitting and redundant iteration. Instead,
> > > > this functionality can be efficiently managed in
> > > > __discard_anon_folio_pmd_locked(), avoiding the extra steps and
> > > > improving performance.
> > > >
> > > > The following microbenchmark redirties folios after invoking MADV_FREE,
> > > > then measures the time taken to perform memory reclamation (actually
> > > > set those folios swapbacked again) on the redirtied folios.
> > > >
> > > >  #include <stdio.h>
> > > >  #include <sys/mman.h>
> > > >  #include <string.h>
> > > >  #include <time.h>
> > > >
> > > >  #define SIZE 128*1024*1024  // 128 MB
> > > >
> > > >  int main(int argc, char *argv[])
> > > >  {
> > > >         while(1) {
> > > >                 volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
> > > >                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> > > >
> > > >                 memset((void *)p, 1, SIZE);
> > > >                 madvise((void *)p, SIZE, MADV_FREE);
> > > >                 /* redirty after MADV_FREE */
> > > >                 memset((void *)p, 1, SIZE);
> > > >
> > > >                 clock_t start_time = clock();
> > > >                 madvise((void *)p, SIZE, MADV_PAGEOUT);
> > > >                 clock_t end_time = clock();
> > > >
> > > >                 double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
> > > >                 printf("Time taken by reclamation: %f seconds\n", elapsed_time);
> > > >
> > > >                 munmap((void *)p, SIZE);
> > > >         }
> > > >         return 0;
> > > >  }
> > > >
> > > > Testing results are as below,
> > > > w/o patch:
> > > > ~ # ./a.out
> > > > Time taken by reclamation: 0.007300 seconds
> > > > Time taken by reclamation: 0.007226 seconds
> > > > Time taken by reclamation: 0.007295 seconds
> > > > Time taken by reclamation: 0.007731 seconds
> > > > Time taken by reclamation: 0.007134 seconds
> > > > Time taken by reclamation: 0.007285 seconds
> > > > Time taken by reclamation: 0.007720 seconds
> > > > Time taken by reclamation: 0.007128 seconds
> > > > Time taken by reclamation: 0.007710 seconds
> > > > Time taken by reclamation: 0.007712 seconds
> > > > Time taken by reclamation: 0.007236 seconds
> > > > Time taken by reclamation: 0.007690 seconds
> > > > Time taken by reclamation: 0.007174 seconds
> > > > Time taken by reclamation: 0.007670 seconds
> > > > Time taken by reclamation: 0.007169 seconds
> > > > Time taken by reclamation: 0.007305 seconds
> > > > Time taken by reclamation: 0.007432 seconds
> > > > Time taken by reclamation: 0.007158 seconds
> > > > Time taken by reclamation: 0.007133 seconds
> > > > …
> > > >
> > > > w/ patch
> > > >
> > > > ~ # ./a.out
> > > > Time taken by reclamation: 0.002124 seconds
> > > > Time taken by reclamation: 0.002116 seconds
> > > > Time taken by reclamation: 0.002150 seconds
> > > > Time taken by reclamation: 0.002261 seconds
> > > > Time taken by reclamation: 0.002137 seconds
> > > > Time taken by reclamation: 0.002173 seconds
> > > > Time taken by reclamation: 0.002063 seconds
> > > > Time taken by reclamation: 0.002088 seconds
> > > > Time taken by reclamation: 0.002169 seconds
> > > > Time taken by reclamation: 0.002124 seconds
> > > > Time taken by reclamation: 0.002111 seconds
> > > > Time taken by reclamation: 0.002224 seconds
> > > > Time taken by reclamation: 0.002297 seconds
> > > > Time taken by reclamation: 0.002260 seconds
> > > > Time taken by reclamation: 0.002246 seconds
> > > > Time taken by reclamation: 0.002272 seconds
> > > > Time taken by reclamation: 0.002277 seconds
> > > > Time taken by reclamation: 0.002462 seconds
> > > > …
> > > >
> > > > This patch significantly speeds up try_to_unmap_one() by allowing it
> > > > to skip redirtied THPs without splitting the PMD.
> > > >
> > > > Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > > > Suggested-by: Lance Yang <ioworker0@gmail.com>
> > > > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > > > ---
> > > >  mm/huge_memory.c | 24 +++++++++++++++++-------
> > > >  mm/rmap.c        | 13 ++++++++++---
> > > >  2 files changed, 27 insertions(+), 10 deletions(-)
> > > >
> > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > > index 3d3ebdc002d5..47cc8c3f8f80 100644
> > > > --- a/mm/huge_memory.c
> > > > +++ b/mm/huge_memory.c
> > > > @@ -3070,8 +3070,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
> > > >         int ref_count, map_count;
> > > >         pmd_t orig_pmd = *pmdp;
> > > >
> > > > -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
> > > > +       if (pmd_dirty(orig_pmd))
> > > > +               folio_set_dirty(folio);
> > > > +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> > > > +               folio_set_swapbacked(folio);
> > > >                 return false;
> > > > +       }
> > >
> > > If either the PMD or the folio is dirty, should we just return false right away,
> > > regardless of VM_DROPPABLE? There’s no need to proceed further in that
> > > case, IMHO ;)
> >
> > I don't quite understand you, but we need to proceed to clear pmd entry.
> > if vm_droppable is true, even if the folio is dirty, we still drop the folio.
>
> Hey barry,
>
> One thing I still don’t quite understand is as follows:
>
> One of the semantics of VM_DROPPABLE is that, under memory pressure,
> the kernel can drop the pages. Similarly, for MADV_FREE, one of its
> semantics is that the kernel can free the pages when memory pressure
> occurs, but only if there is no subsequent write (i.e., the PMD is clean).
>
> So, if VM_DROPPABLE is true, we still drop the folio even if it's dirty. This
> seems to conflict with the semantics of MADV_FREE, which requires the
> folio or PMD to be clean before being dropped. wdyt?

I think we can simply revisit the patch where VM_DROPPABLE was introduced:
commit 9651fcedf7b92d3f7
"mm: add MAP_DROPPABLE for designating always lazily freeable mappings"

>
> Thanks,
> Lance
>
>
>
>
> >
> > >
> > > Thanks,
> > > Lance
> > >
> > > >
> > > >         orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
> > > >
> > > > @@ -3098,8 +3102,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
> > > >          *
> > > >          * The only folio refs must be one from isolation plus the rmap(s).
> > > >          */
> > > > -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
> > > > -           ref_count != map_count + 1) {
> > > > +       if (pmd_dirty(orig_pmd))
> > > > +               folio_set_dirty(folio);
> > > > +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> > > > +               folio_set_swapbacked(folio);
> > > > +               set_pmd_at(mm, addr, pmdp, orig_pmd);
> > > > +               return false;
> > > > +       }
> > > > +
> > > > +       if (ref_count != map_count + 1) {
> > > >                 set_pmd_at(mm, addr, pmdp, orig_pmd);
> > > >                 return false;
> > > >         }
> > > > @@ -3119,12 +3130,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> > > >  {
> > > >         VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
> > > >         VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> > > > +       VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
> > > > +       VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
> > > >         VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
> > > >
> > > > -       if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
> > > > -               return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> > > > -
> > > > -       return false;
> > > > +       return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> > > >  }
> > > >
> > > >  static void remap_page(struct folio *folio, unsigned long nr, int flags)
> > > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > > index be1978d2712d..a859c399ec7c 100644
> > > > --- a/mm/rmap.c
> > > > +++ b/mm/rmap.c
> > > > @@ -1724,9 +1724,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > > >                 }
> > > >
> > > >                 if (!pvmw.pte) {
> > > > -                       if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
> > > > -                                                 folio))
> > > > -                               goto walk_done;
> > > > +                       if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
> > > > +                               if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
> > > > +                                       goto walk_done;
> > > > +                               /*
> > > > +                                * unmap_huge_pmd_locked has either already marked
> > > > +                                * the folio as swap-backed or decided to retain it
> > > > +                                * due to GUP or speculative references.
> > > > +                                */
> > > > +                               goto walk_abort;
> > > > +                       }
> > > >
> > > >                         if (flags & TTU_SPLIT_HUGE_PMD) {
> > > >                                 /*
> > > > --
> > > > 2.39.3 (Apple Git-146)
> > > >

Thanks
Barry
Lance Yang Jan. 15, 2025, 7:01 a.m. UTC | #6
On Wed, Jan 15, 2025 at 2:42 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Jan 15, 2025 at 7:26 PM Lance Yang <ioworker0@gmail.com> wrote:
> >
> > On Wed, Jan 15, 2025 at 1:09 PM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > On Wed, Jan 15, 2025 at 6:01 PM Lance Yang <ioworker0@gmail.com> wrote:
> > > >
> > > > On Wed, Jan 15, 2025 at 11:38 AM Barry Song <21cnbao@gmail.com> wrote:
> > > > >
> > > > > From: Barry Song <v-songbaohua@oppo.com>
> > > > >
> > > > > The try_to_unmap_one() function currently handles PMD-mapped THPs
> > > > > inefficiently. It first splits the PMD into PTEs, copies the dirty
> > > > > state from the PMD to the PTEs, iterates over the PTEs to locate
> > > > > the dirty state, and then marks the THP as swap-backed. This process
> > > > > involves unnecessary PMD splitting and redundant iteration. Instead,
> > > > > this functionality can be efficiently managed in
> > > > > __discard_anon_folio_pmd_locked(), avoiding the extra steps and
> > > > > improving performance.
> > > > >
> > > > > The following microbenchmark redirties folios after invoking MADV_FREE,
> > > > > then measures the time taken to perform memory reclamation (actually
> > > > > set those folios swapbacked again) on the redirtied folios.
> > > > >
> > > > >  #include <stdio.h>
> > > > >  #include <sys/mman.h>
> > > > >  #include <string.h>
> > > > >  #include <time.h>
> > > > >
> > > > >  #define SIZE 128*1024*1024  // 128 MB
> > > > >
> > > > >  int main(int argc, char *argv[])
> > > > >  {
> > > > >         while(1) {
> > > > >                 volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
> > > > >                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> > > > >
> > > > >                 memset((void *)p, 1, SIZE);
> > > > >                 madvise((void *)p, SIZE, MADV_FREE);
> > > > >                 /* redirty after MADV_FREE */
> > > > >                 memset((void *)p, 1, SIZE);
> > > > >
> > > > >                 clock_t start_time = clock();
> > > > >                 madvise((void *)p, SIZE, MADV_PAGEOUT);
> > > > >                 clock_t end_time = clock();
> > > > >
> > > > >                 double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
> > > > >                 printf("Time taken by reclamation: %f seconds\n", elapsed_time);
> > > > >
> > > > >                 munmap((void *)p, SIZE);
> > > > >         }
> > > > >         return 0;
> > > > >  }
> > > > >
> > > > > Testing results are as below,
> > > > > w/o patch:
> > > > > ~ # ./a.out
> > > > > Time taken by reclamation: 0.007300 seconds
> > > > > Time taken by reclamation: 0.007226 seconds
> > > > > Time taken by reclamation: 0.007295 seconds
> > > > > Time taken by reclamation: 0.007731 seconds
> > > > > Time taken by reclamation: 0.007134 seconds
> > > > > Time taken by reclamation: 0.007285 seconds
> > > > > Time taken by reclamation: 0.007720 seconds
> > > > > Time taken by reclamation: 0.007128 seconds
> > > > > Time taken by reclamation: 0.007710 seconds
> > > > > Time taken by reclamation: 0.007712 seconds
> > > > > Time taken by reclamation: 0.007236 seconds
> > > > > Time taken by reclamation: 0.007690 seconds
> > > > > Time taken by reclamation: 0.007174 seconds
> > > > > Time taken by reclamation: 0.007670 seconds
> > > > > Time taken by reclamation: 0.007169 seconds
> > > > > Time taken by reclamation: 0.007305 seconds
> > > > > Time taken by reclamation: 0.007432 seconds
> > > > > Time taken by reclamation: 0.007158 seconds
> > > > > Time taken by reclamation: 0.007133 seconds
> > > > > …
> > > > >
> > > > > w/ patch
> > > > >
> > > > > ~ # ./a.out
> > > > > Time taken by reclamation: 0.002124 seconds
> > > > > Time taken by reclamation: 0.002116 seconds
> > > > > Time taken by reclamation: 0.002150 seconds
> > > > > Time taken by reclamation: 0.002261 seconds
> > > > > Time taken by reclamation: 0.002137 seconds
> > > > > Time taken by reclamation: 0.002173 seconds
> > > > > Time taken by reclamation: 0.002063 seconds
> > > > > Time taken by reclamation: 0.002088 seconds
> > > > > Time taken by reclamation: 0.002169 seconds
> > > > > Time taken by reclamation: 0.002124 seconds
> > > > > Time taken by reclamation: 0.002111 seconds
> > > > > Time taken by reclamation: 0.002224 seconds
> > > > > Time taken by reclamation: 0.002297 seconds
> > > > > Time taken by reclamation: 0.002260 seconds
> > > > > Time taken by reclamation: 0.002246 seconds
> > > > > Time taken by reclamation: 0.002272 seconds
> > > > > Time taken by reclamation: 0.002277 seconds
> > > > > Time taken by reclamation: 0.002462 seconds
> > > > > …
> > > > >
> > > > > This patch significantly speeds up try_to_unmap_one() by allowing it
> > > > > to skip redirtied THPs without splitting the PMD.
> > > > >
> > > > > Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > > > > Suggested-by: Lance Yang <ioworker0@gmail.com>
> > > > > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > > > > ---
> > > > >  mm/huge_memory.c | 24 +++++++++++++++++-------
> > > > >  mm/rmap.c        | 13 ++++++++++---
> > > > >  2 files changed, 27 insertions(+), 10 deletions(-)
> > > > >
> > > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > > > index 3d3ebdc002d5..47cc8c3f8f80 100644
> > > > > --- a/mm/huge_memory.c
> > > > > +++ b/mm/huge_memory.c
> > > > > @@ -3070,8 +3070,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
> > > > >         int ref_count, map_count;
> > > > >         pmd_t orig_pmd = *pmdp;
> > > > >
> > > > > -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
> > > > > +       if (pmd_dirty(orig_pmd))
> > > > > +               folio_set_dirty(folio);
> > > > > +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> > > > > +               folio_set_swapbacked(folio);
> > > > >                 return false;
> > > > > +       }
> > > >
> > > > If either the PMD or the folio is dirty, should we just return false right away,
> > > > regardless of VM_DROPPABLE? There’s no need to proceed further in that
> > > > case, IMHO ;)
> > >
> > > I don't quite understand you, but we need to proceed to clear pmd entry.
> > > if vm_droppable is true, even if the folio is dirty, we still drop the folio.
> >
> > Hey barry,
> >
> > One thing I still don’t quite understand is as follows:
> >
> > One of the semantics of VM_DROPPABLE is that, under memory pressure,
> > the kernel can drop the pages. Similarly, for MADV_FREE, one of its
> > semantics is that the kernel can free the pages when memory pressure
> > occurs, but only if there is no subsequent write (i.e., the PMD is clean).
> >
> > So, if VM_DROPPABLE is true, we still drop the folio even if it's dirty. This
> > seems to conflict with the semantics of MADV_FREE, which requires the
> > folio or PMD to be clean before being dropped. wdyt?
>
> I think we can simply revisit the patch where VM_DROPPABLE was introduced:
> commit 9651fcedf7b92d3f7
> "mm: add MAP_DROPPABLE for designating always lazily freeable mappings"

Ah, I see ~

As the patch says: "Unlike MADV_FREE mappings, VM_DROPPABLE ones
can be dropped even if they've been dirtied".

Thanks a lot for the lesson!
Lance

>
> >
> > Thanks,
> > Lance
> >
> >
> >
> >
> > >
> > > >
> > > > Thanks,
> > > > Lance
> > > >
> > > > >
> > > > >         orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
> > > > >
> > > > > @@ -3098,8 +3102,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
> > > > >          *
> > > > >          * The only folio refs must be one from isolation plus the rmap(s).
> > > > >          */
> > > > > -       if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
> > > > > -           ref_count != map_count + 1) {
> > > > > +       if (pmd_dirty(orig_pmd))
> > > > > +               folio_set_dirty(folio);
> > > > > +       if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
> > > > > +               folio_set_swapbacked(folio);
> > > > > +               set_pmd_at(mm, addr, pmdp, orig_pmd);
> > > > > +               return false;
> > > > > +       }
> > > > > +
> > > > > +       if (ref_count != map_count + 1) {
> > > > >                 set_pmd_at(mm, addr, pmdp, orig_pmd);
> > > > >                 return false;
> > > > >         }
> > > > > @@ -3119,12 +3130,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> > > > >  {
> > > > >         VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
> > > > >         VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> > > > > +       VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
> > > > > +       VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
> > > > >         VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
> > > > >
> > > > > -       if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
> > > > > -               return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> > > > > -
> > > > > -       return false;
> > > > > +       return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
> > > > >  }
> > > > >
> > > > >  static void remap_page(struct folio *folio, unsigned long nr, int flags)
> > > > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > > > index be1978d2712d..a859c399ec7c 100644
> > > > > --- a/mm/rmap.c
> > > > > +++ b/mm/rmap.c
> > > > > @@ -1724,9 +1724,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > > > >                 }
> > > > >
> > > > >                 if (!pvmw.pte) {
> > > > > -                       if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
> > > > > -                                                 folio))
> > > > > -                               goto walk_done;
> > > > > +                       if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
> > > > > +                               if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
> > > > > +                                       goto walk_done;
> > > > > +                               /*
> > > > > +                                * unmap_huge_pmd_locked has either already marked
> > > > > +                                * the folio as swap-backed or decided to retain it
> > > > > +                                * due to GUP or speculative references.
> > > > > +                                */
> > > > > +                               goto walk_abort;
> > > > > +                       }
> > > > >
> > > > >                         if (flags & TTU_SPLIT_HUGE_PMD) {
> > > > >                                 /*
> > > > > --
> > > > > 2.39.3 (Apple Git-146)
> > > > >
>
> Thanks
> Barry
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3d3ebdc002d5..47cc8c3f8f80 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3070,8 +3070,12 @@  static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
 	int ref_count, map_count;
 	pmd_t orig_pmd = *pmdp;
 
-	if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
+	if (pmd_dirty(orig_pmd))
+		folio_set_dirty(folio);
+	if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+		folio_set_swapbacked(folio);
 		return false;
+	}
 
 	orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
 
@@ -3098,8 +3102,15 @@  static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
 	 *
 	 * The only folio refs must be one from isolation plus the rmap(s).
 	 */
-	if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
-	    ref_count != map_count + 1) {
+	if (pmd_dirty(orig_pmd))
+		folio_set_dirty(folio);
+	if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+		folio_set_swapbacked(folio);
+		set_pmd_at(mm, addr, pmdp, orig_pmd);
+		return false;
+	}
+
+	if (ref_count != map_count + 1) {
 		set_pmd_at(mm, addr, pmdp, orig_pmd);
 		return false;
 	}
@@ -3119,12 +3130,11 @@  bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
 {
 	VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
 	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+	VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
 	VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
 
-	if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
-		return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
-
-	return false;
+	return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
 }
 
 static void remap_page(struct folio *folio, unsigned long nr, int flags)
diff --git a/mm/rmap.c b/mm/rmap.c
index be1978d2712d..a859c399ec7c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1724,9 +1724,16 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 		}
 
 		if (!pvmw.pte) {
-			if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
-						  folio))
-				goto walk_done;
+			if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
+				if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
+					goto walk_done;
+				/*
+				 * unmap_huge_pmd_locked has either already marked
+				 * the folio as swap-backed or decided to retain it
+				 * due to GUP or speculative references.
+				 */
+				goto walk_abort;
+			}
 
 			if (flags & TTU_SPLIT_HUGE_PMD) {
 				/*