diff mbox series

[RFC,6/6] mm: madvise: don't split mTHP for MADV_PAGEOUT

Message ID 20240118111036.72641-7-21cnbao@gmail.com (mailing list archive)
State New
Headers show
Series mm: support large folios swap-in | expand

Commit Message

Barry Song Jan. 18, 2024, 11:10 a.m. UTC
From: Chuanhua Han <hanchuanhua@oppo.com>

MADV_PAGEOUT and MADV_FREE are common cases in Android. Ryan's patchset has
supported swapping large folios out as a whole for vmscan case. This patch
extends the feature to madvise.

If madvised range covers the whole large folio, we don't split it. Otherwise,
we still need to split it.

This patch doesn't depend on ARM64's CONT-PTE, alternatively, it defines one
helper named pte_range_cont_mapped() to check if all PTEs are contiguously
mapped to a large folio.

Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
Co-developed-by: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 include/asm-generic/tlb.h | 10 +++++++
 include/linux/pgtable.h   | 60 +++++++++++++++++++++++++++++++++++++++
 mm/madvise.c              | 48 +++++++++++++++++++++++++++++++
 3 files changed, 118 insertions(+)

Comments

Chris Li Jan. 29, 2024, 2:15 a.m. UTC | #1
On Thu, Jan 18, 2024 at 3:12 AM Barry Song <21cnbao@gmail.com> wrote:
>
> From: Chuanhua Han <hanchuanhua@oppo.com>
>
> MADV_PAGEOUT and MADV_FREE are common cases in Android. Ryan's patchset has
> supported swapping large folios out as a whole for vmscan case. This patch
> extends the feature to madvise.
>
> If madvised range covers the whole large folio, we don't split it. Otherwise,
> we still need to split it.
>
> This patch doesn't depend on ARM64's CONT-PTE, alternatively, it defines one
> helper named pte_range_cont_mapped() to check if all PTEs are contiguously
> mapped to a large folio.
>
> Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
> Co-developed-by: Barry Song <v-songbaohua@oppo.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  include/asm-generic/tlb.h | 10 +++++++
>  include/linux/pgtable.h   | 60 +++++++++++++++++++++++++++++++++++++++
>  mm/madvise.c              | 48 +++++++++++++++++++++++++++++++
>  3 files changed, 118 insertions(+)
>
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index 129a3a759976..f894e22da5d6 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -608,6 +608,16 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
>                 __tlb_remove_tlb_entry(tlb, ptep, address);     \
>         } while (0)
>
> +#define tlb_remove_nr_tlb_entry(tlb, ptep, address, nr)                        \
> +       do {                                                            \
> +               int i;                                                  \
> +               tlb_flush_pte_range(tlb, address,                       \
> +                               PAGE_SIZE * nr);                        \
> +               for (i = 0; i < nr; i++)                                \
> +                       __tlb_remove_tlb_entry(tlb, ptep + i,           \
> +                                       address + i * PAGE_SIZE);       \
> +       } while (0)
> +
>  #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)       \
>         do {                                                    \
>                 unsigned long _sz = huge_page_size(h);          \
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 37fe83b0c358..da0c1cf447e3 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -320,6 +320,42 @@ static inline pgd_t pgdp_get(pgd_t *pgdp)
>  }
>  #endif
>
> +#ifndef pte_range_cont_mapped
> +static inline bool pte_range_cont_mapped(unsigned long start_pfn,
> +                                        pte_t *start_pte,
> +                                        unsigned long start_addr,
> +                                        int nr)
> +{
> +       int i;
> +       pte_t pte_val;
> +
> +       for (i = 0; i < nr; i++) {
> +               pte_val = ptep_get(start_pte + i);
> +
> +               if (pte_none(pte_val))
> +                       return false;

Hmm, the following check pte_pfn == start_pfn + i should have covered
the pte none case?

I think the pte_none means it can't have a valid pfn. So this check
can be skipped?

> +
> +               if (pte_pfn(pte_val) != (start_pfn + i))
> +                       return false;
> +       }
> +
> +       return true;
> +}
> +#endif
> +
> +#ifndef pte_range_young
> +static inline bool pte_range_young(pte_t *start_pte, int nr)
> +{
> +       int i;
> +
> +       for (i = 0; i < nr; i++)
> +               if (pte_young(ptep_get(start_pte + i)))
> +                       return true;
> +
> +       return false;
> +}
> +#endif
> +
>  #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
>  static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
>                                             unsigned long address,
> @@ -580,6 +616,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
>  }
>  #endif
>
> +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_RANGE_FULL
> +static inline pte_t ptep_get_and_clear_range_full(struct mm_struct *mm,
> +                                                 unsigned long start_addr,
> +                                                 pte_t *start_pte,
> +                                                 int nr, int full)
> +{
> +       int i;
> +       pte_t pte;
> +
> +       pte = ptep_get_and_clear_full(mm, start_addr, start_pte, full);
> +
> +       for (i = 1; i < nr; i++)
> +               ptep_get_and_clear_full(mm, start_addr + i * PAGE_SIZE,
> +                                       start_pte + i, full);
> +
> +       return pte;
> +}
>
>  /*
>   * If two threads concurrently fault at the same page, the thread that
> @@ -995,6 +1048,13 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>  })
>  #endif
>
> +#ifndef pte_nr_addr_end
> +#define pte_nr_addr_end(addr, size, end)                               \
> +({     unsigned long __boundary = ((addr) + size) & (~(size - 1));     \
> +       (__boundary - 1 < (end) - 1)? __boundary: (end);                \
> +})
> +#endif
> +
>  /*
>   * When walking page tables, we usually want to skip any p?d_none entries;
>   * and any p?d_bad entries - reporting the error before resetting to none.
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 912155a94ed5..262460ac4b2e 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -452,6 +452,54 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>                 if (folio_test_large(folio)) {
>                         int err;
>
> +                       if (!folio_test_pmd_mappable(folio)) {

This session of code indent into the right too much.
You can do:

if (folio_test_pmd_mappable(folio))
         goto split;

to make the code flatter.

> +                               int nr_pages = folio_nr_pages(folio);
> +                               unsigned long folio_size = PAGE_SIZE * nr_pages;
> +                               unsigned long start_addr = ALIGN_DOWN(addr, nr_pages * PAGE_SIZE);;
> +                               unsigned long start_pfn = page_to_pfn(folio_page(folio, 0));
> +                               pte_t *start_pte = pte - (addr - start_addr) / PAGE_SIZE;
> +                               unsigned long next = pte_nr_addr_end(addr, folio_size, end);
> +
> +                               if (!pte_range_cont_mapped(start_pfn, start_pte, start_addr, nr_pages))
> +                                       goto split;
> +
> +                               if (next - addr != folio_size) {

Nitpick: One line statement does not need {

> +                                       goto split;
> +                               } else {

When the previous if statement already "goto split", there is no need
for the else. You can save one level of indentation.



> +                                       /* Do not interfere with other mappings of this page */
> +                                       if (folio_estimated_sharers(folio) != 1)
> +                                               goto skip;
> +
> +                                       VM_BUG_ON(addr != start_addr || pte != start_pte);
> +
> +                                       if (pte_range_young(start_pte, nr_pages)) {
> +                                               ptent = ptep_get_and_clear_range_full(mm, start_addr, start_pte,
> +                                                                                     nr_pages, tlb->fullmm);
> +                                               ptent = pte_mkold(ptent);
> +
> +                                               set_ptes(mm, start_addr, start_pte, ptent, nr_pages);
> +                                               tlb_remove_nr_tlb_entry(tlb, start_pte, start_addr, nr_pages);
> +                                       }
> +
> +                                       folio_clear_referenced(folio);
> +                                       folio_test_clear_young(folio);
> +                                       if (pageout) {
> +                                               if (folio_isolate_lru(folio)) {
> +                                                       if (folio_test_unevictable(folio))
> +                                                               folio_putback_lru(folio);
> +                                                       else
> +                                                               list_add(&folio->lru, &folio_list);
> +                                               }
> +                                       } else
> +                                               folio_deactivate(folio);

I notice this section is very similar to the earlier statements inside
the same function.
"if (pmd_trans_huge(*pmd)) {"

Wondering if there is some way to unify the two a bit somehow.

Also notice if you test the else condition first,

If (!pageout) {
    folio_deactivate(folio);
    goto skip;
}

You can save one level of indentation.
Not your fault, I notice the section inside (pmd_trans_huge(*pmd))
does exactly the same thing.

Chris


> +                               }
> +skip:
> +                               pte += (next - PAGE_SIZE - (addr & PAGE_MASK))/PAGE_SIZE;
> +                               addr = next - PAGE_SIZE;
> +                               continue;
> +
> +                       }
> +split:
>                         if (folio_estimated_sharers(folio) != 1)
>                                 break;
>                         if (pageout_anon_only_filter && !folio_test_anon(folio))
> --
> 2.34.1
>
>
Barry Song Feb. 26, 2024, 6:39 a.m. UTC | #2
On Mon, Jan 29, 2024 at 3:15 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Thu, Jan 18, 2024 at 3:12 AM Barry Song <21cnbao@gmail.com> wrote:
> >
> > From: Chuanhua Han <hanchuanhua@oppo.com>
> >
> > MADV_PAGEOUT and MADV_FREE are common cases in Android. Ryan's patchset has
> > supported swapping large folios out as a whole for vmscan case. This patch
> > extends the feature to madvise.
> >
> > If madvised range covers the whole large folio, we don't split it. Otherwise,
> > we still need to split it.
> >
> > This patch doesn't depend on ARM64's CONT-PTE, alternatively, it defines one
> > helper named pte_range_cont_mapped() to check if all PTEs are contiguously
> > mapped to a large folio.
> >
> > Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
> > Co-developed-by: Barry Song <v-songbaohua@oppo.com>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  include/asm-generic/tlb.h | 10 +++++++
> >  include/linux/pgtable.h   | 60 +++++++++++++++++++++++++++++++++++++++
> >  mm/madvise.c              | 48 +++++++++++++++++++++++++++++++
> >  3 files changed, 118 insertions(+)
> >
> > diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> > index 129a3a759976..f894e22da5d6 100644
> > --- a/include/asm-generic/tlb.h
> > +++ b/include/asm-generic/tlb.h
> > @@ -608,6 +608,16 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
> >                 __tlb_remove_tlb_entry(tlb, ptep, address);     \
> >         } while (0)
> >
> > +#define tlb_remove_nr_tlb_entry(tlb, ptep, address, nr)                        \
> > +       do {                                                            \
> > +               int i;                                                  \
> > +               tlb_flush_pte_range(tlb, address,                       \
> > +                               PAGE_SIZE * nr);                        \
> > +               for (i = 0; i < nr; i++)                                \
> > +                       __tlb_remove_tlb_entry(tlb, ptep + i,           \
> > +                                       address + i * PAGE_SIZE);       \
> > +       } while (0)
> > +
> >  #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)       \
> >         do {                                                    \
> >                 unsigned long _sz = huge_page_size(h);          \
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index 37fe83b0c358..da0c1cf447e3 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -320,6 +320,42 @@ static inline pgd_t pgdp_get(pgd_t *pgdp)
> >  }
> >  #endif
> >
> > +#ifndef pte_range_cont_mapped
> > +static inline bool pte_range_cont_mapped(unsigned long start_pfn,
> > +                                        pte_t *start_pte,
> > +                                        unsigned long start_addr,
> > +                                        int nr)
> > +{
> > +       int i;
> > +       pte_t pte_val;
> > +
> > +       for (i = 0; i < nr; i++) {
> > +               pte_val = ptep_get(start_pte + i);
> > +
> > +               if (pte_none(pte_val))
> > +                       return false;
>
> Hmm, the following check pte_pfn == start_pfn + i should have covered
> the pte none case?
>
> I think the pte_none means it can't have a valid pfn. So this check
> can be skipped?

yes. check pte_pfn == start_pfn + i should have covered the pte none
case. but leaving pte_none there seems to make the code more
readable.  i guess we need to check pte_present() too, a small chance is
swp_offset can equal pte_pfn after some shifting? in case, a PTE
within the large folio range has been a swap entry?

I am still thinking about if we have some cheaper way to check if a folio
is still entirely mapped. maybe sth like if
(list_empty(&folio->_deferred_list))?

>
> > +
> > +               if (pte_pfn(pte_val) != (start_pfn + i))
> > +                       return false;
> > +       }
> > +
> > +       return true;
> > +}
> > +#endif
> > +
> > +#ifndef pte_range_young
> > +static inline bool pte_range_young(pte_t *start_pte, int nr)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < nr; i++)
> > +               if (pte_young(ptep_get(start_pte + i)))
> > +                       return true;
> > +
> > +       return false;
> > +}
> > +#endif
> > +
> >  #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
> >  static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
> >                                             unsigned long address,
> > @@ -580,6 +616,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
> >  }
> >  #endif
> >
> > +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_RANGE_FULL
> > +static inline pte_t ptep_get_and_clear_range_full(struct mm_struct *mm,
> > +                                                 unsigned long start_addr,
> > +                                                 pte_t *start_pte,
> > +                                                 int nr, int full)
> > +{
> > +       int i;
> > +       pte_t pte;
> > +
> > +       pte = ptep_get_and_clear_full(mm, start_addr, start_pte, full);
> > +
> > +       for (i = 1; i < nr; i++)
> > +               ptep_get_and_clear_full(mm, start_addr + i * PAGE_SIZE,
> > +                                       start_pte + i, full);
> > +
> > +       return pte;
> > +}
> >
> >  /*
> >   * If two threads concurrently fault at the same page, the thread that
> > @@ -995,6 +1048,13 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> >  })
> >  #endif
> >
> > +#ifndef pte_nr_addr_end
> > +#define pte_nr_addr_end(addr, size, end)                               \
> > +({     unsigned long __boundary = ((addr) + size) & (~(size - 1));     \
> > +       (__boundary - 1 < (end) - 1)? __boundary: (end);                \
> > +})
> > +#endif
> > +
> >  /*
> >   * When walking page tables, we usually want to skip any p?d_none entries;
> >   * and any p?d_bad entries - reporting the error before resetting to none.
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 912155a94ed5..262460ac4b2e 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -452,6 +452,54 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> >                 if (folio_test_large(folio)) {
> >                         int err;
> >
> > +                       if (!folio_test_pmd_mappable(folio)) {
>
> This session of code indent into the right too much.
> You can do:
>
> if (folio_test_pmd_mappable(folio))
>          goto split;
>
> to make the code flatter.

I guess we don't need  "if (!folio_test_pmd_mappable(folio))" at all
as the pmd case has been
handled at the first beginning of  madvise_cold_or_pageout_pte_range().

>
> > +                               int nr_pages = folio_nr_pages(folio);
> > +                               unsigned long folio_size = PAGE_SIZE * nr_pages;
> > +                               unsigned long start_addr = ALIGN_DOWN(addr, nr_pages * PAGE_SIZE);;
> > +                               unsigned long start_pfn = page_to_pfn(folio_page(folio, 0));
> > +                               pte_t *start_pte = pte - (addr - start_addr) / PAGE_SIZE;
> > +                               unsigned long next = pte_nr_addr_end(addr, folio_size, end);
> > +
> > +                               if (!pte_range_cont_mapped(start_pfn, start_pte, start_addr, nr_pages))
> > +                                       goto split;
> > +
> > +                               if (next - addr != folio_size) {
>
> Nitpick: One line statement does not need {
>
> > +                                       goto split;
> > +                               } else {
>
> When the previous if statement already "goto split", there is no need
> for the else. You can save one level of indentation.

right!

>
>
>
> > +                                       /* Do not interfere with other mappings of this page */
> > +                                       if (folio_estimated_sharers(folio) != 1)
> > +                                               goto skip;
> > +
> > +                                       VM_BUG_ON(addr != start_addr || pte != start_pte);
> > +
> > +                                       if (pte_range_young(start_pte, nr_pages)) {
> > +                                               ptent = ptep_get_and_clear_range_full(mm, start_addr, start_pte,
> > +                                                                                     nr_pages, tlb->fullmm);
> > +                                               ptent = pte_mkold(ptent);
> > +
> > +                                               set_ptes(mm, start_addr, start_pte, ptent, nr_pages);
> > +                                               tlb_remove_nr_tlb_entry(tlb, start_pte, start_addr, nr_pages);
> > +                                       }
> > +
> > +                                       folio_clear_referenced(folio);
> > +                                       folio_test_clear_young(folio);
> > +                                       if (pageout) {
> > +                                               if (folio_isolate_lru(folio)) {
> > +                                                       if (folio_test_unevictable(folio))
> > +                                                               folio_putback_lru(folio);
> > +                                                       else
> > +                                                               list_add(&folio->lru, &folio_list);
> > +                                               }
> > +                                       } else
> > +                                               folio_deactivate(folio);
>
> I notice this section is very similar to the earlier statements inside
> the same function.
> "if (pmd_trans_huge(*pmd)) {"
>
> Wondering if there is some way to unify the two a bit somehow.

we have duplicated the code three times - pmd, pte-mapped large, normal folio.
I am quite sure if we can extract a common function.

>
> Also notice if you test the else condition first,
>
> If (!pageout) {
>     folio_deactivate(folio);
>     goto skip;
> }
>
> You can save one level of indentation.
> Not your fault, I notice the section inside (pmd_trans_huge(*pmd))
> does exactly the same thing.
>

can address this issue once we have a common func.

> Chris
>
>
> > +                               }
> > +skip:
> > +                               pte += (next - PAGE_SIZE - (addr & PAGE_MASK))/PAGE_SIZE;
> > +                               addr = next - PAGE_SIZE;
> > +                               continue;
> > +
> > +                       }
> > +split:
> >                         if (folio_estimated_sharers(folio) != 1)
> >                                 break;
> >                         if (pageout_anon_only_filter && !folio_test_anon(folio))
> > --
> > 2.34.1
> >
> >

Thanks
Barry
Ryan Roberts Feb. 27, 2024, 12:22 p.m. UTC | #3
Hi Barry,

I've scanned through this patch as part of trying to understand the races you
have reported (It's going to take me a while to fully understand it all :) ). In
the meantime I have a few comments on this patch...

On 18/01/2024 11:10, Barry Song wrote:
> From: Chuanhua Han <hanchuanhua@oppo.com>
> 
> MADV_PAGEOUT and MADV_FREE are common cases in Android. Ryan's patchset has
> supported swapping large folios out as a whole for vmscan case. This patch
> extends the feature to madvise.
> 
> If madvised range covers the whole large folio, we don't split it. Otherwise,
> we still need to split it.
> 
> This patch doesn't depend on ARM64's CONT-PTE, alternatively, it defines one
> helper named pte_range_cont_mapped() to check if all PTEs are contiguously
> mapped to a large folio.
> 
> Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
> Co-developed-by: Barry Song <v-songbaohua@oppo.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  include/asm-generic/tlb.h | 10 +++++++
>  include/linux/pgtable.h   | 60 +++++++++++++++++++++++++++++++++++++++
>  mm/madvise.c              | 48 +++++++++++++++++++++++++++++++
>  3 files changed, 118 insertions(+)
> 
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index 129a3a759976..f894e22da5d6 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -608,6 +608,16 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
>  		__tlb_remove_tlb_entry(tlb, ptep, address);	\
>  	} while (0)
>  
> +#define tlb_remove_nr_tlb_entry(tlb, ptep, address, nr)			\
> +	do {                                                    	\
> +		int i;							\
> +		tlb_flush_pte_range(tlb, address,			\
> +				PAGE_SIZE * nr);			\
> +		for (i = 0; i < nr; i++)				\
> +			__tlb_remove_tlb_entry(tlb, ptep + i,		\
> +					address + i * PAGE_SIZE);	\
> +	} while (0)

David has recently added tlb_remove_tlb_entries() which does the same thing.

> +
>  #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
>  	do {							\
>  		unsigned long _sz = huge_page_size(h);		\
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 37fe83b0c358..da0c1cf447e3 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -320,6 +320,42 @@ static inline pgd_t pgdp_get(pgd_t *pgdp)
>  }
>  #endif
>  
> +#ifndef pte_range_cont_mapped
> +static inline bool pte_range_cont_mapped(unsigned long start_pfn,
> +					 pte_t *start_pte,
> +					 unsigned long start_addr,
> +					 int nr)
> +{
> +	int i;
> +	pte_t pte_val;
> +
> +	for (i = 0; i < nr; i++) {
> +		pte_val = ptep_get(start_pte + i);
> +
> +		if (pte_none(pte_val))
> +			return false;
> +
> +		if (pte_pfn(pte_val) != (start_pfn + i))
> +			return false;
> +	}
> +
> +	return true;
> +}
> +#endif

David has recently added folio_pte_batch() which does a similar thing (as
discussed in other context).

> +
> +#ifndef pte_range_young
> +static inline bool pte_range_young(pte_t *start_pte, int nr)
> +{
> +	int i;
> +
> +	for (i = 0; i < nr; i++)
> +		if (pte_young(ptep_get(start_pte + i)))
> +			return true;
> +
> +	return false;
> +}
> +#endif

I wonder if this should come from folio_pte_batch()?

> +
>  #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
>  static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
>  					    unsigned long address,
> @@ -580,6 +616,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
>  }
>  #endif
>  
> +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_RANGE_FULL
> +static inline pte_t ptep_get_and_clear_range_full(struct mm_struct *mm,
> +						  unsigned long start_addr,
> +						  pte_t *start_pte,
> +						  int nr, int full)
> +{
> +	int i;
> +	pte_t pte;
> +
> +	pte = ptep_get_and_clear_full(mm, start_addr, start_pte, full);
> +
> +	for (i = 1; i < nr; i++)
> +		ptep_get_and_clear_full(mm, start_addr + i * PAGE_SIZE,
> +					start_pte + i, full);
> +
> +	return pte;
> +}

David has recently added get_and_clear_full_ptes(). Your version isn't gathering
access/dirty, which may be ok for your case, but not ok in general.

>  
>  /*
>   * If two threads concurrently fault at the same page, the thread that
> @@ -995,6 +1048,13 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>  })
>  #endif
>  
> +#ifndef pte_nr_addr_end
> +#define pte_nr_addr_end(addr, size, end)				\
> +({	unsigned long __boundary = ((addr) + size) & (~(size - 1));	\
> +	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
> +})
> +#endif
> +
>  /*
>   * When walking page tables, we usually want to skip any p?d_none entries;
>   * and any p?d_bad entries - reporting the error before resetting to none.
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 912155a94ed5..262460ac4b2e 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -452,6 +452,54 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  		if (folio_test_large(folio)) {
>  			int err;
>  
> +			if (!folio_test_pmd_mappable(folio)) {
> +				int nr_pages = folio_nr_pages(folio);
> +				unsigned long folio_size = PAGE_SIZE * nr_pages;
> +				unsigned long start_addr = ALIGN_DOWN(addr, nr_pages * PAGE_SIZE);;

I doubt it is correct to align down here. Couldn't you be going outside the
bounds that the user supplied?

nit: you've defined folio_size, why not use it here?
nit: double semi-colon.

> +				unsigned long start_pfn = page_to_pfn(folio_page(folio, 0));
> +				pte_t *start_pte = pte - (addr - start_addr) / PAGE_SIZE;

I think start_pte could be off the start of the pgtable and into random memory
in some corner cases (and outside the protection of the PTL)? You're assuming
that the folio is fully and contigously mapped and correctly aligned. mremap
(and other things) could break that assumption.

> +				unsigned long next = pte_nr_addr_end(addr, folio_size, end);
> +
> +				if (!pte_range_cont_mapped(start_pfn, start_pte, start_addr, nr_pages))
> +					goto split;
> +
> +				if (next - addr != folio_size) {
> +					goto split;
> +				} else {
> +					/* Do not interfere with other mappings of this page */
> +					if (folio_estimated_sharers(folio) != 1)
> +						goto skip;
> +
> +					VM_BUG_ON(addr != start_addr || pte != start_pte);
> +
> +					if (pte_range_young(start_pte, nr_pages)) {
> +						ptent = ptep_get_and_clear_range_full(mm, start_addr, start_pte,
> +										      nr_pages, tlb->fullmm);
> +						ptent = pte_mkold(ptent);
> +
> +						set_ptes(mm, start_addr, start_pte, ptent, nr_pages);
> +						tlb_remove_nr_tlb_entry(tlb, start_pte, start_addr, nr_pages);
> +					}
> +
> +					folio_clear_referenced(folio);
> +					folio_test_clear_young(folio);
> +					if (pageout) {
> +						if (folio_isolate_lru(folio)) {
> +							if (folio_test_unevictable(folio))
> +								folio_putback_lru(folio);
> +							else
> +								list_add(&folio->lru, &folio_list);
> +						}
> +					} else
> +						folio_deactivate(folio);
> +				}
> +skip:
> +				pte += (next - PAGE_SIZE - (addr & PAGE_MASK))/PAGE_SIZE;
> +				addr = next - PAGE_SIZE;
> +				continue;
> +
> +			}
> +split:
>  			if (folio_estimated_sharers(folio) != 1)
>  				break;
>  			if (pageout_anon_only_filter && !folio_test_anon(folio))
Ryan Roberts Feb. 27, 2024, 2:40 p.m. UTC | #4
On 18/01/2024 11:10, Barry Song wrote:
> From: Chuanhua Han <hanchuanhua@oppo.com>
> 
> MADV_PAGEOUT and MADV_FREE are common cases in Android. Ryan's patchset has
> supported swapping large folios out as a whole for vmscan case. This patch
> extends the feature to madvise.
> 
> If madvised range covers the whole large folio, we don't split it. Otherwise,
> we still need to split it.
> 
> This patch doesn't depend on ARM64's CONT-PTE, alternatively, it defines one
> helper named pte_range_cont_mapped() to check if all PTEs are contiguously
> mapped to a large folio.

I'm going to rework this patch and integrate it into my series if that's ok with
you?
Barry Song Feb. 27, 2024, 6:57 p.m. UTC | #5
On Wed, Feb 28, 2024 at 3:40 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 18/01/2024 11:10, Barry Song wrote:
> > From: Chuanhua Han <hanchuanhua@oppo.com>
> >
> > MADV_PAGEOUT and MADV_FREE are common cases in Android. Ryan's patchset has
> > supported swapping large folios out as a whole for vmscan case. This patch
> > extends the feature to madvise.
> >
> > If madvised range covers the whole large folio, we don't split it. Otherwise,
> > we still need to split it.
> >
> > This patch doesn't depend on ARM64's CONT-PTE, alternatively, it defines one
> > helper named pte_range_cont_mapped() to check if all PTEs are contiguously
> > mapped to a large folio.
>
> I'm going to rework this patch and integrate it into my series if that's ok with
> you?

This is perfect. Please integrate it into your swap-out series which is the
perfect place for this MADV_PAGEOUT.

Thanks
Barry
Barry Song Feb. 27, 2024, 10:39 p.m. UTC | #6
On Wed, Feb 28, 2024 at 1:22 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> Hi Barry,
>
> I've scanned through this patch as part of trying to understand the races you
> have reported (It's going to take me a while to fully understand it all :) ). In
> the meantime I have a few comments on this patch...
>
> On 18/01/2024 11:10, Barry Song wrote:
> > From: Chuanhua Han <hanchuanhua@oppo.com>
> >
> > MADV_PAGEOUT and MADV_FREE are common cases in Android. Ryan's patchset has
> > supported swapping large folios out as a whole for vmscan case. This patch
> > extends the feature to madvise.
> >
> > If madvised range covers the whole large folio, we don't split it. Otherwise,
> > we still need to split it.
> >
> > This patch doesn't depend on ARM64's CONT-PTE, alternatively, it defines one
> > helper named pte_range_cont_mapped() to check if all PTEs are contiguously
> > mapped to a large folio.
> >
> > Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
> > Co-developed-by: Barry Song <v-songbaohua@oppo.com>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  include/asm-generic/tlb.h | 10 +++++++
> >  include/linux/pgtable.h   | 60 +++++++++++++++++++++++++++++++++++++++
> >  mm/madvise.c              | 48 +++++++++++++++++++++++++++++++
> >  3 files changed, 118 insertions(+)
> >
> > diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> > index 129a3a759976..f894e22da5d6 100644
> > --- a/include/asm-generic/tlb.h
> > +++ b/include/asm-generic/tlb.h
> > @@ -608,6 +608,16 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
> >               __tlb_remove_tlb_entry(tlb, ptep, address);     \
> >       } while (0)
> >
> > +#define tlb_remove_nr_tlb_entry(tlb, ptep, address, nr)                      \
> > +     do {                                                            \
> > +             int i;                                                  \
> > +             tlb_flush_pte_range(tlb, address,                       \
> > +                             PAGE_SIZE * nr);                        \
> > +             for (i = 0; i < nr; i++)                                \
> > +                     __tlb_remove_tlb_entry(tlb, ptep + i,           \
> > +                                     address + i * PAGE_SIZE);       \
> > +     } while (0)
>
> David has recently added tlb_remove_tlb_entries() which does the same thing.

cool. While sending the patchset, we were not depending on other work.
Nice to know David's work can help this case.

>
> > +
> >  #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)     \
> >       do {                                                    \
> >               unsigned long _sz = huge_page_size(h);          \
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index 37fe83b0c358..da0c1cf447e3 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -320,6 +320,42 @@ static inline pgd_t pgdp_get(pgd_t *pgdp)
> >  }
> >  #endif
> >
> > +#ifndef pte_range_cont_mapped
> > +static inline bool pte_range_cont_mapped(unsigned long start_pfn,
> > +                                      pte_t *start_pte,
> > +                                      unsigned long start_addr,
> > +                                      int nr)
> > +{
> > +     int i;
> > +     pte_t pte_val;
> > +
> > +     for (i = 0; i < nr; i++) {
> > +             pte_val = ptep_get(start_pte + i);
> > +
> > +             if (pte_none(pte_val))
> > +                     return false;
> > +
> > +             if (pte_pfn(pte_val) != (start_pfn + i))
> > +                     return false;
> > +     }
> > +
> > +     return true;
> > +}
> > +#endif
>
> David has recently added folio_pte_batch() which does a similar thing (as
> discussed in other context).

yes.

>
> > +
> > +#ifndef pte_range_young
> > +static inline bool pte_range_young(pte_t *start_pte, int nr)
> > +{
> > +     int i;
> > +
> > +     for (i = 0; i < nr; i++)
> > +             if (pte_young(ptep_get(start_pte + i)))
> > +                     return true;
> > +
> > +     return false;
> > +}
> > +#endif
>
> I wonder if this should come from folio_pte_batch()?

not quite sure folio_pte_batch can return young. but i guess
you already have a batched function to check if a large folio
is young?

>
> > +
> >  #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
> >  static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
> >                                           unsigned long address,
> > @@ -580,6 +616,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
> >  }
> >  #endif
> >
> > +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_RANGE_FULL
> > +static inline pte_t ptep_get_and_clear_range_full(struct mm_struct *mm,
> > +                                               unsigned long start_addr,
> > +                                               pte_t *start_pte,
> > +                                               int nr, int full)
> > +{
> > +     int i;
> > +     pte_t pte;
> > +
> > +     pte = ptep_get_and_clear_full(mm, start_addr, start_pte, full);
> > +
> > +     for (i = 1; i < nr; i++)
> > +             ptep_get_and_clear_full(mm, start_addr + i * PAGE_SIZE,
> > +                                     start_pte + i, full);
> > +
> > +     return pte;
> > +}
>
> David has recently added get_and_clear_full_ptes(). Your version isn't gathering
> access/dirty, which may be ok for your case, but not ok in general.

ok. glad to know we can use get_and_clear_full_ptes().

>
> >
> >  /*
> >   * If two threads concurrently fault at the same page, the thread that
> > @@ -995,6 +1048,13 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> >  })
> >  #endif
> >
> > +#ifndef pte_nr_addr_end
> > +#define pte_nr_addr_end(addr, size, end)                             \
> > +({   unsigned long __boundary = ((addr) + size) & (~(size - 1));     \
> > +     (__boundary - 1 < (end) - 1)? __boundary: (end);                \
> > +})
> > +#endif
> > +
> >  /*
> >   * When walking page tables, we usually want to skip any p?d_none entries;
> >   * and any p?d_bad entries - reporting the error before resetting to none.
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 912155a94ed5..262460ac4b2e 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -452,6 +452,54 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> >               if (folio_test_large(folio)) {
> >                       int err;
> >
> > +                     if (!folio_test_pmd_mappable(folio)) {
> > +                             int nr_pages = folio_nr_pages(folio);
> > +                             unsigned long folio_size = PAGE_SIZE * nr_pages;
> > +                             unsigned long start_addr = ALIGN_DOWN(addr, nr_pages * PAGE_SIZE);;
>
> I doubt it is correct to align down here. Couldn't you be going outside the
> bounds that the user supplied?

Yes, it can. This is ugly and suspicious but does not cause problems
if the large folio's virtadd is aligned , but it is wrong if virtual address is
not aligned as explained below.

>
> nit: you've defined folio_size, why not use it here?
> nit: double semi-colon.
>
> > +                             unsigned long start_pfn = page_to_pfn(folio_page(folio, 0));
> > +                             pte_t *start_pte = pte - (addr - start_addr) / PAGE_SIZE;
>
> I think start_pte could be off the start of the pgtable and into random memory

> in some corner cases (and outside the protection of the PTL)? You're assuming
> that the folio is fully and contigously mapped and correctly aligned. mremap
> (and other things) could break that assumption.

actually we don't run under the assumption folio is fully and
contiguously mapped.
but the code does assume a large folio's virtual address is aligned with
nr_pages * PAGE_SIZE.

OTOH,  we have  if (next - addr != folio_size) to split folios if
users just want to partially
reclaim a large folio, but I do agree we should move if (next - addr
!= folio_size)
before pte_range_cont_mapped().

as long as the virt addr is aligned, pte_range_cont_mapped() won't
cause a problem
for the code even before if (next - addr != folio_size) (but ugly and
suspicious) as it is
still under the protection of PTL since we don't cross a PMD for a
pte-mapped large
folio.

but you are really right, we have cases like mremap which can remap an aligned
large folio to an unaligned address. I actually placed a trace point
in kernel, running
lots of phones, didn't find this case was happening. so i feel mremap
is really rare.
Is it possible to split large folios and avoid complexity  instead if
we are remapping
to an unaligned address?

And, the code is really completely wrong if the large folio is
unaligned. we have to
remove the assumption if that is really happening. So shouldn't do ALIGN_DOWN.

>
> > +                             unsigned long next = pte_nr_addr_end(addr, folio_size, end);
> > +
> > +                             if (!pte_range_cont_mapped(start_pfn, start_pte, start_addr, nr_pages))
> > +                                     goto split;
> > +
> > +                             if (next - addr != folio_size) {
> > +                                     goto split;
> > +                             } else {
> > +                                     /* Do not interfere with other mappings of this page */
> > +                                     if (folio_estimated_sharers(folio) != 1)
> > +                                             goto skip;
> > +
> > +                                     VM_BUG_ON(addr != start_addr || pte != start_pte);
> > +
> > +                                     if (pte_range_young(start_pte, nr_pages)) {
> > +                                             ptent = ptep_get_and_clear_range_full(mm, start_addr, start_pte,
> > +                                                                                   nr_pages, tlb->fullmm);
> > +                                             ptent = pte_mkold(ptent);
> > +
> > +                                             set_ptes(mm, start_addr, start_pte, ptent, nr_pages);
> > +                                             tlb_remove_nr_tlb_entry(tlb, start_pte, start_addr, nr_pages);
> > +                                     }
> > +
> > +                                     folio_clear_referenced(folio);
> > +                                     folio_test_clear_young(folio);
> > +                                     if (pageout) {
> > +                                             if (folio_isolate_lru(folio)) {
> > +                                                     if (folio_test_unevictable(folio))
> > +                                                             folio_putback_lru(folio);
> > +                                                     else
> > +                                                             list_add(&folio->lru, &folio_list);
> > +                                             }
> > +                                     } else
> > +                                             folio_deactivate(folio);
> > +                             }
> > +skip:
> > +                             pte += (next - PAGE_SIZE - (addr & PAGE_MASK))/PAGE_SIZE;
> > +                             addr = next - PAGE_SIZE;
> > +                             continue;
> > +
> > +                     }
> > +split:
> >                       if (folio_estimated_sharers(folio) != 1)
> >                               break;
> >                       if (pageout_anon_only_filter && !folio_test_anon(folio))

Thanks
Barry
Barry Song Feb. 28, 2024, 3:49 a.m. UTC | #7
>> I'm going to rework this patch and integrate it into my series if that's ok with
>> you?
> 
> This is perfect. Please integrate it into your swap-out series which is the
> perfect place for this MADV_PAGEOUT.

BTW, Ryan, while you integrate this into your swap-put series, can you also
add the below one which is addressing one comment of Chris,

From: Barry Song <v-songbaohua@oppo.com>
Date: Tue, 27 Feb 2024 22:03:59 +1300
Subject: [PATCH] mm: madvise: extract common function
 folio_deactivate_or_add_to_reclaim_list

For madvise_cold_or_pageout_pte_range, both pmd-mapped and pte-mapped
normal folios are duplicating the same code right now, and we might
have more such as pte-mapped large folios to use it. It is better
to extract a common function.

Cc: Chris Li <chrisl@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/madvise.c | 52 ++++++++++++++++++++--------------------------------
 1 file changed, 20 insertions(+), 32 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 44a498c94158..1812457144ea 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -321,6 +321,24 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
+static inline void folio_deactivate_or_add_to_reclaim_list(struct folio *folio, bool pageout,
+				struct list_head *folio_list)
+{
+	folio_clear_referenced(folio);
+	folio_test_clear_young(folio);
+
+	if (folio_test_active(folio))
+		folio_set_workingset(folio);
+	if (!pageout)
+		return folio_deactivate(folio);
+	if (folio_isolate_lru(folio)) {
+		if (folio_test_unevictable(folio))
+			folio_putback_lru(folio);
+		else
+			list_add(&folio->lru, folio_list);
+	}
+}
+
 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
@@ -394,19 +412,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 		}
 
-		folio_clear_referenced(folio);
-		folio_test_clear_young(folio);
-		if (folio_test_active(folio))
-			folio_set_workingset(folio);
-		if (pageout) {
-			if (folio_isolate_lru(folio)) {
-				if (folio_test_unevictable(folio))
-					folio_putback_lru(folio);
-				else
-					list_add(&folio->lru, &folio_list);
-			}
-		} else
-			folio_deactivate(folio);
+		folio_deactivate_or_add_to_reclaim_list(folio, pageout, &folio_list);
 huge_unlock:
 		spin_unlock(ptl);
 		if (pageout)
@@ -498,25 +504,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			tlb_remove_tlb_entry(tlb, pte, addr);
 		}
 
-		/*
-		 * We are deactivating a folio for accelerating reclaiming.
-		 * VM couldn't reclaim the folio unless we clear PG_young.
-		 * As a side effect, it makes confuse idle-page tracking
-		 * because they will miss recent referenced history.
-		 */
-		folio_clear_referenced(folio);
-		folio_test_clear_young(folio);
-		if (folio_test_active(folio))
-			folio_set_workingset(folio);
-		if (pageout) {
-			if (folio_isolate_lru(folio)) {
-				if (folio_test_unevictable(folio))
-					folio_putback_lru(folio);
-				else
-					list_add(&folio->lru, &folio_list);
-			}
-		} else
-			folio_deactivate(folio);
+		folio_deactivate_or_add_to_reclaim_list(folio, pageout, &folio_list);
 	}
 
 	if (start_pte) {
diff mbox series

Patch

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 129a3a759976..f894e22da5d6 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -608,6 +608,16 @@  static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 	} while (0)
 
+#define tlb_remove_nr_tlb_entry(tlb, ptep, address, nr)			\
+	do {                                                    	\
+		int i;							\
+		tlb_flush_pte_range(tlb, address,			\
+				PAGE_SIZE * nr);			\
+		for (i = 0; i < nr; i++)				\
+			__tlb_remove_tlb_entry(tlb, ptep + i,		\
+					address + i * PAGE_SIZE);	\
+	} while (0)
+
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
 	do {							\
 		unsigned long _sz = huge_page_size(h);		\
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 37fe83b0c358..da0c1cf447e3 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -320,6 +320,42 @@  static inline pgd_t pgdp_get(pgd_t *pgdp)
 }
 #endif
 
+#ifndef pte_range_cont_mapped
+static inline bool pte_range_cont_mapped(unsigned long start_pfn,
+					 pte_t *start_pte,
+					 unsigned long start_addr,
+					 int nr)
+{
+	int i;
+	pte_t pte_val;
+
+	for (i = 0; i < nr; i++) {
+		pte_val = ptep_get(start_pte + i);
+
+		if (pte_none(pte_val))
+			return false;
+
+		if (pte_pfn(pte_val) != (start_pfn + i))
+			return false;
+	}
+
+	return true;
+}
+#endif
+
+#ifndef pte_range_young
+static inline bool pte_range_young(pte_t *start_pte, int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; i++)
+		if (pte_young(ptep_get(start_pte + i)))
+			return true;
+
+	return false;
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
@@ -580,6 +616,23 @@  static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
 }
 #endif
 
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_RANGE_FULL
+static inline pte_t ptep_get_and_clear_range_full(struct mm_struct *mm,
+						  unsigned long start_addr,
+						  pte_t *start_pte,
+						  int nr, int full)
+{
+	int i;
+	pte_t pte;
+
+	pte = ptep_get_and_clear_full(mm, start_addr, start_pte, full);
+
+	for (i = 1; i < nr; i++)
+		ptep_get_and_clear_full(mm, start_addr + i * PAGE_SIZE,
+					start_pte + i, full);
+
+	return pte;
+}
 
 /*
  * If two threads concurrently fault at the same page, the thread that
@@ -995,6 +1048,13 @@  static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 })
 #endif
 
+#ifndef pte_nr_addr_end
+#define pte_nr_addr_end(addr, size, end)				\
+({	unsigned long __boundary = ((addr) + size) & (~(size - 1));	\
+	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
+})
+#endif
+
 /*
  * When walking page tables, we usually want to skip any p?d_none entries;
  * and any p?d_bad entries - reporting the error before resetting to none.
diff --git a/mm/madvise.c b/mm/madvise.c
index 912155a94ed5..262460ac4b2e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -452,6 +452,54 @@  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		if (folio_test_large(folio)) {
 			int err;
 
+			if (!folio_test_pmd_mappable(folio)) {
+				int nr_pages = folio_nr_pages(folio);
+				unsigned long folio_size = PAGE_SIZE * nr_pages;
+				unsigned long start_addr = ALIGN_DOWN(addr, nr_pages * PAGE_SIZE);;
+				unsigned long start_pfn = page_to_pfn(folio_page(folio, 0));
+				pte_t *start_pte = pte - (addr - start_addr) / PAGE_SIZE;
+				unsigned long next = pte_nr_addr_end(addr, folio_size, end);
+
+				if (!pte_range_cont_mapped(start_pfn, start_pte, start_addr, nr_pages))
+					goto split;
+
+				if (next - addr != folio_size) {
+					goto split;
+				} else {
+					/* Do not interfere with other mappings of this page */
+					if (folio_estimated_sharers(folio) != 1)
+						goto skip;
+
+					VM_BUG_ON(addr != start_addr || pte != start_pte);
+
+					if (pte_range_young(start_pte, nr_pages)) {
+						ptent = ptep_get_and_clear_range_full(mm, start_addr, start_pte,
+										      nr_pages, tlb->fullmm);
+						ptent = pte_mkold(ptent);
+
+						set_ptes(mm, start_addr, start_pte, ptent, nr_pages);
+						tlb_remove_nr_tlb_entry(tlb, start_pte, start_addr, nr_pages);
+					}
+
+					folio_clear_referenced(folio);
+					folio_test_clear_young(folio);
+					if (pageout) {
+						if (folio_isolate_lru(folio)) {
+							if (folio_test_unevictable(folio))
+								folio_putback_lru(folio);
+							else
+								list_add(&folio->lru, &folio_list);
+						}
+					} else
+						folio_deactivate(folio);
+				}
+skip:
+				pte += (next - PAGE_SIZE - (addr & PAGE_MASK))/PAGE_SIZE;
+				addr = next - PAGE_SIZE;
+				continue;
+
+			}
+split:
 			if (folio_estimated_sharers(folio) != 1)
 				break;
 			if (pageout_anon_only_filter && !folio_test_anon(folio))