diff mbox series

[1/1] mm/vmscan: avoid split PMD-mapped THP during shrink_folio_list()

Message ID 20240417141111.77855-1-ioworker0@gmail.com (mailing list archive)
State New
Headers show
Series [1/1] mm/vmscan: avoid split PMD-mapped THP during shrink_folio_list() | expand

Commit Message

Lance Yang April 17, 2024, 2:11 p.m. UTC
When the user no longer requires the pages, they would use madvise(madv_free)
to mark the pages as lazy free. IMO, they would not typically rewrite to the
given range.

At present, a PMD-mapped THP marked as lazyfree during shrink_folio_list()
is unconditionally split, which may be unnecessary. If the THP is exclusively
mapped and clean, and the PMD associated with it is also clean, then we can
attempt to remove the PMD mapping from it. This change will improve the
efficiency of memory reclamation in this case.

On an Intel i5 CPU, reclaiming 1GiB of PMD-mapped THPs using
mem_cgroup_force_empty() results in the following runtimes in seconds
(shorter is better):

--------------------------------------------
|     Old       |      New       |  Change  |
--------------------------------------------
|   0.683426    |    0.049197    |  -92.80% |
--------------------------------------------

Signed-off-by: Lance Yang <ioworker0@gmail.com>
---
 include/linux/huge_mm.h |  1 +
 include/linux/rmap.h    |  1 +
 mm/huge_memory.c        |  2 +-
 mm/rmap.c               | 81 +++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c             |  7 ++++
 5 files changed, 91 insertions(+), 1 deletion(-)

Comments

David Hildenbrand April 17, 2024, 3:02 p.m. UTC | #1
On 17.04.24 16:11, Lance Yang wrote:
> When the user no longer requires the pages, they would use madvise(madv_free)
> to mark the pages as lazy free. IMO, they would not typically rewrite to the
> given range.
> 
> At present, a PMD-mapped THP marked as lazyfree during shrink_folio_list()
> is unconditionally split, which may be unnecessary. If the THP is exclusively
> mapped and clean, and the PMD associated with it is also clean, then we can
> attempt to remove the PMD mapping from it. This change will improve the
> efficiency of memory reclamation in this case.
> 
> On an Intel i5 CPU, reclaiming 1GiB of PMD-mapped THPs using
> mem_cgroup_force_empty() results in the following runtimes in seconds
> (shorter is better):
> 
> --------------------------------------------
> |     Old       |      New       |  Change  |
> --------------------------------------------
> |   0.683426    |    0.049197    |  -92.80% |
> --------------------------------------------
> 
> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> ---
>   include/linux/huge_mm.h |  1 +
>   include/linux/rmap.h    |  1 +
>   mm/huge_memory.c        |  2 +-
>   mm/rmap.c               | 81 +++++++++++++++++++++++++++++++++++++++++
>   mm/vmscan.c             |  7 ++++
>   5 files changed, 91 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 7cd07b83a3d0..02a71c05f68a 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -36,6 +36,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>   int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
>   		    unsigned long cp_flags);
> +inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd);
>   
>   vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
>   vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 0f906dc6d280..8c2f45713351 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -100,6 +100,7 @@ enum ttu_flags {
>   					 * do a final flush if necessary */
>   	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
>   					 * caller holds it */
> +	TTU_LAZYFREE_THP	= 0x100, /* avoid split PMD-mapped THP */
>   };
>   
>   #ifdef CONFIG_MMU
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 58f2c4745d80..309fba9624c2 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1801,7 +1801,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   	return ret;
>   }
>   
> -static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
> +inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
>   {
>   	pgtable_t pgtable;
>   
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 2608c40dffad..4994f9e402d4 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -77,6 +77,7 @@
>   #include <linux/mm_inline.h>
>   
>   #include <asm/tlbflush.h>
> +#include <asm/tlb.h>
>   
>   #define CREATE_TRACE_POINTS
>   #include <trace/events/tlb.h>
> @@ -1606,6 +1607,80 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
>   #endif
>   }
>   
> +static bool __try_to_unmap_lazyfree_thp(struct vm_area_struct *vma,
> +					     unsigned long address,
> +					     struct folio *folio)
> +{
> +	spinlock_t *ptl;
> +	pmd_t *pmdp, orig_pmd;
> +	struct mmu_notifier_range range;
> +	struct mmu_gather tlb;
> +	struct mm_struct *mm = vma->vm_mm;
> +	struct page *page;
> +	bool ret = false;
> +
> +	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
> +	VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
> +	VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
> +	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> +
> +	/*
> +	 * If we encounter a PMD-mapped THP that marked as lazyfree, we
> +	 * will try to unmap it without splitting.
> +	 *
> +	 * The folio exclusively mapped should only have two refs:
> +	 * one from the isolation and one from the rmap.
> +	 */
> +	if (folio_entire_mapcount(folio) != 1 || folio_test_dirty(folio) ||
> +	    folio_ref_count(folio) != 2)

folio_mapcount() == 1 is a bit nicer. Bit I assume you can drop that 
completely and only check the refcount?

> +		return false;
> +
> +	pmdp = mm_find_pmd(mm, address);
> +	if (unlikely(!pmdp))
> +		return false;
> +	if (pmd_dirty(*pmdp))
> +		return false;
> +
> +	tlb_gather_mmu(&tlb, mm);
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> +				address & HPAGE_PMD_MASK,
> +				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +
> +	ptl = pmd_lock(mm, pmdp);
> +	orig_pmd = *pmdp;
> +	if (unlikely(!pmd_present(orig_pmd) || !pmd_trans_huge(orig_pmd)))
> +		goto out;
> +
> +	page = pmd_page(orig_pmd);
> +	if (unlikely(page_folio(page) != folio))
> +		goto out;
> +
> +	orig_pmd = pmdp_huge_get_and_clear(mm, address, pmdp);
> +	tlb_remove_pmd_tlb_entry(&tlb, pmdp, address);

Until this point, the page could have been pinned (including GUP-fast) 
and we might be in trouble if we drop it.
Matthew Wilcox April 17, 2024, 3:08 p.m. UTC | #2
On Wed, Apr 17, 2024 at 10:11:11PM +0800, Lance Yang wrote:
> When the user no longer requires the pages, they would use madvise(madv_free)
> to mark the pages as lazy free. IMO, they would not typically rewrite to the
> given range.
> 
> At present, a PMD-mapped THP marked as lazyfree during shrink_folio_list()
> is unconditionally split, which may be unnecessary. If the THP is exclusively
> mapped and clean, and the PMD associated with it is also clean, then we can
> attempt to remove the PMD mapping from it. This change will improve the
> efficiency of memory reclamation in this case.
> 
> On an Intel i5 CPU, reclaiming 1GiB of PMD-mapped THPs using
> mem_cgroup_force_empty() results in the following runtimes in seconds
> (shorter is better):
> 
> --------------------------------------------
> |     Old       |      New       |  Change  |
> --------------------------------------------
> |   0.683426    |    0.049197    |  -92.80% |
> --------------------------------------------
> 
> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> ---
>  include/linux/huge_mm.h |  1 +
>  include/linux/rmap.h    |  1 +
>  mm/huge_memory.c        |  2 +-
>  mm/rmap.c               | 81 +++++++++++++++++++++++++++++++++++++++++
>  mm/vmscan.c             |  7 ++++
>  5 files changed, 91 insertions(+), 1 deletion(-)

I'm confused why we need all this extra code.  If we remove a folio
from the pagecache, we can just call truncate_inode_folio() and
unmap_mapping_folio() takes care of all the necessary unmappings.
Why can't you call unmap_mapping_folio() here?
Lance Yang April 18, 2024, 6:40 a.m. UTC | #3
Hey David,

Thanks for taking time to review.

On Wed, Apr 17, 2024 at 11:02 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 17.04.24 16:11, Lance Yang wrote:
> > When the user no longer requires the pages, they would use madvise(madv_free)
> > to mark the pages as lazy free. IMO, they would not typically rewrite to the
> > given range.
> >
> > At present, a PMD-mapped THP marked as lazyfree during shrink_folio_list()
> > is unconditionally split, which may be unnecessary. If the THP is exclusively
> > mapped and clean, and the PMD associated with it is also clean, then we can
> > attempt to remove the PMD mapping from it. This change will improve the
> > efficiency of memory reclamation in this case.
> >
> > On an Intel i5 CPU, reclaiming 1GiB of PMD-mapped THPs using
> > mem_cgroup_force_empty() results in the following runtimes in seconds
> > (shorter is better):
> >
> > --------------------------------------------
> > |     Old       |      New       |  Change  |
> > --------------------------------------------
> > |   0.683426    |    0.049197    |  -92.80% |
> > --------------------------------------------
> >
> > Signed-off-by: Lance Yang <ioworker0@gmail.com>
> > ---
> >   include/linux/huge_mm.h |  1 +
> >   include/linux/rmap.h    |  1 +
> >   mm/huge_memory.c        |  2 +-
> >   mm/rmap.c               | 81 +++++++++++++++++++++++++++++++++++++++++
> >   mm/vmscan.c             |  7 ++++
> >   5 files changed, 91 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 7cd07b83a3d0..02a71c05f68a 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -36,6 +36,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
> >   int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >                   pmd_t *pmd, unsigned long addr, pgprot_t newprot,
> >                   unsigned long cp_flags);
> > +inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd);
> >
> >   vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
> >   vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
> > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > index 0f906dc6d280..8c2f45713351 100644
> > --- a/include/linux/rmap.h
> > +++ b/include/linux/rmap.h
> > @@ -100,6 +100,7 @@ enum ttu_flags {
> >                                        * do a final flush if necessary */
> >       TTU_RMAP_LOCKED         = 0x80, /* do not grab rmap lock:
> >                                        * caller holds it */
> > +     TTU_LAZYFREE_THP        = 0x100, /* avoid split PMD-mapped THP */
> >   };
> >
> >   #ifdef CONFIG_MMU
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 58f2c4745d80..309fba9624c2 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1801,7 +1801,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >       return ret;
> >   }
> >
> > -static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
> > +inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
> >   {
> >       pgtable_t pgtable;
> >
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index 2608c40dffad..4994f9e402d4 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -77,6 +77,7 @@
> >   #include <linux/mm_inline.h>
> >
> >   #include <asm/tlbflush.h>
> > +#include <asm/tlb.h>
> >
> >   #define CREATE_TRACE_POINTS
> >   #include <trace/events/tlb.h>
> > @@ -1606,6 +1607,80 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
> >   #endif
> >   }
> >
> > +static bool __try_to_unmap_lazyfree_thp(struct vm_area_struct *vma,
> > +                                          unsigned long address,
> > +                                          struct folio *folio)
> > +{
> > +     spinlock_t *ptl;
> > +     pmd_t *pmdp, orig_pmd;
> > +     struct mmu_notifier_range range;
> > +     struct mmu_gather tlb;
> > +     struct mm_struct *mm = vma->vm_mm;
> > +     struct page *page;
> > +     bool ret = false;
> > +
> > +     VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
> > +     VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
> > +     VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
> > +     VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> > +
> > +     /*
> > +      * If we encounter a PMD-mapped THP that marked as lazyfree, we
> > +      * will try to unmap it without splitting.
> > +      *
> > +      * The folio exclusively mapped should only have two refs:
> > +      * one from the isolation and one from the rmap.
> > +      */
> > +     if (folio_entire_mapcount(folio) != 1 || folio_test_dirty(folio) ||
> > +         folio_ref_count(folio) != 2)
>
> folio_mapcount() == 1 is a bit nicer. Bit I assume you can drop that
> completely and only check the refcount?

Thanks for your suggestion!

 + if (folio_test_dirty(folio) || folio_ref_count(folio) != 2)

I'm not sure if it's safe without checking the folio_mapcount.

>
> > +             return false;
> > +
> > +     pmdp = mm_find_pmd(mm, address);
> > +     if (unlikely(!pmdp))
> > +             return false;
> > +     if (pmd_dirty(*pmdp))
> > +             return false;
> > +
> > +     tlb_gather_mmu(&tlb, mm);
> > +     mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> > +                             address & HPAGE_PMD_MASK,
> > +                             (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
> > +     mmu_notifier_invalidate_range_start(&range);
> > +
> > +     ptl = pmd_lock(mm, pmdp);
> > +     orig_pmd = *pmdp;
> > +     if (unlikely(!pmd_present(orig_pmd) || !pmd_trans_huge(orig_pmd)))
> > +             goto out;
> > +
> > +     page = pmd_page(orig_pmd);
> > +     if (unlikely(page_folio(page) != folio))
> > +             goto out;
> > +
> > +     orig_pmd = pmdp_huge_get_and_clear(mm, address, pmdp);
> > +     tlb_remove_pmd_tlb_entry(&tlb, pmdp, address);
>
> Until this point, the page could have been pinned (including GUP-fast)
> and we might be in trouble if we drop it.

Thanks for pointing that out!

+ if (pmd_dirty(orig_pmd) || folio_maybe_dma_pinned(folio) ||
folio_ref_count(folio) != 2) {
+     set_pmd_at(mm, address, pmdp, orig_pmd);
+ } else {

Could I check the folio->_pincount using folio_maybe_dma_pinned() and
then re-check the refcount here? Or should I just re-check the refcount?

IIUC, this folio has been already unlinked from the PMD and the process
cannot get an additional pin on this folio.

Thanks again for the review!
Lance

>
> --
> Cheers,
>
> David / dhildenb
>
Lance Yang April 20, 2024, 4:59 a.m. UTC | #4
Hey Matthew,

Thanks for taking time to review!

On Wed, Apr 17, 2024 at 11:09 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Apr 17, 2024 at 10:11:11PM +0800, Lance Yang wrote:
> > When the user no longer requires the pages, they would use madvise(madv_free)
> > to mark the pages as lazy free. IMO, they would not typically rewrite to the
> > given range.
> >
> > At present, a PMD-mapped THP marked as lazyfree during shrink_folio_list()
> > is unconditionally split, which may be unnecessary. If the THP is exclusively
> > mapped and clean, and the PMD associated with it is also clean, then we can
> > attempt to remove the PMD mapping from it. This change will improve the
> > efficiency of memory reclamation in this case.
> >
> > On an Intel i5 CPU, reclaiming 1GiB of PMD-mapped THPs using
> > mem_cgroup_force_empty() results in the following runtimes in seconds
> > (shorter is better):
> >
> > --------------------------------------------
> > |     Old       |      New       |  Change  |
> > --------------------------------------------
> > |   0.683426    |    0.049197    |  -92.80% |
> > --------------------------------------------
> >
> > Signed-off-by: Lance Yang <ioworker0@gmail.com>
> > ---
> >  include/linux/huge_mm.h |  1 +
> >  include/linux/rmap.h    |  1 +
> >  mm/huge_memory.c        |  2 +-
> >  mm/rmap.c               | 81 +++++++++++++++++++++++++++++++++++++++++
> >  mm/vmscan.c             |  7 ++++
> >  5 files changed, 91 insertions(+), 1 deletion(-)
>
> I'm confused why we need all this extra code.  If we remove a folio

Thanks for pointing that out!

I've added a lot of extra code to rmap.c, and we don't need it
for file pages - sorry. I'll reconsider where to place this code.

> from the pagecache, we can just call truncate_inode_folio() and
> unmap_mapping_folio() takes care of all the necessary unmappings.
> Why can't you call unmap_mapping_folio() here?

Thanks for your suggestion.

But this change only avoids the splitting of *anon* large folios
(PMD-mapped THPs) that are marked as lazyfree during
shrink_folio_list().

IIUC, in some cases, we cannot unmap the THP marked as lazyfree
here, such as when it's not exclusively mapped, dirty, pinned, etc.
In such situations, we still need to return to try_to_unmap_one(), and
then call split_huge_pmd_address() to split it.

Thanks again for the review.
Lance
Lance Yang April 20, 2024, 3:04 p.m. UTC | #5
On Sat, Apr 20, 2024 at 12:59 PM Lance Yang <ioworker0@gmail.com> wrote:
>
> Hey Matthew,
>
> Thanks for taking time to review!
>
> On Wed, Apr 17, 2024 at 11:09 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Wed, Apr 17, 2024 at 10:11:11PM +0800, Lance Yang wrote:
> > > When the user no longer requires the pages, they would use madvise(madv_free)
> > > to mark the pages as lazy free. IMO, they would not typically rewrite to the
> > > given range.
> > >
> > > At present, a PMD-mapped THP marked as lazyfree during shrink_folio_list()
> > > is unconditionally split, which may be unnecessary. If the THP is exclusively
> > > mapped and clean, and the PMD associated with it is also clean, then we can
> > > attempt to remove the PMD mapping from it. This change will improve the
> > > efficiency of memory reclamation in this case.
> > >
> > > On an Intel i5 CPU, reclaiming 1GiB of PMD-mapped THPs using
> > > mem_cgroup_force_empty() results in the following runtimes in seconds
> > > (shorter is better):
> > >
> > > --------------------------------------------
> > > |     Old       |      New       |  Change  |
> > > --------------------------------------------
> > > |   0.683426    |    0.049197    |  -92.80% |
> > > --------------------------------------------
> > >
> > > Signed-off-by: Lance Yang <ioworker0@gmail.com>
> > > ---
> > >  include/linux/huge_mm.h |  1 +
> > >  include/linux/rmap.h    |  1 +
> > >  mm/huge_memory.c        |  2 +-
> > >  mm/rmap.c               | 81 +++++++++++++++++++++++++++++++++++++++++
> > >  mm/vmscan.c             |  7 ++++
> > >  5 files changed, 91 insertions(+), 1 deletion(-)
> >
> > I'm confused why we need all this extra code.  If we remove a folio
>
> Thanks for pointing that out!
>
> I've added a lot of extra code to rmap.c, and we don't need it
> for file pages - sorry. I'll reconsider where to place this code.
>
> > from the pagecache, we can just call truncate_inode_folio() and
> > unmap_mapping_folio() takes care of all the necessary unmappings.
> > Why can't you call unmap_mapping_folio() here?
>
> Thanks for your suggestion.
>
> But this change only avoids the splitting of *anon* large folios
> (PMD-mapped THPs) that are marked as lazyfree during
> shrink_folio_list().
>
> IIUC, in some cases, we cannot unmap the THP marked as lazyfree
> here, such as when it's not exclusively mapped, dirty, pinned, etc.

I’d like to make a correction.

IMO, we can unmap the THP that is not exclusively mapped, but
ensuring folio_ref_count() equals folio_mapcount() +1.

Thanks,
Lance

> In such situations, we still need to return to try_to_unmap_one(), and
> then call split_huge_pmd_address() to split it.
>
> Thanks again for the review.
> Lance
David Hildenbrand April 20, 2024, 4:31 p.m. UTC | #6
On 20.04.24 17:04, Lance Yang wrote:
> On Sat, Apr 20, 2024 at 12:59 PM Lance Yang <ioworker0@gmail.com> wrote:
>>
>> Hey Matthew,
>>
>> Thanks for taking time to review!
>>
>> On Wed, Apr 17, 2024 at 11:09 PM Matthew Wilcox <willy@infradead.org> wrote:
>>>
>>> On Wed, Apr 17, 2024 at 10:11:11PM +0800, Lance Yang wrote:
>>>> When the user no longer requires the pages, they would use madvise(madv_free)
>>>> to mark the pages as lazy free. IMO, they would not typically rewrite to the
>>>> given range.
>>>>
>>>> At present, a PMD-mapped THP marked as lazyfree during shrink_folio_list()
>>>> is unconditionally split, which may be unnecessary. If the THP is exclusively
>>>> mapped and clean, and the PMD associated with it is also clean, then we can
>>>> attempt to remove the PMD mapping from it. This change will improve the
>>>> efficiency of memory reclamation in this case.
>>>>
>>>> On an Intel i5 CPU, reclaiming 1GiB of PMD-mapped THPs using
>>>> mem_cgroup_force_empty() results in the following runtimes in seconds
>>>> (shorter is better):
>>>>
>>>> --------------------------------------------
>>>> |     Old       |      New       |  Change  |
>>>> --------------------------------------------
>>>> |   0.683426    |    0.049197    |  -92.80% |
>>>> --------------------------------------------
>>>>
>>>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
>>>> ---
>>>>   include/linux/huge_mm.h |  1 +
>>>>   include/linux/rmap.h    |  1 +
>>>>   mm/huge_memory.c        |  2 +-
>>>>   mm/rmap.c               | 81 +++++++++++++++++++++++++++++++++++++++++
>>>>   mm/vmscan.c             |  7 ++++
>>>>   5 files changed, 91 insertions(+), 1 deletion(-)
>>>
>>> I'm confused why we need all this extra code.  If we remove a folio
>>
>> Thanks for pointing that out!
>>
>> I've added a lot of extra code to rmap.c, and we don't need it
>> for file pages - sorry. I'll reconsider where to place this code.
>>
>>> from the pagecache, we can just call truncate_inode_folio() and
>>> unmap_mapping_folio() takes care of all the necessary unmappings.
>>> Why can't you call unmap_mapping_folio() here?
>>
>> Thanks for your suggestion.
>>
>> But this change only avoids the splitting of *anon* large folios
>> (PMD-mapped THPs) that are marked as lazyfree during
>> shrink_folio_list().
>>
>> IIUC, in some cases, we cannot unmap the THP marked as lazyfree
>> here, such as when it's not exclusively mapped, dirty, pinned, etc.
> 
> I’d like to make a correction.
> 
> IMO, we can unmap the THP that is not exclusively mapped, but
> ensuring folio_ref_count() equals folio_mapcount() +1.

You must follow the exact same logic as in try_to_unmap_one() I guess.

That is, unmap the page, syncing against concurrent GUP-fast. Then, 
check mapcount vs. refcount. If there are unexpected references, remap 
the page (set_pte_at).
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7cd07b83a3d0..02a71c05f68a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -36,6 +36,7 @@  bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
 		    unsigned long cp_flags);
+inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd);
 
 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0f906dc6d280..8c2f45713351 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -100,6 +100,7 @@  enum ttu_flags {
 					 * do a final flush if necessary */
 	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
 					 * caller holds it */
+	TTU_LAZYFREE_THP	= 0x100, /* avoid split PMD-mapped THP */
 };
 
 #ifdef CONFIG_MMU
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 58f2c4745d80..309fba9624c2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1801,7 +1801,7 @@  bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	return ret;
 }
 
-static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
 {
 	pgtable_t pgtable;
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 2608c40dffad..4994f9e402d4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -77,6 +77,7 @@ 
 #include <linux/mm_inline.h>
 
 #include <asm/tlbflush.h>
+#include <asm/tlb.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/tlb.h>
@@ -1606,6 +1607,80 @@  void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
 #endif
 }
 
+static bool __try_to_unmap_lazyfree_thp(struct vm_area_struct *vma,
+					     unsigned long address,
+					     struct folio *folio)
+{
+	spinlock_t *ptl;
+	pmd_t *pmdp, orig_pmd;
+	struct mmu_notifier_range range;
+	struct mmu_gather tlb;
+	struct mm_struct *mm = vma->vm_mm;
+	struct page *page;
+	bool ret = false;
+
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+	VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
+	VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
+	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+
+	/*
+	 * If we encounter a PMD-mapped THP that marked as lazyfree, we
+	 * will try to unmap it without splitting.
+	 *
+	 * The folio exclusively mapped should only have two refs:
+	 * one from the isolation and one from the rmap.
+	 */
+	if (folio_entire_mapcount(folio) != 1 || folio_test_dirty(folio) ||
+	    folio_ref_count(folio) != 2)
+		return false;
+
+	pmdp = mm_find_pmd(mm, address);
+	if (unlikely(!pmdp))
+		return false;
+	if (pmd_dirty(*pmdp))
+		return false;
+
+	tlb_gather_mmu(&tlb, mm);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+				address & HPAGE_PMD_MASK,
+				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	ptl = pmd_lock(mm, pmdp);
+	orig_pmd = *pmdp;
+	if (unlikely(!pmd_present(orig_pmd) || !pmd_trans_huge(orig_pmd)))
+		goto out;
+
+	page = pmd_page(orig_pmd);
+	if (unlikely(page_folio(page) != folio))
+		goto out;
+
+	orig_pmd = pmdp_huge_get_and_clear(mm, address, pmdp);
+	tlb_remove_pmd_tlb_entry(&tlb, pmdp, address);
+	/*
+	 * There is a race between the first check of the dirty bit
+	 * for the PMD and the TLB entry flush. If the PMD is re-dirty
+	 * at this point, we will return to try_to_unmap_one() to call
+	 * split_huge_pmd_address() to split it.
+	 */
+	if (pmd_dirty(orig_pmd))
+		set_pmd_at(mm, address, pmdp, orig_pmd);
+	else {
+		folio_remove_rmap_pmd(folio, page, vma);
+		zap_deposited_table(mm, pmdp);
+		add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+		folio_put(folio);
+		ret = true;
+	}
+
+out:
+	spin_unlock(ptl);
+	mmu_notifier_invalidate_range_end(&range);
+
+	return ret;
+}
+
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
@@ -1631,6 +1706,12 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 	if (flags & TTU_SYNC)
 		pvmw.flags = PVMW_SYNC;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (flags & TTU_LAZYFREE_THP)
+		if (__try_to_unmap_lazyfree_thp(vma, address, folio))
+			return true;
+#endif
+
 	if (flags & TTU_SPLIT_HUGE_PMD)
 		split_huge_pmd_address(vma, address, false, folio);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 49bd94423961..2358b1cff8bf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1277,6 +1277,13 @@  static unsigned int shrink_folio_list(struct list_head *folio_list,
 
 			if (folio_test_pmd_mappable(folio))
 				flags |= TTU_SPLIT_HUGE_PMD;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+			if (folio_test_anon(folio) && !was_swapbacked &&
+			    flags & TTU_SPLIT_HUGE_PMD)
+				flags |= TTU_LAZYFREE_THP;
+#endif
+
 			/*
 			 * Without TTU_SYNC, try_to_unmap will only begin to
 			 * hold PTL from the first present PTE within a large