diff mbox series

[5/7] khugepaged: Allow to collapse PTE-mapped compound pages

Message ID 20200327170601.18563-6-kirill.shutemov@linux.intel.com
State New, archived
Headers show
Series thp/khugepaged improvements and CoW semantics | expand

Commit Message

Kirill A. Shutemov March 27, 2020, 5:05 p.m. UTC
We can collapse PTE-mapped compound pages. We only need to avoid
handling them more than once: lock/unlock page only once if it's present
in the PMD range multiple times as it handled on compound level. The
same goes for LRU isolation and putpack.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

Comments

Yang Shi March 27, 2020, 6:53 p.m. UTC | #1
On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
>
> We can collapse PTE-mapped compound pages. We only need to avoid
> handling them more than once: lock/unlock page only once if it's present
> in the PMD range multiple times as it handled on compound level. The
> same goes for LRU isolation and putpack.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
>  1 file changed, 31 insertions(+), 10 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index b47edfe57f7b..c8c2c463095c 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
>
>  static void release_pte_page(struct page *page)
>  {
> +       /*
> +        * We need to unlock and put compound page on LRU only once.
> +        * The rest of the pages have to be locked and not on LRU here.
> +        */
> +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> +                       (!PageLocked(page) && PageLRU(page)), page);
> +
> +       if (!PageLocked(page))
> +               return;
> +
> +       page = compound_head(page);
>         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));

We need count in the number of base pages. The same counter is
modified by vmscan in base page unit. Also need modify the inc path.

>         unlock_page(page);
>         putback_lru_page(page);
> @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>         pte_t *_pte;
>         int none_or_zero = 0, result = 0, referenced = 0;
>         bool writable = false;
> +       LIST_HEAD(compound_pagelist);
>
>         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
>              _pte++, address += PAGE_SIZE) {
> @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                         goto out;
>                 }
>
> -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> +
>                 if (PageCompound(page)) {
> -                       result = SCAN_PAGE_COMPOUND;
> -                       goto out;
> -               }
> +                       struct page *p;
> +                       page = compound_head(page);
>
> -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> +                       /*
> +                        * Check if we have dealt with the compount page

s/compount/compound

> +                        * already
> +                        */
> +                       list_for_each_entry(p, &compound_pagelist, lru) {
> +                               if (page ==  p)
> +                                       break;
> +                       }
> +                       if (page ==  p)
> +                               continue;

I don't quite understand why we need the above check. My understanding
is when we scan the ptes, once the first PTE mapped subpage is found,
then the THP would be added into compound_pagelist, then the later
loop would find the same THP on the list then just break the loop. Did
I miss anything?

> +               }
>
>                 /*
>                  * We can do it before isolate_lru_page because the
> @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                     page_is_young(page) || PageReferenced(page) ||
>                     mmu_notifier_test_young(vma->vm_mm, address))
>                         referenced++;
> +
> +               if (PageCompound(page))
> +                       list_add_tail(&page->lru, &compound_pagelist);
>         }
>         if (likely(writable)) {
>                 if (likely(referenced)) {
> @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>                         goto out_unmap;
>                 }
>
> -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> -               if (PageCompound(page)) {
> -                       result = SCAN_PAGE_COMPOUND;
> -                       goto out_unmap;
> -               }
> +               page = compound_head(page);
>
>                 /*
>                  * Record which node the original page is from and save this
> --
> 2.26.0
>
>
Zi Yan March 27, 2020, 6:55 p.m. UTC | #2
On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:

> We can collapse PTE-mapped compound pages. We only need to avoid
> handling them more than once: lock/unlock page only once if it's present
> in the PMD range multiple times as it handled on compound level. The
> same goes for LRU isolation and putpack.

s/putpack/putback/

>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
>  1 file changed, 31 insertions(+), 10 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index b47edfe57f7b..c8c2c463095c 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
>
>  static void release_pte_page(struct page *page)
>  {
> +	/*
> +	 * We need to unlock and put compound page on LRU only once.
> +	 * The rest of the pages have to be locked and not on LRU here.
> +	 */
> +	VM_BUG_ON_PAGE(!PageCompound(page) &&
> +			(!PageLocked(page) && PageLRU(page)), page);
It only checks base pages.

Add

VM_BUG_ON_PAGE(PageCompound(page) && PageLocked(page) && PageLRU(page), page);

to check for compound pages.

> +
> +	if (!PageLocked(page))
> +		return;
> +
> +	page = compound_head(page);
>  	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
>  	unlock_page(page);
>  	putback_lru_page(page);
> @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  	pte_t *_pte;
>  	int none_or_zero = 0, result = 0, referenced = 0;
>  	bool writable = false;
> +	LIST_HEAD(compound_pagelist);
>
>  	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
>  	     _pte++, address += PAGE_SIZE) {
> @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  			goto out;
>  		}
>
> -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> +		VM_BUG_ON_PAGE(!PageAnon(page), page);
> +
>  		if (PageCompound(page)) {
> -			result = SCAN_PAGE_COMPOUND;
> -			goto out;
> -		}
> +			struct page *p;
> +			page = compound_head(page);
>
> -		VM_BUG_ON_PAGE(!PageAnon(page), page);
> +			/*
> +			 * Check if we have dealt with the compount page

s/compount/compound/

> +			 * already
> +			 */
> +			list_for_each_entry(p, &compound_pagelist, lru) {
> +				if (page ==  p)
> +					break;
> +			}
> +			if (page ==  p)
> +				continue;
> +		}
>
>  		/*
>  		 * We can do it before isolate_lru_page because the
> @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		    page_is_young(page) || PageReferenced(page) ||
>  		    mmu_notifier_test_young(vma->vm_mm, address))
>  			referenced++;
> +
> +		if (PageCompound(page))
> +			list_add_tail(&page->lru, &compound_pagelist);
>  	}
>  	if (likely(writable)) {
>  		if (likely(referenced)) {

Do we need a list here? There should be at most one compound page we will see here, right?

If a compound page is seen here, can we bail out the loop early? I guess not,
because we can a partially mapped compound page at the beginning or the end of a VMA, right?

> @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>  			goto out_unmap;
>  		}
>
> -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> -		if (PageCompound(page)) {
> -			result = SCAN_PAGE_COMPOUND;
> -			goto out_unmap;
> -		}
> +		page = compound_head(page);
>
>  		/*
>  		 * Record which node the original page is from and save this
> -- 
> 2.26.0


—
Best Regards,
Yan Zi
Yang Shi March 27, 2020, 8:45 p.m. UTC | #3
On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
>
> We can collapse PTE-mapped compound pages. We only need to avoid
> handling them more than once: lock/unlock page only once if it's present
> in the PMD range multiple times as it handled on compound level. The
> same goes for LRU isolation and putpack.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
>  1 file changed, 31 insertions(+), 10 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index b47edfe57f7b..c8c2c463095c 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
>
>  static void release_pte_page(struct page *page)
>  {
> +       /*
> +        * We need to unlock and put compound page on LRU only once.
> +        * The rest of the pages have to be locked and not on LRU here.
> +        */
> +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> +                       (!PageLocked(page) && PageLRU(page)), page);
> +
> +       if (!PageLocked(page))
> +               return;
> +
> +       page = compound_head(page);
>         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
>         unlock_page(page);
>         putback_lru_page(page);

BTW, wouldn't this unlock the whole THP and put it back to LRU? Then
we may copy the following PTE mapped pages with page unlocked and on
LRU. I don't see critical problem, just the pages might be on and off
LRU by others, i.e. vmscan, compaction, migration, etc. But no one
could take the page away since try_to_unmap() would fail, but not very
productive.


> @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>         pte_t *_pte;
>         int none_or_zero = 0, result = 0, referenced = 0;
>         bool writable = false;
> +       LIST_HEAD(compound_pagelist);
>
>         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
>              _pte++, address += PAGE_SIZE) {
> @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                         goto out;
>                 }
>
> -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> +
>                 if (PageCompound(page)) {
> -                       result = SCAN_PAGE_COMPOUND;
> -                       goto out;
> -               }
> +                       struct page *p;
> +                       page = compound_head(page);
>
> -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> +                       /*
> +                        * Check if we have dealt with the compount page
> +                        * already
> +                        */
> +                       list_for_each_entry(p, &compound_pagelist, lru) {
> +                               if (page ==  p)
> +                                       break;
> +                       }
> +                       if (page ==  p)
> +                               continue;
> +               }
>
>                 /*
>                  * We can do it before isolate_lru_page because the
> @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                     page_is_young(page) || PageReferenced(page) ||
>                     mmu_notifier_test_young(vma->vm_mm, address))
>                         referenced++;
> +
> +               if (PageCompound(page))
> +                       list_add_tail(&page->lru, &compound_pagelist);
>         }
>         if (likely(writable)) {
>                 if (likely(referenced)) {
> @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>                         goto out_unmap;
>                 }
>
> -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> -               if (PageCompound(page)) {
> -                       result = SCAN_PAGE_COMPOUND;
> -                       goto out_unmap;
> -               }
> +               page = compound_head(page);
>
>                 /*
>                  * Record which node the original page is from and save this
> --
> 2.26.0
>
>
Kirill A. Shutemov March 28, 2020, 12:34 a.m. UTC | #4
On Fri, Mar 27, 2020 at 11:53:57AM -0700, Yang Shi wrote:
> On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> <kirill@shutemov.name> wrote:
> >
> > We can collapse PTE-mapped compound pages. We only need to avoid
> > handling them more than once: lock/unlock page only once if it's present
> > in the PMD range multiple times as it handled on compound level. The
> > same goes for LRU isolation and putpack.
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> >  1 file changed, 31 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index b47edfe57f7b..c8c2c463095c 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> >
> >  static void release_pte_page(struct page *page)
> >  {
> > +       /*
> > +        * We need to unlock and put compound page on LRU only once.
> > +        * The rest of the pages have to be locked and not on LRU here.
> > +        */
> > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > +                       (!PageLocked(page) && PageLRU(page)), page);
> > +
> > +       if (!PageLocked(page))
> > +               return;
> > +
> > +       page = compound_head(page);
> >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> 
> We need count in the number of base pages. The same counter is
> modified by vmscan in base page unit.

Is it though? Where?

> Also need modify the inc path.

Done already.

> >         unlock_page(page);
> >         putback_lru_page(page);
> > @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >         pte_t *_pte;
> >         int none_or_zero = 0, result = 0, referenced = 0;
> >         bool writable = false;
> > +       LIST_HEAD(compound_pagelist);
> >
> >         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> >              _pte++, address += PAGE_SIZE) {
> > @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                         goto out;
> >                 }
> >
> > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +
> >                 if (PageCompound(page)) {
> > -                       result = SCAN_PAGE_COMPOUND;
> > -                       goto out;
> > -               }
> > +                       struct page *p;
> > +                       page = compound_head(page);
> >
> > -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +                       /*
> > +                        * Check if we have dealt with the compount page
> 
> s/compount/compound

Thanks.

> > +                        * already
> > +                        */
> > +                       list_for_each_entry(p, &compound_pagelist, lru) {
> > +                               if (page ==  p)
> > +                                       break;
> > +                       }
> > +                       if (page ==  p)
> > +                               continue;
> 
> I don't quite understand why we need the above check. My understanding
> is when we scan the ptes, once the first PTE mapped subpage is found,
> then the THP would be added into compound_pagelist, then the later
> loop would find the same THP on the list then just break the loop. Did
> I miss anything?

We skip the iteration and look at the next pte. We've already isolated the
page. Nothing to do here.

> > +               }
> >
> >                 /*
> >                  * We can do it before isolate_lru_page because the
> > @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                     page_is_young(page) || PageReferenced(page) ||
> >                     mmu_notifier_test_young(vma->vm_mm, address))
> >                         referenced++;
> > +
> > +               if (PageCompound(page))
> > +                       list_add_tail(&page->lru, &compound_pagelist);
> >         }
> >         if (likely(writable)) {
> >                 if (likely(referenced)) {
> > @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >                         goto out_unmap;
> >                 }
> >
> > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > -               if (PageCompound(page)) {
> > -                       result = SCAN_PAGE_COMPOUND;
> > -                       goto out_unmap;
> > -               }
> > +               page = compound_head(page);
> >
> >                 /*
> >                  * Record which node the original page is from and save this
> > --
> > 2.26.0
> >
> >
Kirill A. Shutemov March 28, 2020, 12:39 a.m. UTC | #5
On Fri, Mar 27, 2020 at 02:55:06PM -0400, Zi Yan wrote:
> On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:
> 
> > We can collapse PTE-mapped compound pages. We only need to avoid
> > handling them more than once: lock/unlock page only once if it's present
> > in the PMD range multiple times as it handled on compound level. The
> > same goes for LRU isolation and putpack.
> 
> s/putpack/putback/
> 
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> >  1 file changed, 31 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index b47edfe57f7b..c8c2c463095c 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> >
> >  static void release_pte_page(struct page *page)
> >  {
> > +	/*
> > +	 * We need to unlock and put compound page on LRU only once.
> > +	 * The rest of the pages have to be locked and not on LRU here.
> > +	 */
> > +	VM_BUG_ON_PAGE(!PageCompound(page) &&
> > +			(!PageLocked(page) && PageLRU(page)), page);
> It only checks base pages.

That's the point.

> Add
> 
> VM_BUG_ON_PAGE(PageCompound(page) && PageLocked(page) && PageLRU(page), page);
> 
> to check for compound pages.

The compound page may be locked here if the function called for the first
time for the page and not locked after that (becouse we've unlocked it we
saw it the first time). The same with LRU.

> > +
> > +	if (!PageLocked(page))
> > +		return;
> > +
> > +	page = compound_head(page);
> >  	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> >  	unlock_page(page);
> >  	putback_lru_page(page);
> > @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >  	pte_t *_pte;
> >  	int none_or_zero = 0, result = 0, referenced = 0;
> >  	bool writable = false;
> > +	LIST_HEAD(compound_pagelist);
> >
> >  	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> >  	     _pte++, address += PAGE_SIZE) {
> > @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >  			goto out;
> >  		}
> >
> > -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> > +		VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +
> >  		if (PageCompound(page)) {
> > -			result = SCAN_PAGE_COMPOUND;
> > -			goto out;
> > -		}
> > +			struct page *p;
> > +			page = compound_head(page);
> >
> > -		VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +			/*
> > +			 * Check if we have dealt with the compount page
> 
> s/compount/compound/
> 

Thanks.

> > +			 * already
> > +			 */
> > +			list_for_each_entry(p, &compound_pagelist, lru) {
> > +				if (page ==  p)
> > +					break;
> > +			}
> > +			if (page ==  p)
> > +				continue;
> > +		}
> >
> >  		/*
> >  		 * We can do it before isolate_lru_page because the
> > @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >  		    page_is_young(page) || PageReferenced(page) ||
> >  		    mmu_notifier_test_young(vma->vm_mm, address))
> >  			referenced++;
> > +
> > +		if (PageCompound(page))
> > +			list_add_tail(&page->lru, &compound_pagelist);
> >  	}
> >  	if (likely(writable)) {
> >  		if (likely(referenced)) {
> 
> Do we need a list here? There should be at most one compound page we will see here, right?

Um? It's outside the pte loop. We get here once per PMD range.

'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
it's just the last page handled in the loop.

> 
> If a compound page is seen here, can we bail out the loop early? I guess not,
> because we can a partially mapped compound page at the beginning or the end of a VMA, right?
> 
> > @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >  			goto out_unmap;
> >  		}
> >
> > -		/* TODO: teach khugepaged to collapse THP mapped with pte */
> > -		if (PageCompound(page)) {
> > -			result = SCAN_PAGE_COMPOUND;
> > -			goto out_unmap;
> > -		}
> > +		page = compound_head(page);
> >
> >  		/*
> >  		 * Record which node the original page is from and save this
> > -- 
> > 2.26.0
> 
> 
> —
> Best Regards,
> Yan Zi
Kirill A. Shutemov March 28, 2020, 12:40 a.m. UTC | #6
On Fri, Mar 27, 2020 at 01:45:55PM -0700, Yang Shi wrote:
> On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> <kirill@shutemov.name> wrote:
> >
> > We can collapse PTE-mapped compound pages. We only need to avoid
> > handling them more than once: lock/unlock page only once if it's present
> > in the PMD range multiple times as it handled on compound level. The
> > same goes for LRU isolation and putpack.
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> >  1 file changed, 31 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index b47edfe57f7b..c8c2c463095c 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> >
> >  static void release_pte_page(struct page *page)
> >  {
> > +       /*
> > +        * We need to unlock and put compound page on LRU only once.
> > +        * The rest of the pages have to be locked and not on LRU here.
> > +        */
> > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > +                       (!PageLocked(page) && PageLRU(page)), page);
> > +
> > +       if (!PageLocked(page))
> > +               return;
> > +
> > +       page = compound_head(page);
> >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> >         unlock_page(page);
> >         putback_lru_page(page);
> 
> BTW, wouldn't this unlock the whole THP and put it back to LRU?

It is the intention.

> Then we may copy the following PTE mapped pages with page unlocked and
> on LRU. I don't see critical problem, just the pages might be on and off
> LRU by others, i.e. vmscan, compaction, migration, etc. But no one could
> take the page away since try_to_unmap() would fail, but not very
> productive.
> 
> 
> > @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >         pte_t *_pte;
> >         int none_or_zero = 0, result = 0, referenced = 0;
> >         bool writable = false;
> > +       LIST_HEAD(compound_pagelist);
> >
> >         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> >              _pte++, address += PAGE_SIZE) {
> > @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                         goto out;
> >                 }
> >
> > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +
> >                 if (PageCompound(page)) {
> > -                       result = SCAN_PAGE_COMPOUND;
> > -                       goto out;
> > -               }
> > +                       struct page *p;
> > +                       page = compound_head(page);
> >
> > -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > +                       /*
> > +                        * Check if we have dealt with the compount page
> > +                        * already
> > +                        */
> > +                       list_for_each_entry(p, &compound_pagelist, lru) {
> > +                               if (page ==  p)
> > +                                       break;
> > +                       }
> > +                       if (page ==  p)
> > +                               continue;
> > +               }
> >
> >                 /*
> >                  * We can do it before isolate_lru_page because the
> > @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                     page_is_young(page) || PageReferenced(page) ||
> >                     mmu_notifier_test_young(vma->vm_mm, address))
> >                         referenced++;
> > +
> > +               if (PageCompound(page))
> > +                       list_add_tail(&page->lru, &compound_pagelist);
> >         }
> >         if (likely(writable)) {
> >                 if (likely(referenced)) {
> > @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >                         goto out_unmap;
> >                 }
> >
> > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > -               if (PageCompound(page)) {
> > -                       result = SCAN_PAGE_COMPOUND;
> > -                       goto out_unmap;
> > -               }
> > +               page = compound_head(page);
> >
> >                 /*
> >                  * Record which node the original page is from and save this
> > --
> > 2.26.0
> >
> >
Yang Shi March 28, 2020, 1:09 a.m. UTC | #7
On Fri, Mar 27, 2020 at 5:34 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 11:53:57AM -0700, Yang Shi wrote:
> > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > <kirill@shutemov.name> wrote:
> > >
> > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > handling them more than once: lock/unlock page only once if it's present
> > > in the PMD range multiple times as it handled on compound level. The
> > > same goes for LRU isolation and putpack.
> > >
> > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > ---
> > >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> > >  1 file changed, 31 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index b47edfe57f7b..c8c2c463095c 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> > >
> > >  static void release_pte_page(struct page *page)
> > >  {
> > > +       /*
> > > +        * We need to unlock and put compound page on LRU only once.
> > > +        * The rest of the pages have to be locked and not on LRU here.
> > > +        */
> > > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > > +                       (!PageLocked(page) && PageLRU(page)), page);
> > > +
> > > +       if (!PageLocked(page))
> > > +               return;
> > > +
> > > +       page = compound_head(page);
> > >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> >
> > We need count in the number of base pages. The same counter is
> > modified by vmscan in base page unit.
>
> Is it though? Where?

__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken) in
vmscan.c, here nr_taken is nr_compound(page), so if it is THP the
number would be 512.

So in both inc and dec path of collapse PTE mapped THP, we should mod
nr_compound(page) too.

>
> > Also need modify the inc path.
>
> Done already.
>
> > >         unlock_page(page);
> > >         putback_lru_page(page);
> > > @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >         pte_t *_pte;
> > >         int none_or_zero = 0, result = 0, referenced = 0;
> > >         bool writable = false;
> > > +       LIST_HEAD(compound_pagelist);
> > >
> > >         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> > >              _pte++, address += PAGE_SIZE) {
> > > @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                         goto out;
> > >                 }
> > >
> > > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > > +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > > +
> > >                 if (PageCompound(page)) {
> > > -                       result = SCAN_PAGE_COMPOUND;
> > > -                       goto out;
> > > -               }
> > > +                       struct page *p;
> > > +                       page = compound_head(page);
> > >
> > > -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > > +                       /*
> > > +                        * Check if we have dealt with the compount page
> >
> > s/compount/compound
>
> Thanks.
>
> > > +                        * already
> > > +                        */
> > > +                       list_for_each_entry(p, &compound_pagelist, lru) {
> > > +                               if (page ==  p)
> > > +                                       break;
> > > +                       }
> > > +                       if (page ==  p)
> > > +                               continue;
> >
> > I don't quite understand why we need the above check. My understanding
> > is when we scan the ptes, once the first PTE mapped subpage is found,
> > then the THP would be added into compound_pagelist, then the later
> > loop would find the same THP on the list then just break the loop. Did
> > I miss anything?
>
> We skip the iteration and look at the next pte. We've already isolated the
> page. Nothing to do here.

I got your point. Thanks.

>
> > > +               }
> > >
> > >                 /*
> > >                  * We can do it before isolate_lru_page because the
> > > @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                     page_is_young(page) || PageReferenced(page) ||
> > >                     mmu_notifier_test_young(vma->vm_mm, address))
> > >                         referenced++;
> > > +
> > > +               if (PageCompound(page))
> > > +                       list_add_tail(&page->lru, &compound_pagelist);
> > >         }
> > >         if (likely(writable)) {
> > >                 if (likely(referenced)) {
> > > @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> > >                         goto out_unmap;
> > >                 }
> > >
> > > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > > -               if (PageCompound(page)) {
> > > -                       result = SCAN_PAGE_COMPOUND;
> > > -                       goto out_unmap;
> > > -               }
> > > +               page = compound_head(page);
> > >
> > >                 /*
> > >                  * Record which node the original page is from and save this
> > > --
> > > 2.26.0
> > >
> > >
>
> --
>  Kirill A. Shutemov
Yang Shi March 28, 2020, 1:12 a.m. UTC | #8
On Fri, Mar 27, 2020 at 5:40 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 01:45:55PM -0700, Yang Shi wrote:
> > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > <kirill@shutemov.name> wrote:
> > >
> > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > handling them more than once: lock/unlock page only once if it's present
> > > in the PMD range multiple times as it handled on compound level. The
> > > same goes for LRU isolation and putpack.
> > >
> > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > ---
> > >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> > >  1 file changed, 31 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index b47edfe57f7b..c8c2c463095c 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> > >
> > >  static void release_pte_page(struct page *page)
> > >  {
> > > +       /*
> > > +        * We need to unlock and put compound page on LRU only once.
> > > +        * The rest of the pages have to be locked and not on LRU here.
> > > +        */
> > > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > > +                       (!PageLocked(page) && PageLRU(page)), page);
> > > +
> > > +       if (!PageLocked(page))
> > > +               return;
> > > +
> > > +       page = compound_head(page);
> > >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> > >         unlock_page(page);
> > >         putback_lru_page(page);
> >
> > BTW, wouldn't this unlock the whole THP and put it back to LRU?
>
> It is the intention.

Yes, understood. Considering the below case:

Subpages 0, 1, 2, 3 are PTE mapped. Once subpage 0 is copied
release_pte_page() would be called then the whole THP would be
unlocked and putback to lru, then the loop would iterate to subpage 1,
2 and 3, but the page is not locked and on lru already. Is it
intentional?

>
> > Then we may copy the following PTE mapped pages with page unlocked and
> > on LRU. I don't see critical problem, just the pages might be on and off
> > LRU by others, i.e. vmscan, compaction, migration, etc. But no one could
> > take the page away since try_to_unmap() would fail, but not very
> > productive.
> >
> >
> > > @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >         pte_t *_pte;
> > >         int none_or_zero = 0, result = 0, referenced = 0;
> > >         bool writable = false;
> > > +       LIST_HEAD(compound_pagelist);
> > >
> > >         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> > >              _pte++, address += PAGE_SIZE) {
> > > @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                         goto out;
> > >                 }
> > >
> > > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > > +               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > > +
> > >                 if (PageCompound(page)) {
> > > -                       result = SCAN_PAGE_COMPOUND;
> > > -                       goto out;
> > > -               }
> > > +                       struct page *p;
> > > +                       page = compound_head(page);
> > >
> > > -               VM_BUG_ON_PAGE(!PageAnon(page), page);
> > > +                       /*
> > > +                        * Check if we have dealt with the compount page
> > > +                        * already
> > > +                        */
> > > +                       list_for_each_entry(p, &compound_pagelist, lru) {
> > > +                               if (page ==  p)
> > > +                                       break;
> > > +                       }
> > > +                       if (page ==  p)
> > > +                               continue;
> > > +               }
> > >
> > >                 /*
> > >                  * We can do it before isolate_lru_page because the
> > > @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                     page_is_young(page) || PageReferenced(page) ||
> > >                     mmu_notifier_test_young(vma->vm_mm, address))
> > >                         referenced++;
> > > +
> > > +               if (PageCompound(page))
> > > +                       list_add_tail(&page->lru, &compound_pagelist);
> > >         }
> > >         if (likely(writable)) {
> > >                 if (likely(referenced)) {
> > > @@ -1185,11 +1210,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> > >                         goto out_unmap;
> > >                 }
> > >
> > > -               /* TODO: teach khugepaged to collapse THP mapped with pte */
> > > -               if (PageCompound(page)) {
> > > -                       result = SCAN_PAGE_COMPOUND;
> > > -                       goto out_unmap;
> > > -               }
> > > +               page = compound_head(page);
> > >
> > >                 /*
> > >                  * Record which node the original page is from and save this
> > > --
> > > 2.26.0
> > >
> > >
>
> --
>  Kirill A. Shutemov
Zi Yan March 28, 2020, 1:17 a.m. UTC | #9
On 27 Mar 2020, at 20:39, Kirill A. Shutemov wrote:

> External email: Use caution opening links or attachments
>
>
> On Fri, Mar 27, 2020 at 02:55:06PM -0400, Zi Yan wrote:
>> On 27 Mar 2020, at 13:05, Kirill A. Shutemov wrote:
>>
>>> We can collapse PTE-mapped compound pages. We only need to avoid
>>> handling them more than once: lock/unlock page only once if it's present
>>> in the PMD range multiple times as it handled on compound level. The
>>> same goes for LRU isolation and putpack.
>>
>> s/putpack/putback/
>>
>>>
>>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>>> ---
>>>  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
>>>  1 file changed, 31 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index b47edfe57f7b..c8c2c463095c 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
>>>
>>>  static void release_pte_page(struct page *page)
>>>  {
>>> +   /*
>>> +    * We need to unlock and put compound page on LRU only once.
>>> +    * The rest of the pages have to be locked and not on LRU here.
>>> +    */
>>> +   VM_BUG_ON_PAGE(!PageCompound(page) &&
>>> +                   (!PageLocked(page) && PageLRU(page)), page);
>> It only checks base pages.
>
> That's the point.
>
>> Add
>>
>> VM_BUG_ON_PAGE(PageCompound(page) && PageLocked(page) && PageLRU(page), page);
>>
>> to check for compound pages.
>
> The compound page may be locked here if the function called for the first
> time for the page and not locked after that (becouse we've unlocked it we
> saw it the first time). The same with LRU.
>

For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
For the second time and so on, the compound page is unlocked and on the LRU,
so this VM_BUG_ON still passes.

For base page, VM_BUG_ON passes.

Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
but your VM_BUG_ON will not detect this situation, right?



>>> +
>>> +   if (!PageLocked(page))
>>> +           return;
>>> +
>>> +   page = compound_head(page);
>>>     dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
>>>     unlock_page(page);
>>>     putback_lru_page(page);
>>> @@ -537,6 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>     pte_t *_pte;
>>>     int none_or_zero = 0, result = 0, referenced = 0;
>>>     bool writable = false;
>>> +   LIST_HEAD(compound_pagelist);
>>>
>>>     for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
>>>          _pte++, address += PAGE_SIZE) {
>>> @@ -561,13 +573,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>                     goto out;
>>>             }
>>>
>>> -           /* TODO: teach khugepaged to collapse THP mapped with pte */
>>> +           VM_BUG_ON_PAGE(!PageAnon(page), page);
>>> +
>>>             if (PageCompound(page)) {
>>> -                   result = SCAN_PAGE_COMPOUND;
>>> -                   goto out;
>>> -           }
>>> +                   struct page *p;
>>> +                   page = compound_head(page);
>>>
>>> -           VM_BUG_ON_PAGE(!PageAnon(page), page);
>>> +                   /*
>>> +                    * Check if we have dealt with the compount page
>>
>> s/compount/compound/
>>
>
> Thanks.
>
>>> +                    * already
>>> +                    */
>>> +                   list_for_each_entry(p, &compound_pagelist, lru) {
>>> +                           if (page ==  p)
>>> +                                   break;
>>> +                   }
>>> +                   if (page ==  p)
>>> +                           continue;
>>> +           }
>>>
>>>             /*
>>>              * We can do it before isolate_lru_page because the
>>> @@ -640,6 +662,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>                 page_is_young(page) || PageReferenced(page) ||
>>>                 mmu_notifier_test_young(vma->vm_mm, address))
>>>                     referenced++;
>>> +
>>> +           if (PageCompound(page))
>>> +                   list_add_tail(&page->lru, &compound_pagelist);
>>>     }
>>>     if (likely(writable)) {
>>>             if (likely(referenced)) {
>>
>> Do we need a list here? There should be at most one compound page we will see here, right?
>
> Um? It's outside the pte loop. We get here once per PMD range.
>
> 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> it's just the last page handled in the loop.
>

Throughout the pte loop, we should only see at most one compound page, right?

If that is the case, we do not need a list to store that single compound page
but can use a struct page pointer that is initialized to NULL and later assigned
to the discovered compound page, right? I am not saying I want to change the code
in this way, but just try to make sure I understand the code correctly.

Thanks.

—
Best Regards,
Yan Zi
Kirill A. Shutemov March 28, 2020, 12:27 p.m. UTC | #10
On Fri, Mar 27, 2020 at 06:09:38PM -0700, Yang Shi wrote:
> On Fri, Mar 27, 2020 at 5:34 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> >
> > On Fri, Mar 27, 2020 at 11:53:57AM -0700, Yang Shi wrote:
> > > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > > <kirill@shutemov.name> wrote:
> > > >
> > > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > > handling them more than once: lock/unlock page only once if it's present
> > > > in the PMD range multiple times as it handled on compound level. The
> > > > same goes for LRU isolation and putpack.
> > > >
> > > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > > ---
> > > >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> > > >  1 file changed, 31 insertions(+), 10 deletions(-)
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index b47edfe57f7b..c8c2c463095c 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> > > >
> > > >  static void release_pte_page(struct page *page)
> > > >  {
> > > > +       /*
> > > > +        * We need to unlock and put compound page on LRU only once.
> > > > +        * The rest of the pages have to be locked and not on LRU here.
> > > > +        */
> > > > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > > > +                       (!PageLocked(page) && PageLRU(page)), page);
> > > > +
> > > > +       if (!PageLocked(page))
> > > > +               return;
> > > > +
> > > > +       page = compound_head(page);
> > > >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> > >
> > > We need count in the number of base pages. The same counter is
> > > modified by vmscan in base page unit.
> >
> > Is it though? Where?
> 
> __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken) in
> vmscan.c, here nr_taken is nr_compound(page), so if it is THP the
> number would be 512.

Could you point a particular codepath?

> So in both inc and dec path of collapse PTE mapped THP, we should mod
> nr_compound(page) too.

I disagree. Compound page is represented by single entry on LRU, so it has
to be counted once.
Kirill A. Shutemov March 28, 2020, 12:33 p.m. UTC | #11
On Fri, Mar 27, 2020 at 09:17:00PM -0400, Zi Yan wrote:
> > The compound page may be locked here if the function called for the first
> > time for the page and not locked after that (becouse we've unlocked it we
> > saw it the first time). The same with LRU.
> >
> 
> For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
> For the second time and so on, the compound page is unlocked and on the LRU,
> so this VM_BUG_ON still passes.
> 
> For base page, VM_BUG_ON passes.
> 
> Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
> but your VM_BUG_ON will not detect this situation, right?

Right. I will rework this code. I've just realized it is racy: after
unlock and putback on LRU the page can be locked by somebody else and this
code can unlock it which completely borken.

I'll pass down compound_pagelist to release_pte_pages() and handle the
situation there.

> >>>     if (likely(writable)) {
> >>>             if (likely(referenced)) {
> >>
> >> Do we need a list here? There should be at most one compound page we will see here, right?
> >
> > Um? It's outside the pte loop. We get here once per PMD range.
> >
> > 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> > it's just the last page handled in the loop.
> >
> 
> Throughout the pte loop, we should only see at most one compound page, right?

No. mremap(2) opens a possibility for HPAGE_PMD_NR compound pages for
single PMD range.
Yang Shi March 30, 2020, 6:38 p.m. UTC | #12
On Sat, Mar 28, 2020 at 5:27 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 06:09:38PM -0700, Yang Shi wrote:
> > On Fri, Mar 27, 2020 at 5:34 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> > >
> > > On Fri, Mar 27, 2020 at 11:53:57AM -0700, Yang Shi wrote:
> > > > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > > > <kirill@shutemov.name> wrote:
> > > > >
> > > > > We can collapse PTE-mapped compound pages. We only need to avoid
> > > > > handling them more than once: lock/unlock page only once if it's present
> > > > > in the PMD range multiple times as it handled on compound level. The
> > > > > same goes for LRU isolation and putpack.
> > > > >
> > > > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > > > ---
> > > > >  mm/khugepaged.c | 41 +++++++++++++++++++++++++++++++----------
> > > > >  1 file changed, 31 insertions(+), 10 deletions(-)
> > > > >
> > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > > index b47edfe57f7b..c8c2c463095c 100644
> > > > > --- a/mm/khugepaged.c
> > > > > +++ b/mm/khugepaged.c
> > > > > @@ -515,6 +515,17 @@ void __khugepaged_exit(struct mm_struct *mm)
> > > > >
> > > > >  static void release_pte_page(struct page *page)
> > > > >  {
> > > > > +       /*
> > > > > +        * We need to unlock and put compound page on LRU only once.
> > > > > +        * The rest of the pages have to be locked and not on LRU here.
> > > > > +        */
> > > > > +       VM_BUG_ON_PAGE(!PageCompound(page) &&
> > > > > +                       (!PageLocked(page) && PageLRU(page)), page);
> > > > > +
> > > > > +       if (!PageLocked(page))
> > > > > +               return;
> > > > > +
> > > > > +       page = compound_head(page);
> > > > >         dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
> > > >
> > > > We need count in the number of base pages. The same counter is
> > > > modified by vmscan in base page unit.
> > >
> > > Is it though? Where?
> >
> > __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken) in
> > vmscan.c, here nr_taken is nr_compound(page), so if it is THP the
> > number would be 512.
>
> Could you point a particular codepath?

shrink_inactive_list ->
        nr_taken = isolate_lru_pages()
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);

Then in isolate_lru_pages() :
        nr_pages = compound_nr(page);
        ...
        switch (__isolate_lru_page(page, mode)) {
                case 0:
                        nr_taken += nr_pages;

>
> > So in both inc and dec path of collapse PTE mapped THP, we should mod
> > nr_compound(page) too.
>
> I disagree. Compound page is represented by single entry on LRU, so it has
> to be counted once.

It was not a problem without THP swap. But with THP swap we saw
pgsteal_{kswapd|direct} may be greater than pgscan_{kswapd|direct} if
we count THP as one page.

Please refer to the below commit:

commit 98879b3b9edc1604f2d1a6686576ef4d08ed3310
Author: Yang Shi <yang.shi@linux.alibaba.com>
Date:   Thu Jul 11 20:59:30 2019 -0700

    mm: vmscan: correct some vmscan counters for THP swapout

    Commit bd4c82c22c36 ("mm, THP, swap: delay splitting THP after swapped
    out"), THP can be swapped out in a whole.  But, nr_reclaimed and some
    other vm counters still get inc'ed by one even though a whole THP (512
    pages) gets swapped out.

    This doesn't make too much sense to memory reclaim.

    For example, direct reclaim may just need reclaim SWAP_CLUSTER_MAX
    pages, reclaiming one THP could fulfill it.  But, if nr_reclaimed is not
    increased correctly, direct reclaim may just waste time to reclaim more
    pages, SWAP_CLUSTER_MAX * 512 pages in worst case.

    And, it may cause pgsteal_{kswapd|direct} is greater than
    pgscan_{kswapd|direct}, like the below:

    pgsteal_kswapd 122933
    pgsteal_direct 26600225
    pgscan_kswapd 174153
    pgscan_direct 14678312

    nr_reclaimed and nr_scanned must be fixed in parallel otherwise it would
    break some page reclaim logic, e.g.

    vmpressure: this looks at the scanned/reclaimed ratio so it won't change
    semantics as long as scanned & reclaimed are fixed in parallel.

    compaction/reclaim: compaction wants a certain number of physical pages
    freed up before going back to compacting.

    kswapd priority raising: kswapd raises priority if we scan fewer pages
    than the reclaim target (which itself is obviously expressed in order-0
    pages).  As a result, kswapd can falsely raise its aggressiveness even
    when it's making great progress.

    Other than nr_scanned and nr_reclaimed, some other counters, e.g.
    pgactivate, nr_skipped, nr_ref_keep and nr_unmap_fail need to be fixed too
    since they are user visible via cgroup, /proc/vmstat or trace points,
    otherwise they would be underreported.

    When isolating pages from LRUs, nr_taken has been accounted in base page,
    but nr_scanned and nr_skipped are still accounted in THP.  It doesn't make
    too much sense too since this may cause trace point underreport the
    numbers as well.

    So accounting those counters in base page instead of accounting THP as one
    page.

    nr_dirty, nr_unqueued_dirty, nr_congested and nr_writeback are used by
    file cache, so they are not impacted by THP swap.

    This change may result in lower steal/scan ratio in some cases since THP
    may get split during page reclaim, then a part of tail pages get reclaimed
    instead of the whole 512 pages, but nr_scanned is accounted by 512,
    particularly for direct reclaim.  But, this should be not a significant
    issue.


So, since we count THP in base page unit in vmscan path, so the same
counter should be updated in base page unit in other path as well
IMHO.


>
> --
>  Kirill A. Shutemov
Yang Shi March 30, 2020, 6:41 p.m. UTC | #13
On Sat, Mar 28, 2020 at 5:33 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 09:17:00PM -0400, Zi Yan wrote:
> > > The compound page may be locked here if the function called for the first
> > > time for the page and not locked after that (becouse we've unlocked it we
> > > saw it the first time). The same with LRU.
> > >
> >
> > For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
> > For the second time and so on, the compound page is unlocked and on the LRU,
> > so this VM_BUG_ON still passes.
> >
> > For base page, VM_BUG_ON passes.
> >
> > Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
> > but your VM_BUG_ON will not detect this situation, right?
>
> Right. I will rework this code. I've just realized it is racy: after
> unlock and putback on LRU the page can be locked by somebody else and this
> code can unlock it which completely borken.

I'm wondering if we shall skip putting the page back to LRU. Since the
page is about to be freed soon, so as I mentioned in the other patch
it sounds not very productive to put it back to LRU again.

>
> I'll pass down compound_pagelist to release_pte_pages() and handle the
> situation there.
>
> > >>>     if (likely(writable)) {
> > >>>             if (likely(referenced)) {
> > >>
> > >> Do we need a list here? There should be at most one compound page we will see here, right?
> > >
> > > Um? It's outside the pte loop. We get here once per PMD range.
> > >
> > > 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> > > it's just the last page handled in the loop.
> > >
> >
> > Throughout the pte loop, we should only see at most one compound page, right?
>
> No. mremap(2) opens a possibility for HPAGE_PMD_NR compound pages for
> single PMD range.
>
>
> --
>  Kirill A. Shutemov
>
Yang Shi March 30, 2020, 6:50 p.m. UTC | #14
On Sat, Mar 28, 2020 at 5:33 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 09:17:00PM -0400, Zi Yan wrote:
> > > The compound page may be locked here if the function called for the first
> > > time for the page and not locked after that (becouse we've unlocked it we
> > > saw it the first time). The same with LRU.
> > >
> >
> > For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
> > For the second time and so on, the compound page is unlocked and on the LRU,
> > so this VM_BUG_ON still passes.
> >
> > For base page, VM_BUG_ON passes.
> >
> > Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
> > but your VM_BUG_ON will not detect this situation, right?
>
> Right. I will rework this code. I've just realized it is racy: after
> unlock and putback on LRU the page can be locked by somebody else and this
> code can unlock it which completely borken.
>
> I'll pass down compound_pagelist to release_pte_pages() and handle the
> situation there.
>
> > >>>     if (likely(writable)) {
> > >>>             if (likely(referenced)) {
> > >>
> > >> Do we need a list here? There should be at most one compound page we will see here, right?
> > >
> > > Um? It's outside the pte loop. We get here once per PMD range.
> > >
> > > 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> > > it's just the last page handled in the loop.
> > >
> >
> > Throughout the pte loop, we should only see at most one compound page, right?
>
> No. mremap(2) opens a possibility for HPAGE_PMD_NR compound pages for
> single PMD range.

Do you mean every PTE in the PMD is mapped by a sub page from different THPs?

>
>
> --
>  Kirill A. Shutemov
>
Kirill A. Shutemov March 31, 2020, 2:08 p.m. UTC | #15
On Mon, Mar 30, 2020 at 11:50:41AM -0700, Yang Shi wrote:
> On Sat, Mar 28, 2020 at 5:33 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> >
> > On Fri, Mar 27, 2020 at 09:17:00PM -0400, Zi Yan wrote:
> > > > The compound page may be locked here if the function called for the first
> > > > time for the page and not locked after that (becouse we've unlocked it we
> > > > saw it the first time). The same with LRU.
> > > >
> > >
> > > For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
> > > For the second time and so on, the compound page is unlocked and on the LRU,
> > > so this VM_BUG_ON still passes.
> > >
> > > For base page, VM_BUG_ON passes.
> > >
> > > Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
> > > but your VM_BUG_ON will not detect this situation, right?
> >
> > Right. I will rework this code. I've just realized it is racy: after
> > unlock and putback on LRU the page can be locked by somebody else and this
> > code can unlock it which completely borken.
> >
> > I'll pass down compound_pagelist to release_pte_pages() and handle the
> > situation there.
> >
> > > >>>     if (likely(writable)) {
> > > >>>             if (likely(referenced)) {
> > > >>
> > > >> Do we need a list here? There should be at most one compound page we will see here, right?
> > > >
> > > > Um? It's outside the pte loop. We get here once per PMD range.
> > > >
> > > > 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> > > > it's just the last page handled in the loop.
> > > >
> > >
> > > Throughout the pte loop, we should only see at most one compound page, right?
> >
> > No. mremap(2) opens a possibility for HPAGE_PMD_NR compound pages for
> > single PMD range.
> 
> Do you mean every PTE in the PMD is mapped by a sub page from different THPs?

Yes.

Well, it was harder to archive than I expected, but below is a test case,
I've come up with. It maps 512 head pages within single PMD range.

diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
index 3a98d5b2d6d8..9ae119234a39 100644
--- a/tools/testing/selftests/vm/khugepaged.c
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -703,6 +703,63 @@ static void collapse_full_of_compound(void)
 	munmap(p, hpage_pmd_size);
 }
 
+static void collapse_compound_extreme(void)
+{
+	void *p;
+	int i;
+
+	p = alloc_mapping();
+	for (i = 0; i < hpage_pmd_nr; i++) {
+		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
+				i + 1, hpage_pmd_nr);
+
+		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
+		fill_memory(BASE_ADDR, 0, hpage_pmd_size);
+		if (!check_huge(BASE_ADDR)) {
+			printf("Failed to allocate huge page\n");
+			exit(EXIT_FAILURE);
+		}
+		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+		p = mremap(BASE_ADDR - i * page_size,
+				i * page_size + hpage_pmd_size,
+				(i + 1) * page_size,
+				MREMAP_MAYMOVE | MREMAP_FIXED,
+				BASE_ADDR + 2 * hpage_pmd_size);
+		if (p == MAP_FAILED) {
+			perror("mremap+unmap");
+			exit(EXIT_FAILURE);
+		}
+
+		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
+				(i + 1) * page_size,
+				(i + 1) * page_size + hpage_pmd_size,
+				MREMAP_MAYMOVE | MREMAP_FIXED,
+				BASE_ADDR - (i + 1) * page_size);
+		if (p == MAP_FAILED) {
+			perror("mremap+alloc");
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	munmap(BASE_ADDR, hpage_pmd_size);
+	fill_memory(p, 0, hpage_pmd_size);
+	if (!check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	if (wait_for_scan("Collapse PTE table full of different compound pages", p))
+		fail("Timeout");
+	else if (check_huge(p))
+		success("OK");
+	else
+		fail("Fail");
+
+	validate_memory(p, 0, hpage_pmd_size);
+	munmap(p, hpage_pmd_size);
+}
+
 static void collapse_fork(void)
 {
 	int wstatus;
@@ -916,6 +973,7 @@ int main(void)
 	collapse_max_ptes_swap();
 	collapse_signle_pte_entry_compound();
 	collapse_full_of_compound();
+	collapse_compound_extreme();
 	collapse_fork();
 	collapse_fork_compound();
 	collapse_max_ptes_shared();
Yang Shi April 1, 2020, 7:45 p.m. UTC | #16
On Tue, Mar 31, 2020 at 7:08 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Mon, Mar 30, 2020 at 11:50:41AM -0700, Yang Shi wrote:
> > On Sat, Mar 28, 2020 at 5:33 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> > >
> > > On Fri, Mar 27, 2020 at 09:17:00PM -0400, Zi Yan wrote:
> > > > > The compound page may be locked here if the function called for the first
> > > > > time for the page and not locked after that (becouse we've unlocked it we
> > > > > saw it the first time). The same with LRU.
> > > > >
> > > >
> > > > For the first time, the compound page is locked and not on LRU, so this VM_BUG_ON passes.
> > > > For the second time and so on, the compound page is unlocked and on the LRU,
> > > > so this VM_BUG_ON still passes.
> > > >
> > > > For base page, VM_BUG_ON passes.
> > > >
> > > > Other unexpected situation (a compound page is locked and on LRU) triggers the VM_BU_ON,
> > > > but your VM_BUG_ON will not detect this situation, right?
> > >
> > > Right. I will rework this code. I've just realized it is racy: after
> > > unlock and putback on LRU the page can be locked by somebody else and this
> > > code can unlock it which completely borken.
> > >
> > > I'll pass down compound_pagelist to release_pte_pages() and handle the
> > > situation there.
> > >
> > > > >>>     if (likely(writable)) {
> > > > >>>             if (likely(referenced)) {
> > > > >>
> > > > >> Do we need a list here? There should be at most one compound page we will see here, right?
> > > > >
> > > > > Um? It's outside the pte loop. We get here once per PMD range.
> > > > >
> > > > > 'page' argument to trace_mm_collapse_huge_page_isolate() is misleading:
> > > > > it's just the last page handled in the loop.
> > > > >
> > > >
> > > > Throughout the pte loop, we should only see at most one compound page, right?
> > >
> > > No. mremap(2) opens a possibility for HPAGE_PMD_NR compound pages for
> > > single PMD range.
> >
> > Do you mean every PTE in the PMD is mapped by a sub page from different THPs?
>
> Yes.
>
> Well, it was harder to archive than I expected, but below is a test case,
> I've come up with. It maps 512 head pages within single PMD range.

Thanks, this is very helpful.

>
> diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
> index 3a98d5b2d6d8..9ae119234a39 100644
> --- a/tools/testing/selftests/vm/khugepaged.c
> +++ b/tools/testing/selftests/vm/khugepaged.c
> @@ -703,6 +703,63 @@ static void collapse_full_of_compound(void)
>         munmap(p, hpage_pmd_size);
>  }
>
> +static void collapse_compound_extreme(void)
> +{
> +       void *p;
> +       int i;
> +
> +       p = alloc_mapping();
> +       for (i = 0; i < hpage_pmd_nr; i++) {
> +               printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
> +                               i + 1, hpage_pmd_nr);
> +
> +               madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
> +               fill_memory(BASE_ADDR, 0, hpage_pmd_size);
> +               if (!check_huge(BASE_ADDR)) {
> +                       printf("Failed to allocate huge page\n");
> +                       exit(EXIT_FAILURE);
> +               }
> +               madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
> +
> +               p = mremap(BASE_ADDR - i * page_size,
> +                               i * page_size + hpage_pmd_size,
> +                               (i + 1) * page_size,
> +                               MREMAP_MAYMOVE | MREMAP_FIXED,
> +                               BASE_ADDR + 2 * hpage_pmd_size);
> +               if (p == MAP_FAILED) {
> +                       perror("mremap+unmap");
> +                       exit(EXIT_FAILURE);
> +               }
> +
> +               p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
> +                               (i + 1) * page_size,
> +                               (i + 1) * page_size + hpage_pmd_size,
> +                               MREMAP_MAYMOVE | MREMAP_FIXED,
> +                               BASE_ADDR - (i + 1) * page_size);
> +               if (p == MAP_FAILED) {
> +                       perror("mremap+alloc");
> +                       exit(EXIT_FAILURE);
> +               }
> +       }
> +
> +       munmap(BASE_ADDR, hpage_pmd_size);
> +       fill_memory(p, 0, hpage_pmd_size);
> +       if (!check_huge(p))
> +               success("OK");
> +       else
> +               fail("Fail");
> +
> +       if (wait_for_scan("Collapse PTE table full of different compound pages", p))
> +               fail("Timeout");
> +       else if (check_huge(p))
> +               success("OK");
> +       else
> +               fail("Fail");
> +
> +       validate_memory(p, 0, hpage_pmd_size);
> +       munmap(p, hpage_pmd_size);
> +}
> +
>  static void collapse_fork(void)
>  {
>         int wstatus;
> @@ -916,6 +973,7 @@ int main(void)
>         collapse_max_ptes_swap();
>         collapse_signle_pte_entry_compound();
>         collapse_full_of_compound();
> +       collapse_compound_extreme();
>         collapse_fork();
>         collapse_fork_compound();
>         collapse_max_ptes_shared();
> --
>  Kirill A. Shutemov
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b47edfe57f7b..c8c2c463095c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -515,6 +515,17 @@  void __khugepaged_exit(struct mm_struct *mm)
 
 static void release_pte_page(struct page *page)
 {
+	/*
+	 * We need to unlock and put compound page on LRU only once.
+	 * The rest of the pages have to be locked and not on LRU here.
+	 */
+	VM_BUG_ON_PAGE(!PageCompound(page) &&
+			(!PageLocked(page) && PageLRU(page)), page);
+
+	if (!PageLocked(page))
+		return;
+
+	page = compound_head(page);
 	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
 	unlock_page(page);
 	putback_lru_page(page);
@@ -537,6 +548,7 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	pte_t *_pte;
 	int none_or_zero = 0, result = 0, referenced = 0;
 	bool writable = false;
+	LIST_HEAD(compound_pagelist);
 
 	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
 	     _pte++, address += PAGE_SIZE) {
@@ -561,13 +573,23 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			goto out;
 		}
 
-		/* TODO: teach khugepaged to collapse THP mapped with pte */
+		VM_BUG_ON_PAGE(!PageAnon(page), page);
+
 		if (PageCompound(page)) {
-			result = SCAN_PAGE_COMPOUND;
-			goto out;
-		}
+			struct page *p;
+			page = compound_head(page);
 
-		VM_BUG_ON_PAGE(!PageAnon(page), page);
+			/*
+			 * Check if we have dealt with the compount page
+			 * already
+			 */
+			list_for_each_entry(p, &compound_pagelist, lru) {
+				if (page ==  p)
+					break;
+			}
+			if (page ==  p)
+				continue;
+		}
 
 		/*
 		 * We can do it before isolate_lru_page because the
@@ -640,6 +662,9 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		    page_is_young(page) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced++;
+
+		if (PageCompound(page))
+			list_add_tail(&page->lru, &compound_pagelist);
 	}
 	if (likely(writable)) {
 		if (likely(referenced)) {
@@ -1185,11 +1210,7 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 			goto out_unmap;
 		}
 
-		/* TODO: teach khugepaged to collapse THP mapped with pte */
-		if (PageCompound(page)) {
-			result = SCAN_PAGE_COMPOUND;
-			goto out_unmap;
-		}
+		page = compound_head(page);
 
 		/*
 		 * Record which node the original page is from and save this