diff mbox series

[PATCHv2,5/8] khugepaged: Allow to callapse a page shared across fork

Message ID 20200403112928.19742-6-kirill.shutemov@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series thp/khugepaged improvements and CoW semantics | expand

Commit Message

Kirill A . Shutemov April 3, 2020, 11:29 a.m. UTC
The page can be included into collapse as long as it doesn't have extra
pins (from GUP or otherwise).

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/khugepaged.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

Comments

Ralph Campbell April 6, 2020, 8:15 p.m. UTC | #1
On 4/3/20 4:29 AM, Kirill A. Shutemov wrote:
> The page can be included into collapse as long as it doesn't have extra
> pins (from GUP or otherwise).
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   mm/khugepaged.c | 25 ++++++++++++++-----------
>   1 file changed, 14 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 57ff287caf6b..1e7e6543ebca 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -581,11 +581,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		}
>   
>   		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> +		 * Check if the page has any GUP (or other external) pins.
> +		 *
> +		 * The page table that maps the page has been already unlinked
> +		 * from the page table tree and this process cannot get
> +		 * additinal pin on the page.

s/additinal/an additional

> +		 *
> +		 * New pins can come later if the page is shared across fork,
> +		 * but not for the this process. It is fine. The other process
> +		 * cannot write to the page, only trigger CoW.
>   		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		if (total_mapcount(page) + PageSwapCache(page) !=
> +				page_count(page)) {
>   			unlock_page(page);
>   			result = SCAN_PAGE_COUNT;
>   			goto out;
> @@ -672,7 +679,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>   		} else {
>   			src_page = pte_page(pteval);
>   			copy_user_highpage(page, src_page, address, vma);
> -			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
>   			release_pte_page(src_page);
>   			/*
>   			 * ptl mostly unnecessary, but preempt has to
> @@ -1206,12 +1212,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   			goto out_unmap;
>   		}
>   
> -		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> -		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		/* Check if the page has any GUP (or other external) pins */
> +		if (total_mapcount(page) + PageSwapCache(page) !=
> +				page_count(page)) {
>   			result = SCAN_PAGE_COUNT;
>   			goto out_unmap;
>   		}
>
Yang Shi April 6, 2020, 8:50 p.m. UTC | #2
On 4/3/20 4:29 AM, Kirill A. Shutemov wrote:
> The page can be included into collapse as long as it doesn't have extra
> pins (from GUP or otherwise).
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   mm/khugepaged.c | 25 ++++++++++++++-----------
>   1 file changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 57ff287caf6b..1e7e6543ebca 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -581,11 +581,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		}
>   
>   		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> +		 * Check if the page has any GUP (or other external) pins.
> +		 *
> +		 * The page table that maps the page has been already unlinked
> +		 * from the page table tree and this process cannot get
> +		 * additinal pin on the page.
> +		 *
> +		 * New pins can come later if the page is shared across fork,
> +		 * but not for the this process. It is fine. The other process
> +		 * cannot write to the page, only trigger CoW.
>   		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		if (total_mapcount(page) + PageSwapCache(page) !=
> +				page_count(page)) {

This check looks fine for base page, but what if the page is PTE-mapped 
THP? The following patch made this possible.

If it is PTE-mapped THP and the page is in swap cache, the refcount 
would be 512 + the number of PTE-mapped pages.

Shall we do the below change in the following patch?

extra_pins = PageSwapCache(page) ? nr_ccompound(page) - 1 : 0;
if (total_mapcount(page) + PageSwapCache(page) != page_count(page) - 
extra_pins) {
...
}

>   			unlock_page(page);
>   			result = SCAN_PAGE_COUNT;
>   			goto out;
> @@ -672,7 +679,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>   		} else {
>   			src_page = pte_page(pteval);
>   			copy_user_highpage(page, src_page, address, vma);
> -			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
>   			release_pte_page(src_page);
>   			/*
>   			 * ptl mostly unnecessary, but preempt has to
> @@ -1206,12 +1212,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   			goto out_unmap;
>   		}
>   
> -		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> -		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		/* Check if the page has any GUP (or other external) pins */
> +		if (total_mapcount(page) + PageSwapCache(page) !=
> +				page_count(page)) {
>   			result = SCAN_PAGE_COUNT;
>   			goto out_unmap;
>   		}
John Hubbard April 6, 2020, 9:30 p.m. UTC | #3
On 4/3/20 4:29 AM, Kirill A. Shutemov wrote:
> The page can be included into collapse as long as it doesn't have extra
> pins (from GUP or otherwise).

Hi Kirill,

s/callapse/collapse/ in the Subject line.

The commit message should mention that you're also removing a
VM_BUG_ON_PAGE().

> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   mm/khugepaged.c | 25 ++++++++++++++-----------
>   1 file changed, 14 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 57ff287caf6b..1e7e6543ebca 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -581,11 +581,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		}
>   
>   		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> +		 * Check if the page has any GUP (or other external) pins.
> +		 *
> +		 * The page table that maps the page has been already unlinked
> +		 * from the page table tree and this process cannot get
> +		 * additinal pin on the page.


I'd recommend this wording instead, for the last two lines:

		 * from the page table tree. Therefore, this page will not
		 * normally receive any additional pins.


> +		 *
> +		 * New pins can come later if the page is shared across fork,
> +		 * but not for the this process. It is fine. The other process
> +		 * cannot write to the page, only trigger CoW.
>   		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		if (total_mapcount(page) + PageSwapCache(page) !=
> +				page_count(page)) {


I think it's time to put that logic ( "does this page have any extra references")
into a small function. It's already duplicated once below. And the documentation is
duplicated as well.

I took a quick peek at this patch because, after adding pin_user_pages*() APIs earlier
to complement get_user_pages*(), I had a moment of doubt here: what if I'd done  it in
a way that required additional logic here? Fortunately, that's not the case: all
pin_user_pages() calls on huge pages take a "primary/real" refcount, in addition
to scribbling into the compound_pincount_ptr() area. whew. :)



>   			unlock_page(page);
>   			result = SCAN_PAGE_COUNT;
>   			goto out;
> @@ -672,7 +679,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>   		} else {
>   			src_page = pte_page(pteval);
>   			copy_user_highpage(page, src_page, address, vma);
> -			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
>   			release_pte_page(src_page);
>   			/*
>   			 * ptl mostly unnecessary, but preempt has to
> @@ -1206,12 +1212,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>   			goto out_unmap;
>   		}
>   
> -		/*
> -		 * cannot use mapcount: can't collapse if there's a gup pin.
> -		 * The page must only be referenced by the scanned process
> -		 * and page swap cache.
> -		 */
> -		if (page_count(page) != 1 + PageSwapCache(page)) {
> +		/* Check if the page has any GUP (or other external) pins */
> +		if (total_mapcount(page) + PageSwapCache(page) !=
> +				page_count(page)) {>   			result = SCAN_PAGE_COUNT;
>   			goto out_unmap;
>   		}
> 



thanks,
Kirill A . Shutemov April 8, 2020, 1:10 p.m. UTC | #4
On Mon, Apr 06, 2020 at 01:50:56PM -0700, Yang Shi wrote:
> 
> 
> On 4/3/20 4:29 AM, Kirill A. Shutemov wrote:
> > The page can be included into collapse as long as it doesn't have extra
> > pins (from GUP or otherwise).
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >   mm/khugepaged.c | 25 ++++++++++++++-----------
> >   1 file changed, 14 insertions(+), 11 deletions(-)
> > 
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 57ff287caf6b..1e7e6543ebca 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -581,11 +581,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >   		}
> >   		/*
> > -		 * cannot use mapcount: can't collapse if there's a gup pin.
> > -		 * The page must only be referenced by the scanned process
> > -		 * and page swap cache.
> > +		 * Check if the page has any GUP (or other external) pins.
> > +		 *
> > +		 * The page table that maps the page has been already unlinked
> > +		 * from the page table tree and this process cannot get
> > +		 * additinal pin on the page.
> > +		 *
> > +		 * New pins can come later if the page is shared across fork,
> > +		 * but not for the this process. It is fine. The other process
> > +		 * cannot write to the page, only trigger CoW.
> >   		 */
> > -		if (page_count(page) != 1 + PageSwapCache(page)) {
> > +		if (total_mapcount(page) + PageSwapCache(page) !=
> > +				page_count(page)) {
> 
> This check looks fine for base page, but what if the page is PTE-mapped THP?
> The following patch made this possible.
> 
> If it is PTE-mapped THP and the page is in swap cache, the refcount would be
> 512 + the number of PTE-mapped pages.
> 
> Shall we do the below change in the following patch?
> 
> extra_pins = PageSwapCache(page) ? nr_ccompound(page) - 1 : 0;
> if (total_mapcount(page) + PageSwapCache(page) != page_count(page) -
> extra_pins) {
> ...

Looks like you're right.

It would be nice to have a test case to demonstrate the issue.

Is there any way to trigger moving the page to swap cache? I don't see it
immediately.
Yang Shi April 8, 2020, 6:51 p.m. UTC | #5
On 4/8/20 6:10 AM, Kirill A. Shutemov wrote:
> On Mon, Apr 06, 2020 at 01:50:56PM -0700, Yang Shi wrote:
>>
>> On 4/3/20 4:29 AM, Kirill A. Shutemov wrote:
>>> The page can be included into collapse as long as it doesn't have extra
>>> pins (from GUP or otherwise).
>>>
>>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>>> ---
>>>    mm/khugepaged.c | 25 ++++++++++++++-----------
>>>    1 file changed, 14 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index 57ff287caf6b..1e7e6543ebca 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -581,11 +581,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>    		}
>>>    		/*
>>> -		 * cannot use mapcount: can't collapse if there's a gup pin.
>>> -		 * The page must only be referenced by the scanned process
>>> -		 * and page swap cache.
>>> +		 * Check if the page has any GUP (or other external) pins.
>>> +		 *
>>> +		 * The page table that maps the page has been already unlinked
>>> +		 * from the page table tree and this process cannot get
>>> +		 * additinal pin on the page.
>>> +		 *
>>> +		 * New pins can come later if the page is shared across fork,
>>> +		 * but not for the this process. It is fine. The other process
>>> +		 * cannot write to the page, only trigger CoW.
>>>    		 */
>>> -		if (page_count(page) != 1 + PageSwapCache(page)) {
>>> +		if (total_mapcount(page) + PageSwapCache(page) !=
>>> +				page_count(page)) {
>> This check looks fine for base page, but what if the page is PTE-mapped THP?
>> The following patch made this possible.
>>
>> If it is PTE-mapped THP and the page is in swap cache, the refcount would be
>> 512 + the number of PTE-mapped pages.
>>
>> Shall we do the below change in the following patch?
>>
>> extra_pins = PageSwapCache(page) ? nr_ccompound(page) - 1 : 0;
>> if (total_mapcount(page) + PageSwapCache(page) != page_count(page) -
>> extra_pins) {
>> ...
> Looks like you're right.
>
> It would be nice to have a test case to demonstrate the issue.
>
> Is there any way to trigger moving the page to swap cache? I don't see it
> immediately.

It sounds not easy to trigger since it totally depends on timing, I'm 
wondering we may have to use MADV_PAGEOUT? Something below off the top 
of my head may trigger this?


     CPU       A                                    CPU    B             
             CPU    C
In parent:
MADV_HUGEPAGE
page fault to fill with THP
fork
                                                     In Child:
MADV_NOHUGEPAGE
MADV_DONTNEED (split pmd)
                                                         MADV_PAGEOUT
                                                             -> add_to_swap
                                 khugepaged scan parent and try to 
collapse PTE-mapped

                                                             -> try_to_unmap

When doing MADV_DONTNEED we need make sure head page is unmapped since 
MADV_PAGEOUT would call page_mapcount(page) to skip shared mapping.

>
Yang Shi April 10, 2020, 12:03 a.m. UTC | #6
On 4/8/20 11:51 AM, Yang Shi wrote:
>
>
> On 4/8/20 6:10 AM, Kirill A. Shutemov wrote:
>> On Mon, Apr 06, 2020 at 01:50:56PM -0700, Yang Shi wrote:
>>>
>>> On 4/3/20 4:29 AM, Kirill A. Shutemov wrote:
>>>> The page can be included into collapse as long as it doesn't have 
>>>> extra
>>>> pins (from GUP or otherwise).
>>>>
>>>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>>>> ---
>>>>    mm/khugepaged.c | 25 ++++++++++++++-----------
>>>>    1 file changed, 14 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>> index 57ff287caf6b..1e7e6543ebca 100644
>>>> --- a/mm/khugepaged.c
>>>> +++ b/mm/khugepaged.c
>>>> @@ -581,11 +581,18 @@ static int 
>>>> __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>>            }
>>>>            /*
>>>> -         * cannot use mapcount: can't collapse if there's a gup pin.
>>>> -         * The page must only be referenced by the scanned process
>>>> -         * and page swap cache.
>>>> +         * Check if the page has any GUP (or other external) pins.
>>>> +         *
>>>> +         * The page table that maps the page has been already 
>>>> unlinked
>>>> +         * from the page table tree and this process cannot get
>>>> +         * additinal pin on the page.
>>>> +         *
>>>> +         * New pins can come later if the page is shared across fork,
>>>> +         * but not for the this process. It is fine. The other 
>>>> process
>>>> +         * cannot write to the page, only trigger CoW.
>>>>             */
>>>> -        if (page_count(page) != 1 + PageSwapCache(page)) {
>>>> +        if (total_mapcount(page) + PageSwapCache(page) !=
>>>> +                page_count(page)) {
>>> This check looks fine for base page, but what if the page is 
>>> PTE-mapped THP?
>>> The following patch made this possible.
>>>
>>> If it is PTE-mapped THP and the page is in swap cache, the refcount 
>>> would be
>>> 512 + the number of PTE-mapped pages.
>>>
>>> Shall we do the below change in the following patch?
>>>
>>> extra_pins = PageSwapCache(page) ? nr_ccompound(page) - 1 : 0;
>>> if (total_mapcount(page) + PageSwapCache(page) != page_count(page) -
>>> extra_pins) {
>>> ...
>> Looks like you're right.
>>
>> It would be nice to have a test case to demonstrate the issue.
>>
>> Is there any way to trigger moving the page to swap cache? I don't 
>> see it
>> immediately.
>
> It sounds not easy to trigger since it totally depends on timing, I'm 
> wondering we may have to use MADV_PAGEOUT? Something below off the top 
> of my head may trigger this?
>
>
>     CPU       A                                    CPU    B         
>             CPU    C
> In parent:
> MADV_HUGEPAGE
> page fault to fill with THP
> fork
>                                                     In Child:
> MADV_NOHUGEPAGE
> MADV_DONTNEED (split pmd)
> MADV_PAGEOUT
>                                                             -> 
> add_to_swap
>                                 khugepaged scan parent and try to 
> collapse PTE-mapped
>
>                                                             -> 
> try_to_unmap
>
> When doing MADV_DONTNEED we need make sure head page is unmapped since 
> MADV_PAGEOUT would call page_mapcount(page) to skip shared mapping.

This can't trigger it since MADV_PAGEOUT would isolate the page from lru 
so khugepaged_scan_pmd() can't reach the refcount check at all.

If try_to_unmap() or pageout() fails, the page may stay in swap cache. 
But I didn't think of an easy way to make this happen.

>
>>
>
Kirill A . Shutemov April 10, 2020, 3:55 p.m. UTC | #7
On Mon, Apr 06, 2020 at 02:30:07PM -0700, John Hubbard wrote:
> On 4/3/20 4:29 AM, Kirill A. Shutemov wrote:
> > The page can be included into collapse as long as it doesn't have extra
> > pins (from GUP or otherwise).
> 
> Hi Kirill,
> 
> s/callapse/collapse/ in the Subject line.
> 
> The commit message should mention that you're also removing a
> VM_BUG_ON_PAGE().
> 
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >   mm/khugepaged.c | 25 ++++++++++++++-----------
> >   1 file changed, 14 insertions(+), 11 deletions(-)
> > 
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 57ff287caf6b..1e7e6543ebca 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -581,11 +581,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >   		}
> >   		/*
> > -		 * cannot use mapcount: can't collapse if there's a gup pin.
> > -		 * The page must only be referenced by the scanned process
> > -		 * and page swap cache.
> > +		 * Check if the page has any GUP (or other external) pins.
> > +		 *
> > +		 * The page table that maps the page has been already unlinked
> > +		 * from the page table tree and this process cannot get
> > +		 * additinal pin on the page.
> 
> 
> I'd recommend this wording instead, for the last two lines:
> 
> 		 * from the page table tree. Therefore, this page will not
> 		 * normally receive any additional pins.

I guess I'm not clear enough.

The point is that the page cannot get any new pins from this process. It
can get new pin from other process after the check. But it is fine because
if the page is mapped multiple times it has to be write-protected (CoW
after fork()) and we can rely that page's content will not change under
us.

Does it make sense? Wording suggestions are welcome.

> > +		 *
> > +		 * New pins can come later if the page is shared across fork,
> > +		 * but not for the this process. It is fine. The other process
> > +		 * cannot write to the page, only trigger CoW.
> >   		 */
> > -		if (page_count(page) != 1 + PageSwapCache(page)) {
> > +		if (total_mapcount(page) + PageSwapCache(page) !=
> > +				page_count(page)) {
> 
> 
> I think it's time to put that logic ( "does this page have any extra references")
> into a small function. It's already duplicated once below. And the documentation is
> duplicated as well.

Fair enough.

But comments have to stay where they are. Because the context is
different. The first time we check speculatively, before the page table is
unlinked from the page table tree and this check is inherintly racy.
Unlike the second one.

> I took a quick peek at this patch because, after adding pin_user_pages*() APIs earlier
> to complement get_user_pages*(), I had a moment of doubt here: what if I'd done  it in
> a way that required additional logic here? Fortunately, that's not the case: all
> pin_user_pages() calls on huge pages take a "primary/real" refcount, in addition
> to scribbling into the compound_pincount_ptr() area. whew. :)
> 
> 
> 
> >   			unlock_page(page);
> >   			result = SCAN_PAGE_COUNT;
> >   			goto out;
> > @@ -672,7 +679,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> >   		} else {
> >   			src_page = pte_page(pteval);
> >   			copy_user_highpage(page, src_page, address, vma);
> > -			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
> >   			release_pte_page(src_page);
> >   			/*
> >   			 * ptl mostly unnecessary, but preempt has to
> > @@ -1206,12 +1212,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >   			goto out_unmap;
> >   		}
> > -		/*
> > -		 * cannot use mapcount: can't collapse if there's a gup pin.
> > -		 * The page must only be referenced by the scanned process
> > -		 * and page swap cache.
> > -		 */
> > -		if (page_count(page) != 1 + PageSwapCache(page)) {
> > +		/* Check if the page has any GUP (or other external) pins */
> > +		if (total_mapcount(page) + PageSwapCache(page) !=
> > +				page_count(page)) {>   			result = SCAN_PAGE_COUNT;
> >   			goto out_unmap;
> >   		}
> > 
> 
> 
> 
> thanks,
> -- 
> John Hubbard
> NVIDIA
Kirill A . Shutemov April 10, 2020, 3:56 p.m. UTC | #8
On Thu, Apr 09, 2020 at 05:03:00PM -0700, Yang Shi wrote:
> 
> 
> On 4/8/20 11:51 AM, Yang Shi wrote:
> > 
> > 
> > On 4/8/20 6:10 AM, Kirill A. Shutemov wrote:
> > > On Mon, Apr 06, 2020 at 01:50:56PM -0700, Yang Shi wrote:
> > > > 
> > > > On 4/3/20 4:29 AM, Kirill A. Shutemov wrote:
> > > > > The page can be included into collapse as long as it doesn't
> > > > > have extra
> > > > > pins (from GUP or otherwise).
> > > > > 
> > > > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > > > ---
> > > > >    mm/khugepaged.c | 25 ++++++++++++++-----------
> > > > >    1 file changed, 14 insertions(+), 11 deletions(-)
> > > > > 
> > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > > index 57ff287caf6b..1e7e6543ebca 100644
> > > > > --- a/mm/khugepaged.c
> > > > > +++ b/mm/khugepaged.c
> > > > > @@ -581,11 +581,18 @@ static int
> > > > > __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > > > >            }
> > > > >            /*
> > > > > -         * cannot use mapcount: can't collapse if there's a gup pin.
> > > > > -         * The page must only be referenced by the scanned process
> > > > > -         * and page swap cache.
> > > > > +         * Check if the page has any GUP (or other external) pins.
> > > > > +         *
> > > > > +         * The page table that maps the page has been
> > > > > already unlinked
> > > > > +         * from the page table tree and this process cannot get
> > > > > +         * additinal pin on the page.
> > > > > +         *
> > > > > +         * New pins can come later if the page is shared across fork,
> > > > > +         * but not for the this process. It is fine. The
> > > > > other process
> > > > > +         * cannot write to the page, only trigger CoW.
> > > > >             */
> > > > > -        if (page_count(page) != 1 + PageSwapCache(page)) {
> > > > > +        if (total_mapcount(page) + PageSwapCache(page) !=
> > > > > +                page_count(page)) {
> > > > This check looks fine for base page, but what if the page is
> > > > PTE-mapped THP?
> > > > The following patch made this possible.
> > > > 
> > > > If it is PTE-mapped THP and the page is in swap cache, the
> > > > refcount would be
> > > > 512 + the number of PTE-mapped pages.
> > > > 
> > > > Shall we do the below change in the following patch?
> > > > 
> > > > extra_pins = PageSwapCache(page) ? nr_ccompound(page) - 1 : 0;
> > > > if (total_mapcount(page) + PageSwapCache(page) != page_count(page) -
> > > > extra_pins) {
> > > > ...
> > > Looks like you're right.
> > > 
> > > It would be nice to have a test case to demonstrate the issue.
> > > 
> > > Is there any way to trigger moving the page to swap cache? I don't
> > > see it
> > > immediately.
> > 
> > It sounds not easy to trigger since it totally depends on timing, I'm
> > wondering we may have to use MADV_PAGEOUT? Something below off the top
> > of my head may trigger this?
> > 
> > 
> >     CPU       A                                    CPU    B            
> >         CPU    C
> > In parent:
> > MADV_HUGEPAGE
> > page fault to fill with THP
> > fork
> >                                                     In Child:
> > MADV_NOHUGEPAGE
> > MADV_DONTNEED (split pmd)
> > MADV_PAGEOUT
> >                                                             ->
> > add_to_swap
> >                                 khugepaged scan parent and try to
> > collapse PTE-mapped
> > 
> >                                                             ->
> > try_to_unmap
> > 
> > When doing MADV_DONTNEED we need make sure head page is unmapped since
> > MADV_PAGEOUT would call page_mapcount(page) to skip shared mapping.
> 
> This can't trigger it since MADV_PAGEOUT would isolate the page from lru so
> khugepaged_scan_pmd() can't reach the refcount check at all.
> 
> If try_to_unmap() or pageout() fails, the page may stay in swap cache. But I
> didn't think of an easy way to make this happen.

Okay, I'll fix it without a test case.
John Hubbard April 10, 2020, 8:59 p.m. UTC | #9
On 4/10/20 8:55 AM, Kirill A. Shutemov wrote:
...
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index 57ff287caf6b..1e7e6543ebca 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -581,11 +581,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>>    		}
>>>    		/*
>>> -		 * cannot use mapcount: can't collapse if there's a gup pin.
>>> -		 * The page must only be referenced by the scanned process
>>> -		 * and page swap cache.
>>> +		 * Check if the page has any GUP (or other external) pins.
>>> +		 *
>>> +		 * The page table that maps the page has been already unlinked
>>> +		 * from the page table tree and this process cannot get
>>> +		 * additinal pin on the page.
>>
>>
>> I'd recommend this wording instead, for the last two lines:
>>
>> 		 * from the page table tree. Therefore, this page will not
>> 		 * normally receive any additional pins.
> 
> I guess I'm not clear enough.
> 
> The point is that the page cannot get any new pins from this process. It
> can get new pin from other process after the check. But it is fine because
> if the page is mapped multiple times it has to be write-protected (CoW
> after fork()) and we can rely that page's content will not change under
> us.
> 
> Does it make sense? Wording suggestions are welcome.


I think I understood what you were saying. The problem is that was ignoring
a couple of points, especially in an RDMA situation: 1) the page can be
pinned by various drivers, on behalf of other processes, even if the original
process is being torn down, and 2) it doesn't really matter which process pins
a page--the end result is that it's pinned.

So that's why I changed the comment to be much milder: "this page will not
normally receive any additional pins". "Normally" means "in a non-RDMA
setup, for example".

Or am I missing some other point here?

> 
>>> +		 *
>>> +		 * New pins can come later if the page is shared across fork,
>>> +		 * but not for the this process. It is fine. The other process
>>> +		 * cannot write to the page, only trigger CoW.
>>>    		 */
>>> -		if (page_count(page) != 1 + PageSwapCache(page)) {
>>> +		if (total_mapcount(page) + PageSwapCache(page) !=
>>> +				page_count(page)) {
>>
>>
>> I think it's time to put that logic ( "does this page have any extra references")
>> into a small function. It's already duplicated once below. And the documentation is
>> duplicated as well.
> 
> Fair enough.
> 
> But comments have to stay where they are. Because the context is
> different. The first time we check speculatively, before the page table is
> unlinked from the page table tree and this check is inherintly racy.
> Unlike the second one.


Right. Let's take another look at them after you point out to me why my response above
is all wrong... :)

> 
>> I took a quick peek at this patch because, after adding pin_user_pages*() APIs earlier
>> to complement get_user_pages*(), I had a moment of doubt here: what if I'd done  it in
>> a way that required additional logic here? Fortunately, that's not the case: all
>> pin_user_pages() calls on huge pages take a "primary/real" refcount, in addition
>> to scribbling into the compound_pincount_ptr() area. whew. :)
>>
>>
>>
>>>    			unlock_page(page);
>>>    			result = SCAN_PAGE_COUNT;
>>>    			goto out;
>>> @@ -672,7 +679,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>>>    		} else {
>>>    			src_page = pte_page(pteval);
>>>    			copy_user_highpage(page, src_page, address, vma);
>>> -			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
>>>    			release_pte_page(src_page);
>>>    			/*
>>>    			 * ptl mostly unnecessary, but preempt has to
>>> @@ -1206,12 +1212,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>>>    			goto out_unmap;
>>>    		}
>>> -		/*
>>> -		 * cannot use mapcount: can't collapse if there's a gup pin.
>>> -		 * The page must only be referenced by the scanned process
>>> -		 * and page swap cache.
>>> -		 */
>>> -		if (page_count(page) != 1 + PageSwapCache(page)) {
>>> +		/* Check if the page has any GUP (or other external) pins */
>>> +		if (total_mapcount(page) + PageSwapCache(page) !=
>>> +				page_count(page)) {>   			result = SCAN_PAGE_COUNT;
>>>    			goto out_unmap;
>>>    		}
>>>

thanks,
Kirill A . Shutemov April 13, 2020, 9:42 a.m. UTC | #10
On Fri, Apr 10, 2020 at 01:59:22PM -0700, John Hubbard wrote:
> I think I understood what you were saying. The problem is that was ignoring
> a couple of points, especially in an RDMA situation: 1) the page can be
> pinned by various drivers, on behalf of other processes, even if the original
> process is being torn down, and 2) it doesn't really matter which process pins
> a page--the end result is that it's pinned.

Well, no. It is critical that nobody gets new pins after this point on
behalf of *this* process as we about change what is mapped on this virtual
address range. We must avoid the situation that khugepaged screws
legitimate GUP users and make what process see differs from what GUP see.

Pins on behalf of other processes after the point are not relevant to us.

I will keep the comment as is for now. As you can see I'm struggling
communicating my point. Any better wording is welcome.
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 57ff287caf6b..1e7e6543ebca 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -581,11 +581,18 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		}
 
 		/*
-		 * cannot use mapcount: can't collapse if there's a gup pin.
-		 * The page must only be referenced by the scanned process
-		 * and page swap cache.
+		 * Check if the page has any GUP (or other external) pins.
+		 *
+		 * The page table that maps the page has been already unlinked
+		 * from the page table tree and this process cannot get
+		 * additinal pin on the page.
+		 *
+		 * New pins can come later if the page is shared across fork,
+		 * but not for the this process. It is fine. The other process
+		 * cannot write to the page, only trigger CoW.
 		 */
-		if (page_count(page) != 1 + PageSwapCache(page)) {
+		if (total_mapcount(page) + PageSwapCache(page) !=
+				page_count(page)) {
 			unlock_page(page);
 			result = SCAN_PAGE_COUNT;
 			goto out;
@@ -672,7 +679,6 @@  static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 		} else {
 			src_page = pte_page(pteval);
 			copy_user_highpage(page, src_page, address, vma);
-			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
 			release_pte_page(src_page);
 			/*
 			 * ptl mostly unnecessary, but preempt has to
@@ -1206,12 +1212,9 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 			goto out_unmap;
 		}
 
-		/*
-		 * cannot use mapcount: can't collapse if there's a gup pin.
-		 * The page must only be referenced by the scanned process
-		 * and page swap cache.
-		 */
-		if (page_count(page) != 1 + PageSwapCache(page)) {
+		/* Check if the page has any GUP (or other external) pins */
+		if (total_mapcount(page) + PageSwapCache(page) !=
+				page_count(page)) {
 			result = SCAN_PAGE_COUNT;
 			goto out_unmap;
 		}