diff mbox series

[2/2] mm/hugetlb: refactor subpage recording

Message ID 20210125205744.10203-3-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series mm/hugetlb: follow_hugetlb_page() improvements | expand

Commit Message

Joao Martins Jan. 25, 2021, 8:57 p.m. UTC
For a given hugepage backing a VA, there's a rather ineficient
loop which is solely responsible for storing subpages in the passed
pages/vmas array. For each subpage we check whether it's within
range or size of @pages and keep incrementing @pfn_offset and a couple
other variables per subpage iteration.

Simplify this logic and minimize ops per iteration to just
store the output page/vma. Instead of incrementing number of @refs
iteratively, we do it through a precalculation of @refs and having
only a tight loop for storing pinned subpages/vmas.

pinning consequently improves considerably, bringing us close to
{pin,get}_user_pages_fast:

 - 16G with 1G huge page size
 gup_test -f /mnt/huge/file -m 16384 -r 10 -L -S -n 512 -w

 PIN_LONGTERM_BENCHMARK: ~11k us -> ~4400 us
 PIN_FAST_BENCHMARK: ~3700 us

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 mm/hugetlb.c | 49 ++++++++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

Comments

Mike Kravetz Jan. 26, 2021, 6:08 p.m. UTC | #1
On 1/25/21 12:57 PM, Joao Martins wrote:
> For a given hugepage backing a VA, there's a rather ineficient
> loop which is solely responsible for storing subpages in the passed
> pages/vmas array. For each subpage we check whether it's within
> range or size of @pages and keep incrementing @pfn_offset and a couple
> other variables per subpage iteration.
> 
> Simplify this logic and minimize ops per iteration to just
> store the output page/vma. Instead of incrementing number of @refs
> iteratively, we do it through a precalculation of @refs and having
> only a tight loop for storing pinned subpages/vmas.
> 
> pinning consequently improves considerably, bringing us close to
> {pin,get}_user_pages_fast:
> 
>  - 16G with 1G huge page size
>  gup_test -f /mnt/huge/file -m 16384 -r 10 -L -S -n 512 -w
> 
>  PIN_LONGTERM_BENCHMARK: ~11k us -> ~4400 us
>  PIN_FAST_BENCHMARK: ~3700 us
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>  mm/hugetlb.c | 49 ++++++++++++++++++++++++++++---------------------
>  1 file changed, 28 insertions(+), 21 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 016addc8e413..1f7a95bc7c87 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4789,6 +4789,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  	goto out;
>  }
>  
> +static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
> +				 int refs, struct page **pages,
> +				 struct vm_area_struct **vmas)
> +{
> +	int nr;
> +
> +	for (nr = 0; nr < refs; nr++) {
> +		if (likely(pages))
> +			pages[nr] = page++;
> +		if (vmas)
> +			vmas[nr] = vma;
> +	}
> +}
> +
>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  			 struct page **pages, struct vm_area_struct **vmas,
>  			 unsigned long *position, unsigned long *nr_pages,
> @@ -4918,28 +4932,16 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  			continue;
>  		}
>  
> -		refs = 0;
> +		refs = min3(pages_per_huge_page(h) - pfn_offset,
> +			    (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
>  
> -same_page:
> -		if (pages)
> -			pages[i] = mem_map_offset(page, pfn_offset);
> +		if (pages || vmas)
> +			record_subpages_vmas(mem_map_offset(page, pfn_offset),

The assumption made here is that mem_map is contiguous for the range of
pages in the hugetlb page.  I do not believe you can make this assumption
for (gigantic) hugetlb pages which are > MAX_ORDER_NR_PAGES.  For example,

/*
 * Gigantic pages are so large that we do not guarantee that page++ pointer
 * arithmetic will work across the entire page.  We need something more
 * specialized.
 */
static void __copy_gigantic_page(struct page *dst, struct page *src,
                                int nr_pages)
Joao Martins Jan. 26, 2021, 7:21 p.m. UTC | #2
On 1/26/21 6:08 PM, Mike Kravetz wrote:
> On 1/25/21 12:57 PM, Joao Martins wrote:
>> For a given hugepage backing a VA, there's a rather ineficient
>> loop which is solely responsible for storing subpages in the passed
>> pages/vmas array. For each subpage we check whether it's within
>> range or size of @pages and keep incrementing @pfn_offset and a couple
>> other variables per subpage iteration.
>>
>> Simplify this logic and minimize ops per iteration to just
>> store the output page/vma. Instead of incrementing number of @refs
>> iteratively, we do it through a precalculation of @refs and having
>> only a tight loop for storing pinned subpages/vmas.
>>
>> pinning consequently improves considerably, bringing us close to
>> {pin,get}_user_pages_fast:
>>
>>  - 16G with 1G huge page size
>>  gup_test -f /mnt/huge/file -m 16384 -r 10 -L -S -n 512 -w
>>
>>  PIN_LONGTERM_BENCHMARK: ~11k us -> ~4400 us
>>  PIN_FAST_BENCHMARK: ~3700 us
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>  mm/hugetlb.c | 49 ++++++++++++++++++++++++++++---------------------
>>  1 file changed, 28 insertions(+), 21 deletions(-)
>>
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 016addc8e413..1f7a95bc7c87 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -4789,6 +4789,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>>  	goto out;
>>  }
>>  
>> +static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
>> +				 int refs, struct page **pages,
>> +				 struct vm_area_struct **vmas)
>> +{
>> +	int nr;
>> +
>> +	for (nr = 0; nr < refs; nr++) {
>> +		if (likely(pages))
>> +			pages[nr] = page++;
>> +		if (vmas)
>> +			vmas[nr] = vma;
>> +	}
>> +}
>> +
>>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>  			 struct page **pages, struct vm_area_struct **vmas,
>>  			 unsigned long *position, unsigned long *nr_pages,
>> @@ -4918,28 +4932,16 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>  			continue;
>>  		}
>>  
>> -		refs = 0;
>> +		refs = min3(pages_per_huge_page(h) - pfn_offset,
>> +			    (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
>>  
>> -same_page:
>> -		if (pages)
>> -			pages[i] = mem_map_offset(page, pfn_offset);
>> +		if (pages || vmas)
>> +			record_subpages_vmas(mem_map_offset(page, pfn_offset),
> 
> The assumption made here is that mem_map is contiguous for the range of
> pages in the hugetlb page.  I do not believe you can make this assumption
> for (gigantic) hugetlb pages which are > MAX_ORDER_NR_PAGES.  For example,
> 

That would mean get_user_pages_fast() and put_user_pages_fast() are broken for anything
handling PUDs or above? See record_subpages() in gup_huge_pud() or even gup_huge_pgd().
It's using the same page++.

This adjustment below probably is what you're trying to suggest.

Also, nth_page() is slightly more expensive and so the numbers above change from ~4.4k
usecs to ~7.8k usecs.

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1f7a95bc7c87..cf66f8c2f92a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4789,15 +4789,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        goto out;
 }

-static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
+static void record_subpages_vmas(struct page *page, unsigned long pfn_offset,
+                                struct vm_area_struct *vma,
                                 int refs, struct page **pages,
                                 struct vm_area_struct **vmas)
 {
-       int nr;
+       unsigned long nr;

        for (nr = 0; nr < refs; nr++) {
                if (likely(pages))
-                       pages[nr] = page++;
+                       pages[nr] = mem_map_offset(page, pfn_offset + nr);
                if (vmas)
                        vmas[nr] = vma;
        }
@@ -4936,8 +4937,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct
*vma,
                            (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);

                if (pages || vmas)
-                       record_subpages_vmas(mem_map_offset(page, pfn_offset),
-                                            vma, refs,
+                       record_subpages_vmas(page, pfn_offset, vma, refs,
                                             likely(pages) ? pages + i : NULL,
                                             vmas ? vmas + i : NULL);
Joao Martins Jan. 26, 2021, 7:24 p.m. UTC | #3
On 1/26/21 7:21 PM, Joao Martins wrote:
> On 1/26/21 6:08 PM, Mike Kravetz wrote:
>> On 1/25/21 12:57 PM, Joao Martins wrote:
>>> For a given hugepage backing a VA, there's a rather ineficient
>>> loop which is solely responsible for storing subpages in the passed
>>> pages/vmas array. For each subpage we check whether it's within
>>> range or size of @pages and keep incrementing @pfn_offset and a couple
>>> other variables per subpage iteration.
>>>
>>> Simplify this logic and minimize ops per iteration to just
>>> store the output page/vma. Instead of incrementing number of @refs
>>> iteratively, we do it through a precalculation of @refs and having
>>> only a tight loop for storing pinned subpages/vmas.
>>>
>>> pinning consequently improves considerably, bringing us close to
>>> {pin,get}_user_pages_fast:
>>>
>>>  - 16G with 1G huge page size
>>>  gup_test -f /mnt/huge/file -m 16384 -r 10 -L -S -n 512 -w
>>>
>>>  PIN_LONGTERM_BENCHMARK: ~11k us -> ~4400 us
>>>  PIN_FAST_BENCHMARK: ~3700 us
>>>
>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>> ---
>>>  mm/hugetlb.c | 49 ++++++++++++++++++++++++++++---------------------
>>>  1 file changed, 28 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 016addc8e413..1f7a95bc7c87 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -4789,6 +4789,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>>>  	goto out;
>>>  }
>>>  
>>> +static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
>>> +				 int refs, struct page **pages,
>>> +				 struct vm_area_struct **vmas)
>>> +{
>>> +	int nr;
>>> +
>>> +	for (nr = 0; nr < refs; nr++) {
>>> +		if (likely(pages))
>>> +			pages[nr] = page++;
>>> +		if (vmas)
>>> +			vmas[nr] = vma;
>>> +	}
>>> +}
>>> +
>>>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>>  			 struct page **pages, struct vm_area_struct **vmas,
>>>  			 unsigned long *position, unsigned long *nr_pages,
>>> @@ -4918,28 +4932,16 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>>  			continue;
>>>  		}
>>>  
>>> -		refs = 0;
>>> +		refs = min3(pages_per_huge_page(h) - pfn_offset,
>>> +			    (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
>>>  
>>> -same_page:
>>> -		if (pages)
>>> -			pages[i] = mem_map_offset(page, pfn_offset);
>>> +		if (pages || vmas)
>>> +			record_subpages_vmas(mem_map_offset(page, pfn_offset),
>>
>> The assumption made here is that mem_map is contiguous for the range of
>> pages in the hugetlb page.  I do not believe you can make this assumption
>> for (gigantic) hugetlb pages which are > MAX_ORDER_NR_PAGES.  For example,
>>
> 
> That would mean get_user_pages_fast() and put_user_pages_fast() are broken for anything
> handling PUDs or above? See record_subpages() in gup_huge_pud() or even gup_huge_pgd().
> It's using the same page++.
> 

Err ... I meant pin_user_pages_fast(), sorry about that.

> This adjustment below probably is what you're trying to suggest.
> 
> Also, nth_page() is slightly more expensive and so the numbers above change from ~4.4k
> usecs to ~7.8k usecs.
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 1f7a95bc7c87..cf66f8c2f92a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4789,15 +4789,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>         goto out;
>  }
> 
> -static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
> +static void record_subpages_vmas(struct page *page, unsigned long pfn_offset,
> +                                struct vm_area_struct *vma,
>                                  int refs, struct page **pages,
>                                  struct vm_area_struct **vmas)
>  {
> -       int nr;
> +       unsigned long nr;
> 
>         for (nr = 0; nr < refs; nr++) {
>                 if (likely(pages))
> -                       pages[nr] = page++;
> +                       pages[nr] = mem_map_offset(page, pfn_offset + nr);
>                 if (vmas)
>                         vmas[nr] = vma;
>         }
> @@ -4936,8 +4937,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct
> *vma,
>                             (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
> 
>                 if (pages || vmas)
> -                       record_subpages_vmas(mem_map_offset(page, pfn_offset),
> -                                            vma, refs,
> +                       record_subpages_vmas(page, pfn_offset, vma, refs,
>                                              likely(pages) ? pages + i : NULL,
>                                              vmas ? vmas + i : NULL);
>
Mike Kravetz Jan. 26, 2021, 9:21 p.m. UTC | #4
On 1/26/21 11:21 AM, Joao Martins wrote:
> On 1/26/21 6:08 PM, Mike Kravetz wrote:
>> On 1/25/21 12:57 PM, Joao Martins wrote:
>>> 
>>> +static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
>>> +				 int refs, struct page **pages,
>>> +				 struct vm_area_struct **vmas)
>>> +{
>>> +	int nr;
>>> +
>>> +	for (nr = 0; nr < refs; nr++) {
>>> +		if (likely(pages))
>>> +			pages[nr] = page++;
>>> +		if (vmas)
>>> +			vmas[nr] = vma;
>>> +	}
>>> +}
>>> +
>>>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>>  			 struct page **pages, struct vm_area_struct **vmas,
>>>  			 unsigned long *position, unsigned long *nr_pages,
>>> @@ -4918,28 +4932,16 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>>  			continue;
>>>  		}
>>>  
>>> -		refs = 0;
>>> +		refs = min3(pages_per_huge_page(h) - pfn_offset,
>>> +			    (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
>>>  
>>> -same_page:
>>> -		if (pages)
>>> -			pages[i] = mem_map_offset(page, pfn_offset);
>>> +		if (pages || vmas)
>>> +			record_subpages_vmas(mem_map_offset(page, pfn_offset),
>>
>> The assumption made here is that mem_map is contiguous for the range of
>> pages in the hugetlb page.  I do not believe you can make this assumption
>> for (gigantic) hugetlb pages which are > MAX_ORDER_NR_PAGES.  For example,
>>

Thinking about this a bit more ...

mem_map can be accessed contiguously if we have a virtual memmap.  Correct?
I suspect virtual memmap may be the most common configuration today.  However,
it seems we do need to handle other configurations.

> That would mean get_user_pages_fast() and put_user_pages_fast() are broken for anything
> handling PUDs or above? See record_subpages() in gup_huge_pud() or even gup_huge_pgd().
> It's using the same page++.

Yes, I believe those would also have the issue.
Cc: John and Jason as they have spent a significant amount of time in gup
code recently.  There may be something that makes that code safe?

> This adjustment below probably is what you're trying to suggest.
> 
> Also, nth_page() is slightly more expensive and so the numbers above change from ~4.4k
> usecs to ~7.8k usecs.

If my thoughts about virtual memmap are correct, then could we simply have
a !vmemmap version of mem_map_offset (or similar routine) to avoid overhead?
Joao Martins Jan. 26, 2021, 11:20 p.m. UTC | #5
On 1/26/21 9:21 PM, Mike Kravetz wrote:
> On 1/26/21 11:21 AM, Joao Martins wrote:
>> On 1/26/21 6:08 PM, Mike Kravetz wrote:
>>> On 1/25/21 12:57 PM, Joao Martins wrote:
>>>>
>>>> +static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
>>>> +				 int refs, struct page **pages,
>>>> +				 struct vm_area_struct **vmas)
>>>> +{
>>>> +	int nr;
>>>> +
>>>> +	for (nr = 0; nr < refs; nr++) {
>>>> +		if (likely(pages))
>>>> +			pages[nr] = page++;
>>>> +		if (vmas)
>>>> +			vmas[nr] = vma;
>>>> +	}
>>>> +}
>>>> +
>>>>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>>>  			 struct page **pages, struct vm_area_struct **vmas,
>>>>  			 unsigned long *position, unsigned long *nr_pages,
>>>> @@ -4918,28 +4932,16 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>>>  			continue;
>>>>  		}
>>>>  
>>>> -		refs = 0;
>>>> +		refs = min3(pages_per_huge_page(h) - pfn_offset,
>>>> +			    (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
>>>>  
>>>> -same_page:
>>>> -		if (pages)
>>>> -			pages[i] = mem_map_offset(page, pfn_offset);
>>>> +		if (pages || vmas)
>>>> +			record_subpages_vmas(mem_map_offset(page, pfn_offset),
>>>
>>> The assumption made here is that mem_map is contiguous for the range of
>>> pages in the hugetlb page.  I do not believe you can make this assumption
>>> for (gigantic) hugetlb pages which are > MAX_ORDER_NR_PAGES.  For example,
>>>
> 
> Thinking about this a bit more ...
> 
> mem_map can be accessed contiguously if we have a virtual memmap.  Correct?

Right.

> I suspect virtual memmap may be the most common configuration today.  However,
> it seems we do need to handle other configurations.
> 

At the moment mem_map_offset(page, n) in turn does this for >= MAX_ORDER:

	pfn_to_page(page_to_pfn(page) + n)


For CONFIG_SPARSE_VMEMMAP or FLATMEM will resolve into something:

	vmemmap + ((page - vmemmap) + n)

It isn't really different than incrementing the @page.

I can only think that CONFIG_SPARSEMEM and CONFIG_DISCONTIGMEM as the offending
cases which respectively look into section info or pgdat.

[CONFIG_DISCONTIGMEM doesnt isn't auto selected by any arch at the moment.]

>> That would mean get_user_pages_fast() and pin_user_pages_fast() are broken for anything
>> handling PUDs or above? See record_subpages() in gup_huge_pud() or even gup_huge_pgd().
>> It's using the same page++.
> 
> Yes, I believe those would also have the issue.
> Cc: John and Jason as they have spent a significant amount of time in gup
> code recently.  There may be something that makes that code safe?
> 
Maybe -- Looking back, gup-fast has always relied on that page pointer arithmetic, even
before its refactors around __record_subpages() and what not.

>> This adjustment below probably is what you're trying to suggest.
>>
>> Also, nth_page() is slightly more expensive and so the numbers above change from ~4.4k
>> usecs to ~7.8k usecs.
> 
> If my thoughts about virtual memmap are correct, then could we simply have
> a !vmemmap version of mem_map_offset (or similar routine) to avoid overhead?
> In that case, we could ifdef out on SPARSEMEM || DISCONTIGMEM for mem_map_offset() either
internally or within the helper I added for follow_hugetlb_page().
Jason Gunthorpe Jan. 27, 2021, 12:07 a.m. UTC | #6
On Tue, Jan 26, 2021 at 01:21:46PM -0800, Mike Kravetz wrote:
> On 1/26/21 11:21 AM, Joao Martins wrote:
> > On 1/26/21 6:08 PM, Mike Kravetz wrote:
> >> On 1/25/21 12:57 PM, Joao Martins wrote:
> >>> 
> >>> +static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
> >>> +				 int refs, struct page **pages,
> >>> +				 struct vm_area_struct **vmas)
> >>> +{
> >>> +	int nr;
> >>> +
> >>> +	for (nr = 0; nr < refs; nr++) {
> >>> +		if (likely(pages))
> >>> +			pages[nr] = page++;
> >>> +		if (vmas)
> >>> +			vmas[nr] = vma;
> >>> +	}
> >>> +}
> >>> +
> >>>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> >>>  			 struct page **pages, struct vm_area_struct **vmas,
> >>>  			 unsigned long *position, unsigned long *nr_pages,
> >>> @@ -4918,28 +4932,16 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> >>>  			continue;
> >>>  		}
> >>>  
> >>> -		refs = 0;
> >>> +		refs = min3(pages_per_huge_page(h) - pfn_offset,
> >>> +			    (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
> >>>  
> >>> -same_page:
> >>> -		if (pages)
> >>> -			pages[i] = mem_map_offset(page, pfn_offset);
> >>> +		if (pages || vmas)
> >>> +			record_subpages_vmas(mem_map_offset(page, pfn_offset),
> >>
> >> The assumption made here is that mem_map is contiguous for the range of
> >> pages in the hugetlb page.  I do not believe you can make this assumption
> >> for (gigantic) hugetlb pages which are > MAX_ORDER_NR_PAGES.  For example,
> >>
> 
> Thinking about this a bit more ...
> 
> mem_map can be accessed contiguously if we have a virtual memmap.  Correct?
> I suspect virtual memmap may be the most common configuration today.  However,
> it seems we do need to handle other configurations.
> 
> > That would mean get_user_pages_fast() and put_user_pages_fast() are broken for anything
> > handling PUDs or above? See record_subpages() in gup_huge_pud() or even gup_huge_pgd().
> > It's using the same page++.
> 
> Yes, I believe those would also have the issue.
> Cc: John and Jason as they have spent a significant amount of time in gup
> code recently.  There may be something that makes that code safe?

I'm looking at Matt's folio patches and see:

+static inline struct folio *next_folio(struct folio *folio)
+{
+       return folio + folio_nr_pages(folio);
+}

And checking page_trans_huge_mapcount():

	for (i = 0; i < thp_nr_pages(page); i++) {
		mapcount = atomic_read(&page[i]._mapcount) + 1;

And we have the same logic in hmm_vma_walk_pud():

	if (pud_huge(pud) && pud_devmap(pud)) {
		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
		for (i = 0; i < npages; ++i, ++pfn)
			hmm_pfns[i] = pfn | cpu_flags;

So, if page[n] does not access the tail pages of a compound we have
many more people who are surprised by this than just GUP.

Where are these special rules for hugetlb compound tails documented?
Why does it need to be like this? 

Isn't it saner to forbid a compound and its tails from being
non-linear in the page array? That limits when compounds can be
created, but seems more likely to happen than a full mm audit to find
all the places that assume linearity.

Jason
Mike Kravetz Jan. 27, 2021, 1:58 a.m. UTC | #7
On 1/26/21 4:07 PM, Jason Gunthorpe wrote:
> On Tue, Jan 26, 2021 at 01:21:46PM -0800, Mike Kravetz wrote:
>> On 1/26/21 11:21 AM, Joao Martins wrote:
>>> On 1/26/21 6:08 PM, Mike Kravetz wrote:
>>>> On 1/25/21 12:57 PM, Joao Martins wrote:
>>>>>
>>>>> +static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
>>>>> +				 int refs, struct page **pages,
>>>>> +				 struct vm_area_struct **vmas)
>>>>> +{
>>>>> +	int nr;
>>>>> +
>>>>> +	for (nr = 0; nr < refs; nr++) {
>>>>> +		if (likely(pages))
>>>>> +			pages[nr] = page++;
>>>>> +		if (vmas)
>>>>> +			vmas[nr] = vma;
>>>>> +	}
>>>>> +}
>>>>> +
>>>>>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>>>>  			 struct page **pages, struct vm_area_struct **vmas,
>>>>>  			 unsigned long *position, unsigned long *nr_pages,
>>>>> @@ -4918,28 +4932,16 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>>>>  			continue;
>>>>>  		}
>>>>>  
>>>>> -		refs = 0;
>>>>> +		refs = min3(pages_per_huge_page(h) - pfn_offset,
>>>>> +			    (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
>>>>>  
>>>>> -same_page:
>>>>> -		if (pages)
>>>>> -			pages[i] = mem_map_offset(page, pfn_offset);
>>>>> +		if (pages || vmas)
>>>>> +			record_subpages_vmas(mem_map_offset(page, pfn_offset),
>>>>
>>>> The assumption made here is that mem_map is contiguous for the range of
>>>> pages in the hugetlb page.  I do not believe you can make this assumption
>>>> for (gigantic) hugetlb pages which are > MAX_ORDER_NR_PAGES.  For example,
>>>>
>>
>> Thinking about this a bit more ...
>>
>> mem_map can be accessed contiguously if we have a virtual memmap.  Correct?
>> I suspect virtual memmap may be the most common configuration today.  However,
>> it seems we do need to handle other configurations.
>>
>>> That would mean get_user_pages_fast() and put_user_pages_fast() are broken for anything
>>> handling PUDs or above? See record_subpages() in gup_huge_pud() or even gup_huge_pgd().
>>> It's using the same page++.
>>
>> Yes, I believe those would also have the issue.
>> Cc: John and Jason as they have spent a significant amount of time in gup
>> code recently.  There may be something that makes that code safe?
> 
> I'm looking at Matt's folio patches and see:
> 
> +static inline struct folio *next_folio(struct folio *folio)
> +{
> +       return folio + folio_nr_pages(folio);
> +}
> 
> And checking page_trans_huge_mapcount():
> 
> 	for (i = 0; i < thp_nr_pages(page); i++) {
> 		mapcount = atomic_read(&page[i]._mapcount) + 1;
> 
> And we have the same logic in hmm_vma_walk_pud():
> 
> 	if (pud_huge(pud) && pud_devmap(pud)) {
> 		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> 		for (i = 0; i < npages; ++i, ++pfn)
> 			hmm_pfns[i] = pfn | cpu_flags;
> 
> So, if page[n] does not access the tail pages of a compound we have
> many more people who are surprised by this than just GUP.
> 
> Where are these special rules for hugetlb compound tails documented?
> Why does it need to be like this? 

The commit where this was first addressed/pointed out is 69d177c2fc70
"hugetlbfs: handle pages higher order than MAX_ORDER" from back in 2008.
I only know about this because I stumbled upon it a few times in the
hugetlb code.  Although, it does not appear to be hugetlb specific.

As pointed out by Joao, you can also see the differences in pfn_to_page
for CONFIG_SPARSE_VMEMMAP and CONFIG_SPARSEMEM.  The only time we might
have issues is with CONFIG_SPARSEMEM.  I would bet CONFIG_SPARSE_VMEMMAP
is far more common.

Cc: Dave as he has sparsemem history.
Jason Gunthorpe Jan. 27, 2021, 2:10 a.m. UTC | #8
On Tue, Jan 26, 2021 at 05:58:53PM -0800, Mike Kravetz wrote:

> As pointed out by Joao, you can also see the differences in pfn_to_page
> for CONFIG_SPARSE_VMEMMAP and CONFIG_SPARSEMEM.  The only time we might
> have issues is with CONFIG_SPARSEMEM.  I would bet CONFIG_SPARSE_VMEMMAP
> is far more common.

I think it is fine to have a different pfn_to_page, it should just be
illegal to combine pages into a compound if their tail pages are not
linear in the map.

Matt's folio work might present an option to audit the whole mm for
this pattern and provide some folio_next_tail_page() accessor that
does the fast thing - but I question the value of such a project for a
2008 era PPC platform with 16GB pages (seriously?) that may be using
VMEMMAP today anyhow??

Maybe others know of more modern use cases

Jason
Matthew Wilcox (Oracle) Jan. 27, 2021, 2:24 a.m. UTC | #9
On Tue, Jan 26, 2021 at 08:07:30PM -0400, Jason Gunthorpe wrote:
> I'm looking at Matt's folio patches and see:
> 
> +static inline struct folio *next_folio(struct folio *folio)
> +{
> +       return folio + folio_nr_pages(folio);
> +}

This is a replacement for places that would do 'page++'.  eg it's
used by the bio iterator where we already checked that the phys addr
and the struct page are contiguous.

> And checking page_trans_huge_mapcount():
> 
> 	for (i = 0; i < thp_nr_pages(page); i++) {
> 		mapcount = atomic_read(&page[i]._mapcount) + 1;

I think we are guaranteed this for transparent huge pages.  At least
for now.  Zi Yan may have some thoughts for his work on 1GB transhuge
pages ...

> And we have the same logic in hmm_vma_walk_pud():
> 
> 	if (pud_huge(pud) && pud_devmap(pud)) {
> 		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> 		for (i = 0; i < npages; ++i, ++pfn)
> 			hmm_pfns[i] = pfn | cpu_flags;
> 
> So, if page[n] does not access the tail pages of a compound we have
> many more people who are surprised by this than just GUP.
> 
> Where are these special rules for hugetlb compound tails documented?
> Why does it need to be like this? 
> 
> Isn't it saner to forbid a compound and its tails from being
> non-linear in the page array? That limits when compounds can be
> created, but seems more likely to happen than a full mm audit to find
> all the places that assume linearity.
> 
> Jason
Zi Yan Jan. 27, 2021, 2:50 a.m. UTC | #10
On 26 Jan 2021, at 21:24, Matthew Wilcox wrote:

> On Tue, Jan 26, 2021 at 08:07:30PM -0400, Jason Gunthorpe wrote:
>> I'm looking at Matt's folio patches and see:
>>
>> +static inline struct folio *next_folio(struct folio *folio)
>> +{
>> +       return folio + folio_nr_pages(folio);
>> +}
>
> This is a replacement for places that would do 'page++'.  eg it's
> used by the bio iterator where we already checked that the phys addr
> and the struct page are contiguous.
>
>> And checking page_trans_huge_mapcount():
>>
>> 	for (i = 0; i < thp_nr_pages(page); i++) {
>> 		mapcount = atomic_read(&page[i]._mapcount) + 1;
>
> I think we are guaranteed this for transparent huge pages.  At least
> for now.  Zi Yan may have some thoughts for his work on 1GB transhuge
> pages ...

It should work for 1GB THP too. My implementation allocates 1GB pages
from cma_alloc(), which calls alloc_contig_range(). At least for now
subpages from a 1GB THP are physically contiguous.

It will be a concern if we use other ways (like migrating in-use pages)
of forming 1GB THPs. Thanks for pointing this out.

>
>> And we have the same logic in hmm_vma_walk_pud():
>>
>> 	if (pud_huge(pud) && pud_devmap(pud)) {
>> 		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
>> 		for (i = 0; i < npages; ++i, ++pfn)
>> 			hmm_pfns[i] = pfn | cpu_flags;
>>
>> So, if page[n] does not access the tail pages of a compound we have
>> many more people who are surprised by this than just GUP.
>>
>> Where are these special rules for hugetlb compound tails documented?
>> Why does it need to be like this?
>>
>> Isn't it saner to forbid a compound and its tails from being
>> non-linear in the page array? That limits when compounds can be
>> created, but seems more likely to happen than a full mm audit to find
>> all the places that assume linearity.
>>
>> Jason


—
Best Regards,
Yan Zi
Mike Kravetz Feb. 13, 2021, 9:12 p.m. UTC | #11
On 1/26/21 6:10 PM, Jason Gunthorpe wrote:
> On Tue, Jan 26, 2021 at 05:58:53PM -0800, Mike Kravetz wrote:
> 
>> As pointed out by Joao, you can also see the differences in pfn_to_page
>> for CONFIG_SPARSE_VMEMMAP and CONFIG_SPARSEMEM.  The only time we might
>> have issues is with CONFIG_SPARSEMEM.  I would bet CONFIG_SPARSE_VMEMMAP
>> is far more common.
> 
> I think it is fine to have a different pfn_to_page, it should just be
> illegal to combine pages into a compound if their tail pages are not
> linear in the map.
> 
> Matt's folio work might present an option to audit the whole mm for
> this pattern and provide some folio_next_tail_page() accessor that
> does the fast thing - but I question the value of such a project for a
> 2008 era PPC platform with 16GB pages (seriously?) that may be using
> VMEMMAP today anyhow??
> 
> Maybe others know of more modern use cases
> 
> Jason

When discussing v2 of this patch, Zi confirmed that this issue exists today.
See,
https://lore.kernel.org/linux-mm/3d2acb33-57e2-060d-616f-cce872c77307@oracle.com

I will fix up that unexpected discovery in the hugetlb code.  But, not sure
what approach we want to take elsewhere, such as the GUP code.
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 016addc8e413..1f7a95bc7c87 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4789,6 +4789,20 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	goto out;
 }
 
+static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
+				 int refs, struct page **pages,
+				 struct vm_area_struct **vmas)
+{
+	int nr;
+
+	for (nr = 0; nr < refs; nr++) {
+		if (likely(pages))
+			pages[nr] = page++;
+		if (vmas)
+			vmas[nr] = vma;
+	}
+}
+
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
 			 unsigned long *position, unsigned long *nr_pages,
@@ -4918,28 +4932,16 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			continue;
 		}
 
-		refs = 0;
+		refs = min3(pages_per_huge_page(h) - pfn_offset,
+			    (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
 
-same_page:
-		if (pages)
-			pages[i] = mem_map_offset(page, pfn_offset);
+		if (pages || vmas)
+			record_subpages_vmas(mem_map_offset(page, pfn_offset),
+					     vma, refs,
+					     likely(pages) ? pages + i : NULL,
+					     vmas ? vmas + i : NULL);
 
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		++pfn_offset;
-		--remainder;
-		++i;
-		refs++;
-		if (vaddr < vma->vm_end && remainder &&
-				pfn_offset < pages_per_huge_page(h)) {
-			/*
-			 * We use pfn_offset to avoid touching the pageframes
-			 * of this compound page.
-			 */
-			goto same_page;
-		} else if (pages) {
+		if (pages) {
 			/*
 			 * try_grab_compound_head() should always succeed here,
 			 * because: a) we hold the ptl lock, and b) we've just
@@ -4950,7 +4952,7 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 * any way. So this page must be available at this
 			 * point, unless the page refcount overflowed:
 			 */
-			if (WARN_ON_ONCE(!try_grab_compound_head(pages[i-1],
+			if (WARN_ON_ONCE(!try_grab_compound_head(pages[i],
 								 refs,
 								 flags))) {
 				spin_unlock(ptl);
@@ -4959,6 +4961,11 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 				break;
 			}
 		}
+
+		vaddr += (refs << PAGE_SHIFT);
+		remainder -= refs;
+		i += refs;
+
 		spin_unlock(ptl);
 	}
 	*nr_pages = remainder;