diff mbox series

[v4,08/14] mm/gup: grab head page refcount once for group of subpages

Message ID 20210827145819.16471-9-joao.m.martins@oracle.com (mailing list archive)
State New
Headers show
Series mm, sparse-vmemmap: Introduce compound devmaps for device-dax | expand

Commit Message

Joao Martins Aug. 27, 2021, 2:58 p.m. UTC
Use try_grab_compound_head() for device-dax GUP when configured with a
compound devmap.

Rather than incrementing the refcount for each page, do one atomic
addition for all the pages to be pinned.

Performance measured by gup_benchmark improves considerably
get_user_pages_fast() and pin_user_pages_fast() with NVDIMMs:

 $ gup_test -f /dev/dax1.0 -m 16384 -r 10 -S [-u,-a] -n 512 -w
(get_user_pages_fast 2M pages) ~59 ms -> ~6.1 ms
(pin_user_pages_fast 2M pages) ~87 ms -> ~6.2 ms
[altmap]
(get_user_pages_fast 2M pages) ~494 ms -> ~9 ms
(pin_user_pages_fast 2M pages) ~494 ms -> ~10 ms

 $ gup_test -f /dev/dax1.0 -m 129022 -r 10 -S [-u,-a] -n 512 -w
(get_user_pages_fast 2M pages) ~492 ms -> ~49 ms
(pin_user_pages_fast 2M pages) ~493 ms -> ~50 ms
[altmap with -m 127004]
(get_user_pages_fast 2M pages) ~3.91 sec -> ~70 ms
(pin_user_pages_fast 2M pages) ~3.97 sec -> ~74 ms

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
---
 mm/gup.c | 51 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

Comments

Jason Gunthorpe Aug. 27, 2021, 4:25 p.m. UTC | #1
On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote:

>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>  			     unsigned long end, unsigned int flags,
>  			     struct page **pages, int *nr)
>  {
> -	int nr_start = *nr;
> +	int refs, nr_start = *nr;
>  	struct dev_pagemap *pgmap = NULL;
>  	int ret = 1;
>  
>  	do {
> -		struct page *page = pfn_to_page(pfn);
> +		struct page *head, *page = pfn_to_page(pfn);
> +		unsigned long next = addr + PAGE_SIZE;
>  
>  		pgmap = get_dev_pagemap(pfn, pgmap);
>  		if (unlikely(!pgmap)) {
> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>  			ret = 0;
>  			break;
>  		}
> -		SetPageReferenced(page);
> -		pages[*nr] = page;
> -		if (unlikely(!try_grab_page(page, flags))) {
> -			undo_dev_pagemap(nr, nr_start, flags, pages);
> +
> +		head = compound_head(page);
> +		/* @end is assumed to be limited at most one compound page */
> +		if (PageHead(head))
> +			next = end;
> +		refs = record_subpages(page, addr, next, pages + *nr);
> +
> +		SetPageReferenced(head);
> +		if (unlikely(!try_grab_compound_head(head, refs, flags))) {
> +			if (PageHead(head))
> +				ClearPageReferenced(head);
> +			else
> +				undo_dev_pagemap(nr, nr_start, flags, pages);
>  			ret = 0;
>  			break;

Why is this special cased for devmap?

Shouldn't everything processing pud/pmds/etc use the same basic loop
that is similar in idea to the 'for_each_compound_head' scheme in
unpin_user_pages_dirty_lock()?

Doesn't that work for all the special page type cases here?

Jason
Joao Martins Aug. 27, 2021, 6:34 p.m. UTC | #2
On 8/27/21 5:25 PM, Jason Gunthorpe wrote:
> On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote:
> 
>>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
>>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>>  			     unsigned long end, unsigned int flags,
>>  			     struct page **pages, int *nr)
>>  {
>> -	int nr_start = *nr;
>> +	int refs, nr_start = *nr;
>>  	struct dev_pagemap *pgmap = NULL;
>>  	int ret = 1;
>>  
>>  	do {
>> -		struct page *page = pfn_to_page(pfn);
>> +		struct page *head, *page = pfn_to_page(pfn);
>> +		unsigned long next = addr + PAGE_SIZE;
>>  
>>  		pgmap = get_dev_pagemap(pfn, pgmap);
>>  		if (unlikely(!pgmap)) {
>> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>>  			ret = 0;
>>  			break;
>>  		}
>> -		SetPageReferenced(page);
>> -		pages[*nr] = page;
>> -		if (unlikely(!try_grab_page(page, flags))) {
>> -			undo_dev_pagemap(nr, nr_start, flags, pages);
>> +
>> +		head = compound_head(page);
>> +		/* @end is assumed to be limited at most one compound page */
>> +		if (PageHead(head))
>> +			next = end;
>> +		refs = record_subpages(page, addr, next, pages + *nr);
>> +
>> +		SetPageReferenced(head);
>> +		if (unlikely(!try_grab_compound_head(head, refs, flags))) {
>> +			if (PageHead(head))
>> +				ClearPageReferenced(head);
>> +			else
>> +				undo_dev_pagemap(nr, nr_start, flags, pages);
>>  			ret = 0;
>>  			break;
> 
> Why is this special cased for devmap?
> 
> Shouldn't everything processing pud/pmds/etc use the same basic loop
> that is similar in idea to the 'for_each_compound_head' scheme in
> unpin_user_pages_dirty_lock()?
> 
> Doesn't that work for all the special page type cases here?

We are iterating over PFNs to create an array of base pages (regardless of page table
type), rather than iterating over an array of pages to work on. Given that all these gup
functions already give you the boundary (end of pmd or end of pud, etc) then we just need
to grab the ref to pgmap and head page and save the tails. But sadly we need to handle the
base page case which is why there's this outer loop exists sadly. If it was just head
pages we wouldn't need the outer loop (and hence no for_each_compound_head, like the
hugetlb variant).

But maybe I am being dense and you just mean to replace the outer loop with
for_each_compound_range(). I am a little stuck on the part that I anyways need to record
back the tail pages when iterating over the (single) head page. So I feel that it wouldn't
bring that much improvement, unless I missed your point.

	Joao
Jason Gunthorpe Aug. 30, 2021, 1:07 p.m. UTC | #3
On Fri, Aug 27, 2021 at 07:34:54PM +0100, Joao Martins wrote:
> On 8/27/21 5:25 PM, Jason Gunthorpe wrote:
> > On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote:
> > 
> >>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
> >>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
> >>  			     unsigned long end, unsigned int flags,
> >>  			     struct page **pages, int *nr)
> >>  {
> >> -	int nr_start = *nr;
> >> +	int refs, nr_start = *nr;
> >>  	struct dev_pagemap *pgmap = NULL;
> >>  	int ret = 1;
> >>  
> >>  	do {
> >> -		struct page *page = pfn_to_page(pfn);
> >> +		struct page *head, *page = pfn_to_page(pfn);
> >> +		unsigned long next = addr + PAGE_SIZE;
> >>  
> >>  		pgmap = get_dev_pagemap(pfn, pgmap);
> >>  		if (unlikely(!pgmap)) {
> >> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
> >>  			ret = 0;
> >>  			break;
> >>  		}
> >> -		SetPageReferenced(page);
> >> -		pages[*nr] = page;
> >> -		if (unlikely(!try_grab_page(page, flags))) {
> >> -			undo_dev_pagemap(nr, nr_start, flags, pages);
> >> +
> >> +		head = compound_head(page);
> >> +		/* @end is assumed to be limited at most one compound page */
> >> +		if (PageHead(head))
> >> +			next = end;
> >> +		refs = record_subpages(page, addr, next, pages + *nr);
> >> +
> >> +		SetPageReferenced(head);
> >> +		if (unlikely(!try_grab_compound_head(head, refs, flags))) {
> >> +			if (PageHead(head))
> >> +				ClearPageReferenced(head);
> >> +			else
> >> +				undo_dev_pagemap(nr, nr_start, flags, pages);
> >>  			ret = 0;
> >>  			break;
> > 
> > Why is this special cased for devmap?
> > 
> > Shouldn't everything processing pud/pmds/etc use the same basic loop
> > that is similar in idea to the 'for_each_compound_head' scheme in
> > unpin_user_pages_dirty_lock()?
> > 
> > Doesn't that work for all the special page type cases here?
> 
> We are iterating over PFNs to create an array of base pages (regardless of page table
> type), rather than iterating over an array of pages to work on. 

That is part of it, yes, but the slow bit here is to minimally find
the head pages and do the atomics on them, much like the
unpin_user_pages_dirty_lock()

I would think this should be designed similar to how things work on
the unpin side.

Sweep the page tables to find a proper start/end - eg even if a
compound is spread across multiple pte/pmd/pud/etc we should find a
linear range of starting PFN (ie starting page*) and npages across as
much of the page tables as we can manage. This is the same as where
things end up in the unpin case where all the contiguous PFNs are
grouped togeher into a range.

Then 'assign' that range to the output array which requires walking
over each compount_head in the range and pinning it, then writing out
the tail pages to the output struct page array.

And this approach should apply universally no matter what is under the
pte's - ie huge pages, THPs and devmaps should all be treated the same
way. Currently each case is different, like above which is unique to
device_huge.

The more we can process groups of pages in bulks the faster the whole
thing will be.

Jason





Given that all these gup
> functions already give you the boundary (end of pmd or end of pud, etc) then we just need
> to grab the ref to pgmap and head page and save the tails. But sadly we need to handle the
> base page case which is why there's this outer loop exists sadly. If it was just head
> pages we wouldn't need the outer loop (and hence no for_each_compound_head, like the
> hugetlb variant).
> 
> But maybe I am being dense and you just mean to replace the outer loop with
> for_each_compound_range(). I am a little stuck on the part that I anyways need to record
> back the tail pages when iterating over the (single) head page. So I feel that it wouldn't
> bring that much improvement, unless I missed your point.
> 
> 	Joao
>
Joao Martins Aug. 31, 2021, 12:34 p.m. UTC | #4
On 8/30/21 2:07 PM, Jason Gunthorpe wrote:
> On Fri, Aug 27, 2021 at 07:34:54PM +0100, Joao Martins wrote:
>> On 8/27/21 5:25 PM, Jason Gunthorpe wrote:
>>> On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote:
>>>
>>>>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
>>>>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>>>>  			     unsigned long end, unsigned int flags,
>>>>  			     struct page **pages, int *nr)
>>>>  {
>>>> -	int nr_start = *nr;
>>>> +	int refs, nr_start = *nr;
>>>>  	struct dev_pagemap *pgmap = NULL;
>>>>  	int ret = 1;
>>>>  
>>>>  	do {
>>>> -		struct page *page = pfn_to_page(pfn);
>>>> +		struct page *head, *page = pfn_to_page(pfn);
>>>> +		unsigned long next = addr + PAGE_SIZE;
>>>>  
>>>>  		pgmap = get_dev_pagemap(pfn, pgmap);
>>>>  		if (unlikely(!pgmap)) {
>>>> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>>>>  			ret = 0;
>>>>  			break;
>>>>  		}
>>>> -		SetPageReferenced(page);
>>>> -		pages[*nr] = page;
>>>> -		if (unlikely(!try_grab_page(page, flags))) {
>>>> -			undo_dev_pagemap(nr, nr_start, flags, pages);
>>>> +
>>>> +		head = compound_head(page);
>>>> +		/* @end is assumed to be limited at most one compound page */
>>>> +		if (PageHead(head))
>>>> +			next = end;
>>>> +		refs = record_subpages(page, addr, next, pages + *nr);
>>>> +
>>>> +		SetPageReferenced(head);
>>>> +		if (unlikely(!try_grab_compound_head(head, refs, flags))) {
>>>> +			if (PageHead(head))
>>>> +				ClearPageReferenced(head);
>>>> +			else
>>>> +				undo_dev_pagemap(nr, nr_start, flags, pages);
>>>>  			ret = 0;
>>>>  			break;
>>>
>>> Why is this special cased for devmap?
>>>
>>> Shouldn't everything processing pud/pmds/etc use the same basic loop
>>> that is similar in idea to the 'for_each_compound_head' scheme in
>>> unpin_user_pages_dirty_lock()?
>>>
>>> Doesn't that work for all the special page type cases here?
>>
>> We are iterating over PFNs to create an array of base pages (regardless of page table
>> type), rather than iterating over an array of pages to work on. 
> 
> That is part of it, yes, but the slow bit here is to minimally find
> the head pages and do the atomics on them, much like the
> unpin_user_pages_dirty_lock()
> 
> I would think this should be designed similar to how things work on
> the unpin side.
> 
I don't think it's the same thing. The bit you say 'minimally find the
head pages' carries a visible overhead in unpin_user_pages() as we are
checking each of the pages belongs to the same head page -- because you
can pass an arbritary set of pages. This does have a cost which is not
in gup-fast right now AIUI. Whereas in our gup-fast 'handler' you
already know that you are processing a contiguous chunk of pages.
If anything, we are closer to unpin_user_page_range*()
than unpin_user_pages().

> Sweep the page tables to find a proper start/end - eg even if a
> compound is spread across multiple pte/pmd/pud/etc we should find a
> linear range of starting PFN (ie starting page*) and npages across as
> much of the page tables as we can manage. This is the same as where
> things end up in the unpin case where all the contiguous PFNs are
> grouped togeher into a range.
> 
> Then 'assign' that range to the output array which requires walking
> over each compount_head in the range and pinning it, then writing out
> the tail pages to the output struct page array.
> 
> And this approach should apply universally no matter what is under the
> pte's - ie huge pages, THPs and devmaps should all be treated the same
> way. Currently each case is different, like above which is unique to
> device_huge.
> 
Only devmap gup-fast is different IIUC.

Switching to similar iteration logic to unpin would look something like
this (still untested):

        for_each_compound_range(index, &page, npages, head, refs) {
                pgmap = get_dev_pagemap(pfn + *nr, pgmap);
                if (unlikely(!pgmap)) {
                        undo_dev_pagemap(nr, nr_start, flags, pages);
                        ret = 0;
                        break;
                }

                SetPageReferenced(head);
                if (unlikely(!try_grab_compound_head(head, refs, flags))) {
                        if (PageHead(head))
                                ClearPageReferenced(head);
                        else
                                undo_dev_pagemap(nr, nr_start, flags, pages);
                        ret = 0;
                        break;
                }

                record_subpages(page + *nr, addr,
                                addr + (refs << PAGE_SHIFT), pages + *nr);
                *(nr) += refs;
		addr += (refs << PAGE_SHIFT);
        }


But it looks to be a tidbit more complex and not really aligning with the
rest of gup-fast.

All in all, I am dealing with the fact that 1) devmap pmds/puds may not
be represented with compound pages and 2) we temporarily grab dev_pagemap reference
prior to pinning the page. Those two items is what makes this different than THPs/HugeTLB
(which do have the same logic). And thus it's what lead me to *slightly* improve
gup_device_huge().
Jason Gunthorpe Aug. 31, 2021, 5:05 p.m. UTC | #5
On Tue, Aug 31, 2021 at 01:34:04PM +0100, Joao Martins wrote:
> On 8/30/21 2:07 PM, Jason Gunthorpe wrote:
> > On Fri, Aug 27, 2021 at 07:34:54PM +0100, Joao Martins wrote:
> >> On 8/27/21 5:25 PM, Jason Gunthorpe wrote:
> >>> On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote:
> >>>
> >>>>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
> >>>>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
> >>>>  			     unsigned long end, unsigned int flags,
> >>>>  			     struct page **pages, int *nr)
> >>>>  {
> >>>> -	int nr_start = *nr;
> >>>> +	int refs, nr_start = *nr;
> >>>>  	struct dev_pagemap *pgmap = NULL;
> >>>>  	int ret = 1;
> >>>>  
> >>>>  	do {
> >>>> -		struct page *page = pfn_to_page(pfn);
> >>>> +		struct page *head, *page = pfn_to_page(pfn);
> >>>> +		unsigned long next = addr + PAGE_SIZE;
> >>>>  
> >>>>  		pgmap = get_dev_pagemap(pfn, pgmap);
> >>>>  		if (unlikely(!pgmap)) {
> >>>> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
> >>>>  			ret = 0;
> >>>>  			break;
> >>>>  		}
> >>>> -		SetPageReferenced(page);
> >>>> -		pages[*nr] = page;
> >>>> -		if (unlikely(!try_grab_page(page, flags))) {
> >>>> -			undo_dev_pagemap(nr, nr_start, flags, pages);
> >>>> +
> >>>> +		head = compound_head(page);
> >>>> +		/* @end is assumed to be limited at most one compound page */
> >>>> +		if (PageHead(head))
> >>>> +			next = end;
> >>>> +		refs = record_subpages(page, addr, next, pages + *nr);
> >>>> +
> >>>> +		SetPageReferenced(head);
> >>>> +		if (unlikely(!try_grab_compound_head(head, refs, flags))) {
> >>>> +			if (PageHead(head))
> >>>> +				ClearPageReferenced(head);
> >>>> +			else
> >>>> +				undo_dev_pagemap(nr, nr_start, flags, pages);
> >>>>  			ret = 0;
> >>>>  			break;
> >>>
> >>> Why is this special cased for devmap?
> >>>
> >>> Shouldn't everything processing pud/pmds/etc use the same basic loop
> >>> that is similar in idea to the 'for_each_compound_head' scheme in
> >>> unpin_user_pages_dirty_lock()?
> >>>
> >>> Doesn't that work for all the special page type cases here?
> >>
> >> We are iterating over PFNs to create an array of base pages (regardless of page table
> >> type), rather than iterating over an array of pages to work on. 
> > 
> > That is part of it, yes, but the slow bit here is to minimally find
> > the head pages and do the atomics on them, much like the
> > unpin_user_pages_dirty_lock()
> > 
> > I would think this should be designed similar to how things work on
> > the unpin side.
> > 
> I don't think it's the same thing. The bit you say 'minimally find the
> head pages' carries a visible overhead in unpin_user_pages() as we are
> checking each of the pages belongs to the same head page -- because you
> can pass an arbritary set of pages. This does have a cost which is not
> in gup-fast right now AIUI. Whereas in our gup-fast 'handler' you
> already know that you are processing a contiguous chunk of pages.
> If anything, we are closer to unpin_user_page_range*()
> than unpin_user_pages().

Yes, that is what I mean, it is very similar to the range case as we
don't even know that a single compound spans a pud/pmd. So you end up
doing the same loop to find the compound boundaries.

Under GUP slow we can also aggregate multiple page table entires, eg a
split huge page could be procesed as a single 2M range operation even
if it is broken to 4K PTEs.
> 
> Switching to similar iteration logic to unpin would look something like
> this (still untested):
> 
>         for_each_compound_range(index, &page, npages, head, refs) {
>                 pgmap = get_dev_pagemap(pfn + *nr, pgmap);

I recall talking to DanW about this and we agreed it was unnecessary
here to hold the pgmap and should be deleted.

Jason
Joao Martins Sept. 23, 2021, 4:51 p.m. UTC | #6
On 8/31/21 6:05 PM, Jason Gunthorpe wrote:
> On Tue, Aug 31, 2021 at 01:34:04PM +0100, Joao Martins wrote:
>> On 8/30/21 2:07 PM, Jason Gunthorpe wrote:
>>> On Fri, Aug 27, 2021 at 07:34:54PM +0100, Joao Martins wrote:
>>>> On 8/27/21 5:25 PM, Jason Gunthorpe wrote:
>>>>> On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote:
>>>>>
>>>>>>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
>>>>>>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>>>>>>  			     unsigned long end, unsigned int flags,
>>>>>>  			     struct page **pages, int *nr)
>>>>>>  {
>>>>>> -	int nr_start = *nr;
>>>>>> +	int refs, nr_start = *nr;
>>>>>>  	struct dev_pagemap *pgmap = NULL;
>>>>>>  	int ret = 1;
>>>>>>  
>>>>>>  	do {
>>>>>> -		struct page *page = pfn_to_page(pfn);
>>>>>> +		struct page *head, *page = pfn_to_page(pfn);
>>>>>> +		unsigned long next = addr + PAGE_SIZE;
>>>>>>  
>>>>>>  		pgmap = get_dev_pagemap(pfn, pgmap);
>>>>>>  		if (unlikely(!pgmap)) {
>>>>>> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>>>>>>  			ret = 0;
>>>>>>  			break;
>>>>>>  		}
>>>>>> -		SetPageReferenced(page);
>>>>>> -		pages[*nr] = page;
>>>>>> -		if (unlikely(!try_grab_page(page, flags))) {
>>>>>> -			undo_dev_pagemap(nr, nr_start, flags, pages);
>>>>>> +
>>>>>> +		head = compound_head(page);
>>>>>> +		/* @end is assumed to be limited at most one compound page */
>>>>>> +		if (PageHead(head))
>>>>>> +			next = end;
>>>>>> +		refs = record_subpages(page, addr, next, pages + *nr);
>>>>>> +
>>>>>> +		SetPageReferenced(head);
>>>>>> +		if (unlikely(!try_grab_compound_head(head, refs, flags))) {
>>>>>> +			if (PageHead(head))
>>>>>> +				ClearPageReferenced(head);
>>>>>> +			else
>>>>>> +				undo_dev_pagemap(nr, nr_start, flags, pages);
>>>>>>  			ret = 0;
>>>>>>  			break;
>>>>>
>>>>> Why is this special cased for devmap?
>>>>>
>>>>> Shouldn't everything processing pud/pmds/etc use the same basic loop
>>>>> that is similar in idea to the 'for_each_compound_head' scheme in
>>>>> unpin_user_pages_dirty_lock()?
>>>>>
>>>>> Doesn't that work for all the special page type cases here?
>>>>
>>>> We are iterating over PFNs to create an array of base pages (regardless of page table
>>>> type), rather than iterating over an array of pages to work on. 
>>>
>>> That is part of it, yes, but the slow bit here is to minimally find
>>> the head pages and do the atomics on them, much like the
>>> unpin_user_pages_dirty_lock()
>>>
>>> I would think this should be designed similar to how things work on
>>> the unpin side.
>>>
>> I don't think it's the same thing. The bit you say 'minimally find the
>> head pages' carries a visible overhead in unpin_user_pages() as we are
>> checking each of the pages belongs to the same head page -- because you
>> can pass an arbritary set of pages. This does have a cost which is not
>> in gup-fast right now AIUI. Whereas in our gup-fast 'handler' you
>> already know that you are processing a contiguous chunk of pages.
>> If anything, we are closer to unpin_user_page_range*()
>> than unpin_user_pages().
> 
> Yes, that is what I mean, it is very similar to the range case as we
> don't even know that a single compound spans a pud/pmd. So you end up
> doing the same loop to find the compound boundaries.
> 
> Under GUP slow we can also aggregate multiple page table entires, eg a
> split huge page could be procesed as a single 2M range operation even
> if it is broken to 4K PTEs.

/me nods

FWIW, I have a follow-up patch pursuing similar optimization (to fix
gup-slow case) that I need to put in better shape -- I probably won't wait
until this series is done contrary to what the cover letter says.

>> Switching to similar iteration logic to unpin would look something like
>> this (still untested):
>>
>>         for_each_compound_range(index, &page, npages, head, refs) {
>>                 pgmap = get_dev_pagemap(pfn + *nr, pgmap);
> 
> I recall talking to DanW about this and we agreed it was unnecessary
> here to hold the pgmap and should be deleted.

Yeap, I remember that conversation[0]. It was a long time ago, and I am
not sure what progress was made there since the last posting? Dan, any
thoughts there?

[0]
https://lore.kernel.org/linux-mm/161604050866.1463742.7759521510383551055.stgit@dwillia2-desk3.amr.corp.intel.com/

So ... if pgmap accounting was removed from gup-fast then this patch
would be a lot simpler and we could perhaps just fallback to the regular
hugepage case (THP, HugeTLB) like your suggestion at the top. See at the
end below scissors mark as the ballpark of changes.

So far my options seem to be: 1) this patch which leverages the existing
iteration logic or 2) switching to for_each_compound_range() -- see my previous
reply 3) waiting for Dan to remove @pgmap accounting in gup-fast and use
something similar to below scissors mark.

What do you think would be the best course of action?

--->8---

++static int __gup_device_compound(unsigned long addr, unsigned long pfn,
++                               unsigned long mask)
++{
++      pfn += ((addr & ~mask) >> PAGE_SHIFT);
++
++      return PageCompound(pfn_to_page(pfn));
++}
++
  static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                                 unsigned long end, unsigned int flags,
                                 struct page **pages, int *nr)
@@@ -2428,8 -2428,8 +2433,10 @@@ static int gup_huge_pmd(pmd_t orig, pmd
        if (pmd_devmap(orig)) {
                if (unlikely(flags & FOLL_LONGTERM))
                        return 0;
--              return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
--                                           pages, nr);
++
++              if (!__gup_device_compound(addr, pmd_pfn(orig), PMD_MASK))
++                      return __gup_device_huge_pmd(orig, pmdp, addr, end,
++                                                   flags, pages, nr);
        }

        page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@@ -2462,8 -2462,8 +2469,10 @@@ static int gup_huge_pud(pud_t orig, pud
        if (pud_devmap(orig)) {
                if (unlikely(flags & FOLL_LONGTERM))
                        return 0;
--              return __gup_device_huge_pud(orig, pudp, addr, end, flags,
--                                           pages, nr);
++
++              if (!__gup_device_compound(addr, pud_pfn(orig), PUD_MASK))
++                      return __gup_device_huge_pud(orig, pudp, addr, end,
++                                                   flags, pages, nr);
        }

        page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
Jason Gunthorpe Sept. 28, 2021, 6:01 p.m. UTC | #7
On Thu, Sep 23, 2021 at 05:51:04PM +0100, Joao Martins wrote:
> On 8/31/21 6:05 PM, Jason Gunthorpe wrote:

> >> Switching to similar iteration logic to unpin would look something like
> >> this (still untested):
> >>
> >>         for_each_compound_range(index, &page, npages, head, refs) {
> >>                 pgmap = get_dev_pagemap(pfn + *nr, pgmap);
> > 
> > I recall talking to DanW about this and we agreed it was unnecessary
> > here to hold the pgmap and should be deleted.
> 
> Yeap, I remember that conversation[0]. It was a long time ago, and I am
> not sure what progress was made there since the last posting? Dan, any
> thoughts there?
> 
> [0]
> https://lore.kernel.org/linux-mm/161604050866.1463742.7759521510383551055.stgit@dwillia2-desk3.amr.corp.intel.com/

I would really like to see that finished :\

> So ... if pgmap accounting was removed from gup-fast then this patch
> would be a lot simpler and we could perhaps just fallback to the regular
> hugepage case (THP, HugeTLB) like your suggestion at the top. See at the
> end below scissors mark as the ballpark of changes.
> 
> So far my options seem to be: 1) this patch which leverages the existing
> iteration logic or 2) switching to for_each_compound_range() -- see my previous
> reply 3) waiting for Dan to remove @pgmap accounting in gup-fast and use
> something similar to below scissors mark.
> 
> What do you think would be the best course of action?

I still think the basic algorithm should be to accumulate physicaly
contiguous addresses when walking the page table and then flush them
back to struct pages once we can't accumulate any more.

That works for both the walkers and all the page types?

If the get_dev_pagemap has to remain then it just means we have to
flush before changing pagemap pointers

Jason
Joao Martins Sept. 29, 2021, 11:50 a.m. UTC | #8
On 9/28/21 19:01, Jason Gunthorpe wrote:
> On Thu, Sep 23, 2021 at 05:51:04PM +0100, Joao Martins wrote:
>> So ... if pgmap accounting was removed from gup-fast then this patch
>> would be a lot simpler and we could perhaps just fallback to the regular
>> hugepage case (THP, HugeTLB) like your suggestion at the top. See at the
>> end below scissors mark as the ballpark of changes.
>>
>> So far my options seem to be: 1) this patch which leverages the existing
>> iteration logic or 2) switching to for_each_compound_range() -- see my previous
>> reply 3) waiting for Dan to remove @pgmap accounting in gup-fast and use
>> something similar to below scissors mark.
>>
>> What do you think would be the best course of action?
> 
> I still think the basic algorithm should be to accumulate physicaly
> contiguous addresses when walking the page table and then flush them
> back to struct pages once we can't accumulate any more.
> 
> That works for both the walkers and all the page types?
> 

The logic already handles all page types -- I was trying to avoid the extra
complexity in regular hugetlb/THP path by not merging the handling of the
oddball case that is devmap (or fundamentally devmap
non-compound case in the future).

In the context of this patch I am think your suggestion that you  wrote
above to ... instead of changing __gup_device_huge() we uplevel/merge it
all in gup_huge_{pud,pmd}() to cover the devmap?

static int __gup_huge_range(orig_head, ...)
{
	...
	page = orig_head + ((addr & ~mask) >> PAGE_SHIFT);
	refs = record_subpages(page, addr, end, pages + *nr);

	head = try_grab_compound_head(orig_head, refs, flags);
	if (!head)
		return 0;

	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
		put_compound_head(head, refs, flags);
		return 0;
	}

	SetPageReferenced(head);
	return 1;
}

static int gup_huge_pmd(...)
{
	...
	for_each_compound_range(index, page, npages, head, refs) {
		if (pud_devmap(orig))
			pgmap = get_dev_pagemap(pmd_pfn(orig), pgmap);
	
	
		if (!__gup_huge_page_range(pmd_page(orig), refs)) {
			undo_dev_pagemap(...);	
			return 0;
		}
	}

	put_dev_pagemap(pgmap);
}

But all this gup_huge_{pmd,pud}() rework is all just for the trouble of
trying to merge the basepage-on-PMD/PUD case of devmap. It feels more
complex (and affecting other page types) compared to leave the devmap
odity siloed like option 1. If the pgmap refcount wasn't there and
there was no users of basepages-on-PMD/PUD but just compound pages
on PMDs/PUDs ... then we would be talking about code removal rather
than added complexity. But I don't know how realistic that is for
other devmap users (beside device-dax).

> If the get_dev_pagemap has to remain then it just means we have to
> flush before changing pagemap pointers
Right -- I don't think we should need it as that discussion on the other
thread goes.

OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM
for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax]
can support it but not MEMORY_DEVICE_FSDAX [fsdax]).

[0] https://lore.kernel.org/linux-mm/6a18179e-65f7-367d-89a9-d5162f10fef0@oracle.com/
Jason Gunthorpe Sept. 29, 2021, 7:34 p.m. UTC | #9
On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote:

> > If the get_dev_pagemap has to remain then it just means we have to
> > flush before changing pagemap pointers
> Right -- I don't think we should need it as that discussion on the other
> thread goes.
> 
> OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM
> for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax]
> can support it but not MEMORY_DEVICE_FSDAX [fsdax]).

When looking at Logan's patches I think it is pretty clear to me that
page->pgmap must never be a dangling pointer if the caller has a
legitimate refcount on the page.

For instance the migrate and stuff all blindly calls
is_device_private_page() on the struct page expecting a valid
page->pgmap.

This also looks like it is happening, ie

void __put_page(struct page *page)
{
	if (is_zone_device_page(page)) {
		put_dev_pagemap(page->pgmap);

Is indeed putting the pgmap ref back when the page becomes ungettable.

This properly happens when the page refcount goes to zero and so it
should fully interlock with __page_cache_add_speculative():

	if (unlikely(!page_ref_add_unless(page, count, 0))) {

Thus, in gup.c, if we succeed at try_grab_compound_head() then
page->pgmap is a stable pointer with a valid refcount.

So, all the external pgmap stuff in gup.c is completely pointless.
try_grab_compound_head() provides us with an equivalent protection at
lower cost. Remember gup.c doesn't deref the pgmap at all.

Dan/Alistair/Felix do you see any hole in that argument??

So lets just delete it!

Jason
Alistair Popple Sept. 30, 2021, 3:01 a.m. UTC | #10
On Thursday, 30 September 2021 5:34:05 AM AEST Jason Gunthorpe wrote:
> On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote:
> 
> > > If the get_dev_pagemap has to remain then it just means we have to
> > > flush before changing pagemap pointers
> > Right -- I don't think we should need it as that discussion on the other
> > thread goes.
> > 
> > OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM
> > for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax]
> > can support it but not MEMORY_DEVICE_FSDAX [fsdax]).
> 
> When looking at Logan's patches I think it is pretty clear to me that
> page->pgmap must never be a dangling pointer if the caller has a
> legitimate refcount on the page.
> 
> For instance the migrate and stuff all blindly calls
> is_device_private_page() on the struct page expecting a valid
> page->pgmap.
> 
> This also looks like it is happening, ie
> 
> void __put_page(struct page *page)
> {
> 	if (is_zone_device_page(page)) {
> 		put_dev_pagemap(page->pgmap);
> 
> Is indeed putting the pgmap ref back when the page becomes ungettable.
> 
> This properly happens when the page refcount goes to zero and so it
> should fully interlock with __page_cache_add_speculative():
> 
> 	if (unlikely(!page_ref_add_unless(page, count, 0))) {
> 
> Thus, in gup.c, if we succeed at try_grab_compound_head() then
> page->pgmap is a stable pointer with a valid refcount.
> 
> So, all the external pgmap stuff in gup.c is completely pointless.
> try_grab_compound_head() provides us with an equivalent protection at
> lower cost. Remember gup.c doesn't deref the pgmap at all.
> 
> Dan/Alistair/Felix do you see any hole in that argument??

As background note that device pages are currently considered free when
refcount == 1 but the pgmap reference is dropped when the refcount transitions
1->0. The final pgmap reference is typically dropped when a driver calls
memunmap_pages() and put_page() drops the last page reference:

void memunmap_pages(struct dev_pagemap *pgmap)
{
        unsigned long pfn;
        int i;

        dev_pagemap_kill(pgmap);
        for (i = 0; i < pgmap->nr_range; i++)
                for_each_device_pfn(pfn, pgmap, i)
                        put_page(pfn_to_page(pfn));
        dev_pagemap_cleanup(pgmap);

If there are still pgmap references dev_pagemap_cleanup(pgmap) will block until
the final reference is dropped. So I think your argument holds at least for
DEVICE_PRIVATE and DEVICE_GENERIC. DEVICE_FS_DAX defines it's own pagemap
cleanup but I can't see why the same argument wouldn't hold there - if a page
has a valid refcount it must have a reference on the pagemap too.

 - Alistair

> So lets just delete it!
> 
> Jason
>
Joao Martins Sept. 30, 2021, 5:54 p.m. UTC | #11
On 9/30/21 04:01, Alistair Popple wrote:
> On Thursday, 30 September 2021 5:34:05 AM AEST Jason Gunthorpe wrote:
>> On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote:
>>
>>>> If the get_dev_pagemap has to remain then it just means we have to
>>>> flush before changing pagemap pointers
>>> Right -- I don't think we should need it as that discussion on the other
>>> thread goes.
>>>
>>> OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM
>>> for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax]
>>> can support it but not MEMORY_DEVICE_FSDAX [fsdax]).
>>
>> When looking at Logan's patches I think it is pretty clear to me that
>> page->pgmap must never be a dangling pointer if the caller has a
>> legitimate refcount on the page.
>>
>> For instance the migrate and stuff all blindly calls
>> is_device_private_page() on the struct page expecting a valid
>> page->pgmap.
>>
>> This also looks like it is happening, ie
>>
>> void __put_page(struct page *page)
>> {
>> 	if (is_zone_device_page(page)) {
>> 		put_dev_pagemap(page->pgmap);
>>
>> Is indeed putting the pgmap ref back when the page becomes ungettable.
>>
>> This properly happens when the page refcount goes to zero and so it
>> should fully interlock with __page_cache_add_speculative():
>>
>> 	if (unlikely(!page_ref_add_unless(page, count, 0))) {
>>
>> Thus, in gup.c, if we succeed at try_grab_compound_head() then
>> page->pgmap is a stable pointer with a valid refcount.
>>
>> So, all the external pgmap stuff in gup.c is completely pointless.
>> try_grab_compound_head() provides us with an equivalent protection at
>> lower cost. Remember gup.c doesn't deref the pgmap at all.
>>
>> Dan/Alistair/Felix do you see any hole in that argument??
> 
> As background note that device pages are currently considered free when
> refcount == 1 but the pgmap reference is dropped when the refcount transitions
> 1->0. The final pgmap reference is typically dropped when a driver calls
> memunmap_pages() and put_page() drops the last page reference:
> 
> void memunmap_pages(struct dev_pagemap *pgmap)
> {
>         unsigned long pfn;
>         int i;
> 
>         dev_pagemap_kill(pgmap);
>         for (i = 0; i < pgmap->nr_range; i++)
>                 for_each_device_pfn(pfn, pgmap, i)
>                         put_page(pfn_to_page(pfn));
>         dev_pagemap_cleanup(pgmap);
> 
> If there are still pgmap references dev_pagemap_cleanup(pgmap) will block until
> the final reference is dropped. So I think your argument holds at least for
> DEVICE_PRIVATE and DEVICE_GENERIC. DEVICE_FS_DAX defines it's own pagemap
> cleanup but I can't see why the same argument wouldn't hold there - if a page
> has a valid refcount it must have a reference on the pagemap too.

IIUC Dan's reasoning was that fsdax wasn't able to deal with surprise removal [1] so his
patches were to ensure fsdax (or the pmem block device) poisons/kills the pages as a way
to notify filesystem/dm that the page was to be kept unmapped:

[1]
https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/

But if fsdax doesn't wait for all the pgmap references[*] on its pagemap cleanup callback
then what's the pgmap ref in __gup_device_huge() pairs/protects us up against that is
specific to fsdax?

I am not sure I follow how both the fsdax specific issue ties in with this pgmap ref being
there above.

	Joao

[*] or at least fsdax_pagemap_ops doesn't suggest the contrary ... compared to
dev_pagemap_{kill,cleanup}
Jason Gunthorpe Sept. 30, 2021, 9:55 p.m. UTC | #12
On Thu, Sep 30, 2021 at 06:54:05PM +0100, Joao Martins wrote:
> On 9/30/21 04:01, Alistair Popple wrote:
> > On Thursday, 30 September 2021 5:34:05 AM AEST Jason Gunthorpe wrote:
> >> On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote:
> >>
> >>>> If the get_dev_pagemap has to remain then it just means we have to
> >>>> flush before changing pagemap pointers
> >>> Right -- I don't think we should need it as that discussion on the other
> >>> thread goes.
> >>>
> >>> OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM
> >>> for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax]
> >>> can support it but not MEMORY_DEVICE_FSDAX [fsdax]).
> >>
> >> When looking at Logan's patches I think it is pretty clear to me that
> >> page->pgmap must never be a dangling pointer if the caller has a
> >> legitimate refcount on the page.
> >>
> >> For instance the migrate and stuff all blindly calls
> >> is_device_private_page() on the struct page expecting a valid
> >> page->pgmap.
> >>
> >> This also looks like it is happening, ie
> >>
> >> void __put_page(struct page *page)
> >> {
> >> 	if (is_zone_device_page(page)) {
> >> 		put_dev_pagemap(page->pgmap);
> >>
> >> Is indeed putting the pgmap ref back when the page becomes ungettable.
> >>
> >> This properly happens when the page refcount goes to zero and so it
> >> should fully interlock with __page_cache_add_speculative():
> >>
> >> 	if (unlikely(!page_ref_add_unless(page, count, 0))) {
> >>
> >> Thus, in gup.c, if we succeed at try_grab_compound_head() then
> >> page->pgmap is a stable pointer with a valid refcount.
> >>
> >> So, all the external pgmap stuff in gup.c is completely pointless.
> >> try_grab_compound_head() provides us with an equivalent protection at
> >> lower cost. Remember gup.c doesn't deref the pgmap at all.
> >>
> >> Dan/Alistair/Felix do you see any hole in that argument??
> > 
> > As background note that device pages are currently considered free when
> > refcount == 1 but the pgmap reference is dropped when the refcount transitions
> > 1->0. The final pgmap reference is typically dropped when a driver calls
> > memunmap_pages() and put_page() drops the last page reference:
> > 
> > void memunmap_pages(struct dev_pagemap *pgmap)
> > {
> >         unsigned long pfn;
> >         int i;
> > 
> >         dev_pagemap_kill(pgmap);
> >         for (i = 0; i < pgmap->nr_range; i++)
> >                 for_each_device_pfn(pfn, pgmap, i)
> >                         put_page(pfn_to_page(pfn));
> >         dev_pagemap_cleanup(pgmap);
> > 
> > If there are still pgmap references dev_pagemap_cleanup(pgmap) will block until
> > the final reference is dropped. So I think your argument holds at least for
> > DEVICE_PRIVATE and DEVICE_GENERIC. DEVICE_FS_DAX defines it's own pagemap
> > cleanup but I can't see why the same argument wouldn't hold there - if a page
> > has a valid refcount it must have a reference on the pagemap too.
>
> IIUC Dan's reasoning was that fsdax wasn't able to deal with
> surprise removal [1] so his patches were to ensure fsdax (or the
> pmem block device) poisons/kills the pages as a way to notify
> filesystem/dm that the page was to be kept unmapped:

Sure, but that has nothing to do with GUP, that is between the
filesytem and fsdax
 
> But if fsdax doesn't wait for all the pgmap references[*] on its
> pagemap cleanup callback then what's the pgmap ref in
> __gup_device_huge() pairs/protects us up against that is specific to
> fsdax?

It does wait for refs

It sets the pgmap.ref to:

	pmem->pgmap.ref = &q->q_usage_counter;

And that ref is incr'd by the struct page lifetime - the unincr is in
__put_page() above

fsdax_pagemap_ops does pmem_pagemap_kill() which calls
blk_freeze_queue_start() which does percpu_ref_kill(). Then the
pmem_pagemap_cleanup() eventually does blk_mq_freeze_queue_wait()
which will sleep until the prefcpu ref reaches zero.

In other words fsdax cannot pass cleanup while a struct page exists
with a non-zero refcount, which answers Alistair's question about how
fsdax's cleanup work.

Jason
Jason Gunthorpe Oct. 8, 2021, 11:54 a.m. UTC | #13
On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote:
> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>  			ret = 0;
>  			break;
>  		}
> -		SetPageReferenced(page);
> -		pages[*nr] = page;
> -		if (unlikely(!try_grab_page(page, flags))) {
> -			undo_dev_pagemap(nr, nr_start, flags, pages);
> +
> +		head = compound_head(page);
> +		/* @end is assumed to be limited at most one compound page */
> +		if (PageHead(head))
> +			next = end;
> +		refs = record_subpages(page, addr, next, pages + *nr);
> +
> +		SetPageReferenced(head);
> +		if (unlikely(!try_grab_compound_head(head, refs, flags))) {

I was thinking about this some more, and this ordering doesn't seem
like a good idea. We shouldn't be looking at any part of the struct
page without holding the refcount, certainly not the compound_head()

The only optimization that might work here is to grab the head, then
compute the extent of tail pages and amalgamate them. Holding a ref on
the head also secures the tails.

Which also means most of what I was suggesting isn't going to work
anyhow.

Jason
Joao Martins Oct. 11, 2021, 3:53 p.m. UTC | #14
On 10/8/21 12:54, Jason Gunthorpe wrote:
> On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote:
>> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>>  			ret = 0;
>>  			break;
>>  		}
>> -		SetPageReferenced(page);
>> -		pages[*nr] = page;
>> -		if (unlikely(!try_grab_page(page, flags))) {
>> -			undo_dev_pagemap(nr, nr_start, flags, pages);
>> +
>> +		head = compound_head(page);
>> +		/* @end is assumed to be limited at most one compound page */
>> +		if (PageHead(head))
>> +			next = end;
>> +		refs = record_subpages(page, addr, next, pages + *nr);
>> +
>> +		SetPageReferenced(head);
>> +		if (unlikely(!try_grab_compound_head(head, refs, flags))) {
> 
> I was thinking about this some more, and this ordering doesn't seem
> like a good idea. We shouldn't be looking at any part of the struct
> page without holding the refcount, certainly not the compound_head()
> 
> The only optimization that might work here is to grab the head, then
> compute the extent of tail pages and amalgamate them. Holding a ref on
> the head also secures the tails.
> 

How about pmd_page(orig) / pud_page(orig) like what the rest of hugetlb/thp
checks do? i.e. we would pass pmd_page(orig)/pud_page(orig) to __gup_device_huge()
as an added @head argument. While keeping the same structure of counting tail pages
between @addr .. @end if we have a head page.

Albeit this lingers on whether it's OK to call PageHead() .. The PageHead policy is for
any page (PF_ANY) so no hidden calls to compound_head() when testing that page flag. but
in the end it accesses struct page flags which is well, still struct page data.

We could also check pgmap for a non-zero geometry (which tells us that pmd_page(orig) does
represent a head page). And that would save us from looking at struct page data today, but
it would introduce problems later whenever we remove the pgmap ref grab in gup_device_huge().

So the only viable might be to do the grab, count tails and fixup-ref like you suggest
above, and take the perf hit of one extra atomic op :(

It's interesting how THP (in gup_huge_pmd()) unilaterally computes tails assuming
pmd_page(orig) is the head page.
Jason Gunthorpe Oct. 13, 2021, 5:41 p.m. UTC | #15
On Mon, Oct 11, 2021 at 04:53:29PM +0100, Joao Martins wrote:
> On 10/8/21 12:54, Jason Gunthorpe wrote:

> > The only optimization that might work here is to grab the head, then
> > compute the extent of tail pages and amalgamate them. Holding a ref on
> > the head also secures the tails.
> 
> How about pmd_page(orig) / pud_page(orig) like what the rest of hugetlb/thp
> checks do? i.e. we would pass pmd_page(orig)/pud_page(orig) to __gup_device_huge()
> as an added @head argument. While keeping the same structure of counting tail pages
> between @addr .. @end if we have a head page.

The right logic is what everything else does:

	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
	refs = record_subpages(page, addr, end, pages + *nr);
	head = try_grab_compound_head(pud_page(orig), refs, flags);

If you can use this, or not, depends entirely on answering the
question of why does  __gup_device_huge() exist at all.

This I don't fully know:

1) As discussed quite a few times now, the entire get_dev_pagemap
   stuff looks usless and should just be deleted. If you care about
   optimizing this I would persue doing that as it will give the
   biggest single win.

2) It breaks up the PUD/PMD into tail pages and scans them all
   Why? Can devmap combine multiple compound_head's into the same PTE?
   Does devmap guarentee that the PUD/PMD points to the head page? (I
   assume no)

But I'm looking at this some more and I see try_get_compound_head() is
reading the compound_head with no locking, just READ_ONCE, so that
must be OK under GUP.

It still seems to me the same generic algorithm should work
everywhere, once we get rid of the get_dev_pagemap

  start_pfn = pud/pmd_pfn() + pud/pmd_page_offset(addr)
  end_pfn = start_pfn + (end - addr) // fixme
  if (THP)
     refs = end_pfn - start_pfn
  if (devmap)
     refs = 1

  do {
     page = pfn_to_page(start_pfn)
     head_page = try_grab_compound_head(page, refs, flags)
     if (pud/pmd_val() != orig)
        err

     npages = 1 << compound_order(head_page)
     npages = min(npages, end_pfn - start_pfn)
     for (i = 0, iter = page; i != npages) {
     	 *pages++ = iter;
         mem_map_next(iter, page, i)
     }

     if (devmap && npages > 2)
         grab_compound_head(head_page, npages - 1, flags)
     start_pfn += npages;
  } while (start_pfn != end_pfn)

Above needs to be cleaned up quite a bit, but is the general idea.

There is an further optimization we can put in where we can know that
'page' is still in a currently grab'd compound (eg last_page+1 = page,
not past compound_order) and defer the refcount work.

> It's interesting how THP (in gup_huge_pmd()) unilaterally computes
> tails assuming pmd_page(orig) is the head page.

I think this is an integral property of THP, probably not devmap/dax
though?

Jason
Joao Martins Oct. 13, 2021, 7:18 p.m. UTC | #16
On 10/13/21 18:41, Jason Gunthorpe wrote:
> On Mon, Oct 11, 2021 at 04:53:29PM +0100, Joao Martins wrote:
>> On 10/8/21 12:54, Jason Gunthorpe wrote:
> 
>>> The only optimization that might work here is to grab the head, then
>>> compute the extent of tail pages and amalgamate them. Holding a ref on
>>> the head also secures the tails.
>>
>> How about pmd_page(orig) / pud_page(orig) like what the rest of hugetlb/thp
>> checks do? i.e. we would pass pmd_page(orig)/pud_page(orig) to __gup_device_huge()
>> as an added @head argument. While keeping the same structure of counting tail pages
>> between @addr .. @end if we have a head page.
> 
> The right logic is what everything else does:
> 
> 	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> 	refs = record_subpages(page, addr, end, pages + *nr);
> 	head = try_grab_compound_head(pud_page(orig), refs, flags);
> 
> If you can use this, or not, depends entirely on answering the
> question of why does  __gup_device_huge() exist at all.
> 
So for device-dax it seems to be an untackled oversight[*], probably
inherited from when fsdax devmap was introduced. What I don't know
is the other devmap users :(

[*] it has all the same properties as hugetlbfs AFAIU (after this series)

Certainly if any devmap PMD/PUD was represented in a single compound
page like THP/hugetlbfs then this patch would be a matter of removing
pgmap ref grab (and nuke the __gup_device_huge function existence as I
suggested earlier).

> This I don't fully know:
> 
> 1) As discussed quite a few times now, the entire get_dev_pagemap
>    stuff looks usless and should just be deleted. If you care about
>    optimizing this I would persue doing that as it will give the
>    biggest single win.
> 
I am not questioning the well-deserved improvement -- but from a pure
optimization perspective the get_dev_pagemap() cost is not
visible and quite negligeble. It is done once and only once and
subsequent calls to get_dev_pagemap with a non-NULL pgmap don't alter
the refcount and just return the pgmap object. And the xarray storing
the ranges -> pgmap won't be that big ... perhaps maybe 12 pgmaps on
a large >1T pmem system depending on your DIMM size.

The refcount update of the individual 4K page is what introduces
a seriously prohibite cost: I am seeing 10x the cost with DRAM
located struct pages (pmem located struct pages is even more ludicrous).

> 2) It breaks up the PUD/PMD into tail pages and scans them all
>    Why? Can devmap combine multiple compound_head's into the same PTE?


I am not aware of any other usage of compound pages for devmap struct pages
than this series. At least I haven't seen device-dax or fsdax using this.
Unless HMM does this stuff, or some sort of devmap page migration? P2PDMA
doesn't seem to be (yet?) caught by any of the GUP path at least before
Logan's series lands. Or am I misunderstanding things here?

>    Does devmap guarentee that the PUD/PMD points to the head page? (I
>    assume no)
> 
For device-dax yes.


> But I'm looking at this some more and I see try_get_compound_head() is
> reading the compound_head with no locking, just READ_ONCE, so that
> must be OK under GUP.
> 
I suppose one other way to get around the double atomic op would be to fail
the try_get_compound_head() by comparing the first tail page compound_head()
after grabbing the head with @refs. That is instead of comparing against
grabbed head page and the passed page argument.

> It still seems to me the same generic algorithm should work
> everywhere, once we get rid of the get_dev_pagemap
> 
>   start_pfn = pud/pmd_pfn() + pud/pmd_page_offset(addr)
>   end_pfn = start_pfn + (end - addr) // fixme
>   if (THP)
>      refs = end_pfn - start_pfn
>   if (devmap)
>      refs = 1
> 
>   do {
>      page = pfn_to_page(start_pfn)
>      head_page = try_grab_compound_head(page, refs, flags)
>      if (pud/pmd_val() != orig)
>         err
> 
>      npages = 1 << compound_order(head_page)
>      npages = min(npages, end_pfn - start_pfn)
>      for (i = 0, iter = page; i != npages) {
>      	 *pages++ = iter;
>          mem_map_next(iter, page, i)
>      }
> 
>      if (devmap && npages > 2)
>          grab_compound_head(head_page, npages - 1, flags)
>      start_pfn += npages;
>   } while (start_pfn != end_pfn)
> 
> Above needs to be cleaned up quite a bit, but is the general idea.
> 
> There is an further optimization we can put in where we can know that
> 'page' is still in a currently grab'd compound (eg last_page+1 = page,
> not past compound_order) and defer the refcount work.
> 
I was changing __gup_device_huge() with similar to the above, but yeah
it follows that algorithm as inside your do { } while() (thanks!). I can
turn __gup_device_huge() into another (renamed to like try_grab_pages())
helper and replace the callsites of gup_huge_{pud,pmd} for the THP/hugetlbfs
equivalent handling.

>> It's interesting how THP (in gup_huge_pmd()) unilaterally computes
>> tails assuming pmd_page(orig) is the head page.
> 
> I think this is an integral property of THP, probably not devmap/dax
> though?

I think the right answer is "depends on the devmap" type. device-dax with
PMD/PUDs (i.e. 2m pagesize PMEM or 1G pagesize pmem) works with the same
rules as hugetlbfs. fsdax not so much (as you say above) but it would
follow up changes to perhaps adhere to similar scheme (not exactly sure
how do deal with holes). HMM I am not sure what the rules are there.
P2PDMA seems not applicable?
Jason Gunthorpe Oct. 13, 2021, 7:43 p.m. UTC | #17
On Wed, Oct 13, 2021 at 08:18:08PM +0100, Joao Martins wrote:
> On 10/13/21 18:41, Jason Gunthorpe wrote:
> > On Mon, Oct 11, 2021 at 04:53:29PM +0100, Joao Martins wrote:
> >> On 10/8/21 12:54, Jason Gunthorpe wrote:
> > 
> >>> The only optimization that might work here is to grab the head, then
> >>> compute the extent of tail pages and amalgamate them. Holding a ref on
> >>> the head also secures the tails.
> >>
> >> How about pmd_page(orig) / pud_page(orig) like what the rest of hugetlb/thp
> >> checks do? i.e. we would pass pmd_page(orig)/pud_page(orig) to __gup_device_huge()
> >> as an added @head argument. While keeping the same structure of counting tail pages
> >> between @addr .. @end if we have a head page.
> > 
> > The right logic is what everything else does:
> > 
> > 	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> > 	refs = record_subpages(page, addr, end, pages + *nr);
> > 	head = try_grab_compound_head(pud_page(orig), refs, flags);
> > 
> > If you can use this, or not, depends entirely on answering the
> > question of why does  __gup_device_huge() exist at all.
> > 
> So for device-dax it seems to be an untackled oversight[*], probably
> inherited from when fsdax devmap was introduced. What I don't know
> is the other devmap users :(

devmap generic infrastructure waits until all page refcounts go to
zero, and it should wait until any fast GUP is serialized as part of
the TLB shootdown - otherwise it is leaking access to the memory it
controls beyond it's shutdown

So, I don't think the different devmap users can break this?

> > This I don't fully know:
> > 
> > 1) As discussed quite a few times now, the entire get_dev_pagemap
> >    stuff looks usless and should just be deleted. If you care about
> >    optimizing this I would persue doing that as it will give the
> >    biggest single win.
> 
> I am not questioning the well-deserved improvement -- but from a pure
> optimization perspective the get_dev_pagemap() cost is not
> visible and quite negligeble.

You are doing large enough GUPs then that the expensive xarray seach
is small compared to the rest?

> > 2) It breaks up the PUD/PMD into tail pages and scans them all
> >    Why? Can devmap combine multiple compound_head's into the same PTE?
> 
> I am not aware of any other usage of compound pages for devmap struct pages
> than this series. At least I haven't seen device-dax or fsdax using
> this.

Let me ask this question differently, is this assertion OK?

--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -808,8 +808,13 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
        }
 
        entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
-       if (pfn_t_devmap(pfn))
+       if (pfn_t_devmap(pfn)) {
+               struct page *pfn_to_page(pfn);
+
+               WARN_ON(compound_head(page) != page);
+               WARN_ON(compound_order(page) != PMD_SHIFT);
                entry = pmd_mkdevmap(entry);
+       }
        if (write) {
                entry = pmd_mkyoung(pmd_mkdirty(entry));
                entry = maybe_pmd_mkwrite(entry, vma);

(and same for insert_pfn_pud)

You said it is OK for device/dax/device.c?

And not for fs/dax.c?


> Unless HMM does this stuff, or some sort of devmap page migration? P2PDMA
> doesn't seem to be (yet?) caught by any of the GUP path at least before
> Logan's series lands. Or am I misunderstanding things here?

Of the places that call the insert_pfn_pmd/pud call chains I only see
device/dax/device.c and fs/dax.c as being linked to devmap. So other
devmap users don't use this stuff.

> I was changing __gup_device_huge() with similar to the above, but yeah
> it follows that algorithm as inside your do { } while() (thanks!). I can
> turn __gup_device_huge() into another (renamed to like try_grab_pages())
> helper and replace the callsites of gup_huge_{pud,pmd} for the THP/hugetlbfs
> equivalent handling.

I suppose it should be some #define because the (pmd_val != orig) logic
is not sharable

But, yes, a general call that the walker should make at any level to
record a pfn -> npages range efficiently.

> I think the right answer is "depends on the devmap" type. device-dax with
> PMD/PUDs (i.e. 2m pagesize PMEM or 1G pagesize pmem) works with the same
> rules as hugetlbfs. fsdax not so much (as you say above) but it would
> follow up changes to perhaps adhere to similar scheme (not exactly sure
> how do deal with holes). HMM I am not sure what the rules are there.
> P2PDMA seems not applicable?

I would say, not "depends on the devmap", but what are the rules for
calling insert_pfn_pmd in the first place.

If users are allowed the create pmds that span many compound_head's
then we need logic as I showed. Otherwise we do not.

And I would document this relationship in the GUP side "This do/while
is required because insert_pfn_pmd/pud() is used with compound pages
smaller than the PUD/PMD size" so it isn't so confused with just
"devmap"

Jason
Joao Martins Oct. 14, 2021, 5:56 p.m. UTC | #18
On 10/13/21 20:43, Jason Gunthorpe wrote:
> On Wed, Oct 13, 2021 at 08:18:08PM +0100, Joao Martins wrote:
>> On 10/13/21 18:41, Jason Gunthorpe wrote:
>>> On Mon, Oct 11, 2021 at 04:53:29PM +0100, Joao Martins wrote:
>>>> On 10/8/21 12:54, Jason Gunthorpe wrote:
>>>
>>>>> The only optimization that might work here is to grab the head, then
>>>>> compute the extent of tail pages and amalgamate them. Holding a ref on
>>>>> the head also secures the tails.
>>>>
>>>> How about pmd_page(orig) / pud_page(orig) like what the rest of hugetlb/thp
>>>> checks do? i.e. we would pass pmd_page(orig)/pud_page(orig) to __gup_device_huge()
>>>> as an added @head argument. While keeping the same structure of counting tail pages
>>>> between @addr .. @end if we have a head page.
>>>
>>> The right logic is what everything else does:
>>>
>>> 	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
>>> 	refs = record_subpages(page, addr, end, pages + *nr);
>>> 	head = try_grab_compound_head(pud_page(orig), refs, flags);
>>>
>>> If you can use this, or not, depends entirely on answering the
>>> question of why does  __gup_device_huge() exist at all.
>>>
>> So for device-dax it seems to be an untackled oversight[*], probably
>> inherited from when fsdax devmap was introduced. What I don't know
>> is the other devmap users :(
> 
> devmap generic infrastructure waits until all page refcounts go to
> zero, and it should wait until any fast GUP is serialized as part of
> the TLB shootdown - otherwise it is leaking access to the memory it
> controls beyond it's shutdown
> 
> So, I don't think the different devmap users can break this?
> 
maybe fsdax may not honor that after removing the pgmap ref count
as we talked earlier in the thread without the memory failures stuff.

But my point earlier on "oversight" was about the fact that we describe
PMD/PUDs with base pages. And hence why we can't do this:

 	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 	refs = record_subpages(page, addr, end, pages + *nr);
 	head = try_grab_compound_head(pud_page(orig), refs, flags);

... For devmap.

>>> This I don't fully know:
>>>
>>> 1) As discussed quite a few times now, the entire get_dev_pagemap
>>>    stuff looks usless and should just be deleted. If you care about
>>>    optimizing this I would persue doing that as it will give the
>>>    biggest single win.
>>
>> I am not questioning the well-deserved improvement -- but from a pure
>> optimization perspective the get_dev_pagemap() cost is not
>> visible and quite negligeble.
> 
> You are doing large enough GUPs then that the expensive xarray seach
> is small compared to the rest?
> 

The tests I showed are on 16G and 128G (I usually test with > 1Tb).
But we are comparing 1 pgmap lookup, and 512 refcount atomic updates for
a PMD sized pin (2M).

It depends on what you think small is. Maybe for a 4K-16K of memory pinned,
probably the pgmap lookup is more expensive.

A couple months ago I remember measuring xarray lookups and that would be
around ~150ns per lookup with few entries excluding the ref update. I can
be pedantic and give you a more concrete measurement for get_dev_pagemap()
but quite honestly don't think that it will be close with we pin a hugepage.

But this is orthogonal to the pgmap ref existence, I was merely commenting on
the performance aspect.

>>> 2) It breaks up the PUD/PMD into tail pages and scans them all
>>>    Why? Can devmap combine multiple compound_head's into the same PTE?
>>
>> I am not aware of any other usage of compound pages for devmap struct pages
>> than this series. At least I haven't seen device-dax or fsdax using
>> this.
> 
> Let me ask this question differently, is this assertion OK?
> 
I understood the question. I was more trying to learn from you  on HMM/P2PDMA
use-cases as you sounded like compound pages exist elsewhere in devmap :)

> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -808,8 +808,13 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>         }
>  
>         entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
> -       if (pfn_t_devmap(pfn))
> +       if (pfn_t_devmap(pfn)) {
> +               struct page *pfn_to_page(pfn);
> +
> +               WARN_ON(compound_head(page) != page);
> +               WARN_ON(compound_order(page) != PMD_SHIFT);
>                 entry = pmd_mkdevmap(entry);
> +       }
>         if (write) {
>                 entry = pmd_mkyoung(pmd_mkdirty(entry));
>                 entry = maybe_pmd_mkwrite(entry, vma);
> 
> (and same for insert_pfn_pud)
> 
> You said it is OK for device/dax/device.c?
> 
Yes, correct. (I also verified with similar snippet above to be
extra-pedantic)

I would like to emphasize that it is only OK *after this series*.

Without this series there is neither compound order or head, nor any
compound page whatsoever.

> And not for fs/dax.c?
> 
IIUC, Correct. fs/dax code is a little fuzzy on the sector to PFN
translation but still:

There's no compound pages in fsdax (neither do I add them). So
compound_order() doesn't exist (hence order 0) and there's no head
which also means that compound_head(@page) returns @page (which is
a tad misleading on being a head definition as a PMD subpage would
not return @page).

> 
>> Unless HMM does this stuff, or some sort of devmap page migration? P2PDMA
>> doesn't seem to be (yet?) caught by any of the GUP path at least before
>> Logan's series lands. Or am I misunderstanding things here?
> 
> Of the places that call the insert_pfn_pmd/pud call chains I only see
> device/dax/device.c and fs/dax.c as being linked to devmap. So other
> devmap users don't use this stuff.
> 
>> I was changing __gup_device_huge() with similar to the above, but yeah
>> it follows that algorithm as inside your do { } while() (thanks!). I can
>> turn __gup_device_huge() into another (renamed to like try_grab_pages())
>> helper and replace the callsites of gup_huge_{pud,pmd} for the THP/hugetlbfs
>> equivalent handling.
> 
> I suppose it should be some #define because the (pmd_val != orig) logic
> is not sharable
> 
I wasn't going to have that pmd_val() in there. Just this refcount part:

 	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 	refs = record_subpages(page, addr, end, pages + *nr);
 	head = try_grab_compound_head(pud_page(orig), refs, flags);

and __gup_device_huge() melded in.

> But, yes, a general call that the walker should make at any level to
> record a pfn -> npages range efficiently.
> 
>> I think the right answer is "depends on the devmap" type. device-dax with
>> PMD/PUDs (i.e. 2m pagesize PMEM or 1G pagesize pmem) works with the same
>> rules as hugetlbfs. fsdax not so much (as you say above) but it would
>> follow up changes to perhaps adhere to similar scheme (not exactly sure
>> how do deal with holes). HMM I am not sure what the rules are there.
>> P2PDMA seems not applicable?
> 
> I would say, not "depends on the devmap", but what are the rules for
> calling insert_pfn_pmd in the first place.
> 
I am gonna translate that into HMM and P2PDMA so far aren't affected as you
also said earlier above. Even after Logan's P2PDMA NVME series it appears is
always PTEs. So fsdax and device-dax are the only pmd_devmap users we
should care, and for pud_devmap() only device-dax.

> If users are allowed the create pmds that span many compound_head's
> then we need logic as I showed. Otherwise we do not.
> 
/me nods.

> And I would document this relationship in the GUP side "This do/while
> is required because insert_pfn_pmd/pud() is used with compound pages
> smaller than the PUD/PMD size" so it isn't so confused with just
> "devmap"

Also, it's not that PMDs span compound heads, it's that PMDs/PUDs use
just base pages. Compound pages/head in devmap are only introduced by
series and for device-dax only.
Jason Gunthorpe Oct. 14, 2021, 6:06 p.m. UTC | #19
On Thu, Oct 14, 2021 at 06:56:51PM +0100, Joao Martins wrote:

> > And I would document this relationship in the GUP side "This do/while
> > is required because insert_pfn_pmd/pud() is used with compound pages
> > smaller than the PUD/PMD size" so it isn't so confused with just
> > "devmap"
> 
> Also, it's not that PMDs span compound heads, it's that PMDs/PUDs use
> just base pages. Compound pages/head in devmap are only introduced by
> series and for device-dax only.

I think it all makes sense, I just want to clarify what I mean by
compound_head:

  compound_head(base_page) == base_page

Ie a PMD consisting only of order 0 pages would have N 'compound_heads'
within it even though nothing is a compound page.

Which is relavent because the GUP algorithms act on the compound_head

Jason
Jason Gunthorpe Oct. 18, 2021, 6:36 p.m. UTC | #20
On Thu, Sep 30, 2021 at 01:01:14PM +1000, Alistair Popple wrote:
> On Thursday, 30 September 2021 5:34:05 AM AEST Jason Gunthorpe wrote:
> > On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote:
> > 
> > > > If the get_dev_pagemap has to remain then it just means we have to
> > > > flush before changing pagemap pointers
> > > Right -- I don't think we should need it as that discussion on the other
> > > thread goes.
> > > 
> > > OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM
> > > for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax]
> > > can support it but not MEMORY_DEVICE_FSDAX [fsdax]).
> > 
> > When looking at Logan's patches I think it is pretty clear to me that
> > page->pgmap must never be a dangling pointer if the caller has a
> > legitimate refcount on the page.
> > 
> > For instance the migrate and stuff all blindly calls
> > is_device_private_page() on the struct page expecting a valid
> > page->pgmap.
> > 
> > This also looks like it is happening, ie
> > 
> > void __put_page(struct page *page)
> > {
> > 	if (is_zone_device_page(page)) {
> > 		put_dev_pagemap(page->pgmap);
> > 
> > Is indeed putting the pgmap ref back when the page becomes ungettable.
> > 
> > This properly happens when the page refcount goes to zero and so it
> > should fully interlock with __page_cache_add_speculative():
> > 
> > 	if (unlikely(!page_ref_add_unless(page, count, 0))) {
> > 
> > Thus, in gup.c, if we succeed at try_grab_compound_head() then
> > page->pgmap is a stable pointer with a valid refcount.
> > 
> > So, all the external pgmap stuff in gup.c is completely pointless.
> > try_grab_compound_head() provides us with an equivalent protection at
> > lower cost. Remember gup.c doesn't deref the pgmap at all.
> > 
> > Dan/Alistair/Felix do you see any hole in that argument??
> 
> As background note that device pages are currently considered free when
> refcount == 1 but the pgmap reference is dropped when the refcount transitions
> 1->0. The final pgmap reference is typically dropped when a driver calls
> memunmap_pages() and put_page() drops the last page reference:
> 
> void memunmap_pages(struct dev_pagemap *pgmap)
> {
>         unsigned long pfn;
>         int i;
> 
>         dev_pagemap_kill(pgmap);
>         for (i = 0; i < pgmap->nr_range; i++)
>                 for_each_device_pfn(pfn, pgmap, i)
>                         put_page(pfn_to_page(pfn));
>         dev_pagemap_cleanup(pgmap);
> 
> If there are still pgmap references dev_pagemap_cleanup(pgmap) will block until
> the final reference is dropped. So I think your argument holds at least for
> DEVICE_PRIVATE and DEVICE_GENERIC. DEVICE_FS_DAX defines it's own pagemap
> cleanup but I can't see why the same argument wouldn't hold there - if a page
> has a valid refcount it must have a reference on the pagemap too.

To close this circle - the issue is use after free on the struct page
* entry while it has a zero ref.

memunmap_pages() does wait for the refcount to go to zero, but it then
goes on to free the memory under the struct pages.

However there are possibly still untracked references to this memory
in the page tables.

This is the bug Dan has been working on - to shootdown page table
mappings before getting to memunmap_pages()

Getting the page map ref will make the use-after-free never crash,
just be a silent security problem. :(

Jason
Jason Gunthorpe Oct. 18, 2021, 6:37 p.m. UTC | #21
On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote:
> On 9/28/21 19:01, Jason Gunthorpe wrote:
> > On Thu, Sep 23, 2021 at 05:51:04PM +0100, Joao Martins wrote:
> >> So ... if pgmap accounting was removed from gup-fast then this patch
> >> would be a lot simpler and we could perhaps just fallback to the regular
> >> hugepage case (THP, HugeTLB) like your suggestion at the top. See at the
> >> end below scissors mark as the ballpark of changes.
> >>
> >> So far my options seem to be: 1) this patch which leverages the existing
> >> iteration logic or 2) switching to for_each_compound_range() -- see my previous
> >> reply 3) waiting for Dan to remove @pgmap accounting in gup-fast and use
> >> something similar to below scissors mark.
> >>
> >> What do you think would be the best course of action?
> > 
> > I still think the basic algorithm should be to accumulate physicaly
> > contiguous addresses when walking the page table and then flush them
> > back to struct pages once we can't accumulate any more.
> > 
> > That works for both the walkers and all the page types?
> > 
> 
> The logic already handles all page types -- I was trying to avoid the extra
> complexity in regular hugetlb/THP path by not merging the handling of the
> oddball case that is devmap (or fundamentally devmap
> non-compound case in the future).

FYI, this untested thing is what I came to when I tried to make
something like this:

/*
 * A large page entry such as PUD/PMD can point to a struct page. In cases like
 * THP this struct page will be a compound page of the same order as the page
 * table level. However, in cases like DAX or more generally pgmap ZONE_DEVICE,
 * the PUD/PMD may point at the first pfn in a string of pages.
 *
 * This helper iterates over all head pages or all the non-compound base pages.
 */
static pt_entry_iter_state
{
	struct page *head;
	unsigned long compound_nr;
	unsigned long pfn;
	unsigned long end_pfn;
};

static inline struct page *__pt_start_iter(struct iter_state *state,
					   struct page *page, unsigned long pfn,
					   unsigned int entry_size)
{
	state->head = compound_head(page);
	state->compound_nr = compound_nr(page);
	state->pfn = pfn & (~(state->compound_nr - 1));
	state->end_pfn = pfn + entry_size / PAGE_SIZE;
	return state->head;
}

static inline struct page *__pt_next_page(struct iter_state *state)
{
	state->pfn += state->compound_nr;
	if (state->end_pfn <= state->pfn)
		return NULL;
	state->head = pfn_to_page(state->pfn);
	state->compound_nr = compound_nr(page);
	return state->head;
}

#define for_each_page_in_pt_entry(state, page, pfn, entry_size)                \
	for (page = __pt_start_iter(state, page, pfn, entry_size); page;       \
	     page = __pt_next_page(&state))

static bool remove_pages_from_page_table(struct vm_area_struct *vma,
					 struct page *page, unsigned long pfn,
					 unsigned int entry_size, bool is_dirty,
					 bool is_young)
{
	struct iter_state state;

	for_each_page_in_pt_entry(&state, page, pfn, entry_size)
		remove_page_from_page_table(vma, page, is_dirty, is_young);
}


Jason
diff mbox series

Patch

diff --git a/mm/gup.c b/mm/gup.c
index 7a406d79bd2e..0741d2c0ba5e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2234,17 +2234,30 @@  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
+
+static int record_subpages(struct page *page, unsigned long addr,
+			   unsigned long end, struct page **pages)
+{
+	int nr;
+
+	for (nr = 0; addr != end; addr += PAGE_SIZE)
+		pages[nr++] = page++;
+
+	return nr;
+}
+
 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 			     unsigned long end, unsigned int flags,
 			     struct page **pages, int *nr)
 {
-	int nr_start = *nr;
+	int refs, nr_start = *nr;
 	struct dev_pagemap *pgmap = NULL;
 	int ret = 1;
 
 	do {
-		struct page *page = pfn_to_page(pfn);
+		struct page *head, *page = pfn_to_page(pfn);
+		unsigned long next = addr + PAGE_SIZE;
 
 		pgmap = get_dev_pagemap(pfn, pgmap);
 		if (unlikely(!pgmap)) {
@@ -2252,16 +2265,25 @@  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 			ret = 0;
 			break;
 		}
-		SetPageReferenced(page);
-		pages[*nr] = page;
-		if (unlikely(!try_grab_page(page, flags))) {
-			undo_dev_pagemap(nr, nr_start, flags, pages);
+
+		head = compound_head(page);
+		/* @end is assumed to be limited at most one compound page */
+		if (PageHead(head))
+			next = end;
+		refs = record_subpages(page, addr, next, pages + *nr);
+
+		SetPageReferenced(head);
+		if (unlikely(!try_grab_compound_head(head, refs, flags))) {
+			if (PageHead(head))
+				ClearPageReferenced(head);
+			else
+				undo_dev_pagemap(nr, nr_start, flags, pages);
 			ret = 0;
 			break;
 		}
-		(*nr)++;
-		pfn++;
-	} while (addr += PAGE_SIZE, addr != end);
+		*nr += refs;
+		pfn += refs;
+	} while (addr += (refs << PAGE_SHIFT), addr != end);
 
 	put_dev_pagemap(pgmap);
 	return ret;
@@ -2320,17 +2342,6 @@  static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
 }
 #endif
 
-static int record_subpages(struct page *page, unsigned long addr,
-			   unsigned long end, struct page **pages)
-{
-	int nr;
-
-	for (nr = 0; addr != end; addr += PAGE_SIZE)
-		pages[nr++] = page++;
-
-	return nr;
-}
-
 #ifdef CONFIG_ARCH_HAS_HUGEPD
 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 				      unsigned long sz)