diff mbox series

[v2,12/18] mm/gup: track FOLL_PIN pages

Message ID 20191103211813.213227-13-jhubbard@nvidia.com (mailing list archive)
State Superseded
Headers show
Series mm/gup: track dma-pinned pages: FOLL_PIN, FOLL_LONGTERM | expand

Commit Message

John Hubbard Nov. 3, 2019, 9:18 p.m. UTC
Add tracking of pages that were pinned via FOLL_PIN.

As mentioned in the FOLL_PIN documentation, callers who effectively set
FOLL_PIN are required to ultimately free such pages via put_user_page().
The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
for DIO and/or RDMA use".

Pages that have been pinned via FOLL_PIN are identifiable via a
new function call:

   bool page_dma_pinned(struct page *page);

What to do in response to encountering such a page, is left to later
patchsets. There is discussion about this in [1].

This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().

This also has a couple of trivial, non-functional change fixes to
try_get_compound_head(). That function got moved to the top of the
file.

This includes the following fix from Ira Weiny:

DAX requires detection of a page crossing to a ref count of 1.  Fix this
for GUP pages by introducing put_devmap_managed_user_page() which
accounts for GUP_PIN_COUNTING_BIAS now used by GUP.

[1] https://lwn.net/Articles/784574/ "Some slow progress on
get_user_pages()"

Suggested-by: Jan Kara <jack@suse.cz>
Suggested-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
---
 include/linux/mm.h       |  80 +++++++++++----
 include/linux/mmzone.h   |   2 +
 include/linux/page_ref.h |  10 ++
 mm/gup.c                 | 213 +++++++++++++++++++++++++++++++--------
 mm/huge_memory.c         |  32 +++++-
 mm/hugetlb.c             |  28 ++++-
 mm/memremap.c            |   4 +-
 mm/vmstat.c              |   2 +
 8 files changed, 300 insertions(+), 71 deletions(-)

Comments

Jerome Glisse Nov. 4, 2019, 6:52 p.m. UTC | #1
On Sun, Nov 03, 2019 at 01:18:07PM -0800, John Hubbard wrote:
> Add tracking of pages that were pinned via FOLL_PIN.
> 
> As mentioned in the FOLL_PIN documentation, callers who effectively set
> FOLL_PIN are required to ultimately free such pages via put_user_page().
> The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
> for DIO and/or RDMA use".
> 
> Pages that have been pinned via FOLL_PIN are identifiable via a
> new function call:
> 
>    bool page_dma_pinned(struct page *page);
> 
> What to do in response to encountering such a page, is left to later
> patchsets. There is discussion about this in [1].
> 
> This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().
> 
> This also has a couple of trivial, non-functional change fixes to
> try_get_compound_head(). That function got moved to the top of the
> file.

Maybe split that as a separate trivial patch.

> 
> This includes the following fix from Ira Weiny:
> 
> DAX requires detection of a page crossing to a ref count of 1.  Fix this
> for GUP pages by introducing put_devmap_managed_user_page() which
> accounts for GUP_PIN_COUNTING_BIAS now used by GUP.

Please do the put_devmap_managed_page() changes in a separate
patch, it would be a lot easier to follow, also on that front
see comments below.

> 
> [1] https://lwn.net/Articles/784574/ "Some slow progress on
> get_user_pages()"
> 
> Suggested-by: Jan Kara <jack@suse.cz>
> Suggested-by: Jérôme Glisse <jglisse@redhat.com>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>
> ---
>  include/linux/mm.h       |  80 +++++++++++----
>  include/linux/mmzone.h   |   2 +
>  include/linux/page_ref.h |  10 ++
>  mm/gup.c                 | 213 +++++++++++++++++++++++++++++++--------
>  mm/huge_memory.c         |  32 +++++-
>  mm/hugetlb.c             |  28 ++++-
>  mm/memremap.c            |   4 +-
>  mm/vmstat.c              |   2 +
>  8 files changed, 300 insertions(+), 71 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index cdfb6fedb271..03b3600843b7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -972,9 +972,10 @@ static inline bool is_zone_device_page(const struct page *page)
>  #endif
>  
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page);
> +void __put_devmap_managed_page(struct page *page, int count);
>  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
> -static inline bool put_devmap_managed_page(struct page *page)
> +
> +static inline bool page_is_devmap_managed(struct page *page)
>  {
>  	if (!static_branch_unlikely(&devmap_managed_key))
>  		return false;
> @@ -983,7 +984,6 @@ static inline bool put_devmap_managed_page(struct page *page)
>  	switch (page->pgmap->type) {
>  	case MEMORY_DEVICE_PRIVATE:
>  	case MEMORY_DEVICE_FS_DAX:
> -		__put_devmap_managed_page(page);
>  		return true;
>  	default:
>  		break;
> @@ -991,6 +991,19 @@ static inline bool put_devmap_managed_page(struct page *page)
>  	return false;
>  }
>  
> +static inline bool put_devmap_managed_page(struct page *page)
> +{
> +	bool is_devmap = page_is_devmap_managed(page);
> +
> +	if (is_devmap) {
> +		int count = page_ref_dec_return(page);
> +
> +		__put_devmap_managed_page(page, count);
> +	}
> +
> +	return is_devmap;
> +}

I think the __put_devmap_managed_page() should be rename
to free_devmap_managed_page() and that the count != 1
case move to this inline function ie:

static inline bool put_devmap_managed_page(struct page *page)
{
	bool is_devmap = page_is_devmap_managed(page);

	if (is_devmap) {
		int count = page_ref_dec_return(page);

		/*
		 * If refcount is 1 then page is freed and refcount is stable as nobody
		 * holds a reference on the page.
		 */
		if (count == 1)
			free_devmap_managed_page(page, count);
		else if (!count)
			__put_page(page);
	}

	return is_devmap;
}


> +
>  #else /* CONFIG_DEV_PAGEMAP_OPS */
>  static inline bool put_devmap_managed_page(struct page *page)
>  {
> @@ -1038,6 +1051,8 @@ static inline __must_check bool try_get_page(struct page *page)
>  	return true;
>  }
>  
> +__must_check bool user_page_ref_inc(struct page *page);
> +

What about having it as an inline here as it is pretty small.


>  static inline void put_page(struct page *page)
>  {
>  	page = compound_head(page);
> @@ -1055,31 +1070,56 @@ static inline void put_page(struct page *page)
>  		__put_page(page);
>  }
>  
> -/**
> - * put_user_page() - release a gup-pinned page
> - * @page:            pointer to page to be released
> +/*
> + * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
> + * the page's refcount so that two separate items are tracked: the original page
> + * reference count, and also a new count of how many get_user_pages() calls were
> + * made against the page. ("gup-pinned" is another term for the latter).
> + *
> + * With this scheme, get_user_pages() becomes special: such pages are marked
> + * as distinct from normal pages. As such, the new put_user_page() call (and
> + * its variants) must be used in order to release gup-pinned pages.
> + *
> + * Choice of value:
>   *
> - * Pages that were pinned via get_user_pages*() must be released via
> - * either put_user_page(), or one of the put_user_pages*() routines
> - * below. This is so that eventually, pages that are pinned via
> - * get_user_pages*() can be separately tracked and uniquely handled. In
> - * particular, interactions with RDMA and filesystems need special
> - * handling.
> + * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
> + * counts with respect to get_user_pages() and put_user_page() becomes simpler,
> + * due to the fact that adding an even power of two to the page refcount has
> + * the effect of using only the upper N bits, for the code that counts up using
> + * the bias value. This means that the lower bits are left for the exclusive
> + * use of the original code that increments and decrements by one (or at least,
> + * by much smaller values than the bias value).
>   *
> - * put_user_page() and put_page() are not interchangeable, despite this early
> - * implementation that makes them look the same. put_user_page() calls must
> - * be perfectly matched up with get_user_page() calls.
> + * Of course, once the lower bits overflow into the upper bits (and this is
> + * OK, because subtraction recovers the original values), then visual inspection
> + * no longer suffices to directly view the separate counts. However, for normal
> + * applications that don't have huge page reference counts, this won't be an
> + * issue.
> + *
> + * Locking: the lockless algorithm described in page_cache_get_speculative()
> + * and page_cache_gup_pin_speculative() provides safe operation for
> + * get_user_pages and page_mkclean and other calls that race to set up page
> + * table entries.
>   */
> -static inline void put_user_page(struct page *page)
> -{
> -	put_page(page);
> -}
> +#define GUP_PIN_COUNTING_BIAS (1UL << 10)
>  
> +void put_user_page(struct page *page);
>  void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
>  			       bool make_dirty);
> -
>  void put_user_pages(struct page **pages, unsigned long npages);
>  
> +/**
> + * page_dma_pinned() - report if a page is pinned by a call to pin_user_pages*()
> + * or pin_longterm_pages*()
> + * @page:	pointer to page to be queried.
> + * @Return:	True, if it is likely that the page has been "dma-pinned".
> + *		False, if the page is definitely not dma-pinned.
> + */

Maybe add a small comment about wrap around :)

> +static inline bool page_dma_pinned(struct page *page)
> +{
> +	return (page_ref_count(compound_head(page))) >= GUP_PIN_COUNTING_BIAS;
> +}
> +
>  #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
>  #define SECTION_IN_PAGE_FLAGS
>  #endif

[...]

> diff --git a/mm/gup.c b/mm/gup.c
> index 1aea48427879..c9727e65fad3 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c

[...]

> @@ -1930,12 +2028,20 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>  
>  		pgmap = get_dev_pagemap(pfn, pgmap);
>  		if (unlikely(!pgmap)) {
> -			undo_dev_pagemap(nr, nr_start, pages);
> +			undo_dev_pagemap(nr, nr_start, flags, pages);
>  			return 0;
>  		}
>  		SetPageReferenced(page);
>  		pages[*nr] = page;
> -		get_page(page);
> +
> +		if (flags & FOLL_PIN) {
> +			if (unlikely(!user_page_ref_inc(page))) {
> +				undo_dev_pagemap(nr, nr_start, flags, pages);
> +				return 0;
> +			}

Maybe add a comment about a case that should never happens ie
user_page_ref_inc() fails after the second iteration of the
loop as it would be broken and a bug to call undo_dev_pagemap()
after the first iteration of that loop.

Also i believe that this should never happens as if first
iteration succeed than __page_cache_add_speculative() will
succeed for all the iterations.

Note that the pgmap case above follows that too ie the call to
get_dev_pagemap() can only fail on first iteration of the loop,
well i assume you can never have a huge device page that span
different pgmap ie different devices (which is a reasonable
assumption). So maybe this code needs fixing ie :

		pgmap = get_dev_pagemap(pfn, pgmap);
		if (unlikely(!pgmap))
			return 0;


> +		} else
> +			get_page(page);
> +
>  		(*nr)++;
>  		pfn++;
>  	} while (addr += PAGE_SIZE, addr != end);

[...]

> @@ -2409,7 +2540,7 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
>  	unsigned long addr, len, end;
>  	int nr = 0, ret = 0;
>  
> -	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
> +	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_PIN)))

Maybe add a comments to explain, something like:

/*
 * The only flags allowed here are: FOLL_WRITE, FOLL_LONGTERM, FOLL_PIN
 *
 * Note that get_user_pages_fast() imply FOLL_GET flag by default but
 * callers can over-ride this default to pin case by setting FOLL_PIN.
 */

>  		return -EINVAL;
>  
>  	start = untagged_addr(start) & PAGE_MASK;
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 13cc93785006..66bf4c8b88f1 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c

[...]

> @@ -968,7 +973,12 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
>  	if (!*pgmap)
>  		return ERR_PTR(-EFAULT);
>  	page = pfn_to_page(pfn);
> -	get_page(page);
> +
> +	if (flags & FOLL_GET)
> +		get_page(page);
> +	else if (flags & FOLL_PIN)
> +		if (unlikely(!user_page_ref_inc(page)))
> +			page = ERR_PTR(-ENOMEM);

While i agree that user_page_ref_inc() (ie page_cache_add_speculative())
should never fails here as we are holding the pmd lock and thus no one
can unmap the pmd and free the page it points to. I believe you should
return -EFAULT like for the pgmap and not -ENOMEM as the pgmap should
not fail either for the same reason. Thus it would be better to have
consistent error. Maybe also add a comments explaining that it should
not fail here.

>  
>  	return page;
>  }

[...]

> @@ -1100,7 +1115,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
>  	 * device mapped pages can only be returned if the
>  	 * caller will manage the page reference count.
>  	 */
> -	if (!(flags & FOLL_GET))
> +	if (!(flags & (FOLL_GET | FOLL_PIN)))
>  		return ERR_PTR(-EEXIST);

Maybe add a comment that FOLL_GET or FOLL_PIN must be set.

>  	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
> @@ -1108,7 +1123,12 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
>  	if (!*pgmap)
>  		return ERR_PTR(-EFAULT);
>  	page = pfn_to_page(pfn);
> -	get_page(page);
> +
> +	if (flags & FOLL_GET)
> +		get_page(page);
> +	else if (flags & FOLL_PIN)
> +		if (unlikely(!user_page_ref_inc(page)))
> +			page = ERR_PTR(-ENOMEM);

Same as for follow_devmap_pmd() see above.

>  
>  	return page;
>  }
> @@ -1522,8 +1542,12 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
>  skip_mlock:
>  	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
>  	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
> +
>  	if (flags & FOLL_GET)
>  		get_page(page);
> +	else if (flags & FOLL_PIN)
> +		if (unlikely(!user_page_ref_inc(page)))
> +			page = NULL;

This should not fail either as we are holding the pmd lock maybe add
a comment. Dunno if we want a WARN() or something to catch this
degenerate case, or dump the page.

>  
>  out:
>  	return page;
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index b45a95363a84..da335b1cd798 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4462,7 +4462,17 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  same_page:
>  		if (pages) {
>  			pages[i] = mem_map_offset(page, pfn_offset);
> -			get_page(pages[i]);
> +
> +			if (flags & FOLL_GET)
> +				get_page(pages[i]);
> +			else if (flags & FOLL_PIN)
> +				if (unlikely(!user_page_ref_inc(pages[i]))) {
> +					spin_unlock(ptl);
> +					remainder = 0;
> +					err = -ENOMEM;
> +					WARN_ON_ONCE(1);
> +					break;
> +				}
>  		}

user_page_ref_inc() should not fail here either because we hold the
ptl, so the WAR_ON_ONCE() is right but maybe add a comment.

>  
>  		if (vmas)

[...]

> @@ -5034,8 +5050,14 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>  	pte = huge_ptep_get((pte_t *)pmd);
>  	if (pte_present(pte)) {
>  		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
> +
>  		if (flags & FOLL_GET)
>  			get_page(page);
> +		else if (flags & FOLL_PIN)
> +			if (unlikely(!user_page_ref_inc(page))) {
> +				page = NULL;
> +				goto out;
> +			}

This should not fail either (again holding pmd lock), dunno if we want
a warn or something to catch this degenerate case.

>  	} else {
>  		if (is_hugetlb_entry_migration(pte)) {
>  			spin_unlock(ptl);

[...]
John Hubbard Nov. 4, 2019, 10:49 p.m. UTC | #2
On 11/4/19 10:52 AM, Jerome Glisse wrote:
> On Sun, Nov 03, 2019 at 01:18:07PM -0800, John Hubbard wrote:
>> Add tracking of pages that were pinned via FOLL_PIN.
>>
>> As mentioned in the FOLL_PIN documentation, callers who effectively set
>> FOLL_PIN are required to ultimately free such pages via put_user_page().
>> The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
>> for DIO and/or RDMA use".
>>
>> Pages that have been pinned via FOLL_PIN are identifiable via a
>> new function call:
>>
>>    bool page_dma_pinned(struct page *page);
>>
>> What to do in response to encountering such a page, is left to later
>> patchsets. There is discussion about this in [1].
>>
>> This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().
>>
>> This also has a couple of trivial, non-functional change fixes to
>> try_get_compound_head(). That function got moved to the top of the
>> file.
> 
> Maybe split that as a separate trivial patch.


Will do.


> 
>>
>> This includes the following fix from Ira Weiny:
>>
>> DAX requires detection of a page crossing to a ref count of 1.  Fix this
>> for GUP pages by introducing put_devmap_managed_user_page() which
>> accounts for GUP_PIN_COUNTING_BIAS now used by GUP.
> 
> Please do the put_devmap_managed_page() changes in a separate
> patch, it would be a lot easier to follow, also on that front
> see comments below.


Oh! OK. It makes sense when you say it out loud. :)


...
>> +static inline bool put_devmap_managed_page(struct page *page)
>> +{
>> +	bool is_devmap = page_is_devmap_managed(page);
>> +
>> +	if (is_devmap) {
>> +		int count = page_ref_dec_return(page);
>> +
>> +		__put_devmap_managed_page(page, count);
>> +	}
>> +
>> +	return is_devmap;
>> +}
> 
> I think the __put_devmap_managed_page() should be rename
> to free_devmap_managed_page() and that the count != 1
> case move to this inline function ie:
> 
> static inline bool put_devmap_managed_page(struct page *page)
> {
> 	bool is_devmap = page_is_devmap_managed(page);
> 
> 	if (is_devmap) {
> 		int count = page_ref_dec_return(page);
> 
> 		/*
> 		 * If refcount is 1 then page is freed and refcount is stable as nobody
> 		 * holds a reference on the page.
> 		 */
> 		if (count == 1)
> 			free_devmap_managed_page(page, count);
> 		else if (!count)
> 			__put_page(page);
> 	}
> 
> 	return is_devmap;
> }
> 

Thanks, that does look cleaner and easier to read.

> 
>> +
>>  #else /* CONFIG_DEV_PAGEMAP_OPS */
>>  static inline bool put_devmap_managed_page(struct page *page)
>>  {
>> @@ -1038,6 +1051,8 @@ static inline __must_check bool try_get_page(struct page *page)
>>  	return true;
>>  }
>>  
>> +__must_check bool user_page_ref_inc(struct page *page);
>> +
> 
> What about having it as an inline here as it is pretty small.


You mean move it to a static inline function in mm.h? It's worse than it 
looks, though: *everything* that it calls is also a static function, local
to gup.c. So I'd have to expose both try_get_compound_head() and
__update_proc_vmstat(). And that also means calling mod_node_page_state() from
mm.h, and it goes south right about there. :)


...  
>> +/**
>> + * page_dma_pinned() - report if a page is pinned by a call to pin_user_pages*()
>> + * or pin_longterm_pages*()
>> + * @page:	pointer to page to be queried.
>> + * @Return:	True, if it is likely that the page has been "dma-pinned".
>> + *		False, if the page is definitely not dma-pinned.
>> + */
> 
> Maybe add a small comment about wrap around :)


I don't *think* the count can wrap around, due to the checks in user_page_ref_inc().

But it's true that the documentation is a little light here...What did you have 
in mind?


> [...]
> 
>> @@ -1930,12 +2028,20 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>>  
>>  		pgmap = get_dev_pagemap(pfn, pgmap);
>>  		if (unlikely(!pgmap)) {
>> -			undo_dev_pagemap(nr, nr_start, pages);
>> +			undo_dev_pagemap(nr, nr_start, flags, pages);
>>  			return 0;
>>  		}
>>  		SetPageReferenced(page);
>>  		pages[*nr] = page;
>> -		get_page(page);
>> +
>> +		if (flags & FOLL_PIN) {
>> +			if (unlikely(!user_page_ref_inc(page))) {
>> +				undo_dev_pagemap(nr, nr_start, flags, pages);
>> +				return 0;
>> +			}
> 
> Maybe add a comment about a case that should never happens ie
> user_page_ref_inc() fails after the second iteration of the
> loop as it would be broken and a bug to call undo_dev_pagemap()
> after the first iteration of that loop.
> 
> Also i believe that this should never happens as if first
> iteration succeed than __page_cache_add_speculative() will
> succeed for all the iterations.
> 
> Note that the pgmap case above follows that too ie the call to
> get_dev_pagemap() can only fail on first iteration of the loop,
> well i assume you can never have a huge device page that span
> different pgmap ie different devices (which is a reasonable
> assumption). So maybe this code needs fixing ie :
> 
> 		pgmap = get_dev_pagemap(pfn, pgmap);
> 		if (unlikely(!pgmap))
> 			return 0;
> 
> 

OK, yes that does make sense. And I think a comment is adequate,
no need to check for bugs during every tail page iteration. So how 
about this, as a preliminary patch:

diff --git a/mm/gup.c b/mm/gup.c
index 8f236a335ae9..a4a81e125832 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1892,17 +1892,18 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
                unsigned long end, struct page **pages, int *nr)
 {
-       int nr_start = *nr;
-       struct dev_pagemap *pgmap = NULL;
+       /*
+        * Huge pages should never cross dev_pagemap boundaries. Therefore, use
+        * this same pgmap for the entire huge page.
+        */
+       struct dev_pagemap *pgmap = get_dev_pagemap(pfn, NULL);
+
+       if (unlikely(!pgmap))
+               return 0;
 
        do {
                struct page *page = pfn_to_page(pfn);
 
-               pgmap = get_dev_pagemap(pfn, pgmap);
-               if (unlikely(!pgmap)) {
-                       undo_dev_pagemap(nr, nr_start, pages);
-                       return 0;
-               }
                SetPageReferenced(page);
                pages[*nr] = page;
                get_page(page);




>> +		} else
>> +			get_page(page);
>> +
>>  		(*nr)++;
>>  		pfn++;
>>  	} while (addr += PAGE_SIZE, addr != end);
> 
> [...]
> 
>> @@ -2409,7 +2540,7 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
>>  	unsigned long addr, len, end;
>>  	int nr = 0, ret = 0;
>>  
>> -	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
>> +	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_PIN)))
> 
> Maybe add a comments to explain, something like:
> 
> /*
>  * The only flags allowed here are: FOLL_WRITE, FOLL_LONGTERM, FOLL_PIN
>  *
>  * Note that get_user_pages_fast() imply FOLL_GET flag by default but
>  * callers can over-ride this default to pin case by setting FOLL_PIN.
>  */

Good idea. Here's the draft now:

/*
 * The only flags allowed here are: FOLL_WRITE, FOLL_LONGTERM, FOLL_PIN.
 *
 * Note that get_user_pages_fast() implies FOLL_GET flag by default, but
 * callers can override this default by setting FOLL_PIN instead of
 * FOLL_GET.
 */
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_PIN)))
        return -EINVAL;

> 
>>  		return -EINVAL;
>>  
>>  	start = untagged_addr(start) & PAGE_MASK;
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 13cc93785006..66bf4c8b88f1 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
> 
> [...]
> 
>> @@ -968,7 +973,12 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
>>  	if (!*pgmap)
>>  		return ERR_PTR(-EFAULT);
>>  	page = pfn_to_page(pfn);
>> -	get_page(page);
>> +
>> +	if (flags & FOLL_GET)
>> +		get_page(page);
>> +	else if (flags & FOLL_PIN)
>> +		if (unlikely(!user_page_ref_inc(page)))
>> +			page = ERR_PTR(-ENOMEM);
> 
> While i agree that user_page_ref_inc() (ie page_cache_add_speculative())
> should never fails here as we are holding the pmd lock and thus no one
> can unmap the pmd and free the page it points to. I believe you should
> return -EFAULT like for the pgmap and not -ENOMEM as the pgmap should
> not fail either for the same reason. Thus it would be better to have
> consistent error. Maybe also add a comments explaining that it should
> not fail here.
> 

OK. I'll take a pass through and fix up the remaining points about these
sorts of cases below, as well, in v3. Those all make sense.

>>  
>>  	return page;
>>  }
> 
> [...]
> 
>> @@ -1100,7 +1115,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
>>  	 * device mapped pages can only be returned if the
>>  	 * caller will manage the page reference count.
>>  	 */
>> -	if (!(flags & FOLL_GET))
>> +	if (!(flags & (FOLL_GET | FOLL_PIN)))
>>  		return ERR_PTR(-EEXIST);
> 
> Maybe add a comment that FOLL_GET or FOLL_PIN must be set.
> 
>>  	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
>> @@ -1108,7 +1123,12 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
>>  	if (!*pgmap)
>>  		return ERR_PTR(-EFAULT);
>>  	page = pfn_to_page(pfn);
>> -	get_page(page);
>> +
>> +	if (flags & FOLL_GET)
>> +		get_page(page);
>> +	else if (flags & FOLL_PIN)
>> +		if (unlikely(!user_page_ref_inc(page)))
>> +			page = ERR_PTR(-ENOMEM);
> 
> Same as for follow_devmap_pmd() see above.
> 
>>  
>>  	return page;
>>  }
>> @@ -1522,8 +1542,12 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
>>  skip_mlock:
>>  	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
>>  	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
>> +
>>  	if (flags & FOLL_GET)
>>  		get_page(page);
>> +	else if (flags & FOLL_PIN)
>> +		if (unlikely(!user_page_ref_inc(page)))
>> +			page = NULL;
> 
> This should not fail either as we are holding the pmd lock maybe add
> a comment. Dunno if we want a WARN() or something to catch this
> degenerate case, or dump the page.
> 
>>  
>>  out:
>>  	return page;
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index b45a95363a84..da335b1cd798 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -4462,7 +4462,17 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>  same_page:
>>  		if (pages) {
>>  			pages[i] = mem_map_offset(page, pfn_offset);
>> -			get_page(pages[i]);
>> +
>> +			if (flags & FOLL_GET)
>> +				get_page(pages[i]);
>> +			else if (flags & FOLL_PIN)
>> +				if (unlikely(!user_page_ref_inc(pages[i]))) {
>> +					spin_unlock(ptl);
>> +					remainder = 0;
>> +					err = -ENOMEM;
>> +					WARN_ON_ONCE(1);
>> +					break;
>> +				}
>>  		}
> 
> user_page_ref_inc() should not fail here either because we hold the
> ptl, so the WAR_ON_ONCE() is right but maybe add a comment.
> 
>>  
>>  		if (vmas)
> 
> [...]
> 
>> @@ -5034,8 +5050,14 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>>  	pte = huge_ptep_get((pte_t *)pmd);
>>  	if (pte_present(pte)) {
>>  		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
>> +
>>  		if (flags & FOLL_GET)
>>  			get_page(page);
>> +		else if (flags & FOLL_PIN)
>> +			if (unlikely(!user_page_ref_inc(page))) {
>> +				page = NULL;
>> +				goto out;
>> +			}
> 
> This should not fail either (again holding pmd lock), dunno if we want
> a warn or something to catch this degenerate case.
> 
>>  	} else {
>>  		if (is_hugetlb_entry_migration(pte)) {
>>  			spin_unlock(ptl);
> 
> [...]
> 
> 

Those are all good points, working on them now.

thanks,
Jerome Glisse Nov. 4, 2019, 11:49 p.m. UTC | #3
On Mon, Nov 04, 2019 at 02:49:18PM -0800, John Hubbard wrote:
> On 11/4/19 10:52 AM, Jerome Glisse wrote:
> > On Sun, Nov 03, 2019 at 01:18:07PM -0800, John Hubbard wrote:
> >> Add tracking of pages that were pinned via FOLL_PIN.
> >>
> >> As mentioned in the FOLL_PIN documentation, callers who effectively set
> >> FOLL_PIN are required to ultimately free such pages via put_user_page().
> >> The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
> >> for DIO and/or RDMA use".
> >>
> >> Pages that have been pinned via FOLL_PIN are identifiable via a
> >> new function call:
> >>
> >>    bool page_dma_pinned(struct page *page);
> >>
> >> What to do in response to encountering such a page, is left to later
> >> patchsets. There is discussion about this in [1].
> >>
> >> This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().
> >>
> >> This also has a couple of trivial, non-functional change fixes to
> >> try_get_compound_head(). That function got moved to the top of the
> >> file.
> > 
> > Maybe split that as a separate trivial patch.
> 
> 
> Will do.
> 
> 
> > 
> >>
> >> This includes the following fix from Ira Weiny:
> >>
> >> DAX requires detection of a page crossing to a ref count of 1.  Fix this
> >> for GUP pages by introducing put_devmap_managed_user_page() which
> >> accounts for GUP_PIN_COUNTING_BIAS now used by GUP.
> > 
> > Please do the put_devmap_managed_page() changes in a separate
> > patch, it would be a lot easier to follow, also on that front
> > see comments below.
> 
> 
> Oh! OK. It makes sense when you say it out loud. :)
> 
> 
> ...
> >> +static inline bool put_devmap_managed_page(struct page *page)
> >> +{
> >> +	bool is_devmap = page_is_devmap_managed(page);
> >> +
> >> +	if (is_devmap) {
> >> +		int count = page_ref_dec_return(page);
> >> +
> >> +		__put_devmap_managed_page(page, count);
> >> +	}
> >> +
> >> +	return is_devmap;
> >> +}
> > 
> > I think the __put_devmap_managed_page() should be rename
> > to free_devmap_managed_page() and that the count != 1
> > case move to this inline function ie:
> > 
> > static inline bool put_devmap_managed_page(struct page *page)
> > {
> > 	bool is_devmap = page_is_devmap_managed(page);
> > 
> > 	if (is_devmap) {
> > 		int count = page_ref_dec_return(page);
> > 
> > 		/*
> > 		 * If refcount is 1 then page is freed and refcount is stable as nobody
> > 		 * holds a reference on the page.
> > 		 */
> > 		if (count == 1)
> > 			free_devmap_managed_page(page, count);
> > 		else if (!count)
> > 			__put_page(page);
> > 	}
> > 
> > 	return is_devmap;
> > }
> > 
> 
> Thanks, that does look cleaner and easier to read.
> 
> > 
> >> +
> >>  #else /* CONFIG_DEV_PAGEMAP_OPS */
> >>  static inline bool put_devmap_managed_page(struct page *page)
> >>  {
> >> @@ -1038,6 +1051,8 @@ static inline __must_check bool try_get_page(struct page *page)
> >>  	return true;
> >>  }
> >>  
> >> +__must_check bool user_page_ref_inc(struct page *page);
> >> +
> > 
> > What about having it as an inline here as it is pretty small.
> 
> 
> You mean move it to a static inline function in mm.h? It's worse than it 
> looks, though: *everything* that it calls is also a static function, local
> to gup.c. So I'd have to expose both try_get_compound_head() and
> __update_proc_vmstat(). And that also means calling mod_node_page_state() from
> mm.h, and it goes south right about there. :)

Ok fair enough

> ...  
> >> +/**
> >> + * page_dma_pinned() - report if a page is pinned by a call to pin_user_pages*()
> >> + * or pin_longterm_pages*()
> >> + * @page:	pointer to page to be queried.
> >> + * @Return:	True, if it is likely that the page has been "dma-pinned".
> >> + *		False, if the page is definitely not dma-pinned.
> >> + */
> > 
> > Maybe add a small comment about wrap around :)
> 
> 
> I don't *think* the count can wrap around, due to the checks in user_page_ref_inc().
> 
> But it's true that the documentation is a little light here...What did you have 
> in mind?

About false positive case (and how unlikely they are) and that wrap
around is properly handle. Maybe just a pointer to the documentation
so that people know they can go look there for details. I know my
brain tend to forget where to look for things so i like to be constantly
reminded hey the doc is Documentations/foobar :)

> > [...]
> > 
> >> @@ -1930,12 +2028,20 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
> >>  
> >>  		pgmap = get_dev_pagemap(pfn, pgmap);
> >>  		if (unlikely(!pgmap)) {
> >> -			undo_dev_pagemap(nr, nr_start, pages);
> >> +			undo_dev_pagemap(nr, nr_start, flags, pages);
> >>  			return 0;
> >>  		}
> >>  		SetPageReferenced(page);
> >>  		pages[*nr] = page;
> >> -		get_page(page);
> >> +
> >> +		if (flags & FOLL_PIN) {
> >> +			if (unlikely(!user_page_ref_inc(page))) {
> >> +				undo_dev_pagemap(nr, nr_start, flags, pages);
> >> +				return 0;
> >> +			}
> > 
> > Maybe add a comment about a case that should never happens ie
> > user_page_ref_inc() fails after the second iteration of the
> > loop as it would be broken and a bug to call undo_dev_pagemap()
> > after the first iteration of that loop.
> > 
> > Also i believe that this should never happens as if first
> > iteration succeed than __page_cache_add_speculative() will
> > succeed for all the iterations.
> > 
> > Note that the pgmap case above follows that too ie the call to
> > get_dev_pagemap() can only fail on first iteration of the loop,
> > well i assume you can never have a huge device page that span
> > different pgmap ie different devices (which is a reasonable
> > assumption). So maybe this code needs fixing ie :
> > 
> > 		pgmap = get_dev_pagemap(pfn, pgmap);
> > 		if (unlikely(!pgmap))
> > 			return 0;
> > 
> > 
> 
> OK, yes that does make sense. And I think a comment is adequate,
> no need to check for bugs during every tail page iteration. So how 
> about this, as a preliminary patch:

Actualy i thought about it and i think that there is pgmap
per section and thus maybe one device can have multiple pgmap
and that would be an issue for page bigger than section size
(ie bigger than 128MB iirc). I will go double check that, but
maybe Dan can chime in.

In any case my comment above is correct for the page ref
increment, if the first one succeed than others will too
or otherwise it means someone is doing too many put_page()/
put_user_page() which is _bad_ :)

> 
> diff --git a/mm/gup.c b/mm/gup.c
> index 8f236a335ae9..a4a81e125832 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1892,17 +1892,18 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>                 unsigned long end, struct page **pages, int *nr)
>  {
> -       int nr_start = *nr;
> -       struct dev_pagemap *pgmap = NULL;
> +       /*
> +        * Huge pages should never cross dev_pagemap boundaries. Therefore, use
> +        * this same pgmap for the entire huge page.
> +        */
> +       struct dev_pagemap *pgmap = get_dev_pagemap(pfn, NULL);
> +
> +       if (unlikely(!pgmap))
> +               return 0;
>  
>         do {
>                 struct page *page = pfn_to_page(pfn);
>  
> -               pgmap = get_dev_pagemap(pfn, pgmap);
> -               if (unlikely(!pgmap)) {
> -                       undo_dev_pagemap(nr, nr_start, pages);
> -                       return 0;
> -               }
>                 SetPageReferenced(page);
>                 pages[*nr] = page;
>                 get_page(page);
> 
> 
> 
> 
> >> +		} else
> >> +			get_page(page);
> >> +
> >>  		(*nr)++;
> >>  		pfn++;
> >>  	} while (addr += PAGE_SIZE, addr != end);
> > 
> > [...]
> > 
> >> @@ -2409,7 +2540,7 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
> >>  	unsigned long addr, len, end;
> >>  	int nr = 0, ret = 0;
> >>  
> >> -	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
> >> +	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_PIN)))
> > 
> > Maybe add a comments to explain, something like:
> > 
> > /*
> >  * The only flags allowed here are: FOLL_WRITE, FOLL_LONGTERM, FOLL_PIN
> >  *
> >  * Note that get_user_pages_fast() imply FOLL_GET flag by default but
> >  * callers can over-ride this default to pin case by setting FOLL_PIN.
> >  */
> 
> Good idea. Here's the draft now:
> 
> /*
>  * The only flags allowed here are: FOLL_WRITE, FOLL_LONGTERM, FOLL_PIN.
>  *
>  * Note that get_user_pages_fast() implies FOLL_GET flag by default, but
>  * callers can override this default by setting FOLL_PIN instead of
>  * FOLL_GET.
>  */
> if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_PIN)))
>         return -EINVAL;

Looks good to me.

...

Cheers,
Jérôme
John Hubbard Nov. 5, 2019, 12:18 a.m. UTC | #4
Hi Dan, there is a question for you further down:


On 11/4/19 3:49 PM, Jerome Glisse wrote:
> On Mon, Nov 04, 2019 at 02:49:18PM -0800, John Hubbard wrote:
...
>>> Maybe add a small comment about wrap around :)
>>
>>
>> I don't *think* the count can wrap around, due to the checks in user_page_ref_inc().
>>
>> But it's true that the documentation is a little light here...What did you have 
>> in mind?
> 
> About false positive case (and how unlikely they are) and that wrap
> around is properly handle. Maybe just a pointer to the documentation
> so that people know they can go look there for details. I know my
> brain tend to forget where to look for things so i like to be constantly
> reminded hey the doc is Documentations/foobar :)
> 

I see. OK, here's a version with a thoroughly overhauled comment header:

/**
 * page_dma_pinned() - report if a page is pinned for DMA.
 *
 * This function checks if a page has been pinned via a call to
 * pin_user_pages*() or pin_longterm_pages*().
 *
 * The return value is partially fuzzy: false is not fuzzy, because it means
 * "definitely not pinned for DMA", but true means "probably pinned for DMA, but
 * possibly a false positive due to having at least GUP_PIN_COUNTING_BIAS worth
 * of normal page references".
 *
 * False positives are OK, because: a) it's unlikely for a page to get that many
 * refcounts, and b) all the callers of this routine are expected to be able to
 * deal gracefully with a false positive.
 *
 * For more information, please see Documentation/vm/pin_user_pages.rst.
 *
 * @page:	pointer to page to be queried.
 * @Return:	True, if it is likely that the page has been "dma-pinned".
 *		False, if the page is definitely not dma-pinned.
 */
static inline bool page_dma_pinned(struct page *page)


>>> [...]
>>>
>>>> @@ -1930,12 +2028,20 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>>>>  
>>>>  		pgmap = get_dev_pagemap(pfn, pgmap);
>>>>  		if (unlikely(!pgmap)) {
>>>> -			undo_dev_pagemap(nr, nr_start, pages);
>>>> +			undo_dev_pagemap(nr, nr_start, flags, pages);
>>>>  			return 0;
>>>>  		}
>>>>  		SetPageReferenced(page);
>>>>  		pages[*nr] = page;
>>>> -		get_page(page);
>>>> +
>>>> +		if (flags & FOLL_PIN) {
>>>> +			if (unlikely(!user_page_ref_inc(page))) {
>>>> +				undo_dev_pagemap(nr, nr_start, flags, pages);
>>>> +				return 0;
>>>> +			}
>>>
>>> Maybe add a comment about a case that should never happens ie
>>> user_page_ref_inc() fails after the second iteration of the
>>> loop as it would be broken and a bug to call undo_dev_pagemap()
>>> after the first iteration of that loop.
>>>
>>> Also i believe that this should never happens as if first
>>> iteration succeed than __page_cache_add_speculative() will
>>> succeed for all the iterations.
>>>
>>> Note that the pgmap case above follows that too ie the call to
>>> get_dev_pagemap() can only fail on first iteration of the loop,
>>> well i assume you can never have a huge device page that span
>>> different pgmap ie different devices (which is a reasonable
>>> assumption). So maybe this code needs fixing ie :
>>>
>>> 		pgmap = get_dev_pagemap(pfn, pgmap);
>>> 		if (unlikely(!pgmap))
>>> 			return 0;
>>>
>>>
>>
>> OK, yes that does make sense. And I think a comment is adequate,
>> no need to check for bugs during every tail page iteration. So how 
>> about this, as a preliminary patch:
> 
> Actualy i thought about it and i think that there is pgmap
> per section and thus maybe one device can have multiple pgmap
> and that would be an issue for page bigger than section size
> (ie bigger than 128MB iirc). I will go double check that, but
> maybe Dan can chime in.
> 
> In any case my comment above is correct for the page ref
> increment, if the first one succeed than others will too
> or otherwise it means someone is doing too many put_page()/
> put_user_page() which is _bad_ :)
> 

I'll wait to hear from Dan before doing anything rash. :)


thanks,

John Hubbard
NVIDIA
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cdfb6fedb271..03b3600843b7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -972,9 +972,10 @@  static inline bool is_zone_device_page(const struct page *page)
 #endif
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page);
+void __put_devmap_managed_page(struct page *page, int count);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-static inline bool put_devmap_managed_page(struct page *page)
+
+static inline bool page_is_devmap_managed(struct page *page)
 {
 	if (!static_branch_unlikely(&devmap_managed_key))
 		return false;
@@ -983,7 +984,6 @@  static inline bool put_devmap_managed_page(struct page *page)
 	switch (page->pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
 	case MEMORY_DEVICE_FS_DAX:
-		__put_devmap_managed_page(page);
 		return true;
 	default:
 		break;
@@ -991,6 +991,19 @@  static inline bool put_devmap_managed_page(struct page *page)
 	return false;
 }
 
+static inline bool put_devmap_managed_page(struct page *page)
+{
+	bool is_devmap = page_is_devmap_managed(page);
+
+	if (is_devmap) {
+		int count = page_ref_dec_return(page);
+
+		__put_devmap_managed_page(page, count);
+	}
+
+	return is_devmap;
+}
+
 #else /* CONFIG_DEV_PAGEMAP_OPS */
 static inline bool put_devmap_managed_page(struct page *page)
 {
@@ -1038,6 +1051,8 @@  static inline __must_check bool try_get_page(struct page *page)
 	return true;
 }
 
+__must_check bool user_page_ref_inc(struct page *page);
+
 static inline void put_page(struct page *page)
 {
 	page = compound_head(page);
@@ -1055,31 +1070,56 @@  static inline void put_page(struct page *page)
 		__put_page(page);
 }
 
-/**
- * put_user_page() - release a gup-pinned page
- * @page:            pointer to page to be released
+/*
+ * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
+ * the page's refcount so that two separate items are tracked: the original page
+ * reference count, and also a new count of how many get_user_pages() calls were
+ * made against the page. ("gup-pinned" is another term for the latter).
+ *
+ * With this scheme, get_user_pages() becomes special: such pages are marked
+ * as distinct from normal pages. As such, the new put_user_page() call (and
+ * its variants) must be used in order to release gup-pinned pages.
+ *
+ * Choice of value:
  *
- * Pages that were pinned via get_user_pages*() must be released via
- * either put_user_page(), or one of the put_user_pages*() routines
- * below. This is so that eventually, pages that are pinned via
- * get_user_pages*() can be separately tracked and uniquely handled. In
- * particular, interactions with RDMA and filesystems need special
- * handling.
+ * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
+ * counts with respect to get_user_pages() and put_user_page() becomes simpler,
+ * due to the fact that adding an even power of two to the page refcount has
+ * the effect of using only the upper N bits, for the code that counts up using
+ * the bias value. This means that the lower bits are left for the exclusive
+ * use of the original code that increments and decrements by one (or at least,
+ * by much smaller values than the bias value).
  *
- * put_user_page() and put_page() are not interchangeable, despite this early
- * implementation that makes them look the same. put_user_page() calls must
- * be perfectly matched up with get_user_page() calls.
+ * Of course, once the lower bits overflow into the upper bits (and this is
+ * OK, because subtraction recovers the original values), then visual inspection
+ * no longer suffices to directly view the separate counts. However, for normal
+ * applications that don't have huge page reference counts, this won't be an
+ * issue.
+ *
+ * Locking: the lockless algorithm described in page_cache_get_speculative()
+ * and page_cache_gup_pin_speculative() provides safe operation for
+ * get_user_pages and page_mkclean and other calls that race to set up page
+ * table entries.
  */
-static inline void put_user_page(struct page *page)
-{
-	put_page(page);
-}
+#define GUP_PIN_COUNTING_BIAS (1UL << 10)
 
+void put_user_page(struct page *page);
 void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 			       bool make_dirty);
-
 void put_user_pages(struct page **pages, unsigned long npages);
 
+/**
+ * page_dma_pinned() - report if a page is pinned by a call to pin_user_pages*()
+ * or pin_longterm_pages*()
+ * @page:	pointer to page to be queried.
+ * @Return:	True, if it is likely that the page has been "dma-pinned".
+ *		False, if the page is definitely not dma-pinned.
+ */
+static inline bool page_dma_pinned(struct page *page)
+{
+	return (page_ref_count(compound_head(page))) >= GUP_PIN_COUNTING_BIAS;
+}
+
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define SECTION_IN_PAGE_FLAGS
 #endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bda20282746b..0485cba38d23 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -244,6 +244,8 @@  enum node_stat_item {
 	NR_DIRTIED,		/* page dirtyings since bootup */
 	NR_WRITTEN,		/* page writings since bootup */
 	NR_KERNEL_MISC_RECLAIMABLE,	/* reclaimable non-slab kernel pages */
+	NR_FOLL_PIN_REQUESTED,	/* via: pin_user_page(), gup flag: FOLL_PIN */
+	NR_FOLL_PIN_RETURNED,	/* pages returned via put_user_page() */
 	NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 14d14beb1f7f..b9cbe553d1e7 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -102,6 +102,16 @@  static inline void page_ref_sub(struct page *page, int nr)
 		__page_ref_mod(page, -nr);
 }
 
+static inline int page_ref_sub_return(struct page *page, int nr)
+{
+	int ret = atomic_sub_return(nr, &page->_refcount);
+
+	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
+		__page_ref_mod(page, -nr);
+
+	return ret;
+}
+
 static inline void page_ref_inc(struct page *page)
 {
 	atomic_inc(&page->_refcount);
diff --git a/mm/gup.c b/mm/gup.c
index 1aea48427879..c9727e65fad3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,6 +29,102 @@  struct follow_page_context {
 	unsigned int page_mask;
 };
 
+/*
+ * Return the compound head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+	struct page *head = compound_head(page);
+
+	if (WARN_ON_ONCE(page_ref_count(head) < 0))
+		return NULL;
+	if (unlikely(!page_cache_add_speculative(head, refs)))
+		return NULL;
+	return head;
+}
+
+#ifdef CONFIG_DEBUG_VM
+static inline void __update_proc_vmstat(struct page *page,
+					enum node_stat_item item, int count)
+{
+	mod_node_page_state(page_pgdat(page), item, count);
+}
+#else
+static inline void __update_proc_vmstat(struct page *page,
+					enum node_stat_item item, int count)
+{
+}
+#endif
+
+/**
+ * user_page_ref_inc() - mark a page as being used by get_user_pages(FOLL_PIN).
+ *
+ * @page:	pointer to page to be marked
+ * @Return:	true for success, false for failure
+ */
+__must_check bool user_page_ref_inc(struct page *page)
+{
+	page = try_get_compound_head(page, GUP_PIN_COUNTING_BIAS);
+	if (!page)
+		return false;
+
+	__update_proc_vmstat(page, NR_FOLL_PIN_REQUESTED, 1);
+	return true;
+}
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+static bool __put_devmap_managed_user_page(struct page *page)
+{
+	bool is_devmap = page_is_devmap_managed(page);
+
+	if (is_devmap) {
+		int count = page_ref_sub_return(page, GUP_PIN_COUNTING_BIAS);
+
+		__update_proc_vmstat(page, NR_FOLL_PIN_RETURNED, 1);
+		__put_devmap_managed_page(page, count);
+	}
+
+	return is_devmap;
+}
+#else
+static bool __put_devmap_managed_user_page(struct page *page)
+{
+	return false;
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
+
+/**
+ * put_user_page() - release a gup-pinned page
+ * @page:            pointer to page to be released
+ *
+ * Pages that were pinned via get_user_pages*() must be released via
+ * either put_user_page(), or one of the put_user_pages*() routines
+ * below. This is so that eventually, pages that are pinned via
+ * get_user_pages*() can be separately tracked and uniquely handled. In
+ * particular, interactions with RDMA and filesystems need special
+ * handling.
+ */
+void put_user_page(struct page *page)
+{
+	page = compound_head(page);
+
+	/*
+	 * For devmap managed pages we need to catch refcount transition from
+	 * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
+	 * page is free and we need to inform the device driver through
+	 * callback. See include/linux/memremap.h and HMM for details.
+	 */
+	if (__put_devmap_managed_user_page(page))
+		return;
+
+	if (page_ref_sub_and_test(page, GUP_PIN_COUNTING_BIAS))
+		__put_page(page);
+
+	__update_proc_vmstat(page, NR_FOLL_PIN_RETURNED, 1);
+}
+EXPORT_SYMBOL(put_user_page);
+
 /**
  * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
  * @pages:  array of pages to be maybe marked dirty, and definitely released.
@@ -215,10 +311,11 @@  static struct page *follow_page_pte(struct vm_area_struct *vma,
 	}
 
 	page = vm_normal_page(vma, address, pte);
-	if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
 		/*
-		 * Only return device mapping pages in the FOLL_GET case since
-		 * they are only valid while holding the pgmap reference.
+		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
+		 * case since they are only valid while holding the pgmap
+		 * reference.
 		 */
 		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
 		if (*pgmap)
@@ -261,6 +358,11 @@  static struct page *follow_page_pte(struct vm_area_struct *vma,
 			page = ERR_PTR(-ENOMEM);
 			goto out;
 		}
+	} else if (flags & FOLL_PIN) {
+		if (unlikely(!user_page_ref_inc(page))) {
+			page = ERR_PTR(-ENOMEM);
+			goto out;
+		}
 	}
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
@@ -522,8 +624,8 @@  static struct page *follow_page_mask(struct vm_area_struct *vma,
 	/* make this handle hugepd */
 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 	if (!IS_ERR(page)) {
-		BUG_ON(flags & FOLL_GET);
-		return page;
+		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
+		return NULL;
 	}
 
 	pgd = pgd_offset(mm, address);
@@ -1812,30 +1914,20 @@  static inline pte_t gup_get_pte(pte_t *ptep)
 #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
 
 static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
+					    unsigned int flags,
 					    struct page **pages)
 {
 	while ((*nr) - nr_start) {
 		struct page *page = pages[--(*nr)];
 
 		ClearPageReferenced(page);
-		put_page(page);
+		if (flags & FOLL_PIN)
+			put_user_page(page);
+		else
+			put_page(page);
 	}
 }
 
-/*
- * Return the compund head page with ref appropriately incremented,
- * or NULL if that failed.
- */
-static inline struct page *try_get_compound_head(struct page *page, int refs)
-{
-	struct page *head = compound_head(page);
-	if (WARN_ON_ONCE(page_ref_count(head) < 0))
-		return NULL;
-	if (unlikely(!page_cache_add_speculative(head, refs)))
-		return NULL;
-	return head;
-}
-
 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 			 unsigned int flags, struct page **pages, int *nr)
@@ -1865,7 +1957,7 @@  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 
 			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
 			if (unlikely(!pgmap)) {
-				undo_dev_pagemap(nr, nr_start, pages);
+				undo_dev_pagemap(nr, nr_start, flags, pages);
 				goto pte_unmap;
 			}
 		} else if (pte_special(pte))
@@ -1874,9 +1966,15 @@  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 		page = pte_page(pte);
 
-		head = try_get_compound_head(page, 1);
-		if (!head)
-			goto pte_unmap;
+		if (flags & FOLL_PIN) {
+			head = page;
+			if (unlikely(!user_page_ref_inc(head)))
+				goto pte_unmap;
+		} else {
+			head = try_get_compound_head(page, 1);
+			if (!head)
+				goto pte_unmap;
+		}
 
 		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
 			put_page(head);
@@ -1930,12 +2028,20 @@  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 
 		pgmap = get_dev_pagemap(pfn, pgmap);
 		if (unlikely(!pgmap)) {
-			undo_dev_pagemap(nr, nr_start, pages);
+			undo_dev_pagemap(nr, nr_start, flags, pages);
 			return 0;
 		}
 		SetPageReferenced(page);
 		pages[*nr] = page;
-		get_page(page);
+
+		if (flags & FOLL_PIN) {
+			if (unlikely(!user_page_ref_inc(page))) {
+				undo_dev_pagemap(nr, nr_start, flags, pages);
+				return 0;
+			}
+		} else
+			get_page(page);
+
 		(*nr)++;
 		pfn++;
 	} while (addr += PAGE_SIZE, addr != end);
@@ -1957,7 +2063,7 @@  static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		return 0;
 
 	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
-		undo_dev_pagemap(nr, nr_start, pages);
+		undo_dev_pagemap(nr, nr_start, flags, pages);
 		return 0;
 	}
 	return 1;
@@ -1975,7 +2081,7 @@  static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		return 0;
 
 	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
-		undo_dev_pagemap(nr, nr_start, pages);
+		undo_dev_pagemap(nr, nr_start, flags, pages);
 		return 0;
 	}
 	return 1;
@@ -2059,9 +2165,16 @@  static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
 	refs = __record_subpages(page, addr, end, pages, *nr);
 
-	head = try_get_compound_head(head, refs);
-	if (!head)
-		return 0;
+	if (flags & FOLL_PIN) {
+		head = page;
+		if (unlikely(!user_page_ref_inc(head)))
+			return 0;
+		head = page;
+	} else {
+		head = try_get_compound_head(head, refs);
+		if (!head)
+			return 0;
+	}
 
 	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
 		put_compound_head(head, refs);
@@ -2118,9 +2231,15 @@  static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 	page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	refs = __record_subpages(page, addr, end, pages, *nr);
 
-	head = try_get_compound_head(pmd_page(orig), refs);
-	if (!head)
-		return 0;
+	if (flags & FOLL_PIN) {
+		head = page;
+		if (unlikely(!user_page_ref_inc(head)))
+			return 0;
+	} else {
+		head = try_get_compound_head(pmd_page(orig), refs);
+		if (!head)
+			return 0;
+	}
 
 	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
 		put_compound_head(head, refs);
@@ -2151,9 +2270,15 @@  static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 	refs = __record_subpages(page, addr, end, pages, *nr);
 
-	head = try_get_compound_head(pud_page(orig), refs);
-	if (!head)
-		return 0;
+	if (flags & FOLL_PIN) {
+		head = page;
+		if (unlikely(!user_page_ref_inc(head)))
+			return 0;
+	} else {
+		head = try_get_compound_head(pud_page(orig), refs);
+		if (!head)
+			return 0;
+	}
 
 	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
 		put_compound_head(head, refs);
@@ -2179,9 +2304,15 @@  static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 	page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
 	refs = __record_subpages(page, addr, end, pages, *nr);
 
-	head = try_get_compound_head(pgd_page(orig), refs);
-	if (!head)
-		return 0;
+	if (flags & FOLL_PIN) {
+		head = page;
+		if (unlikely(!user_page_ref_inc(head)))
+			return 0;
+	} else {
+		head = try_get_compound_head(pgd_page(orig), refs);
+		if (!head)
+			return 0;
+	}
 
 	if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
 		put_compound_head(head, refs);
@@ -2409,7 +2540,7 @@  static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
 	unsigned long addr, len, end;
 	int nr = 0, ret = 0;
 
-	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
+	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_PIN)))
 		return -EINVAL;
 
 	start = untagged_addr(start) & PAGE_MASK;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 13cc93785006..66bf4c8b88f1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -945,6 +945,11 @@  struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 	 */
 	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
 
+	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
+	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+			 (FOLL_PIN | FOLL_GET)))
+		return NULL;
+
 	if (flags & FOLL_WRITE && !pmd_write(*pmd))
 		return NULL;
 
@@ -960,7 +965,7 @@  struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 	 * device mapped pages can only be returned if the
 	 * caller will manage the page reference count.
 	 */
-	if (!(flags & FOLL_GET))
+	if (!(flags & (FOLL_GET | FOLL_PIN)))
 		return ERR_PTR(-EEXIST);
 
 	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
@@ -968,7 +973,12 @@  struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 	if (!*pgmap)
 		return ERR_PTR(-EFAULT);
 	page = pfn_to_page(pfn);
-	get_page(page);
+
+	if (flags & FOLL_GET)
+		get_page(page);
+	else if (flags & FOLL_PIN)
+		if (unlikely(!user_page_ref_inc(page)))
+			page = ERR_PTR(-ENOMEM);
 
 	return page;
 }
@@ -1088,6 +1098,11 @@  struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 	if (flags & FOLL_WRITE && !pud_write(*pud))
 		return NULL;
 
+	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
+	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+			 (FOLL_PIN | FOLL_GET)))
+		return NULL;
+
 	if (pud_present(*pud) && pud_devmap(*pud))
 		/* pass */;
 	else
@@ -1100,7 +1115,7 @@  struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 	 * device mapped pages can only be returned if the
 	 * caller will manage the page reference count.
 	 */
-	if (!(flags & FOLL_GET))
+	if (!(flags & (FOLL_GET | FOLL_PIN)))
 		return ERR_PTR(-EEXIST);
 
 	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
@@ -1108,7 +1123,12 @@  struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 	if (!*pgmap)
 		return ERR_PTR(-EFAULT);
 	page = pfn_to_page(pfn);
-	get_page(page);
+
+	if (flags & FOLL_GET)
+		get_page(page);
+	else if (flags & FOLL_PIN)
+		if (unlikely(!user_page_ref_inc(page)))
+			page = ERR_PTR(-ENOMEM);
 
 	return page;
 }
@@ -1522,8 +1542,12 @@  struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 skip_mlock:
 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
 	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
+
 	if (flags & FOLL_GET)
 		get_page(page);
+	else if (flags & FOLL_PIN)
+		if (unlikely(!user_page_ref_inc(page)))
+			page = NULL;
 
 out:
 	return page;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b45a95363a84..da335b1cd798 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4462,7 +4462,17 @@  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 same_page:
 		if (pages) {
 			pages[i] = mem_map_offset(page, pfn_offset);
-			get_page(pages[i]);
+
+			if (flags & FOLL_GET)
+				get_page(pages[i]);
+			else if (flags & FOLL_PIN)
+				if (unlikely(!user_page_ref_inc(pages[i]))) {
+					spin_unlock(ptl);
+					remainder = 0;
+					err = -ENOMEM;
+					WARN_ON_ONCE(1);
+					break;
+				}
 		}
 
 		if (vmas)
@@ -5022,6 +5032,12 @@  follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	struct page *page = NULL;
 	spinlock_t *ptl;
 	pte_t pte;
+
+	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
+	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+			 (FOLL_PIN | FOLL_GET)))
+		return NULL;
+
 retry:
 	ptl = pmd_lockptr(mm, pmd);
 	spin_lock(ptl);
@@ -5034,8 +5050,14 @@  follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	pte = huge_ptep_get((pte_t *)pmd);
 	if (pte_present(pte)) {
 		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+
 		if (flags & FOLL_GET)
 			get_page(page);
+		else if (flags & FOLL_PIN)
+			if (unlikely(!user_page_ref_inc(page))) {
+				page = NULL;
+				goto out;
+			}
 	} else {
 		if (is_hugetlb_entry_migration(pte)) {
 			spin_unlock(ptl);
@@ -5056,7 +5078,7 @@  struct page * __weak
 follow_huge_pud(struct mm_struct *mm, unsigned long address,
 		pud_t *pud, int flags)
 {
-	if (flags & FOLL_GET)
+	if (flags & (FOLL_GET | FOLL_PIN))
 		return NULL;
 
 	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
@@ -5065,7 +5087,7 @@  follow_huge_pud(struct mm_struct *mm, unsigned long address,
 struct page * __weak
 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
 {
-	if (flags & FOLL_GET)
+	if (flags & (FOLL_GET | FOLL_PIN))
 		return NULL;
 
 	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
diff --git a/mm/memremap.c b/mm/memremap.c
index 03ccbdfeb697..3b1c69df1d2a 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -410,10 +410,8 @@  struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page)
+void __put_devmap_managed_page(struct page *page, int count)
 {
-	int count = page_ref_dec_return(page);
-
 	/*
 	 * If refcount is 1 then page is freed and refcount is stable as nobody
 	 * holds a reference on the page.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6afc892a148a..65c027d9b637 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1167,6 +1167,8 @@  const char * const vmstat_text[] = {
 	"nr_dirtied",
 	"nr_written",
 	"nr_kernel_misc_reclaimable",
+	"nr_foll_pin_requested",
+	"nr_foll_pin_returned",
 
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",