diff mbox series

[RFC,2/2] mm/gup: introduce vaddr_pin_pages_remote()

Message ID 20190812015044.26176-3-jhubbard@nvidia.com (mailing list archive)
State New, archived
Headers show
Series mm/gup: introduce vaddr_pin_pages_remote(), FOLL_PIN | expand

Commit Message

john.hubbard@gmail.com Aug. 12, 2019, 1:50 a.m. UTC
From: John Hubbard <jhubbard@nvidia.com>

This is the "vaddr_pin_pages" corresponding variant to
get_user_pages_remote(), but with FOLL_PIN semantics: the implementation
sets FOLL_PIN. That, in turn, means that the pages must ultimately be
released by put_user_page*()--typically, via vaddr_unpin_pages*().

Note that the put_user_page*() requirement won't be truly
required until all of the call sites have been converted, and
the tracking of pages is actually activated.

Also introduce vaddr_unpin_pages(), in order to have a simpler
call for the error handling cases.

Use both of these new calls in the Infiniband drive, replacing
get_user_pages_remote() and put_user_pages().

Signed-off-by: John Hubbard <jhubbard@nvidia.com>
---
 drivers/infiniband/core/umem_odp.c | 15 +++++----
 include/linux/mm.h                 |  7 +++++
 mm/gup.c                           | 50 ++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 6 deletions(-)

Comments

Ira Weiny Aug. 12, 2019, 10:03 p.m. UTC | #1
On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
> From: John Hubbard <jhubbard@nvidia.com>
> 
> This is the "vaddr_pin_pages" corresponding variant to
> get_user_pages_remote(), but with FOLL_PIN semantics: the implementation
> sets FOLL_PIN. That, in turn, means that the pages must ultimately be
> released by put_user_page*()--typically, via vaddr_unpin_pages*().
> 
> Note that the put_user_page*() requirement won't be truly
> required until all of the call sites have been converted, and
> the tracking of pages is actually activated.
> 
> Also introduce vaddr_unpin_pages(), in order to have a simpler
> call for the error handling cases.
> 
> Use both of these new calls in the Infiniband drive, replacing
> get_user_pages_remote() and put_user_pages().
> 
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>
> ---
>  drivers/infiniband/core/umem_odp.c | 15 +++++----
>  include/linux/mm.h                 |  7 +++++
>  mm/gup.c                           | 50 ++++++++++++++++++++++++++++++
>  3 files changed, 66 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
> index 53085896d718..fdff034a8a30 100644
> --- a/drivers/infiniband/core/umem_odp.c
> +++ b/drivers/infiniband/core/umem_odp.c
> @@ -534,7 +534,7 @@ static int ib_umem_odp_map_dma_single_page(
>  	}
>  
>  out:
> -	put_user_page(page);
> +	vaddr_unpin_pages(&page, 1, &umem_odp->umem.vaddr_pin);
>  
>  	if (remove_existing_mapping) {
>  		ib_umem_notifier_start_account(umem_odp);
> @@ -635,9 +635,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>  		 * complex (and doesn't gain us much performance in most use
>  		 * cases).
>  		 */
> -		npages = get_user_pages_remote(owning_process, owning_mm,
> +		npages = vaddr_pin_pages_remote(owning_process, owning_mm,
>  				user_virt, gup_num_pages,
> -				flags, local_page_list, NULL, NULL);
> +				flags, local_page_list, NULL, NULL,
> +				&umem_odp->umem.vaddr_pin);
>  		up_read(&owning_mm->mmap_sem);
>  
>  		if (npages < 0) {
> @@ -657,7 +658,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>  					ret = -EFAULT;
>  					break;
>  				}
> -				put_user_page(local_page_list[j]);
> +				vaddr_unpin_pages(&local_page_list[j], 1,
> +						  &umem_odp->umem.vaddr_pin);
>  				continue;
>  			}
>  
> @@ -684,8 +686,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>  			 * ib_umem_odp_map_dma_single_page().
>  			 */
>  			if (npages - (j + 1) > 0)
> -				put_user_pages(&local_page_list[j+1],
> -					       npages - (j + 1));
> +				vaddr_unpin_pages(&local_page_list[j+1],
> +						  npages - (j + 1),
> +						  &umem_odp->umem.vaddr_pin);
>  			break;
>  		}
>  	}
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 61b616cd9243..2bd76ad8787e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1606,6 +1606,13 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
>  long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
>  		     unsigned int gup_flags, struct page **pages,
>  		     struct vaddr_pin *vaddr_pin);
> +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long start, unsigned long nr_pages,
> +			    unsigned int gup_flags, struct page **pages,
> +			    struct vm_area_struct **vmas, int *locked,
> +			    struct vaddr_pin *vaddr_pin);
> +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
> +		       struct vaddr_pin *vaddr_pin);
>  void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
>  				  struct vaddr_pin *vaddr_pin, bool make_dirty);
>  bool mapping_inode_has_layout(struct vaddr_pin *vaddr_pin, struct page *page);
> diff --git a/mm/gup.c b/mm/gup.c
> index 85f09958fbdc..bb95adfaf9b6 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2518,6 +2518,38 @@ long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
>  }
>  EXPORT_SYMBOL(vaddr_pin_pages);
>  
> +/**
> + * vaddr_pin_pages pin pages by virtual address and return the pages to the

vaddr_pin_pages_remote

Fixed in my tree.

> + * user.
> + *
> + * @tsk:	the task_struct to use for page fault accounting, or
> + *		NULL if faults are not to be recorded.
> + * @mm:		mm_struct of target mm
> + * @addr:	start address
> + * @nr_pages:	number of pages to pin
> + * @gup_flags:	flags to use for the pin
> + * @pages:	array of pages returned
> + * @vaddr_pin:	initialized meta information this pin is to be associated
> + * with.
> + *
> + * This is the "vaddr_pin_pages" corresponding variant to
> + * get_user_pages_remote(), but with FOLL_PIN semantics: the implementation sets
> + * FOLL_PIN. That, in turn, means that the pages must ultimately be released
> + * by put_user_page().
> + */
> +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long start, unsigned long nr_pages,
> +			    unsigned int gup_flags, struct page **pages,
> +			    struct vm_area_struct **vmas, int *locked,
> +			    struct vaddr_pin *vaddr_pin)
> +{
> +	gup_flags |= FOLL_TOUCH | FOLL_REMOTE | FOLL_PIN;
> +
> +	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
> +				       locked, gup_flags, vaddr_pin);
> +}
> +EXPORT_SYMBOL(vaddr_pin_pages_remote);
> +
>  /**
>   * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages
>   *
> @@ -2536,3 +2568,21 @@ void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
>  	__put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty);
>  }
>  EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock);
> +
> +/**
> + * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages
> + *
> + * @pages: array of pages returned
> + * @nr_pages: number of pages in pages
> + * @vaddr_pin: same information passed to vaddr_pin_pages
> + *
> + * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting
> + * back pages in an error case: they were never made dirty.
> + */
> +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
> +		       struct vaddr_pin *vaddr_pin)
> +{
> +	__put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false);
> +}
> +EXPORT_SYMBOL(vaddr_unpin_pages);

Rather than have another wrapping call why don't we just do this?  Would it be
so bad to just have to specify false for make_dirty?


diff --git a/mm/gup.c b/mm/gup.c
index e77b250c1307..ca660a5e8206 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2540,7 +2540,7 @@ long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 EXPORT_SYMBOL(vaddr_pin_pages_remote);
 
 /**
- * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages
+ * vaddr_unpin_pages - counterpart to vaddr_pin_pages
  *
  * @pages: array of pages returned
  * @nr_pages: number of pages in pages
@@ -2551,26 +2551,9 @@ EXPORT_SYMBOL(vaddr_pin_pages_remote);
  * in vaddr_pin_pages should be passed back into this call for proper
  * tracking.
  */
-void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
-                                 struct vaddr_pin *vaddr_pin, bool make_dirty)
+void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
+                      struct vaddr_pin *vaddr_pin, bool make_dirty)
 {
        __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty);
 }
 EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock);
-
-/**
- * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages
- *
- * @pages: array of pages returned
- * @nr_pages: number of pages in pages
- * @vaddr_pin: same information passed to vaddr_pin_pages
- *
- * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting
- * back pages in an error case: they were never made dirty.
- */
-void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
-                      struct vaddr_pin *vaddr_pin)
-{
-       __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false);
-}
-EXPORT_SYMBOL(vaddr_unpin_pages);
John Hubbard Aug. 12, 2019, 10:21 p.m. UTC | #2
On 8/12/19 3:03 PM, Ira Weiny wrote:
> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
>> From: John Hubbard <jhubbard@nvidia.com>
...
>> +/**
>> + * vaddr_pin_pages pin pages by virtual address and return the pages to the
> 
> vaddr_pin_pages_remote
> 
> Fixed in my tree.


thanks. :)


> 
>> + * user.
>> + *
>> + * @tsk:	the task_struct to use for page fault accounting, or
>> + *		NULL if faults are not to be recorded.
>> + * @mm:		mm_struct of target mm
>> + * @addr:	start address
>> + * @nr_pages:	number of pages to pin
>> + * @gup_flags:	flags to use for the pin
>> + * @pages:	array of pages returned
>> + * @vaddr_pin:	initialized meta information this pin is to be associated
>> + * with.
>> + *
>> + * This is the "vaddr_pin_pages" corresponding variant to
>> + * get_user_pages_remote(), but with FOLL_PIN semantics: the implementation sets
>> + * FOLL_PIN. That, in turn, means that the pages must ultimately be released
>> + * by put_user_page().
>> + */
>> +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
>> +			    unsigned long start, unsigned long nr_pages,
>> +			    unsigned int gup_flags, struct page **pages,
>> +			    struct vm_area_struct **vmas, int *locked,
>> +			    struct vaddr_pin *vaddr_pin)
>> +{
>> +	gup_flags |= FOLL_TOUCH | FOLL_REMOTE | FOLL_PIN;
>> +
>> +	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
>> +				       locked, gup_flags, vaddr_pin);
>> +}
>> +EXPORT_SYMBOL(vaddr_pin_pages_remote);
>> +
>>  /**
>>   * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages
>>   *
>> @@ -2536,3 +2568,21 @@ void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
>>  	__put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty);
>>  }
>>  EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock);
>> +
>> +/**
>> + * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages
>> + *
>> + * @pages: array of pages returned
>> + * @nr_pages: number of pages in pages
>> + * @vaddr_pin: same information passed to vaddr_pin_pages
>> + *
>> + * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting
>> + * back pages in an error case: they were never made dirty.
>> + */
>> +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
>> +		       struct vaddr_pin *vaddr_pin)
>> +{
>> +	__put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false);
>> +}
>> +EXPORT_SYMBOL(vaddr_unpin_pages);
> 
> Rather than have another wrapping call why don't we just do this?  Would it be
> so bad to just have to specify false for make_dirty?

Sure, passing in false for make_dirty is fine, and in fact, there may even be
error cases I've forgotten about that *want* to dirty the page. 

I thought about these variants, and realized that we don't generally need to 
say "lock" anymore, because we're going to forcibly use set_page_dirty_lock 
(rather than set_page_dirty) in this part of the code. And a shorter name 
is nice. Since you've dropped both "_dirty" and "_lock" from the function 
name, it's still nice and short even though we pass in make_dirty as an arg.

So that's a long-winded, "the API below looks good to me". :)

> 
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index e77b250c1307..ca660a5e8206 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2540,7 +2540,7 @@ long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
>  EXPORT_SYMBOL(vaddr_pin_pages_remote);
>  
>  /**
> - * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages
> + * vaddr_unpin_pages - counterpart to vaddr_pin_pages
>   *
>   * @pages: array of pages returned
>   * @nr_pages: number of pages in pages
> @@ -2551,26 +2551,9 @@ EXPORT_SYMBOL(vaddr_pin_pages_remote);
>   * in vaddr_pin_pages should be passed back into this call for proper
>   * tracking.
>   */
> -void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
> -                                 struct vaddr_pin *vaddr_pin, bool make_dirty)
> +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
> +                      struct vaddr_pin *vaddr_pin, bool make_dirty)
>  {
>         __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty);
>  }
>  EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock);
> -
> -/**
> - * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages
> - *
> - * @pages: array of pages returned
> - * @nr_pages: number of pages in pages
> - * @vaddr_pin: same information passed to vaddr_pin_pages
> - *
> - * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting
> - * back pages in an error case: they were never made dirty.
> - */
> -void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
> -                      struct vaddr_pin *vaddr_pin)
> -{
> -       __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false);
> -}
> -EXPORT_SYMBOL(vaddr_unpin_pages);
> 

thanks,
Ira Weiny Aug. 12, 2019, 11:49 p.m. UTC | #3
On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
> From: John Hubbard <jhubbard@nvidia.com>
> 
> This is the "vaddr_pin_pages" corresponding variant to
> get_user_pages_remote(), but with FOLL_PIN semantics: the implementation
> sets FOLL_PIN. That, in turn, means that the pages must ultimately be
> released by put_user_page*()--typically, via vaddr_unpin_pages*().
> 
> Note that the put_user_page*() requirement won't be truly
> required until all of the call sites have been converted, and
> the tracking of pages is actually activated.
> 
> Also introduce vaddr_unpin_pages(), in order to have a simpler
> call for the error handling cases.
> 
> Use both of these new calls in the Infiniband drive, replacing
> get_user_pages_remote() and put_user_pages().
> 
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>
> ---
>  drivers/infiniband/core/umem_odp.c | 15 +++++----
>  include/linux/mm.h                 |  7 +++++
>  mm/gup.c                           | 50 ++++++++++++++++++++++++++++++
>  3 files changed, 66 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
> index 53085896d718..fdff034a8a30 100644
> --- a/drivers/infiniband/core/umem_odp.c
> +++ b/drivers/infiniband/core/umem_odp.c
> @@ -534,7 +534,7 @@ static int ib_umem_odp_map_dma_single_page(
>  	}
>  
>  out:
> -	put_user_page(page);
> +	vaddr_unpin_pages(&page, 1, &umem_odp->umem.vaddr_pin);
>  
>  	if (remove_existing_mapping) {
>  		ib_umem_notifier_start_account(umem_odp);
> @@ -635,9 +635,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>  		 * complex (and doesn't gain us much performance in most use
>  		 * cases).
>  		 */
> -		npages = get_user_pages_remote(owning_process, owning_mm,
> +		npages = vaddr_pin_pages_remote(owning_process, owning_mm,
>  				user_virt, gup_num_pages,
> -				flags, local_page_list, NULL, NULL);
> +				flags, local_page_list, NULL, NULL,
> +				&umem_odp->umem.vaddr_pin);

Thinking about this part of the patch... is this pin really necessary?  This
code is not doing a long term pin.  The page just needs a reference while we
map it into the devices page tables.  Once that is done we should get notifiers
if anything changes and we can adjust.  right?

Ira

>  		up_read(&owning_mm->mmap_sem);
>  
>  		if (npages < 0) {
> @@ -657,7 +658,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>  					ret = -EFAULT;
>  					break;
>  				}
> -				put_user_page(local_page_list[j]);
> +				vaddr_unpin_pages(&local_page_list[j], 1,
> +						  &umem_odp->umem.vaddr_pin);
>  				continue;
>  			}
>  
> @@ -684,8 +686,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>  			 * ib_umem_odp_map_dma_single_page().
>  			 */
>  			if (npages - (j + 1) > 0)
> -				put_user_pages(&local_page_list[j+1],
> -					       npages - (j + 1));
> +				vaddr_unpin_pages(&local_page_list[j+1],
> +						  npages - (j + 1),
> +						  &umem_odp->umem.vaddr_pin);
>  			break;
>  		}
>  	}
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 61b616cd9243..2bd76ad8787e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1606,6 +1606,13 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
>  long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
>  		     unsigned int gup_flags, struct page **pages,
>  		     struct vaddr_pin *vaddr_pin);
> +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long start, unsigned long nr_pages,
> +			    unsigned int gup_flags, struct page **pages,
> +			    struct vm_area_struct **vmas, int *locked,
> +			    struct vaddr_pin *vaddr_pin);
> +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
> +		       struct vaddr_pin *vaddr_pin);
>  void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
>  				  struct vaddr_pin *vaddr_pin, bool make_dirty);
>  bool mapping_inode_has_layout(struct vaddr_pin *vaddr_pin, struct page *page);
> diff --git a/mm/gup.c b/mm/gup.c
> index 85f09958fbdc..bb95adfaf9b6 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2518,6 +2518,38 @@ long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
>  }
>  EXPORT_SYMBOL(vaddr_pin_pages);
>  
> +/**
> + * vaddr_pin_pages pin pages by virtual address and return the pages to the
> + * user.
> + *
> + * @tsk:	the task_struct to use for page fault accounting, or
> + *		NULL if faults are not to be recorded.
> + * @mm:		mm_struct of target mm
> + * @addr:	start address
> + * @nr_pages:	number of pages to pin
> + * @gup_flags:	flags to use for the pin
> + * @pages:	array of pages returned
> + * @vaddr_pin:	initialized meta information this pin is to be associated
> + * with.
> + *
> + * This is the "vaddr_pin_pages" corresponding variant to
> + * get_user_pages_remote(), but with FOLL_PIN semantics: the implementation sets
> + * FOLL_PIN. That, in turn, means that the pages must ultimately be released
> + * by put_user_page().
> + */
> +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long start, unsigned long nr_pages,
> +			    unsigned int gup_flags, struct page **pages,
> +			    struct vm_area_struct **vmas, int *locked,
> +			    struct vaddr_pin *vaddr_pin)
> +{
> +	gup_flags |= FOLL_TOUCH | FOLL_REMOTE | FOLL_PIN;
> +
> +	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
> +				       locked, gup_flags, vaddr_pin);
> +}
> +EXPORT_SYMBOL(vaddr_pin_pages_remote);
> +
>  /**
>   * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages
>   *
> @@ -2536,3 +2568,21 @@ void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
>  	__put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty);
>  }
>  EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock);
> +
> +/**
> + * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages
> + *
> + * @pages: array of pages returned
> + * @nr_pages: number of pages in pages
> + * @vaddr_pin: same information passed to vaddr_pin_pages
> + *
> + * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting
> + * back pages in an error case: they were never made dirty.
> + */
> +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
> +		       struct vaddr_pin *vaddr_pin)
> +{
> +	__put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false);
> +}
> +EXPORT_SYMBOL(vaddr_unpin_pages);
> +
> -- 
> 2.22.0
>
John Hubbard Aug. 13, 2019, 12:07 a.m. UTC | #4
On 8/12/19 4:49 PM, Ira Weiny wrote:
> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
>> From: John Hubbard <jhubbard@nvidia.com>
...
>> diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
>> index 53085896d718..fdff034a8a30 100644
>> --- a/drivers/infiniband/core/umem_odp.c
>> +++ b/drivers/infiniband/core/umem_odp.c
>> @@ -534,7 +534,7 @@ static int ib_umem_odp_map_dma_single_page(
>>   	}
>>   
>>   out:
>> -	put_user_page(page);
>> +	vaddr_unpin_pages(&page, 1, &umem_odp->umem.vaddr_pin);
>>   
>>   	if (remove_existing_mapping) {
>>   		ib_umem_notifier_start_account(umem_odp);
>> @@ -635,9 +635,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>>   		 * complex (and doesn't gain us much performance in most use
>>   		 * cases).
>>   		 */
>> -		npages = get_user_pages_remote(owning_process, owning_mm,
>> +		npages = vaddr_pin_pages_remote(owning_process, owning_mm,
>>   				user_virt, gup_num_pages,
>> -				flags, local_page_list, NULL, NULL);
>> +				flags, local_page_list, NULL, NULL,
>> +				&umem_odp->umem.vaddr_pin);
> 
> Thinking about this part of the patch... is this pin really necessary?  This
> code is not doing a long term pin.  The page just needs a reference while we
> map it into the devices page tables.  Once that is done we should get notifiers
> if anything changes and we can adjust.  right?
> 

OK, now it's a little interesting: the FOLL_PIN is necessary, but maybe not
FOLL_LONGTERM. Illustrating once again that it's actually necessary to allow
these flags to vary independently.

And that leads to another API refinement idea: let's set FOLL_PIN within the
vaddr_pin_pages*() wrappers, and set FOLL_LONGTER in the *callers* of those
wrappers, yes?

thanks,
Ira Weiny Aug. 13, 2019, 9:08 p.m. UTC | #5
On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote:
> On 8/12/19 4:49 PM, Ira Weiny wrote:
> > On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
> > > From: John Hubbard <jhubbard@nvidia.com>
> ...
> > > diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
> > > index 53085896d718..fdff034a8a30 100644
> > > --- a/drivers/infiniband/core/umem_odp.c
> > > +++ b/drivers/infiniband/core/umem_odp.c
> > > @@ -534,7 +534,7 @@ static int ib_umem_odp_map_dma_single_page(
> > >   	}
> > >   out:
> > > -	put_user_page(page);
> > > +	vaddr_unpin_pages(&page, 1, &umem_odp->umem.vaddr_pin);
> > >   	if (remove_existing_mapping) {
> > >   		ib_umem_notifier_start_account(umem_odp);
> > > @@ -635,9 +635,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
> > >   		 * complex (and doesn't gain us much performance in most use
> > >   		 * cases).
> > >   		 */
> > > -		npages = get_user_pages_remote(owning_process, owning_mm,
> > > +		npages = vaddr_pin_pages_remote(owning_process, owning_mm,
> > >   				user_virt, gup_num_pages,
> > > -				flags, local_page_list, NULL, NULL);
> > > +				flags, local_page_list, NULL, NULL,
> > > +				&umem_odp->umem.vaddr_pin);
> > 
> > Thinking about this part of the patch... is this pin really necessary?  This
> > code is not doing a long term pin.  The page just needs a reference while we
> > map it into the devices page tables.  Once that is done we should get notifiers
> > if anything changes and we can adjust.  right?
> > 
> 
> OK, now it's a little interesting: the FOLL_PIN is necessary, but maybe not
> FOLL_LONGTERM. Illustrating once again that it's actually necessary to allow
> these flags to vary independently.

Why is PIN necessary?  I think we do want all drivers to use the new
user_uaddr_vaddr_pin_user_pages() call...  :-P  But in this case I think a
simple "get" reference is enough to reference the page while we are using it.
If it changes after the "put/unpin" we get a fault which should handle the
change right?

The other issue I have with FOLL_PIN is what does it mean to call "...pin...()"
without FOLL_PIN?

This is another confusion of get_user_pages()...  you can actually call it
without FOLL_GET...  :-/  And you just don't get pages back.  I've never really
dug into how (or if) you "put" them later...

> 
> And that leads to another API refinement idea: let's set FOLL_PIN within the
> vaddr_pin_pages*() wrappers, and set FOLL_LONGTER in the *callers* of those
> wrappers, yes?

I've thought about this before and I think any default flags should simply
define what we want follow_pages to do.

Also, the addition of vaddr_pin information creates an implicit flag which if
not there disallows any file pages from being pinned.  It becomes our new
"longterm" flag.  FOLL_PIN _could_ be what we should use "internally".  But we
could also just use this implicit vaddr_pin flag and not add a new flag.

Finally, I struggle with converting everyone to a new call.  It is more
overhead to use vaddr_pin in the call above because now the GUP code is going
to associate a file pin object with that file when in ODP we don't need that
because the pages can move around.

This overhead may be fine, not sure in this case, but I don't see everyone
wanting it.

Ira
John Hubbard Aug. 14, 2019, 12:51 a.m. UTC | #6
On 8/13/19 2:08 PM, Ira Weiny wrote:
> On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote:
>> On 8/12/19 4:49 PM, Ira Weiny wrote:
>>> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
>>>> From: John Hubbard <jhubbard@nvidia.com>
>> ...
>>> Thinking about this part of the patch... is this pin really necessary?  This
>>> code is not doing a long term pin.  The page just needs a reference while we
>>> map it into the devices page tables.  Once that is done we should get notifiers
>>> if anything changes and we can adjust.  right?
>>>
>>
>> OK, now it's a little interesting: the FOLL_PIN is necessary, but maybe not
>> FOLL_LONGTERM. Illustrating once again that it's actually necessary to allow
>> these flags to vary independently.
> 
> Why is PIN necessary?  I think we do want all drivers to use the new
> user_uaddr_vaddr_pin_user_pages() call...  :-P  But in this case I think a
> simple "get" reference is enough to reference the page while we are using it.
> If it changes after the "put/unpin" we get a fault which should handle the
> change right?
> 

FOLL_PIN is necessary because the caller is clearly in the use case that 
requires it--however briefly they might be there. As Jan described it,

"Anything that gets page reference and then touches page data (e.g. direct IO)
needs the new kind of tracking so that filesystem knows someone is messing with
the page data." [1]

> The other issue I have with FOLL_PIN is what does it mean to call "...pin...()"
> without FOLL_PIN?
> 
> This is another confusion of get_user_pages()...  you can actually call it
> without FOLL_GET...  :-/  And you just don't get pages back.  I've never really
> dug into how (or if) you "put" them later...
>

Yes, you are talking to someone who has been suffering through that. The
problem here is that gup() has evolved into a do-everything tool. I think we're
getting closer to teasing it apart into more specific interfaces that do
more limited things.

Anyway, I want FOLL_PIN as a way to help clarify this...

 
>>
>> And that leads to another API refinement idea: let's set FOLL_PIN within the
>> vaddr_pin_pages*() wrappers, and set FOLL_LONGTER in the *callers* of those
>> wrappers, yes?
> 
> I've thought about this before and I think any default flags should simply
> define what we want follow_pages to do.

Hmm, so don't forget that we need to know what gup_fast() should do, too.

Anyway, I'm not worried about which combination of wrapper calls set which flags,
I'm open to suggestion there. But it does still seem to me that we should have
independent FOLL_LONGTERM and FOLL_PIN flags, once the API churn settles. 


> 
> Also, the addition of vaddr_pin information creates an implicit flag which if
> not there disallows any file pages from being pinned.  It becomes our new
> "longterm" flag.  FOLL_PIN _could_ be what we should use "internally".  But we
> could also just use this implicit vaddr_pin flag and not add a new flag.

I'd like to have FOLL_PIN internally, in order to solve the problems that
you just raised, above! Namely, that it's too hard to figure out all the
cases that gup() is handling.

With FOLL_PIN, we know that we should be taking the new pin refcount, and
releasing it via the (currently named) put_user_page*(), ultimately.

> 
> Finally, I struggle with converting everyone to a new call.  It is more
> overhead to use vaddr_pin in the call above because now the GUP code is going
> to associate a file pin object with that file when in ODP we don't need that
> because the pages can move around.

What if the pages in ODP are file-backed? 

> 
> This overhead may be fine, not sure in this case, but I don't see everyone
> wanting it.

I think most callers don't have much choice, otherwise they'll be broken with
file-backed memory.

thanks,
John Hubbard Aug. 14, 2019, 12:56 a.m. UTC | #7
On 8/13/19 5:51 PM, John Hubbard wrote:
> On 8/13/19 2:08 PM, Ira Weiny wrote:
>> On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote:
>>> On 8/12/19 4:49 PM, Ira Weiny wrote:
>>>> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
>>>>> From: John Hubbard <jhubbard@nvidia.com>
>>> ...
>> Finally, I struggle with converting everyone to a new call.  It is more
>> overhead to use vaddr_pin in the call above because now the GUP code is going
>> to associate a file pin object with that file when in ODP we don't need that
>> because the pages can move around.
> 
> What if the pages in ODP are file-backed? 
> 

oops, strike that, you're right: in that case, even the file system case is covered.
Don't mind me. :)

>>
>> This overhead may be fine, not sure in this case, but I don't see everyone
>> wanting it.

So now I see why you said that, but I will note that ODP hardware is rare,
and will likely remain rare: replayable page faults require really special 
hardware, and after all this time, we still only have CPUs, GPUs, and the
Mellanox cards that do it.

That leaves a lot of other hardware to take care of.

thanks,
Ira Weiny Aug. 14, 2019, 11:50 p.m. UTC | #8
On Tue, Aug 13, 2019 at 05:56:31PM -0700, John Hubbard wrote:
> On 8/13/19 5:51 PM, John Hubbard wrote:
> > On 8/13/19 2:08 PM, Ira Weiny wrote:
> >> On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote:
> >>> On 8/12/19 4:49 PM, Ira Weiny wrote:
> >>>> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
> >>>>> From: John Hubbard <jhubbard@nvidia.com>
> >>> ...
> >> Finally, I struggle with converting everyone to a new call.  It is more
> >> overhead to use vaddr_pin in the call above because now the GUP code is going
> >> to associate a file pin object with that file when in ODP we don't need that
> >> because the pages can move around.
> > 
> > What if the pages in ODP are file-backed? 
> > 
> 
> oops, strike that, you're right: in that case, even the file system case is covered.
> Don't mind me. :)

Ok so are we agreed we will drop the patch to the ODP code?  I'm going to keep
the FOLL_PIN flag and addition in the vaddr_pin_pages.

Ira

> 
> >>
> >> This overhead may be fine, not sure in this case, but I don't see everyone
> >> wanting it.
> 
> So now I see why you said that, but I will note that ODP hardware is rare,
> and will likely remain rare: replayable page faults require really special 
> hardware, and after all this time, we still only have CPUs, GPUs, and the
> Mellanox cards that do it.
> 
> That leaves a lot of other hardware to take care of.
> 
> thanks,
> -- 
> John Hubbard
> NVIDIA
>
John Hubbard Aug. 15, 2019, 12:02 a.m. UTC | #9
On 8/14/19 4:50 PM, Ira Weiny wrote:
> On Tue, Aug 13, 2019 at 05:56:31PM -0700, John Hubbard wrote:
>> On 8/13/19 5:51 PM, John Hubbard wrote:
>>> On 8/13/19 2:08 PM, Ira Weiny wrote:
>>>> On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote:
>>>>> On 8/12/19 4:49 PM, Ira Weiny wrote:
>>>>>> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
>>>>>>> From: John Hubbard <jhubbard@nvidia.com>
>>>>> ...
>>>> Finally, I struggle with converting everyone to a new call.  It is more
>>>> overhead to use vaddr_pin in the call above because now the GUP code is going
>>>> to associate a file pin object with that file when in ODP we don't need that
>>>> because the pages can move around.
>>>
>>> What if the pages in ODP are file-backed?
>>>
>>
>> oops, strike that, you're right: in that case, even the file system case is covered.
>> Don't mind me. :)
> 
> Ok so are we agreed we will drop the patch to the ODP code?  I'm going to keep
> the FOLL_PIN flag and addition in the vaddr_pin_pages.
> 

Yes. I hope I'm not overlooking anything, but it all seems to make sense to
let ODP just rely on the MMU notifiers.

thanks,
John Hubbard Aug. 15, 2019, 3:01 a.m. UTC | #10
On 8/14/19 5:02 PM, John Hubbard wrote:
> On 8/14/19 4:50 PM, Ira Weiny wrote:
>> On Tue, Aug 13, 2019 at 05:56:31PM -0700, John Hubbard wrote:
>>> On 8/13/19 5:51 PM, John Hubbard wrote:
>>>> On 8/13/19 2:08 PM, Ira Weiny wrote:
>>>>> On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote:
>>>>>> On 8/12/19 4:49 PM, Ira Weiny wrote:
>>>>>>> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
>>>>>>>> From: John Hubbard <jhubbard@nvidia.com>
>>>>>> ...
>>>>> Finally, I struggle with converting everyone to a new call.  It is more
>>>>> overhead to use vaddr_pin in the call above because now the GUP code is going
>>>>> to associate a file pin object with that file when in ODP we don't need that
>>>>> because the pages can move around.
>>>>
>>>> What if the pages in ODP are file-backed?
>>>>
>>>
>>> oops, strike that, you're right: in that case, even the file system case is covered.
>>> Don't mind me. :)
>>
>> Ok so are we agreed we will drop the patch to the ODP code?  I'm going to keep
>> the FOLL_PIN flag and addition in the vaddr_pin_pages.
>>
> 
> Yes. I hope I'm not overlooking anything, but it all seems to make sense to
> let ODP just rely on the MMU notifiers.
> 

Hold on, I *was* forgetting something: this was a two part thing, and you're
conflating the two points, but they need to remain separate and distinct. There
were:

1. FOLL_PIN is necessary because the caller is clearly in the use case that
requires it--however briefly they might be there. As Jan described it,

"Anything that gets page reference and then touches page data (e.g. direct IO)
needs the new kind of tracking so that filesystem knows someone is messing with
the page data." [1]

2. Releasing the pin: for ODP, we can use MMU notifiers instead of requiring a
lease.

This second point does not invalidate the first point. Therefore, I still see the
need for the call within ODP, to something that sets FOLL_PIN. And that means
either vaddr_pin_[user?]_pages_remote, or some other wrapper of your choice. :)

I guess shows that the API might need to be refined. We're trying to solve
two closely related issues, but they're not identical.

thanks,
Jan Kara Aug. 15, 2019, 1:26 p.m. UTC | #11
On Wed 14-08-19 20:01:07, John Hubbard wrote:
> On 8/14/19 5:02 PM, John Hubbard wrote:
> > On 8/14/19 4:50 PM, Ira Weiny wrote:
> > > On Tue, Aug 13, 2019 at 05:56:31PM -0700, John Hubbard wrote:
> > > > On 8/13/19 5:51 PM, John Hubbard wrote:
> > > > > On 8/13/19 2:08 PM, Ira Weiny wrote:
> > > > > > On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote:
> > > > > > > On 8/12/19 4:49 PM, Ira Weiny wrote:
> > > > > > > > On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
> > > > > > > > > From: John Hubbard <jhubbard@nvidia.com>
> > > > > > > ...
> > > > > > Finally, I struggle with converting everyone to a new call.  It is more
> > > > > > overhead to use vaddr_pin in the call above because now the GUP code is going
> > > > > > to associate a file pin object with that file when in ODP we don't need that
> > > > > > because the pages can move around.
> > > > > 
> > > > > What if the pages in ODP are file-backed?
> > > > > 
> > > > 
> > > > oops, strike that, you're right: in that case, even the file system case is covered.
> > > > Don't mind me. :)
> > > 
> > > Ok so are we agreed we will drop the patch to the ODP code?  I'm going to keep
> > > the FOLL_PIN flag and addition in the vaddr_pin_pages.
> > > 
> > 
> > Yes. I hope I'm not overlooking anything, but it all seems to make sense to
> > let ODP just rely on the MMU notifiers.
> > 
> 
> Hold on, I *was* forgetting something: this was a two part thing, and
> you're conflating the two points, but they need to remain separate and
> distinct. There were:
> 
> 1. FOLL_PIN is necessary because the caller is clearly in the use case that
> requires it--however briefly they might be there. As Jan described it,
> 
> "Anything that gets page reference and then touches page data (e.g.
> direct IO) needs the new kind of tracking so that filesystem knows
> someone is messing with the page data." [1]

So when the GUP user uses MMU notifiers to stop writing to pages whenever
they are writeprotected with page_mkclean(), they don't really need page
pin - their access is then fully equivalent to any other mmap userspace
access and filesystem knows how to deal with those. I forgot out this case
when I wrote the above sentence.

So to sum up there are three cases:
1) DIO case - GUP references to pages serving as DIO buffers are needed for
   relatively short time, no special synchronization with page_mkclean() or
   munmap() => needs FOLL_PIN
2) RDMA case - GUP references to pages serving as DMA buffers needed for a
   long time, no special synchronization with page_mkclean() or munmap()
   => needs FOLL_PIN | FOLL_LONGTERM
   This case has also a special case when the pages are actually DAX. Then
   the caller additionally needs file lease and additional file_pin
   structure is used for tracking this usage.
3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
   used to synchronize with page_mkclean() and munmap() => normal page
   references are fine.

								Honza
Jan Kara Aug. 15, 2019, 1:35 p.m. UTC | #12
On Thu 15-08-19 15:26:22, Jan Kara wrote:
> On Wed 14-08-19 20:01:07, John Hubbard wrote:
> > On 8/14/19 5:02 PM, John Hubbard wrote:
> > > On 8/14/19 4:50 PM, Ira Weiny wrote:
> > > > On Tue, Aug 13, 2019 at 05:56:31PM -0700, John Hubbard wrote:
> > > > > On 8/13/19 5:51 PM, John Hubbard wrote:
> > > > > > On 8/13/19 2:08 PM, Ira Weiny wrote:
> > > > > > > On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote:
> > > > > > > > On 8/12/19 4:49 PM, Ira Weiny wrote:
> > > > > > > > > On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote:
> > > > > > > > > > From: John Hubbard <jhubbard@nvidia.com>
> > > > > > > > ...
> > > > > > > Finally, I struggle with converting everyone to a new call.  It is more
> > > > > > > overhead to use vaddr_pin in the call above because now the GUP code is going
> > > > > > > to associate a file pin object with that file when in ODP we don't need that
> > > > > > > because the pages can move around.
> > > > > > 
> > > > > > What if the pages in ODP are file-backed?
> > > > > > 
> > > > > 
> > > > > oops, strike that, you're right: in that case, even the file system case is covered.
> > > > > Don't mind me. :)
> > > > 
> > > > Ok so are we agreed we will drop the patch to the ODP code?  I'm going to keep
> > > > the FOLL_PIN flag and addition in the vaddr_pin_pages.
> > > > 
> > > 
> > > Yes. I hope I'm not overlooking anything, but it all seems to make sense to
> > > let ODP just rely on the MMU notifiers.
> > > 
> > 
> > Hold on, I *was* forgetting something: this was a two part thing, and
> > you're conflating the two points, but they need to remain separate and
> > distinct. There were:
> > 
> > 1. FOLL_PIN is necessary because the caller is clearly in the use case that
> > requires it--however briefly they might be there. As Jan described it,
> > 
> > "Anything that gets page reference and then touches page data (e.g.
> > direct IO) needs the new kind of tracking so that filesystem knows
> > someone is messing with the page data." [1]
> 
> So when the GUP user uses MMU notifiers to stop writing to pages whenever
> they are writeprotected with page_mkclean(), they don't really need page
> pin - their access is then fully equivalent to any other mmap userspace
> access and filesystem knows how to deal with those. I forgot out this case
> when I wrote the above sentence.
> 
> So to sum up there are three cases:
> 1) DIO case - GUP references to pages serving as DIO buffers are needed for
>    relatively short time, no special synchronization with page_mkclean() or
>    munmap() => needs FOLL_PIN
> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a
>    long time, no special synchronization with page_mkclean() or munmap()
>    => needs FOLL_PIN | FOLL_LONGTERM
>    This case has also a special case when the pages are actually DAX. Then
>    the caller additionally needs file lease and additional file_pin
>    structure is used for tracking this usage.
> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
>    used to synchronize with page_mkclean() and munmap() => normal page
>    references are fine.

I want to add that I'd like to convert users in cases 1) and 2) from using
GUP to using differently named function. Users in case 3) can stay as they
are for now although ultimately I'd like to denote such use cases in a
special way as well...

								Honza
Jason Gunthorpe Aug. 15, 2019, 2:51 p.m. UTC | #13
On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:

> > 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
> >    used to synchronize with page_mkclean() and munmap() => normal page
> >    references are fine.
> 
> I want to add that I'd like to convert users in cases 1) and 2) from using
> GUP to using differently named function. Users in case 3) can stay as they
> are for now although ultimately I'd like to denote such use cases in a
> special way as well...

3) users also want a special function and path, right now it is called
hmm_range_fault() but perhaps it would be good to harmonize it more
with the GUP infrastructure?

I'm not quite sure what the best plan for that is yet.

Jason
Ira Weiny Aug. 15, 2019, 5:32 p.m. UTC | #14
On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:
> On Thu 15-08-19 15:26:22, Jan Kara wrote:
> > On Wed 14-08-19 20:01:07, John Hubbard wrote:
> > > On 8/14/19 5:02 PM, John Hubbard wrote:
> > > 
> > > Hold on, I *was* forgetting something: this was a two part thing, and
> > > you're conflating the two points, but they need to remain separate and
> > > distinct. There were:
> > > 
> > > 1. FOLL_PIN is necessary because the caller is clearly in the use case that
> > > requires it--however briefly they might be there. As Jan described it,
> > > 
> > > "Anything that gets page reference and then touches page data (e.g.
> > > direct IO) needs the new kind of tracking so that filesystem knows
> > > someone is messing with the page data." [1]
> > 
> > So when the GUP user uses MMU notifiers to stop writing to pages whenever
> > they are writeprotected with page_mkclean(), they don't really need page
> > pin - their access is then fully equivalent to any other mmap userspace
> > access and filesystem knows how to deal with those. I forgot out this case
> > when I wrote the above sentence.
> > 
> > So to sum up there are three cases:
> > 1) DIO case - GUP references to pages serving as DIO buffers are needed for
> >    relatively short time, no special synchronization with page_mkclean() or
> >    munmap() => needs FOLL_PIN
> > 2) RDMA case - GUP references to pages serving as DMA buffers needed for a
> >    long time, no special synchronization with page_mkclean() or munmap()
> >    => needs FOLL_PIN | FOLL_LONGTERM
> >    This case has also a special case when the pages are actually DAX. Then
> >    the caller additionally needs file lease and additional file_pin
> >    structure is used for tracking this usage.
> > 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
> >    used to synchronize with page_mkclean() and munmap() => normal page
> >    references are fine.
> 
> I want to add that I'd like to convert users in cases 1) and 2) from using
> GUP to using differently named function. Users in case 3) can stay as they
> are for now although ultimately I'd like to denote such use cases in a
> special way as well...
> 

Ok just to make this clear I threw up my current tree with your patches here:

https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4

I'm talking about dropping the final patch:
05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP

The other 2 can stay.  I split out the *_remote() call.  We don't have a user
but I'll keep it around for a bit.

This tree is still WIP as I work through all the comments.  So I've not changed
names or variable types etc...  Just wanted to settle this.

Ira
John Hubbard Aug. 15, 2019, 5:41 p.m. UTC | #15
On 8/15/19 10:32 AM, Ira Weiny wrote:
> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:
>> On Thu 15-08-19 15:26:22, Jan Kara wrote:
>>> On Wed 14-08-19 20:01:07, John Hubbard wrote:
>>>> On 8/14/19 5:02 PM, John Hubbard wrote:
>>>>
>>>> Hold on, I *was* forgetting something: this was a two part thing, and
>>>> you're conflating the two points, but they need to remain separate and
>>>> distinct. There were:
>>>>
>>>> 1. FOLL_PIN is necessary because the caller is clearly in the use case that
>>>> requires it--however briefly they might be there. As Jan described it,
>>>>
>>>> "Anything that gets page reference and then touches page data (e.g.
>>>> direct IO) needs the new kind of tracking so that filesystem knows
>>>> someone is messing with the page data." [1]
>>>
>>> So when the GUP user uses MMU notifiers to stop writing to pages whenever
>>> they are writeprotected with page_mkclean(), they don't really need page
>>> pin - their access is then fully equivalent to any other mmap userspace
>>> access and filesystem knows how to deal with those. I forgot out this case
>>> when I wrote the above sentence.
>>>
>>> So to sum up there are three cases:
>>> 1) DIO case - GUP references to pages serving as DIO buffers are needed for
>>>    relatively short time, no special synchronization with page_mkclean() or
>>>    munmap() => needs FOLL_PIN
>>> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a
>>>    long time, no special synchronization with page_mkclean() or munmap()
>>>    => needs FOLL_PIN | FOLL_LONGTERM
>>>    This case has also a special case when the pages are actually DAX. Then
>>>    the caller additionally needs file lease and additional file_pin
>>>    structure is used for tracking this usage.
>>> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
>>>    used to synchronize with page_mkclean() and munmap() => normal page
>>>    references are fine.
>>

Thanks Jan, once again, for clarifying all of this!

>> I want to add that I'd like to convert users in cases 1) and 2) from using
>> GUP to using differently named function. Users in case 3) can stay as they
>> are for now although ultimately I'd like to denote such use cases in a
>> special way as well...
>>
> 
> Ok just to make this clear I threw up my current tree with your patches here:
> 
> https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4
> 
> I'm talking about dropping the final patch:
> 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP
> 
> The other 2 can stay.  I split out the *_remote() call.  We don't have a user
> but I'll keep it around for a bit.
> 
> This tree is still WIP as I work through all the comments.  So I've not changed
> names or variable types etc...  Just wanted to settle this.
> 

Right. And now that ODP is not a user, I'll take a quick look through my other
call site conversions and see if I can find an easy one, to include here as
the first user of vaddr_pin_pages_remote(). I'll send it your way if that
works out.


thanks,
John Hubbard Aug. 16, 2019, 2:14 a.m. UTC | #16
On 8/15/19 10:41 AM, John Hubbard wrote:
> On 8/15/19 10:32 AM, Ira Weiny wrote:
>> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:
>>> On Thu 15-08-19 15:26:22, Jan Kara wrote:
>>>> On Wed 14-08-19 20:01:07, John Hubbard wrote:
>>>>> On 8/14/19 5:02 PM, John Hubbard wrote:
...
>> Ok just to make this clear I threw up my current tree with your patches here:
>>
>> https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4
>>
>> I'm talking about dropping the final patch:
>> 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP
>>
>> The other 2 can stay.  I split out the *_remote() call.  We don't have a user
>> but I'll keep it around for a bit.
>>
>> This tree is still WIP as I work through all the comments.  So I've not changed
>> names or variable types etc...  Just wanted to settle this.
>>
> 
> Right. And now that ODP is not a user, I'll take a quick look through my other
> call site conversions and see if I can find an easy one, to include here as
> the first user of vaddr_pin_pages_remote(). I'll send it your way if that
> works out.
> 

OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru
patch, maybe eventually [1].  But looking at process_vm_access.c, I think 
it is one of the patches that is no longer applicable, and I can just
drop it entirely...I'd welcome a second opinion on that...

So we might be all out of potential users for vaddr_pin_pages_remote()!

For quick reference, it looks like this:
 
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 357aa7bef6c0..4d29d54ec93f 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -96,7 +96,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
                flags |= FOLL_WRITE;
 
        while (!rc && nr_pages && iov_iter_count(iter)) {
-               int pages = min(nr_pages, max_pages_per_loop);
+               int pinned_pages = min(nr_pages, max_pages_per_loop);
                int locked = 1;
                size_t bytes;
 
@@ -106,14 +106,15 @@ static int process_vm_rw_single_vec(unsigned long addr,
                 * current/current->mm
                 */
                down_read(&mm->mmap_sem);
-               pages = get_user_pages_remote(task, mm, pa, pages, flags,
-                                             process_pages, NULL, &locked);
+               pinned_pages = get_user_pages_remote(task, mm, pa, pinned_pages,
+                                                    flags, process_pages, NULL,
+                                                    &locked);
                if (locked)
                        up_read(&mm->mmap_sem);
-               if (pages <= 0)
+               if (pinned_pages <= 0)
                        return -EFAULT;
 
-               bytes = pages * PAGE_SIZE - start_offset;
+               bytes = pinned_pages * PAGE_SIZE - start_offset;
                if (bytes > len)
                        bytes = len;
 
@@ -122,10 +123,9 @@ static int process_vm_rw_single_vec(unsigned long addr,
                                         vm_write);
                len -= bytes;
                start_offset = 0;
-               nr_pages -= pages;
-               pa += pages * PAGE_SIZE;
-               while (pages)
-                       put_page(process_pages[--pages]);
+               nr_pages -= pinned_pages;
+               pa += pinned_pages * PAGE_SIZE;
+               put_user_pages(process_pages, pinned_pages);
        }
 
        return rc;


[1] https://lore.kernel.org/r/1565379497-29266-2-git-send-email-linux.bhar@gmail.com

thanks,
Vlastimil Babka Aug. 16, 2019, 8:47 a.m. UTC | #17
On 8/15/19 3:35 PM, Jan Kara wrote:
>> 
>> So when the GUP user uses MMU notifiers to stop writing to pages whenever
>> they are writeprotected with page_mkclean(), they don't really need page
>> pin - their access is then fully equivalent to any other mmap userspace
>> access and filesystem knows how to deal with those. I forgot out this case
>> when I wrote the above sentence.
>> 
>> So to sum up there are three cases:
>> 1) DIO case - GUP references to pages serving as DIO buffers are needed for
>>    relatively short time, no special synchronization with page_mkclean() or
>>    munmap() => needs FOLL_PIN
>> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a
>>    long time, no special synchronization with page_mkclean() or munmap()
>>    => needs FOLL_PIN | FOLL_LONGTERM
>>    This case has also a special case when the pages are actually DAX. Then
>>    the caller additionally needs file lease and additional file_pin
>>    structure is used for tracking this usage.
>> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
>>    used to synchronize with page_mkclean() and munmap() => normal page
>>    references are fine.

IMHO the munlock lesson told us about another one, that's in the end equivalent
to 3)

4) pinning for struct page manipulation only => normal page references are fine

> I want to add that I'd like to convert users in cases 1) and 2) from using
> GUP to using differently named function. Users in case 3) can stay as they
> are for now although ultimately I'd like to denote such use cases in a
> special way as well...

So after 1/2/3 is renamed/specially denoted, only 4) keeps the current interface?

> 								Honza
>
Jan Kara Aug. 16, 2019, 3:41 p.m. UTC | #18
On Thu 15-08-19 19:14:08, John Hubbard wrote:
> On 8/15/19 10:41 AM, John Hubbard wrote:
> > On 8/15/19 10:32 AM, Ira Weiny wrote:
> >> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:
> >>> On Thu 15-08-19 15:26:22, Jan Kara wrote:
> >>>> On Wed 14-08-19 20:01:07, John Hubbard wrote:
> >>>>> On 8/14/19 5:02 PM, John Hubbard wrote:
> ...
> >> Ok just to make this clear I threw up my current tree with your patches here:
> >>
> >> https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4
> >>
> >> I'm talking about dropping the final patch:
> >> 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP
> >>
> >> The other 2 can stay.  I split out the *_remote() call.  We don't have a user
> >> but I'll keep it around for a bit.
> >>
> >> This tree is still WIP as I work through all the comments.  So I've not changed
> >> names or variable types etc...  Just wanted to settle this.
> >>
> > 
> > Right. And now that ODP is not a user, I'll take a quick look through my other
> > call site conversions and see if I can find an easy one, to include here as
> > the first user of vaddr_pin_pages_remote(). I'll send it your way if that
> > works out.
> > 
> 
> OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru
> patch, maybe eventually [1].  But looking at process_vm_access.c, I think 
> it is one of the patches that is no longer applicable, and I can just
> drop it entirely...I'd welcome a second opinion on that...

I don't think you can drop the patch. process_vm_rw_pages() clearly touches
page contents and does not synchronize with page_mkclean(). So it is case
1) and needs FOLL_PIN semantics.

								Honza
Jan Kara Aug. 16, 2019, 3:44 p.m. UTC | #19
On Fri 16-08-19 10:47:21, Vlastimil Babka wrote:
> On 8/15/19 3:35 PM, Jan Kara wrote:
> >> 
> >> So when the GUP user uses MMU notifiers to stop writing to pages whenever
> >> they are writeprotected with page_mkclean(), they don't really need page
> >> pin - their access is then fully equivalent to any other mmap userspace
> >> access and filesystem knows how to deal with those. I forgot out this case
> >> when I wrote the above sentence.
> >> 
> >> So to sum up there are three cases:
> >> 1) DIO case - GUP references to pages serving as DIO buffers are needed for
> >>    relatively short time, no special synchronization with page_mkclean() or
> >>    munmap() => needs FOLL_PIN
> >> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a
> >>    long time, no special synchronization with page_mkclean() or munmap()
> >>    => needs FOLL_PIN | FOLL_LONGTERM
> >>    This case has also a special case when the pages are actually DAX. Then
> >>    the caller additionally needs file lease and additional file_pin
> >>    structure is used for tracking this usage.
> >> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
> >>    used to synchronize with page_mkclean() and munmap() => normal page
> >>    references are fine.
> 
> IMHO the munlock lesson told us about another one, that's in the end equivalent
> to 3)
> 
> 4) pinning for struct page manipulation only => normal page references
> are fine

Right, it's good to have this for clarity.

> > I want to add that I'd like to convert users in cases 1) and 2) from using
> > GUP to using differently named function. Users in case 3) can stay as they
> > are for now although ultimately I'd like to denote such use cases in a
> > special way as well...
> 
> So after 1/2/3 is renamed/specially denoted, only 4) keeps the current
> interface?

Well, munlock() code doesn't even use GUP, just follow_page(). I'd wait to
see what's left after handling cases 1), 2), and 3) to decide about the
interface for the remainder.

								Honza
Jerome Glisse Aug. 16, 2019, 3:52 p.m. UTC | #20
On Fri, Aug 16, 2019 at 05:44:04PM +0200, Jan Kara wrote:
> On Fri 16-08-19 10:47:21, Vlastimil Babka wrote:
> > On 8/15/19 3:35 PM, Jan Kara wrote:
> > >> 
> > >> So when the GUP user uses MMU notifiers to stop writing to pages whenever
> > >> they are writeprotected with page_mkclean(), they don't really need page
> > >> pin - their access is then fully equivalent to any other mmap userspace
> > >> access and filesystem knows how to deal with those. I forgot out this case
> > >> when I wrote the above sentence.
> > >> 
> > >> So to sum up there are three cases:
> > >> 1) DIO case - GUP references to pages serving as DIO buffers are needed for
> > >>    relatively short time, no special synchronization with page_mkclean() or
> > >>    munmap() => needs FOLL_PIN
> > >> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a
> > >>    long time, no special synchronization with page_mkclean() or munmap()
> > >>    => needs FOLL_PIN | FOLL_LONGTERM
> > >>    This case has also a special case when the pages are actually DAX. Then
> > >>    the caller additionally needs file lease and additional file_pin
> > >>    structure is used for tracking this usage.
> > >> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
> > >>    used to synchronize with page_mkclean() and munmap() => normal page
> > >>    references are fine.
> > 
> > IMHO the munlock lesson told us about another one, that's in the end equivalent
> > to 3)
> > 
> > 4) pinning for struct page manipulation only => normal page references
> > are fine
> 
> Right, it's good to have this for clarity.
> 
> > > I want to add that I'd like to convert users in cases 1) and 2) from using
> > > GUP to using differently named function. Users in case 3) can stay as they
> > > are for now although ultimately I'd like to denote such use cases in a
> > > special way as well...
> > 
> > So after 1/2/3 is renamed/specially denoted, only 4) keeps the current
> > interface?
> 
> Well, munlock() code doesn't even use GUP, just follow_page(). I'd wait to
> see what's left after handling cases 1), 2), and 3) to decide about the
> interface for the remainder.
> 

For 3 we do not need to take a reference at all :) So just forget about 3
it does not exist. For 3 the reference is the reference the CPU page table
has on the page and that's it. GUP is no longer involve in ODP or anything
like that.

Cheers,
Jérôme
Jan Kara Aug. 16, 2019, 4:13 p.m. UTC | #21
On Fri 16-08-19 11:52:20, Jerome Glisse wrote:
> On Fri, Aug 16, 2019 at 05:44:04PM +0200, Jan Kara wrote:
> > On Fri 16-08-19 10:47:21, Vlastimil Babka wrote:
> > > On 8/15/19 3:35 PM, Jan Kara wrote:
> > > >> 
> > > >> So when the GUP user uses MMU notifiers to stop writing to pages whenever
> > > >> they are writeprotected with page_mkclean(), they don't really need page
> > > >> pin - their access is then fully equivalent to any other mmap userspace
> > > >> access and filesystem knows how to deal with those. I forgot out this case
> > > >> when I wrote the above sentence.
> > > >> 
> > > >> So to sum up there are three cases:
> > > >> 1) DIO case - GUP references to pages serving as DIO buffers are needed for
> > > >>    relatively short time, no special synchronization with page_mkclean() or
> > > >>    munmap() => needs FOLL_PIN
> > > >> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a
> > > >>    long time, no special synchronization with page_mkclean() or munmap()
> > > >>    => needs FOLL_PIN | FOLL_LONGTERM
> > > >>    This case has also a special case when the pages are actually DAX. Then
> > > >>    the caller additionally needs file lease and additional file_pin
> > > >>    structure is used for tracking this usage.
> > > >> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
> > > >>    used to synchronize with page_mkclean() and munmap() => normal page
> > > >>    references are fine.
> > > 
> > > IMHO the munlock lesson told us about another one, that's in the end equivalent
> > > to 3)
> > > 
> > > 4) pinning for struct page manipulation only => normal page references
> > > are fine
> > 
> > Right, it's good to have this for clarity.
> > 
> > > > I want to add that I'd like to convert users in cases 1) and 2) from using
> > > > GUP to using differently named function. Users in case 3) can stay as they
> > > > are for now although ultimately I'd like to denote such use cases in a
> > > > special way as well...
> > > 
> > > So after 1/2/3 is renamed/specially denoted, only 4) keeps the current
> > > interface?
> > 
> > Well, munlock() code doesn't even use GUP, just follow_page(). I'd wait to
> > see what's left after handling cases 1), 2), and 3) to decide about the
> > interface for the remainder.
> > 
> 
> For 3 we do not need to take a reference at all :) So just forget about 3
> it does not exist. For 3 the reference is the reference the CPU page table
> has on the page and that's it. GUP is no longer involve in ODP or anything
> like that.

Yes, I understand. But the fact is that GUP calls are currently still there
e.g. in ODP code. If you can make the code work without taking a page
reference at all, I'm only happy :)

								Honza
Jason Gunthorpe Aug. 16, 2019, 4:31 p.m. UTC | #22
On Fri, Aug 16, 2019 at 06:13:55PM +0200, Jan Kara wrote:

> > For 3 we do not need to take a reference at all :) So just forget about 3
> > it does not exist. For 3 the reference is the reference the CPU page table
> > has on the page and that's it. GUP is no longer involve in ODP or anything
> > like that.
> 
> Yes, I understand. But the fact is that GUP calls are currently still there
> e.g. in ODP code. If you can make the code work without taking a page
> reference at all, I'm only happy :)

We are working on it :)

Jason
Jerome Glisse Aug. 16, 2019, 4:54 p.m. UTC | #23
On Fri, Aug 16, 2019 at 06:13:55PM +0200, Jan Kara wrote:
> On Fri 16-08-19 11:52:20, Jerome Glisse wrote:
> > On Fri, Aug 16, 2019 at 05:44:04PM +0200, Jan Kara wrote:
> > > On Fri 16-08-19 10:47:21, Vlastimil Babka wrote:
> > > > On 8/15/19 3:35 PM, Jan Kara wrote:
> > > > >> 
> > > > >> So when the GUP user uses MMU notifiers to stop writing to pages whenever
> > > > >> they are writeprotected with page_mkclean(), they don't really need page
> > > > >> pin - their access is then fully equivalent to any other mmap userspace
> > > > >> access and filesystem knows how to deal with those. I forgot out this case
> > > > >> when I wrote the above sentence.
> > > > >> 
> > > > >> So to sum up there are three cases:
> > > > >> 1) DIO case - GUP references to pages serving as DIO buffers are needed for
> > > > >>    relatively short time, no special synchronization with page_mkclean() or
> > > > >>    munmap() => needs FOLL_PIN
> > > > >> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a
> > > > >>    long time, no special synchronization with page_mkclean() or munmap()
> > > > >>    => needs FOLL_PIN | FOLL_LONGTERM
> > > > >>    This case has also a special case when the pages are actually DAX. Then
> > > > >>    the caller additionally needs file lease and additional file_pin
> > > > >>    structure is used for tracking this usage.
> > > > >> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
> > > > >>    used to synchronize with page_mkclean() and munmap() => normal page
> > > > >>    references are fine.
> > > > 
> > > > IMHO the munlock lesson told us about another one, that's in the end equivalent
> > > > to 3)
> > > > 
> > > > 4) pinning for struct page manipulation only => normal page references
> > > > are fine
> > > 
> > > Right, it's good to have this for clarity.
> > > 
> > > > > I want to add that I'd like to convert users in cases 1) and 2) from using
> > > > > GUP to using differently named function. Users in case 3) can stay as they
> > > > > are for now although ultimately I'd like to denote such use cases in a
> > > > > special way as well...
> > > > 
> > > > So after 1/2/3 is renamed/specially denoted, only 4) keeps the current
> > > > interface?
> > > 
> > > Well, munlock() code doesn't even use GUP, just follow_page(). I'd wait to
> > > see what's left after handling cases 1), 2), and 3) to decide about the
> > > interface for the remainder.
> > > 
> > 
> > For 3 we do not need to take a reference at all :) So just forget about 3
> > it does not exist. For 3 the reference is the reference the CPU page table
> > has on the page and that's it. GUP is no longer involve in ODP or anything
> > like that.
> 
> Yes, I understand. But the fact is that GUP calls are currently still there
> e.g. in ODP code. If you can make the code work without taking a page
> reference at all, I'm only happy :)

Already in rdma next AFAIK so in 5.4 it will be gone :) i have been
removing all GUP users that do not need reference. Intel i915 driver
is a left over i will work some more with them to get rid of it too.

Cheers,
Jérôme
Jason Gunthorpe Aug. 16, 2019, 5:04 p.m. UTC | #24
On Fri, Aug 16, 2019 at 12:54:45PM -0400, Jerome Glisse wrote:

> > Yes, I understand. But the fact is that GUP calls are currently still there
> > e.g. in ODP code. If you can make the code work without taking a page
> > reference at all, I'm only happy :)
> 
> Already in rdma next AFAIK so in 5.4 it will be gone :)

Unfortunately no.. only a lot of patches supporting this change will
be in 5.4. The notifiers are still a problem, and I need to figure out
if the edge cases in hmm_range_fault are OK for ODP or not. :(

This is taking a long time in part because ODP itself has all sorts of
problems that make it hard to tell if the other changes are safe or
not..

Lots of effort is being spent to get there though.

Jason
Ira Weiny Aug. 16, 2019, 6:33 p.m. UTC | #25
On Fri, Aug 16, 2019 at 05:41:08PM +0200, Jan Kara wrote:
> On Thu 15-08-19 19:14:08, John Hubbard wrote:
> > On 8/15/19 10:41 AM, John Hubbard wrote:
> > > On 8/15/19 10:32 AM, Ira Weiny wrote:
> > >> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:
> > >>> On Thu 15-08-19 15:26:22, Jan Kara wrote:
> > >>>> On Wed 14-08-19 20:01:07, John Hubbard wrote:
> > >>>>> On 8/14/19 5:02 PM, John Hubbard wrote:
> > ...
> > >> Ok just to make this clear I threw up my current tree with your patches here:
> > >>
> > >> https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4
> > >>
> > >> I'm talking about dropping the final patch:
> > >> 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP
> > >>
> > >> The other 2 can stay.  I split out the *_remote() call.  We don't have a user
> > >> but I'll keep it around for a bit.
> > >>
> > >> This tree is still WIP as I work through all the comments.  So I've not changed
> > >> names or variable types etc...  Just wanted to settle this.
> > >>
> > > 
> > > Right. And now that ODP is not a user, I'll take a quick look through my other
> > > call site conversions and see if I can find an easy one, to include here as
> > > the first user of vaddr_pin_pages_remote(). I'll send it your way if that
> > > works out.
> > > 
> > 
> > OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru
> > patch, maybe eventually [1].  But looking at process_vm_access.c, I think 
> > it is one of the patches that is no longer applicable, and I can just
> > drop it entirely...I'd welcome a second opinion on that...
> 
> I don't think you can drop the patch. process_vm_rw_pages() clearly touches
> page contents and does not synchronize with page_mkclean(). So it is case
> 1) and needs FOLL_PIN semantics.

John could you send a formal patch using vaddr_pin* and I'll add it to the
tree?

Ira

> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
>
John Hubbard Aug. 16, 2019, 6:50 p.m. UTC | #26
On 8/16/19 11:33 AM, Ira Weiny wrote:
> On Fri, Aug 16, 2019 at 05:41:08PM +0200, Jan Kara wrote:
>> On Thu 15-08-19 19:14:08, John Hubbard wrote:
>>> On 8/15/19 10:41 AM, John Hubbard wrote:
>>>> On 8/15/19 10:32 AM, Ira Weiny wrote:
>>>>> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:
>>>>>> On Thu 15-08-19 15:26:22, Jan Kara wrote:
>>>>>>> On Wed 14-08-19 20:01:07, John Hubbard wrote:
>>>>>>>> On 8/14/19 5:02 PM, John Hubbard wrote:
>>> ...
>>>
>>> OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru
>>> patch, maybe eventually [1].  But looking at process_vm_access.c, I think
>>> it is one of the patches that is no longer applicable, and I can just
>>> drop it entirely...I'd welcome a second opinion on that...
>>
>> I don't think you can drop the patch. process_vm_rw_pages() clearly touches
>> page contents and does not synchronize with page_mkclean(). So it is case
>> 1) and needs FOLL_PIN semantics.
> 
> John could you send a formal patch using vaddr_pin* and I'll add it to the
> tree?
> 

Yes...hints about which struct file to use here are very welcome, btw. This part
of mm is fairly new to me.

thanks,
Ira Weiny Aug. 16, 2019, 9:59 p.m. UTC | #27
On Fri, Aug 16, 2019 at 11:50:09AM -0700, John Hubbard wrote:
> On 8/16/19 11:33 AM, Ira Weiny wrote:
> > On Fri, Aug 16, 2019 at 05:41:08PM +0200, Jan Kara wrote:
> > > On Thu 15-08-19 19:14:08, John Hubbard wrote:
> > > > On 8/15/19 10:41 AM, John Hubbard wrote:
> > > > > On 8/15/19 10:32 AM, Ira Weiny wrote:
> > > > > > On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:
> > > > > > > On Thu 15-08-19 15:26:22, Jan Kara wrote:
> > > > > > > > On Wed 14-08-19 20:01:07, John Hubbard wrote:
> > > > > > > > > On 8/14/19 5:02 PM, John Hubbard wrote:
> > > > ...
> > > > 
> > > > OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru
> > > > patch, maybe eventually [1].  But looking at process_vm_access.c, I think
> > > > it is one of the patches that is no longer applicable, and I can just
> > > > drop it entirely...I'd welcome a second opinion on that...
> > > 
> > > I don't think you can drop the patch. process_vm_rw_pages() clearly touches
> > > page contents and does not synchronize with page_mkclean(). So it is case
> > > 1) and needs FOLL_PIN semantics.
> > 
> > John could you send a formal patch using vaddr_pin* and I'll add it to the
> > tree?
> > 
> 
> Yes...hints about which struct file to use here are very welcome, btw. This part
> of mm is fairly new to me.

I'm still working out the final semantics of vaddr_pin*.  But right now you
don't need a vaddr_pin if you don't specify FOLL_LONGTERM.

Since case 1, this case, does not need FOLL_LONGTERM I think it is safe to
simply pass NULL here.

OTOH we could just track this against the mm_struct.  But I don't think we need
to because this pin should be transient.

And this is why I keep leaning toward _not_ putting these flags in the
vaddr_pin*() calls.  I know this is what I did but I think I'm wrong.  It should
be the caller specifying what they want and the vaddr_pin*() calls check that
what they are asking for is correct.

Ira

> 
> thanks,
> -- 
> John Hubbard
> NVIDIA
John Hubbard Aug. 16, 2019, 10:36 p.m. UTC | #28
On 8/16/19 2:59 PM, Ira Weiny wrote:
> On Fri, Aug 16, 2019 at 11:50:09AM -0700, John Hubbard wrote:
...
>>> John could you send a formal patch using vaddr_pin* and I'll add it to the
>>> tree?
>>>
>>
>> Yes...hints about which struct file to use here are very welcome, btw. This part
>> of mm is fairly new to me.
> 
> I'm still working out the final semantics of vaddr_pin*.  But right now you
> don't need a vaddr_pin if you don't specify FOLL_LONGTERM.
> 

ah OK.

> Since case 1, this case, does not need FOLL_LONGTERM I think it is safe to
> simply pass NULL here.
> 
> OTOH we could just track this against the mm_struct.  But I don't think we need
> to because this pin should be transient.
> 

Thanks for looking at that, I'm definitely in learning mode here.

> And this is why I keep leaning toward _not_ putting these flags in the
> vaddr_pin*() calls.  I know this is what I did but I think I'm wrong.  It should
> be the caller specifying what they want and the vaddr_pin*() calls check that
> what they are asking for is correct.
> 

Yes. I think we're nearly done finding the right balance of wrapper calls and
FOLL_* flags. I've seen Jan and others asking that the call sites do *not*
set the flags, but we also know that FOLL_PIN and FOLL_LONGTERM need to vary
independently.

That means either:

a) another trivial wrapper calls, on top of vaddr_pin_*(), for each supported 
combination of FOLL_PIN and FOLL_LONGTERM, or

b) just setting FOLL_PIN and FOLL_LONGTERM at each callsite.

I think either way is easy to grep for, so it's hard to get too excited
(fortunately) about which one to pick. Let's start simple with (b) and it's 
easy to convert later if someone wants that.

Meanwhile, we do need to pull the flag setting out of vaddr_pin_pages().

So I will post these small patches for your mmotm-rdmafsdax-b0-v4 branch,
shortly:

1) Add FOLL_PIN 

   --also I guess it's time to add comments documenting FOLL_PIN and
FOLL_LONGTERM use, stealing Jan's and others' wording for the 4 cases,
from earlier. :)

2) Add vaddr_pin_user_pages_remote(), which will not set FOLL_PIN or FOLL_LONGTERM
itself. And add the caller, which will.

thanks,
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 53085896d718..fdff034a8a30 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -534,7 +534,7 @@  static int ib_umem_odp_map_dma_single_page(
 	}
 
 out:
-	put_user_page(page);
+	vaddr_unpin_pages(&page, 1, &umem_odp->umem.vaddr_pin);
 
 	if (remove_existing_mapping) {
 		ib_umem_notifier_start_account(umem_odp);
@@ -635,9 +635,10 @@  int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 		 * complex (and doesn't gain us much performance in most use
 		 * cases).
 		 */
-		npages = get_user_pages_remote(owning_process, owning_mm,
+		npages = vaddr_pin_pages_remote(owning_process, owning_mm,
 				user_virt, gup_num_pages,
-				flags, local_page_list, NULL, NULL);
+				flags, local_page_list, NULL, NULL,
+				&umem_odp->umem.vaddr_pin);
 		up_read(&owning_mm->mmap_sem);
 
 		if (npages < 0) {
@@ -657,7 +658,8 @@  int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 					ret = -EFAULT;
 					break;
 				}
-				put_user_page(local_page_list[j]);
+				vaddr_unpin_pages(&local_page_list[j], 1,
+						  &umem_odp->umem.vaddr_pin);
 				continue;
 			}
 
@@ -684,8 +686,9 @@  int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 			 * ib_umem_odp_map_dma_single_page().
 			 */
 			if (npages - (j + 1) > 0)
-				put_user_pages(&local_page_list[j+1],
-					       npages - (j + 1));
+				vaddr_unpin_pages(&local_page_list[j+1],
+						  npages - (j + 1),
+						  &umem_odp->umem.vaddr_pin);
 			break;
 		}
 	}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 61b616cd9243..2bd76ad8787e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1606,6 +1606,13 @@  int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
 long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
 		     unsigned int gup_flags, struct page **pages,
 		     struct vaddr_pin *vaddr_pin);
+long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+			    unsigned long start, unsigned long nr_pages,
+			    unsigned int gup_flags, struct page **pages,
+			    struct vm_area_struct **vmas, int *locked,
+			    struct vaddr_pin *vaddr_pin);
+void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
+		       struct vaddr_pin *vaddr_pin);
 void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
 				  struct vaddr_pin *vaddr_pin, bool make_dirty);
 bool mapping_inode_has_layout(struct vaddr_pin *vaddr_pin, struct page *page);
diff --git a/mm/gup.c b/mm/gup.c
index 85f09958fbdc..bb95adfaf9b6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2518,6 +2518,38 @@  long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
 }
 EXPORT_SYMBOL(vaddr_pin_pages);
 
+/**
+ * vaddr_pin_pages pin pages by virtual address and return the pages to the
+ * user.
+ *
+ * @tsk:	the task_struct to use for page fault accounting, or
+ *		NULL if faults are not to be recorded.
+ * @mm:		mm_struct of target mm
+ * @addr:	start address
+ * @nr_pages:	number of pages to pin
+ * @gup_flags:	flags to use for the pin
+ * @pages:	array of pages returned
+ * @vaddr_pin:	initialized meta information this pin is to be associated
+ * with.
+ *
+ * This is the "vaddr_pin_pages" corresponding variant to
+ * get_user_pages_remote(), but with FOLL_PIN semantics: the implementation sets
+ * FOLL_PIN. That, in turn, means that the pages must ultimately be released
+ * by put_user_page().
+ */
+long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+			    unsigned long start, unsigned long nr_pages,
+			    unsigned int gup_flags, struct page **pages,
+			    struct vm_area_struct **vmas, int *locked,
+			    struct vaddr_pin *vaddr_pin)
+{
+	gup_flags |= FOLL_TOUCH | FOLL_REMOTE | FOLL_PIN;
+
+	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+				       locked, gup_flags, vaddr_pin);
+}
+EXPORT_SYMBOL(vaddr_pin_pages_remote);
+
 /**
  * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages
  *
@@ -2536,3 +2568,21 @@  void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
 	__put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty);
 }
 EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock);
+
+/**
+ * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages
+ *
+ * @pages: array of pages returned
+ * @nr_pages: number of pages in pages
+ * @vaddr_pin: same information passed to vaddr_pin_pages
+ *
+ * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting
+ * back pages in an error case: they were never made dirty.
+ */
+void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
+		       struct vaddr_pin *vaddr_pin)
+{
+	__put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false);
+}
+EXPORT_SYMBOL(vaddr_unpin_pages);
+