Message ID | 20190812015044.26176-3-jhubbard@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm/gup: introduce vaddr_pin_pages_remote(), FOLL_PIN | expand |
On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: > From: John Hubbard <jhubbard@nvidia.com> > > This is the "vaddr_pin_pages" corresponding variant to > get_user_pages_remote(), but with FOLL_PIN semantics: the implementation > sets FOLL_PIN. That, in turn, means that the pages must ultimately be > released by put_user_page*()--typically, via vaddr_unpin_pages*(). > > Note that the put_user_page*() requirement won't be truly > required until all of the call sites have been converted, and > the tracking of pages is actually activated. > > Also introduce vaddr_unpin_pages(), in order to have a simpler > call for the error handling cases. > > Use both of these new calls in the Infiniband drive, replacing > get_user_pages_remote() and put_user_pages(). > > Signed-off-by: John Hubbard <jhubbard@nvidia.com> > --- > drivers/infiniband/core/umem_odp.c | 15 +++++---- > include/linux/mm.h | 7 +++++ > mm/gup.c | 50 ++++++++++++++++++++++++++++++ > 3 files changed, 66 insertions(+), 6 deletions(-) > > diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c > index 53085896d718..fdff034a8a30 100644 > --- a/drivers/infiniband/core/umem_odp.c > +++ b/drivers/infiniband/core/umem_odp.c > @@ -534,7 +534,7 @@ static int ib_umem_odp_map_dma_single_page( > } > > out: > - put_user_page(page); > + vaddr_unpin_pages(&page, 1, &umem_odp->umem.vaddr_pin); > > if (remove_existing_mapping) { > ib_umem_notifier_start_account(umem_odp); > @@ -635,9 +635,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, > * complex (and doesn't gain us much performance in most use > * cases). > */ > - npages = get_user_pages_remote(owning_process, owning_mm, > + npages = vaddr_pin_pages_remote(owning_process, owning_mm, > user_virt, gup_num_pages, > - flags, local_page_list, NULL, NULL); > + flags, local_page_list, NULL, NULL, > + &umem_odp->umem.vaddr_pin); > up_read(&owning_mm->mmap_sem); > > if (npages < 0) { > @@ -657,7 +658,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, > ret = -EFAULT; > break; > } > - put_user_page(local_page_list[j]); > + vaddr_unpin_pages(&local_page_list[j], 1, > + &umem_odp->umem.vaddr_pin); > continue; > } > > @@ -684,8 +686,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, > * ib_umem_odp_map_dma_single_page(). > */ > if (npages - (j + 1) > 0) > - put_user_pages(&local_page_list[j+1], > - npages - (j + 1)); > + vaddr_unpin_pages(&local_page_list[j+1], > + npages - (j + 1), > + &umem_odp->umem.vaddr_pin); > break; > } > } > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 61b616cd9243..2bd76ad8787e 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1606,6 +1606,13 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, > long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages, > unsigned int gup_flags, struct page **pages, > struct vaddr_pin *vaddr_pin); > +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm, > + unsigned long start, unsigned long nr_pages, > + unsigned int gup_flags, struct page **pages, > + struct vm_area_struct **vmas, int *locked, > + struct vaddr_pin *vaddr_pin); > +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, > + struct vaddr_pin *vaddr_pin); > void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages, > struct vaddr_pin *vaddr_pin, bool make_dirty); > bool mapping_inode_has_layout(struct vaddr_pin *vaddr_pin, struct page *page); > diff --git a/mm/gup.c b/mm/gup.c > index 85f09958fbdc..bb95adfaf9b6 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -2518,6 +2518,38 @@ long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages, > } > EXPORT_SYMBOL(vaddr_pin_pages); > > +/** > + * vaddr_pin_pages pin pages by virtual address and return the pages to the vaddr_pin_pages_remote Fixed in my tree. > + * user. > + * > + * @tsk: the task_struct to use for page fault accounting, or > + * NULL if faults are not to be recorded. > + * @mm: mm_struct of target mm > + * @addr: start address > + * @nr_pages: number of pages to pin > + * @gup_flags: flags to use for the pin > + * @pages: array of pages returned > + * @vaddr_pin: initialized meta information this pin is to be associated > + * with. > + * > + * This is the "vaddr_pin_pages" corresponding variant to > + * get_user_pages_remote(), but with FOLL_PIN semantics: the implementation sets > + * FOLL_PIN. That, in turn, means that the pages must ultimately be released > + * by put_user_page(). > + */ > +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm, > + unsigned long start, unsigned long nr_pages, > + unsigned int gup_flags, struct page **pages, > + struct vm_area_struct **vmas, int *locked, > + struct vaddr_pin *vaddr_pin) > +{ > + gup_flags |= FOLL_TOUCH | FOLL_REMOTE | FOLL_PIN; > + > + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, > + locked, gup_flags, vaddr_pin); > +} > +EXPORT_SYMBOL(vaddr_pin_pages_remote); > + > /** > * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages > * > @@ -2536,3 +2568,21 @@ void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages, > __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty); > } > EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock); > + > +/** > + * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages > + * > + * @pages: array of pages returned > + * @nr_pages: number of pages in pages > + * @vaddr_pin: same information passed to vaddr_pin_pages > + * > + * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting > + * back pages in an error case: they were never made dirty. > + */ > +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, > + struct vaddr_pin *vaddr_pin) > +{ > + __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false); > +} > +EXPORT_SYMBOL(vaddr_unpin_pages); Rather than have another wrapping call why don't we just do this? Would it be so bad to just have to specify false for make_dirty? diff --git a/mm/gup.c b/mm/gup.c index e77b250c1307..ca660a5e8206 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2540,7 +2540,7 @@ long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(vaddr_pin_pages_remote); /** - * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages + * vaddr_unpin_pages - counterpart to vaddr_pin_pages * * @pages: array of pages returned * @nr_pages: number of pages in pages @@ -2551,26 +2551,9 @@ EXPORT_SYMBOL(vaddr_pin_pages_remote); * in vaddr_pin_pages should be passed back into this call for proper * tracking. */ -void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages, - struct vaddr_pin *vaddr_pin, bool make_dirty) +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, + struct vaddr_pin *vaddr_pin, bool make_dirty) { __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty); } EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock); - -/** - * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages - * - * @pages: array of pages returned - * @nr_pages: number of pages in pages - * @vaddr_pin: same information passed to vaddr_pin_pages - * - * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting - * back pages in an error case: they were never made dirty. - */ -void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, - struct vaddr_pin *vaddr_pin) -{ - __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false); -} -EXPORT_SYMBOL(vaddr_unpin_pages);
On 8/12/19 3:03 PM, Ira Weiny wrote: > On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: >> From: John Hubbard <jhubbard@nvidia.com> ... >> +/** >> + * vaddr_pin_pages pin pages by virtual address and return the pages to the > > vaddr_pin_pages_remote > > Fixed in my tree. thanks. :) > >> + * user. >> + * >> + * @tsk: the task_struct to use for page fault accounting, or >> + * NULL if faults are not to be recorded. >> + * @mm: mm_struct of target mm >> + * @addr: start address >> + * @nr_pages: number of pages to pin >> + * @gup_flags: flags to use for the pin >> + * @pages: array of pages returned >> + * @vaddr_pin: initialized meta information this pin is to be associated >> + * with. >> + * >> + * This is the "vaddr_pin_pages" corresponding variant to >> + * get_user_pages_remote(), but with FOLL_PIN semantics: the implementation sets >> + * FOLL_PIN. That, in turn, means that the pages must ultimately be released >> + * by put_user_page(). >> + */ >> +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm, >> + unsigned long start, unsigned long nr_pages, >> + unsigned int gup_flags, struct page **pages, >> + struct vm_area_struct **vmas, int *locked, >> + struct vaddr_pin *vaddr_pin) >> +{ >> + gup_flags |= FOLL_TOUCH | FOLL_REMOTE | FOLL_PIN; >> + >> + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, >> + locked, gup_flags, vaddr_pin); >> +} >> +EXPORT_SYMBOL(vaddr_pin_pages_remote); >> + >> /** >> * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages >> * >> @@ -2536,3 +2568,21 @@ void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages, >> __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty); >> } >> EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock); >> + >> +/** >> + * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages >> + * >> + * @pages: array of pages returned >> + * @nr_pages: number of pages in pages >> + * @vaddr_pin: same information passed to vaddr_pin_pages >> + * >> + * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting >> + * back pages in an error case: they were never made dirty. >> + */ >> +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, >> + struct vaddr_pin *vaddr_pin) >> +{ >> + __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false); >> +} >> +EXPORT_SYMBOL(vaddr_unpin_pages); > > Rather than have another wrapping call why don't we just do this? Would it be > so bad to just have to specify false for make_dirty? Sure, passing in false for make_dirty is fine, and in fact, there may even be error cases I've forgotten about that *want* to dirty the page. I thought about these variants, and realized that we don't generally need to say "lock" anymore, because we're going to forcibly use set_page_dirty_lock (rather than set_page_dirty) in this part of the code. And a shorter name is nice. Since you've dropped both "_dirty" and "_lock" from the function name, it's still nice and short even though we pass in make_dirty as an arg. So that's a long-winded, "the API below looks good to me". :) > > > diff --git a/mm/gup.c b/mm/gup.c > index e77b250c1307..ca660a5e8206 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -2540,7 +2540,7 @@ long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm, > EXPORT_SYMBOL(vaddr_pin_pages_remote); > > /** > - * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages > + * vaddr_unpin_pages - counterpart to vaddr_pin_pages > * > * @pages: array of pages returned > * @nr_pages: number of pages in pages > @@ -2551,26 +2551,9 @@ EXPORT_SYMBOL(vaddr_pin_pages_remote); > * in vaddr_pin_pages should be passed back into this call for proper > * tracking. > */ > -void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages, > - struct vaddr_pin *vaddr_pin, bool make_dirty) > +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, > + struct vaddr_pin *vaddr_pin, bool make_dirty) > { > __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty); > } > EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock); > - > -/** > - * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages > - * > - * @pages: array of pages returned > - * @nr_pages: number of pages in pages > - * @vaddr_pin: same information passed to vaddr_pin_pages > - * > - * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting > - * back pages in an error case: they were never made dirty. > - */ > -void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, > - struct vaddr_pin *vaddr_pin) > -{ > - __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false); > -} > -EXPORT_SYMBOL(vaddr_unpin_pages); > thanks,
On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: > From: John Hubbard <jhubbard@nvidia.com> > > This is the "vaddr_pin_pages" corresponding variant to > get_user_pages_remote(), but with FOLL_PIN semantics: the implementation > sets FOLL_PIN. That, in turn, means that the pages must ultimately be > released by put_user_page*()--typically, via vaddr_unpin_pages*(). > > Note that the put_user_page*() requirement won't be truly > required until all of the call sites have been converted, and > the tracking of pages is actually activated. > > Also introduce vaddr_unpin_pages(), in order to have a simpler > call for the error handling cases. > > Use both of these new calls in the Infiniband drive, replacing > get_user_pages_remote() and put_user_pages(). > > Signed-off-by: John Hubbard <jhubbard@nvidia.com> > --- > drivers/infiniband/core/umem_odp.c | 15 +++++---- > include/linux/mm.h | 7 +++++ > mm/gup.c | 50 ++++++++++++++++++++++++++++++ > 3 files changed, 66 insertions(+), 6 deletions(-) > > diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c > index 53085896d718..fdff034a8a30 100644 > --- a/drivers/infiniband/core/umem_odp.c > +++ b/drivers/infiniband/core/umem_odp.c > @@ -534,7 +534,7 @@ static int ib_umem_odp_map_dma_single_page( > } > > out: > - put_user_page(page); > + vaddr_unpin_pages(&page, 1, &umem_odp->umem.vaddr_pin); > > if (remove_existing_mapping) { > ib_umem_notifier_start_account(umem_odp); > @@ -635,9 +635,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, > * complex (and doesn't gain us much performance in most use > * cases). > */ > - npages = get_user_pages_remote(owning_process, owning_mm, > + npages = vaddr_pin_pages_remote(owning_process, owning_mm, > user_virt, gup_num_pages, > - flags, local_page_list, NULL, NULL); > + flags, local_page_list, NULL, NULL, > + &umem_odp->umem.vaddr_pin); Thinking about this part of the patch... is this pin really necessary? This code is not doing a long term pin. The page just needs a reference while we map it into the devices page tables. Once that is done we should get notifiers if anything changes and we can adjust. right? Ira > up_read(&owning_mm->mmap_sem); > > if (npages < 0) { > @@ -657,7 +658,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, > ret = -EFAULT; > break; > } > - put_user_page(local_page_list[j]); > + vaddr_unpin_pages(&local_page_list[j], 1, > + &umem_odp->umem.vaddr_pin); > continue; > } > > @@ -684,8 +686,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, > * ib_umem_odp_map_dma_single_page(). > */ > if (npages - (j + 1) > 0) > - put_user_pages(&local_page_list[j+1], > - npages - (j + 1)); > + vaddr_unpin_pages(&local_page_list[j+1], > + npages - (j + 1), > + &umem_odp->umem.vaddr_pin); > break; > } > } > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 61b616cd9243..2bd76ad8787e 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1606,6 +1606,13 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, > long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages, > unsigned int gup_flags, struct page **pages, > struct vaddr_pin *vaddr_pin); > +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm, > + unsigned long start, unsigned long nr_pages, > + unsigned int gup_flags, struct page **pages, > + struct vm_area_struct **vmas, int *locked, > + struct vaddr_pin *vaddr_pin); > +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, > + struct vaddr_pin *vaddr_pin); > void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages, > struct vaddr_pin *vaddr_pin, bool make_dirty); > bool mapping_inode_has_layout(struct vaddr_pin *vaddr_pin, struct page *page); > diff --git a/mm/gup.c b/mm/gup.c > index 85f09958fbdc..bb95adfaf9b6 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -2518,6 +2518,38 @@ long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages, > } > EXPORT_SYMBOL(vaddr_pin_pages); > > +/** > + * vaddr_pin_pages pin pages by virtual address and return the pages to the > + * user. > + * > + * @tsk: the task_struct to use for page fault accounting, or > + * NULL if faults are not to be recorded. > + * @mm: mm_struct of target mm > + * @addr: start address > + * @nr_pages: number of pages to pin > + * @gup_flags: flags to use for the pin > + * @pages: array of pages returned > + * @vaddr_pin: initialized meta information this pin is to be associated > + * with. > + * > + * This is the "vaddr_pin_pages" corresponding variant to > + * get_user_pages_remote(), but with FOLL_PIN semantics: the implementation sets > + * FOLL_PIN. That, in turn, means that the pages must ultimately be released > + * by put_user_page(). > + */ > +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm, > + unsigned long start, unsigned long nr_pages, > + unsigned int gup_flags, struct page **pages, > + struct vm_area_struct **vmas, int *locked, > + struct vaddr_pin *vaddr_pin) > +{ > + gup_flags |= FOLL_TOUCH | FOLL_REMOTE | FOLL_PIN; > + > + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, > + locked, gup_flags, vaddr_pin); > +} > +EXPORT_SYMBOL(vaddr_pin_pages_remote); > + > /** > * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages > * > @@ -2536,3 +2568,21 @@ void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages, > __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty); > } > EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock); > + > +/** > + * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages > + * > + * @pages: array of pages returned > + * @nr_pages: number of pages in pages > + * @vaddr_pin: same information passed to vaddr_pin_pages > + * > + * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting > + * back pages in an error case: they were never made dirty. > + */ > +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, > + struct vaddr_pin *vaddr_pin) > +{ > + __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false); > +} > +EXPORT_SYMBOL(vaddr_unpin_pages); > + > -- > 2.22.0 >
On 8/12/19 4:49 PM, Ira Weiny wrote: > On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: >> From: John Hubbard <jhubbard@nvidia.com> ... >> diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c >> index 53085896d718..fdff034a8a30 100644 >> --- a/drivers/infiniband/core/umem_odp.c >> +++ b/drivers/infiniband/core/umem_odp.c >> @@ -534,7 +534,7 @@ static int ib_umem_odp_map_dma_single_page( >> } >> >> out: >> - put_user_page(page); >> + vaddr_unpin_pages(&page, 1, &umem_odp->umem.vaddr_pin); >> >> if (remove_existing_mapping) { >> ib_umem_notifier_start_account(umem_odp); >> @@ -635,9 +635,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, >> * complex (and doesn't gain us much performance in most use >> * cases). >> */ >> - npages = get_user_pages_remote(owning_process, owning_mm, >> + npages = vaddr_pin_pages_remote(owning_process, owning_mm, >> user_virt, gup_num_pages, >> - flags, local_page_list, NULL, NULL); >> + flags, local_page_list, NULL, NULL, >> + &umem_odp->umem.vaddr_pin); > > Thinking about this part of the patch... is this pin really necessary? This > code is not doing a long term pin. The page just needs a reference while we > map it into the devices page tables. Once that is done we should get notifiers > if anything changes and we can adjust. right? > OK, now it's a little interesting: the FOLL_PIN is necessary, but maybe not FOLL_LONGTERM. Illustrating once again that it's actually necessary to allow these flags to vary independently. And that leads to another API refinement idea: let's set FOLL_PIN within the vaddr_pin_pages*() wrappers, and set FOLL_LONGTER in the *callers* of those wrappers, yes? thanks,
On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote: > On 8/12/19 4:49 PM, Ira Weiny wrote: > > On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: > > > From: John Hubbard <jhubbard@nvidia.com> > ... > > > diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c > > > index 53085896d718..fdff034a8a30 100644 > > > --- a/drivers/infiniband/core/umem_odp.c > > > +++ b/drivers/infiniband/core/umem_odp.c > > > @@ -534,7 +534,7 @@ static int ib_umem_odp_map_dma_single_page( > > > } > > > out: > > > - put_user_page(page); > > > + vaddr_unpin_pages(&page, 1, &umem_odp->umem.vaddr_pin); > > > if (remove_existing_mapping) { > > > ib_umem_notifier_start_account(umem_odp); > > > @@ -635,9 +635,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, > > > * complex (and doesn't gain us much performance in most use > > > * cases). > > > */ > > > - npages = get_user_pages_remote(owning_process, owning_mm, > > > + npages = vaddr_pin_pages_remote(owning_process, owning_mm, > > > user_virt, gup_num_pages, > > > - flags, local_page_list, NULL, NULL); > > > + flags, local_page_list, NULL, NULL, > > > + &umem_odp->umem.vaddr_pin); > > > > Thinking about this part of the patch... is this pin really necessary? This > > code is not doing a long term pin. The page just needs a reference while we > > map it into the devices page tables. Once that is done we should get notifiers > > if anything changes and we can adjust. right? > > > > OK, now it's a little interesting: the FOLL_PIN is necessary, but maybe not > FOLL_LONGTERM. Illustrating once again that it's actually necessary to allow > these flags to vary independently. Why is PIN necessary? I think we do want all drivers to use the new user_uaddr_vaddr_pin_user_pages() call... :-P But in this case I think a simple "get" reference is enough to reference the page while we are using it. If it changes after the "put/unpin" we get a fault which should handle the change right? The other issue I have with FOLL_PIN is what does it mean to call "...pin...()" without FOLL_PIN? This is another confusion of get_user_pages()... you can actually call it without FOLL_GET... :-/ And you just don't get pages back. I've never really dug into how (or if) you "put" them later... > > And that leads to another API refinement idea: let's set FOLL_PIN within the > vaddr_pin_pages*() wrappers, and set FOLL_LONGTER in the *callers* of those > wrappers, yes? I've thought about this before and I think any default flags should simply define what we want follow_pages to do. Also, the addition of vaddr_pin information creates an implicit flag which if not there disallows any file pages from being pinned. It becomes our new "longterm" flag. FOLL_PIN _could_ be what we should use "internally". But we could also just use this implicit vaddr_pin flag and not add a new flag. Finally, I struggle with converting everyone to a new call. It is more overhead to use vaddr_pin in the call above because now the GUP code is going to associate a file pin object with that file when in ODP we don't need that because the pages can move around. This overhead may be fine, not sure in this case, but I don't see everyone wanting it. Ira
On 8/13/19 2:08 PM, Ira Weiny wrote: > On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote: >> On 8/12/19 4:49 PM, Ira Weiny wrote: >>> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: >>>> From: John Hubbard <jhubbard@nvidia.com> >> ... >>> Thinking about this part of the patch... is this pin really necessary? This >>> code is not doing a long term pin. The page just needs a reference while we >>> map it into the devices page tables. Once that is done we should get notifiers >>> if anything changes and we can adjust. right? >>> >> >> OK, now it's a little interesting: the FOLL_PIN is necessary, but maybe not >> FOLL_LONGTERM. Illustrating once again that it's actually necessary to allow >> these flags to vary independently. > > Why is PIN necessary? I think we do want all drivers to use the new > user_uaddr_vaddr_pin_user_pages() call... :-P But in this case I think a > simple "get" reference is enough to reference the page while we are using it. > If it changes after the "put/unpin" we get a fault which should handle the > change right? > FOLL_PIN is necessary because the caller is clearly in the use case that requires it--however briefly they might be there. As Jan described it, "Anything that gets page reference and then touches page data (e.g. direct IO) needs the new kind of tracking so that filesystem knows someone is messing with the page data." [1] > The other issue I have with FOLL_PIN is what does it mean to call "...pin...()" > without FOLL_PIN? > > This is another confusion of get_user_pages()... you can actually call it > without FOLL_GET... :-/ And you just don't get pages back. I've never really > dug into how (or if) you "put" them later... > Yes, you are talking to someone who has been suffering through that. The problem here is that gup() has evolved into a do-everything tool. I think we're getting closer to teasing it apart into more specific interfaces that do more limited things. Anyway, I want FOLL_PIN as a way to help clarify this... >> >> And that leads to another API refinement idea: let's set FOLL_PIN within the >> vaddr_pin_pages*() wrappers, and set FOLL_LONGTER in the *callers* of those >> wrappers, yes? > > I've thought about this before and I think any default flags should simply > define what we want follow_pages to do. Hmm, so don't forget that we need to know what gup_fast() should do, too. Anyway, I'm not worried about which combination of wrapper calls set which flags, I'm open to suggestion there. But it does still seem to me that we should have independent FOLL_LONGTERM and FOLL_PIN flags, once the API churn settles. > > Also, the addition of vaddr_pin information creates an implicit flag which if > not there disallows any file pages from being pinned. It becomes our new > "longterm" flag. FOLL_PIN _could_ be what we should use "internally". But we > could also just use this implicit vaddr_pin flag and not add a new flag. I'd like to have FOLL_PIN internally, in order to solve the problems that you just raised, above! Namely, that it's too hard to figure out all the cases that gup() is handling. With FOLL_PIN, we know that we should be taking the new pin refcount, and releasing it via the (currently named) put_user_page*(), ultimately. > > Finally, I struggle with converting everyone to a new call. It is more > overhead to use vaddr_pin in the call above because now the GUP code is going > to associate a file pin object with that file when in ODP we don't need that > because the pages can move around. What if the pages in ODP are file-backed? > > This overhead may be fine, not sure in this case, but I don't see everyone > wanting it. I think most callers don't have much choice, otherwise they'll be broken with file-backed memory. thanks,
On 8/13/19 5:51 PM, John Hubbard wrote: > On 8/13/19 2:08 PM, Ira Weiny wrote: >> On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote: >>> On 8/12/19 4:49 PM, Ira Weiny wrote: >>>> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: >>>>> From: John Hubbard <jhubbard@nvidia.com> >>> ... >> Finally, I struggle with converting everyone to a new call. It is more >> overhead to use vaddr_pin in the call above because now the GUP code is going >> to associate a file pin object with that file when in ODP we don't need that >> because the pages can move around. > > What if the pages in ODP are file-backed? > oops, strike that, you're right: in that case, even the file system case is covered. Don't mind me. :) >> >> This overhead may be fine, not sure in this case, but I don't see everyone >> wanting it. So now I see why you said that, but I will note that ODP hardware is rare, and will likely remain rare: replayable page faults require really special hardware, and after all this time, we still only have CPUs, GPUs, and the Mellanox cards that do it. That leaves a lot of other hardware to take care of. thanks,
On Tue, Aug 13, 2019 at 05:56:31PM -0700, John Hubbard wrote: > On 8/13/19 5:51 PM, John Hubbard wrote: > > On 8/13/19 2:08 PM, Ira Weiny wrote: > >> On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote: > >>> On 8/12/19 4:49 PM, Ira Weiny wrote: > >>>> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: > >>>>> From: John Hubbard <jhubbard@nvidia.com> > >>> ... > >> Finally, I struggle with converting everyone to a new call. It is more > >> overhead to use vaddr_pin in the call above because now the GUP code is going > >> to associate a file pin object with that file when in ODP we don't need that > >> because the pages can move around. > > > > What if the pages in ODP are file-backed? > > > > oops, strike that, you're right: in that case, even the file system case is covered. > Don't mind me. :) Ok so are we agreed we will drop the patch to the ODP code? I'm going to keep the FOLL_PIN flag and addition in the vaddr_pin_pages. Ira > > >> > >> This overhead may be fine, not sure in this case, but I don't see everyone > >> wanting it. > > So now I see why you said that, but I will note that ODP hardware is rare, > and will likely remain rare: replayable page faults require really special > hardware, and after all this time, we still only have CPUs, GPUs, and the > Mellanox cards that do it. > > That leaves a lot of other hardware to take care of. > > thanks, > -- > John Hubbard > NVIDIA >
On 8/14/19 4:50 PM, Ira Weiny wrote: > On Tue, Aug 13, 2019 at 05:56:31PM -0700, John Hubbard wrote: >> On 8/13/19 5:51 PM, John Hubbard wrote: >>> On 8/13/19 2:08 PM, Ira Weiny wrote: >>>> On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote: >>>>> On 8/12/19 4:49 PM, Ira Weiny wrote: >>>>>> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: >>>>>>> From: John Hubbard <jhubbard@nvidia.com> >>>>> ... >>>> Finally, I struggle with converting everyone to a new call. It is more >>>> overhead to use vaddr_pin in the call above because now the GUP code is going >>>> to associate a file pin object with that file when in ODP we don't need that >>>> because the pages can move around. >>> >>> What if the pages in ODP are file-backed? >>> >> >> oops, strike that, you're right: in that case, even the file system case is covered. >> Don't mind me. :) > > Ok so are we agreed we will drop the patch to the ODP code? I'm going to keep > the FOLL_PIN flag and addition in the vaddr_pin_pages. > Yes. I hope I'm not overlooking anything, but it all seems to make sense to let ODP just rely on the MMU notifiers. thanks,
On 8/14/19 5:02 PM, John Hubbard wrote: > On 8/14/19 4:50 PM, Ira Weiny wrote: >> On Tue, Aug 13, 2019 at 05:56:31PM -0700, John Hubbard wrote: >>> On 8/13/19 5:51 PM, John Hubbard wrote: >>>> On 8/13/19 2:08 PM, Ira Weiny wrote: >>>>> On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote: >>>>>> On 8/12/19 4:49 PM, Ira Weiny wrote: >>>>>>> On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: >>>>>>>> From: John Hubbard <jhubbard@nvidia.com> >>>>>> ... >>>>> Finally, I struggle with converting everyone to a new call. It is more >>>>> overhead to use vaddr_pin in the call above because now the GUP code is going >>>>> to associate a file pin object with that file when in ODP we don't need that >>>>> because the pages can move around. >>>> >>>> What if the pages in ODP are file-backed? >>>> >>> >>> oops, strike that, you're right: in that case, even the file system case is covered. >>> Don't mind me. :) >> >> Ok so are we agreed we will drop the patch to the ODP code? I'm going to keep >> the FOLL_PIN flag and addition in the vaddr_pin_pages. >> > > Yes. I hope I'm not overlooking anything, but it all seems to make sense to > let ODP just rely on the MMU notifiers. > Hold on, I *was* forgetting something: this was a two part thing, and you're conflating the two points, but they need to remain separate and distinct. There were: 1. FOLL_PIN is necessary because the caller is clearly in the use case that requires it--however briefly they might be there. As Jan described it, "Anything that gets page reference and then touches page data (e.g. direct IO) needs the new kind of tracking so that filesystem knows someone is messing with the page data." [1] 2. Releasing the pin: for ODP, we can use MMU notifiers instead of requiring a lease. This second point does not invalidate the first point. Therefore, I still see the need for the call within ODP, to something that sets FOLL_PIN. And that means either vaddr_pin_[user?]_pages_remote, or some other wrapper of your choice. :) I guess shows that the API might need to be refined. We're trying to solve two closely related issues, but they're not identical. thanks,
On Wed 14-08-19 20:01:07, John Hubbard wrote: > On 8/14/19 5:02 PM, John Hubbard wrote: > > On 8/14/19 4:50 PM, Ira Weiny wrote: > > > On Tue, Aug 13, 2019 at 05:56:31PM -0700, John Hubbard wrote: > > > > On 8/13/19 5:51 PM, John Hubbard wrote: > > > > > On 8/13/19 2:08 PM, Ira Weiny wrote: > > > > > > On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote: > > > > > > > On 8/12/19 4:49 PM, Ira Weiny wrote: > > > > > > > > On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: > > > > > > > > > From: John Hubbard <jhubbard@nvidia.com> > > > > > > > ... > > > > > > Finally, I struggle with converting everyone to a new call. It is more > > > > > > overhead to use vaddr_pin in the call above because now the GUP code is going > > > > > > to associate a file pin object with that file when in ODP we don't need that > > > > > > because the pages can move around. > > > > > > > > > > What if the pages in ODP are file-backed? > > > > > > > > > > > > > oops, strike that, you're right: in that case, even the file system case is covered. > > > > Don't mind me. :) > > > > > > Ok so are we agreed we will drop the patch to the ODP code? I'm going to keep > > > the FOLL_PIN flag and addition in the vaddr_pin_pages. > > > > > > > Yes. I hope I'm not overlooking anything, but it all seems to make sense to > > let ODP just rely on the MMU notifiers. > > > > Hold on, I *was* forgetting something: this was a two part thing, and > you're conflating the two points, but they need to remain separate and > distinct. There were: > > 1. FOLL_PIN is necessary because the caller is clearly in the use case that > requires it--however briefly they might be there. As Jan described it, > > "Anything that gets page reference and then touches page data (e.g. > direct IO) needs the new kind of tracking so that filesystem knows > someone is messing with the page data." [1] So when the GUP user uses MMU notifiers to stop writing to pages whenever they are writeprotected with page_mkclean(), they don't really need page pin - their access is then fully equivalent to any other mmap userspace access and filesystem knows how to deal with those. I forgot out this case when I wrote the above sentence. So to sum up there are three cases: 1) DIO case - GUP references to pages serving as DIO buffers are needed for relatively short time, no special synchronization with page_mkclean() or munmap() => needs FOLL_PIN 2) RDMA case - GUP references to pages serving as DMA buffers needed for a long time, no special synchronization with page_mkclean() or munmap() => needs FOLL_PIN | FOLL_LONGTERM This case has also a special case when the pages are actually DAX. Then the caller additionally needs file lease and additional file_pin structure is used for tracking this usage. 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers used to synchronize with page_mkclean() and munmap() => normal page references are fine. Honza
On Thu 15-08-19 15:26:22, Jan Kara wrote: > On Wed 14-08-19 20:01:07, John Hubbard wrote: > > On 8/14/19 5:02 PM, John Hubbard wrote: > > > On 8/14/19 4:50 PM, Ira Weiny wrote: > > > > On Tue, Aug 13, 2019 at 05:56:31PM -0700, John Hubbard wrote: > > > > > On 8/13/19 5:51 PM, John Hubbard wrote: > > > > > > On 8/13/19 2:08 PM, Ira Weiny wrote: > > > > > > > On Mon, Aug 12, 2019 at 05:07:32PM -0700, John Hubbard wrote: > > > > > > > > On 8/12/19 4:49 PM, Ira Weiny wrote: > > > > > > > > > On Sun, Aug 11, 2019 at 06:50:44PM -0700, john.hubbard@gmail.com wrote: > > > > > > > > > > From: John Hubbard <jhubbard@nvidia.com> > > > > > > > > ... > > > > > > > Finally, I struggle with converting everyone to a new call. It is more > > > > > > > overhead to use vaddr_pin in the call above because now the GUP code is going > > > > > > > to associate a file pin object with that file when in ODP we don't need that > > > > > > > because the pages can move around. > > > > > > > > > > > > What if the pages in ODP are file-backed? > > > > > > > > > > > > > > > > oops, strike that, you're right: in that case, even the file system case is covered. > > > > > Don't mind me. :) > > > > > > > > Ok so are we agreed we will drop the patch to the ODP code? I'm going to keep > > > > the FOLL_PIN flag and addition in the vaddr_pin_pages. > > > > > > > > > > Yes. I hope I'm not overlooking anything, but it all seems to make sense to > > > let ODP just rely on the MMU notifiers. > > > > > > > Hold on, I *was* forgetting something: this was a two part thing, and > > you're conflating the two points, but they need to remain separate and > > distinct. There were: > > > > 1. FOLL_PIN is necessary because the caller is clearly in the use case that > > requires it--however briefly they might be there. As Jan described it, > > > > "Anything that gets page reference and then touches page data (e.g. > > direct IO) needs the new kind of tracking so that filesystem knows > > someone is messing with the page data." [1] > > So when the GUP user uses MMU notifiers to stop writing to pages whenever > they are writeprotected with page_mkclean(), they don't really need page > pin - their access is then fully equivalent to any other mmap userspace > access and filesystem knows how to deal with those. I forgot out this case > when I wrote the above sentence. > > So to sum up there are three cases: > 1) DIO case - GUP references to pages serving as DIO buffers are needed for > relatively short time, no special synchronization with page_mkclean() or > munmap() => needs FOLL_PIN > 2) RDMA case - GUP references to pages serving as DMA buffers needed for a > long time, no special synchronization with page_mkclean() or munmap() > => needs FOLL_PIN | FOLL_LONGTERM > This case has also a special case when the pages are actually DAX. Then > the caller additionally needs file lease and additional file_pin > structure is used for tracking this usage. > 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers > used to synchronize with page_mkclean() and munmap() => normal page > references are fine. I want to add that I'd like to convert users in cases 1) and 2) from using GUP to using differently named function. Users in case 3) can stay as they are for now although ultimately I'd like to denote such use cases in a special way as well... Honza
On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote: > > 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers > > used to synchronize with page_mkclean() and munmap() => normal page > > references are fine. > > I want to add that I'd like to convert users in cases 1) and 2) from using > GUP to using differently named function. Users in case 3) can stay as they > are for now although ultimately I'd like to denote such use cases in a > special way as well... 3) users also want a special function and path, right now it is called hmm_range_fault() but perhaps it would be good to harmonize it more with the GUP infrastructure? I'm not quite sure what the best plan for that is yet. Jason
On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote: > On Thu 15-08-19 15:26:22, Jan Kara wrote: > > On Wed 14-08-19 20:01:07, John Hubbard wrote: > > > On 8/14/19 5:02 PM, John Hubbard wrote: > > > > > > Hold on, I *was* forgetting something: this was a two part thing, and > > > you're conflating the two points, but they need to remain separate and > > > distinct. There were: > > > > > > 1. FOLL_PIN is necessary because the caller is clearly in the use case that > > > requires it--however briefly they might be there. As Jan described it, > > > > > > "Anything that gets page reference and then touches page data (e.g. > > > direct IO) needs the new kind of tracking so that filesystem knows > > > someone is messing with the page data." [1] > > > > So when the GUP user uses MMU notifiers to stop writing to pages whenever > > they are writeprotected with page_mkclean(), they don't really need page > > pin - their access is then fully equivalent to any other mmap userspace > > access and filesystem knows how to deal with those. I forgot out this case > > when I wrote the above sentence. > > > > So to sum up there are three cases: > > 1) DIO case - GUP references to pages serving as DIO buffers are needed for > > relatively short time, no special synchronization with page_mkclean() or > > munmap() => needs FOLL_PIN > > 2) RDMA case - GUP references to pages serving as DMA buffers needed for a > > long time, no special synchronization with page_mkclean() or munmap() > > => needs FOLL_PIN | FOLL_LONGTERM > > This case has also a special case when the pages are actually DAX. Then > > the caller additionally needs file lease and additional file_pin > > structure is used for tracking this usage. > > 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers > > used to synchronize with page_mkclean() and munmap() => normal page > > references are fine. > > I want to add that I'd like to convert users in cases 1) and 2) from using > GUP to using differently named function. Users in case 3) can stay as they > are for now although ultimately I'd like to denote such use cases in a > special way as well... > Ok just to make this clear I threw up my current tree with your patches here: https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4 I'm talking about dropping the final patch: 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP The other 2 can stay. I split out the *_remote() call. We don't have a user but I'll keep it around for a bit. This tree is still WIP as I work through all the comments. So I've not changed names or variable types etc... Just wanted to settle this. Ira
On 8/15/19 10:32 AM, Ira Weiny wrote: > On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote: >> On Thu 15-08-19 15:26:22, Jan Kara wrote: >>> On Wed 14-08-19 20:01:07, John Hubbard wrote: >>>> On 8/14/19 5:02 PM, John Hubbard wrote: >>>> >>>> Hold on, I *was* forgetting something: this was a two part thing, and >>>> you're conflating the two points, but they need to remain separate and >>>> distinct. There were: >>>> >>>> 1. FOLL_PIN is necessary because the caller is clearly in the use case that >>>> requires it--however briefly they might be there. As Jan described it, >>>> >>>> "Anything that gets page reference and then touches page data (e.g. >>>> direct IO) needs the new kind of tracking so that filesystem knows >>>> someone is messing with the page data." [1] >>> >>> So when the GUP user uses MMU notifiers to stop writing to pages whenever >>> they are writeprotected with page_mkclean(), they don't really need page >>> pin - their access is then fully equivalent to any other mmap userspace >>> access and filesystem knows how to deal with those. I forgot out this case >>> when I wrote the above sentence. >>> >>> So to sum up there are three cases: >>> 1) DIO case - GUP references to pages serving as DIO buffers are needed for >>> relatively short time, no special synchronization with page_mkclean() or >>> munmap() => needs FOLL_PIN >>> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a >>> long time, no special synchronization with page_mkclean() or munmap() >>> => needs FOLL_PIN | FOLL_LONGTERM >>> This case has also a special case when the pages are actually DAX. Then >>> the caller additionally needs file lease and additional file_pin >>> structure is used for tracking this usage. >>> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers >>> used to synchronize with page_mkclean() and munmap() => normal page >>> references are fine. >> Thanks Jan, once again, for clarifying all of this! >> I want to add that I'd like to convert users in cases 1) and 2) from using >> GUP to using differently named function. Users in case 3) can stay as they >> are for now although ultimately I'd like to denote such use cases in a >> special way as well... >> > > Ok just to make this clear I threw up my current tree with your patches here: > > https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4 > > I'm talking about dropping the final patch: > 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP > > The other 2 can stay. I split out the *_remote() call. We don't have a user > but I'll keep it around for a bit. > > This tree is still WIP as I work through all the comments. So I've not changed > names or variable types etc... Just wanted to settle this. > Right. And now that ODP is not a user, I'll take a quick look through my other call site conversions and see if I can find an easy one, to include here as the first user of vaddr_pin_pages_remote(). I'll send it your way if that works out. thanks,
On 8/15/19 10:41 AM, John Hubbard wrote: > On 8/15/19 10:32 AM, Ira Weiny wrote: >> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote: >>> On Thu 15-08-19 15:26:22, Jan Kara wrote: >>>> On Wed 14-08-19 20:01:07, John Hubbard wrote: >>>>> On 8/14/19 5:02 PM, John Hubbard wrote: ... >> Ok just to make this clear I threw up my current tree with your patches here: >> >> https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4 >> >> I'm talking about dropping the final patch: >> 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP >> >> The other 2 can stay. I split out the *_remote() call. We don't have a user >> but I'll keep it around for a bit. >> >> This tree is still WIP as I work through all the comments. So I've not changed >> names or variable types etc... Just wanted to settle this. >> > > Right. And now that ODP is not a user, I'll take a quick look through my other > call site conversions and see if I can find an easy one, to include here as > the first user of vaddr_pin_pages_remote(). I'll send it your way if that > works out. > OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru patch, maybe eventually [1]. But looking at process_vm_access.c, I think it is one of the patches that is no longer applicable, and I can just drop it entirely...I'd welcome a second opinion on that... So we might be all out of potential users for vaddr_pin_pages_remote()! For quick reference, it looks like this: diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 357aa7bef6c0..4d29d54ec93f 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -96,7 +96,7 @@ static int process_vm_rw_single_vec(unsigned long addr, flags |= FOLL_WRITE; while (!rc && nr_pages && iov_iter_count(iter)) { - int pages = min(nr_pages, max_pages_per_loop); + int pinned_pages = min(nr_pages, max_pages_per_loop); int locked = 1; size_t bytes; @@ -106,14 +106,15 @@ static int process_vm_rw_single_vec(unsigned long addr, * current/current->mm */ down_read(&mm->mmap_sem); - pages = get_user_pages_remote(task, mm, pa, pages, flags, - process_pages, NULL, &locked); + pinned_pages = get_user_pages_remote(task, mm, pa, pinned_pages, + flags, process_pages, NULL, + &locked); if (locked) up_read(&mm->mmap_sem); - if (pages <= 0) + if (pinned_pages <= 0) return -EFAULT; - bytes = pages * PAGE_SIZE - start_offset; + bytes = pinned_pages * PAGE_SIZE - start_offset; if (bytes > len) bytes = len; @@ -122,10 +123,9 @@ static int process_vm_rw_single_vec(unsigned long addr, vm_write); len -= bytes; start_offset = 0; - nr_pages -= pages; - pa += pages * PAGE_SIZE; - while (pages) - put_page(process_pages[--pages]); + nr_pages -= pinned_pages; + pa += pinned_pages * PAGE_SIZE; + put_user_pages(process_pages, pinned_pages); } return rc; [1] https://lore.kernel.org/r/1565379497-29266-2-git-send-email-linux.bhar@gmail.com thanks,
On 8/15/19 3:35 PM, Jan Kara wrote: >> >> So when the GUP user uses MMU notifiers to stop writing to pages whenever >> they are writeprotected with page_mkclean(), they don't really need page >> pin - their access is then fully equivalent to any other mmap userspace >> access and filesystem knows how to deal with those. I forgot out this case >> when I wrote the above sentence. >> >> So to sum up there are three cases: >> 1) DIO case - GUP references to pages serving as DIO buffers are needed for >> relatively short time, no special synchronization with page_mkclean() or >> munmap() => needs FOLL_PIN >> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a >> long time, no special synchronization with page_mkclean() or munmap() >> => needs FOLL_PIN | FOLL_LONGTERM >> This case has also a special case when the pages are actually DAX. Then >> the caller additionally needs file lease and additional file_pin >> structure is used for tracking this usage. >> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers >> used to synchronize with page_mkclean() and munmap() => normal page >> references are fine. IMHO the munlock lesson told us about another one, that's in the end equivalent to 3) 4) pinning for struct page manipulation only => normal page references are fine > I want to add that I'd like to convert users in cases 1) and 2) from using > GUP to using differently named function. Users in case 3) can stay as they > are for now although ultimately I'd like to denote such use cases in a > special way as well... So after 1/2/3 is renamed/specially denoted, only 4) keeps the current interface? > Honza >
On Thu 15-08-19 19:14:08, John Hubbard wrote: > On 8/15/19 10:41 AM, John Hubbard wrote: > > On 8/15/19 10:32 AM, Ira Weiny wrote: > >> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote: > >>> On Thu 15-08-19 15:26:22, Jan Kara wrote: > >>>> On Wed 14-08-19 20:01:07, John Hubbard wrote: > >>>>> On 8/14/19 5:02 PM, John Hubbard wrote: > ... > >> Ok just to make this clear I threw up my current tree with your patches here: > >> > >> https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4 > >> > >> I'm talking about dropping the final patch: > >> 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP > >> > >> The other 2 can stay. I split out the *_remote() call. We don't have a user > >> but I'll keep it around for a bit. > >> > >> This tree is still WIP as I work through all the comments. So I've not changed > >> names or variable types etc... Just wanted to settle this. > >> > > > > Right. And now that ODP is not a user, I'll take a quick look through my other > > call site conversions and see if I can find an easy one, to include here as > > the first user of vaddr_pin_pages_remote(). I'll send it your way if that > > works out. > > > > OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru > patch, maybe eventually [1]. But looking at process_vm_access.c, I think > it is one of the patches that is no longer applicable, and I can just > drop it entirely...I'd welcome a second opinion on that... I don't think you can drop the patch. process_vm_rw_pages() clearly touches page contents and does not synchronize with page_mkclean(). So it is case 1) and needs FOLL_PIN semantics. Honza
On Fri 16-08-19 10:47:21, Vlastimil Babka wrote: > On 8/15/19 3:35 PM, Jan Kara wrote: > >> > >> So when the GUP user uses MMU notifiers to stop writing to pages whenever > >> they are writeprotected with page_mkclean(), they don't really need page > >> pin - their access is then fully equivalent to any other mmap userspace > >> access and filesystem knows how to deal with those. I forgot out this case > >> when I wrote the above sentence. > >> > >> So to sum up there are three cases: > >> 1) DIO case - GUP references to pages serving as DIO buffers are needed for > >> relatively short time, no special synchronization with page_mkclean() or > >> munmap() => needs FOLL_PIN > >> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a > >> long time, no special synchronization with page_mkclean() or munmap() > >> => needs FOLL_PIN | FOLL_LONGTERM > >> This case has also a special case when the pages are actually DAX. Then > >> the caller additionally needs file lease and additional file_pin > >> structure is used for tracking this usage. > >> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers > >> used to synchronize with page_mkclean() and munmap() => normal page > >> references are fine. > > IMHO the munlock lesson told us about another one, that's in the end equivalent > to 3) > > 4) pinning for struct page manipulation only => normal page references > are fine Right, it's good to have this for clarity. > > I want to add that I'd like to convert users in cases 1) and 2) from using > > GUP to using differently named function. Users in case 3) can stay as they > > are for now although ultimately I'd like to denote such use cases in a > > special way as well... > > So after 1/2/3 is renamed/specially denoted, only 4) keeps the current > interface? Well, munlock() code doesn't even use GUP, just follow_page(). I'd wait to see what's left after handling cases 1), 2), and 3) to decide about the interface for the remainder. Honza
On Fri, Aug 16, 2019 at 05:44:04PM +0200, Jan Kara wrote: > On Fri 16-08-19 10:47:21, Vlastimil Babka wrote: > > On 8/15/19 3:35 PM, Jan Kara wrote: > > >> > > >> So when the GUP user uses MMU notifiers to stop writing to pages whenever > > >> they are writeprotected with page_mkclean(), they don't really need page > > >> pin - their access is then fully equivalent to any other mmap userspace > > >> access and filesystem knows how to deal with those. I forgot out this case > > >> when I wrote the above sentence. > > >> > > >> So to sum up there are three cases: > > >> 1) DIO case - GUP references to pages serving as DIO buffers are needed for > > >> relatively short time, no special synchronization with page_mkclean() or > > >> munmap() => needs FOLL_PIN > > >> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a > > >> long time, no special synchronization with page_mkclean() or munmap() > > >> => needs FOLL_PIN | FOLL_LONGTERM > > >> This case has also a special case when the pages are actually DAX. Then > > >> the caller additionally needs file lease and additional file_pin > > >> structure is used for tracking this usage. > > >> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers > > >> used to synchronize with page_mkclean() and munmap() => normal page > > >> references are fine. > > > > IMHO the munlock lesson told us about another one, that's in the end equivalent > > to 3) > > > > 4) pinning for struct page manipulation only => normal page references > > are fine > > Right, it's good to have this for clarity. > > > > I want to add that I'd like to convert users in cases 1) and 2) from using > > > GUP to using differently named function. Users in case 3) can stay as they > > > are for now although ultimately I'd like to denote such use cases in a > > > special way as well... > > > > So after 1/2/3 is renamed/specially denoted, only 4) keeps the current > > interface? > > Well, munlock() code doesn't even use GUP, just follow_page(). I'd wait to > see what's left after handling cases 1), 2), and 3) to decide about the > interface for the remainder. > For 3 we do not need to take a reference at all :) So just forget about 3 it does not exist. For 3 the reference is the reference the CPU page table has on the page and that's it. GUP is no longer involve in ODP or anything like that. Cheers, Jérôme
On Fri 16-08-19 11:52:20, Jerome Glisse wrote: > On Fri, Aug 16, 2019 at 05:44:04PM +0200, Jan Kara wrote: > > On Fri 16-08-19 10:47:21, Vlastimil Babka wrote: > > > On 8/15/19 3:35 PM, Jan Kara wrote: > > > >> > > > >> So when the GUP user uses MMU notifiers to stop writing to pages whenever > > > >> they are writeprotected with page_mkclean(), they don't really need page > > > >> pin - their access is then fully equivalent to any other mmap userspace > > > >> access and filesystem knows how to deal with those. I forgot out this case > > > >> when I wrote the above sentence. > > > >> > > > >> So to sum up there are three cases: > > > >> 1) DIO case - GUP references to pages serving as DIO buffers are needed for > > > >> relatively short time, no special synchronization with page_mkclean() or > > > >> munmap() => needs FOLL_PIN > > > >> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a > > > >> long time, no special synchronization with page_mkclean() or munmap() > > > >> => needs FOLL_PIN | FOLL_LONGTERM > > > >> This case has also a special case when the pages are actually DAX. Then > > > >> the caller additionally needs file lease and additional file_pin > > > >> structure is used for tracking this usage. > > > >> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers > > > >> used to synchronize with page_mkclean() and munmap() => normal page > > > >> references are fine. > > > > > > IMHO the munlock lesson told us about another one, that's in the end equivalent > > > to 3) > > > > > > 4) pinning for struct page manipulation only => normal page references > > > are fine > > > > Right, it's good to have this for clarity. > > > > > > I want to add that I'd like to convert users in cases 1) and 2) from using > > > > GUP to using differently named function. Users in case 3) can stay as they > > > > are for now although ultimately I'd like to denote such use cases in a > > > > special way as well... > > > > > > So after 1/2/3 is renamed/specially denoted, only 4) keeps the current > > > interface? > > > > Well, munlock() code doesn't even use GUP, just follow_page(). I'd wait to > > see what's left after handling cases 1), 2), and 3) to decide about the > > interface for the remainder. > > > > For 3 we do not need to take a reference at all :) So just forget about 3 > it does not exist. For 3 the reference is the reference the CPU page table > has on the page and that's it. GUP is no longer involve in ODP or anything > like that. Yes, I understand. But the fact is that GUP calls are currently still there e.g. in ODP code. If you can make the code work without taking a page reference at all, I'm only happy :) Honza
On Fri, Aug 16, 2019 at 06:13:55PM +0200, Jan Kara wrote: > > For 3 we do not need to take a reference at all :) So just forget about 3 > > it does not exist. For 3 the reference is the reference the CPU page table > > has on the page and that's it. GUP is no longer involve in ODP or anything > > like that. > > Yes, I understand. But the fact is that GUP calls are currently still there > e.g. in ODP code. If you can make the code work without taking a page > reference at all, I'm only happy :) We are working on it :) Jason
On Fri, Aug 16, 2019 at 06:13:55PM +0200, Jan Kara wrote: > On Fri 16-08-19 11:52:20, Jerome Glisse wrote: > > On Fri, Aug 16, 2019 at 05:44:04PM +0200, Jan Kara wrote: > > > On Fri 16-08-19 10:47:21, Vlastimil Babka wrote: > > > > On 8/15/19 3:35 PM, Jan Kara wrote: > > > > >> > > > > >> So when the GUP user uses MMU notifiers to stop writing to pages whenever > > > > >> they are writeprotected with page_mkclean(), they don't really need page > > > > >> pin - their access is then fully equivalent to any other mmap userspace > > > > >> access and filesystem knows how to deal with those. I forgot out this case > > > > >> when I wrote the above sentence. > > > > >> > > > > >> So to sum up there are three cases: > > > > >> 1) DIO case - GUP references to pages serving as DIO buffers are needed for > > > > >> relatively short time, no special synchronization with page_mkclean() or > > > > >> munmap() => needs FOLL_PIN > > > > >> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a > > > > >> long time, no special synchronization with page_mkclean() or munmap() > > > > >> => needs FOLL_PIN | FOLL_LONGTERM > > > > >> This case has also a special case when the pages are actually DAX. Then > > > > >> the caller additionally needs file lease and additional file_pin > > > > >> structure is used for tracking this usage. > > > > >> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers > > > > >> used to synchronize with page_mkclean() and munmap() => normal page > > > > >> references are fine. > > > > > > > > IMHO the munlock lesson told us about another one, that's in the end equivalent > > > > to 3) > > > > > > > > 4) pinning for struct page manipulation only => normal page references > > > > are fine > > > > > > Right, it's good to have this for clarity. > > > > > > > > I want to add that I'd like to convert users in cases 1) and 2) from using > > > > > GUP to using differently named function. Users in case 3) can stay as they > > > > > are for now although ultimately I'd like to denote such use cases in a > > > > > special way as well... > > > > > > > > So after 1/2/3 is renamed/specially denoted, only 4) keeps the current > > > > interface? > > > > > > Well, munlock() code doesn't even use GUP, just follow_page(). I'd wait to > > > see what's left after handling cases 1), 2), and 3) to decide about the > > > interface for the remainder. > > > > > > > For 3 we do not need to take a reference at all :) So just forget about 3 > > it does not exist. For 3 the reference is the reference the CPU page table > > has on the page and that's it. GUP is no longer involve in ODP or anything > > like that. > > Yes, I understand. But the fact is that GUP calls are currently still there > e.g. in ODP code. If you can make the code work without taking a page > reference at all, I'm only happy :) Already in rdma next AFAIK so in 5.4 it will be gone :) i have been removing all GUP users that do not need reference. Intel i915 driver is a left over i will work some more with them to get rid of it too. Cheers, Jérôme
On Fri, Aug 16, 2019 at 12:54:45PM -0400, Jerome Glisse wrote: > > Yes, I understand. But the fact is that GUP calls are currently still there > > e.g. in ODP code. If you can make the code work without taking a page > > reference at all, I'm only happy :) > > Already in rdma next AFAIK so in 5.4 it will be gone :) Unfortunately no.. only a lot of patches supporting this change will be in 5.4. The notifiers are still a problem, and I need to figure out if the edge cases in hmm_range_fault are OK for ODP or not. :( This is taking a long time in part because ODP itself has all sorts of problems that make it hard to tell if the other changes are safe or not.. Lots of effort is being spent to get there though. Jason
On Fri, Aug 16, 2019 at 05:41:08PM +0200, Jan Kara wrote: > On Thu 15-08-19 19:14:08, John Hubbard wrote: > > On 8/15/19 10:41 AM, John Hubbard wrote: > > > On 8/15/19 10:32 AM, Ira Weiny wrote: > > >> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote: > > >>> On Thu 15-08-19 15:26:22, Jan Kara wrote: > > >>>> On Wed 14-08-19 20:01:07, John Hubbard wrote: > > >>>>> On 8/14/19 5:02 PM, John Hubbard wrote: > > ... > > >> Ok just to make this clear I threw up my current tree with your patches here: > > >> > > >> https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4 > > >> > > >> I'm talking about dropping the final patch: > > >> 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP > > >> > > >> The other 2 can stay. I split out the *_remote() call. We don't have a user > > >> but I'll keep it around for a bit. > > >> > > >> This tree is still WIP as I work through all the comments. So I've not changed > > >> names or variable types etc... Just wanted to settle this. > > >> > > > > > > Right. And now that ODP is not a user, I'll take a quick look through my other > > > call site conversions and see if I can find an easy one, to include here as > > > the first user of vaddr_pin_pages_remote(). I'll send it your way if that > > > works out. > > > > > > > OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru > > patch, maybe eventually [1]. But looking at process_vm_access.c, I think > > it is one of the patches that is no longer applicable, and I can just > > drop it entirely...I'd welcome a second opinion on that... > > I don't think you can drop the patch. process_vm_rw_pages() clearly touches > page contents and does not synchronize with page_mkclean(). So it is case > 1) and needs FOLL_PIN semantics. John could you send a formal patch using vaddr_pin* and I'll add it to the tree? Ira > > Honza > -- > Jan Kara <jack@suse.com> > SUSE Labs, CR >
On 8/16/19 11:33 AM, Ira Weiny wrote: > On Fri, Aug 16, 2019 at 05:41:08PM +0200, Jan Kara wrote: >> On Thu 15-08-19 19:14:08, John Hubbard wrote: >>> On 8/15/19 10:41 AM, John Hubbard wrote: >>>> On 8/15/19 10:32 AM, Ira Weiny wrote: >>>>> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote: >>>>>> On Thu 15-08-19 15:26:22, Jan Kara wrote: >>>>>>> On Wed 14-08-19 20:01:07, John Hubbard wrote: >>>>>>>> On 8/14/19 5:02 PM, John Hubbard wrote: >>> ... >>> >>> OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru >>> patch, maybe eventually [1]. But looking at process_vm_access.c, I think >>> it is one of the patches that is no longer applicable, and I can just >>> drop it entirely...I'd welcome a second opinion on that... >> >> I don't think you can drop the patch. process_vm_rw_pages() clearly touches >> page contents and does not synchronize with page_mkclean(). So it is case >> 1) and needs FOLL_PIN semantics. > > John could you send a formal patch using vaddr_pin* and I'll add it to the > tree? > Yes...hints about which struct file to use here are very welcome, btw. This part of mm is fairly new to me. thanks,
On Fri, Aug 16, 2019 at 11:50:09AM -0700, John Hubbard wrote: > On 8/16/19 11:33 AM, Ira Weiny wrote: > > On Fri, Aug 16, 2019 at 05:41:08PM +0200, Jan Kara wrote: > > > On Thu 15-08-19 19:14:08, John Hubbard wrote: > > > > On 8/15/19 10:41 AM, John Hubbard wrote: > > > > > On 8/15/19 10:32 AM, Ira Weiny wrote: > > > > > > On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote: > > > > > > > On Thu 15-08-19 15:26:22, Jan Kara wrote: > > > > > > > > On Wed 14-08-19 20:01:07, John Hubbard wrote: > > > > > > > > > On 8/14/19 5:02 PM, John Hubbard wrote: > > > > ... > > > > > > > > OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru > > > > patch, maybe eventually [1]. But looking at process_vm_access.c, I think > > > > it is one of the patches that is no longer applicable, and I can just > > > > drop it entirely...I'd welcome a second opinion on that... > > > > > > I don't think you can drop the patch. process_vm_rw_pages() clearly touches > > > page contents and does not synchronize with page_mkclean(). So it is case > > > 1) and needs FOLL_PIN semantics. > > > > John could you send a formal patch using vaddr_pin* and I'll add it to the > > tree? > > > > Yes...hints about which struct file to use here are very welcome, btw. This part > of mm is fairly new to me. I'm still working out the final semantics of vaddr_pin*. But right now you don't need a vaddr_pin if you don't specify FOLL_LONGTERM. Since case 1, this case, does not need FOLL_LONGTERM I think it is safe to simply pass NULL here. OTOH we could just track this against the mm_struct. But I don't think we need to because this pin should be transient. And this is why I keep leaning toward _not_ putting these flags in the vaddr_pin*() calls. I know this is what I did but I think I'm wrong. It should be the caller specifying what they want and the vaddr_pin*() calls check that what they are asking for is correct. Ira > > thanks, > -- > John Hubbard > NVIDIA
On 8/16/19 2:59 PM, Ira Weiny wrote: > On Fri, Aug 16, 2019 at 11:50:09AM -0700, John Hubbard wrote: ... >>> John could you send a formal patch using vaddr_pin* and I'll add it to the >>> tree? >>> >> >> Yes...hints about which struct file to use here are very welcome, btw. This part >> of mm is fairly new to me. > > I'm still working out the final semantics of vaddr_pin*. But right now you > don't need a vaddr_pin if you don't specify FOLL_LONGTERM. > ah OK. > Since case 1, this case, does not need FOLL_LONGTERM I think it is safe to > simply pass NULL here. > > OTOH we could just track this against the mm_struct. But I don't think we need > to because this pin should be transient. > Thanks for looking at that, I'm definitely in learning mode here. > And this is why I keep leaning toward _not_ putting these flags in the > vaddr_pin*() calls. I know this is what I did but I think I'm wrong. It should > be the caller specifying what they want and the vaddr_pin*() calls check that > what they are asking for is correct. > Yes. I think we're nearly done finding the right balance of wrapper calls and FOLL_* flags. I've seen Jan and others asking that the call sites do *not* set the flags, but we also know that FOLL_PIN and FOLL_LONGTERM need to vary independently. That means either: a) another trivial wrapper calls, on top of vaddr_pin_*(), for each supported combination of FOLL_PIN and FOLL_LONGTERM, or b) just setting FOLL_PIN and FOLL_LONGTERM at each callsite. I think either way is easy to grep for, so it's hard to get too excited (fortunately) about which one to pick. Let's start simple with (b) and it's easy to convert later if someone wants that. Meanwhile, we do need to pull the flag setting out of vaddr_pin_pages(). So I will post these small patches for your mmotm-rdmafsdax-b0-v4 branch, shortly: 1) Add FOLL_PIN --also I guess it's time to add comments documenting FOLL_PIN and FOLL_LONGTERM use, stealing Jan's and others' wording for the 4 cases, from earlier. :) 2) Add vaddr_pin_user_pages_remote(), which will not set FOLL_PIN or FOLL_LONGTERM itself. And add the caller, which will. thanks,
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 53085896d718..fdff034a8a30 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -534,7 +534,7 @@ static int ib_umem_odp_map_dma_single_page( } out: - put_user_page(page); + vaddr_unpin_pages(&page, 1, &umem_odp->umem.vaddr_pin); if (remove_existing_mapping) { ib_umem_notifier_start_account(umem_odp); @@ -635,9 +635,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, * complex (and doesn't gain us much performance in most use * cases). */ - npages = get_user_pages_remote(owning_process, owning_mm, + npages = vaddr_pin_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - flags, local_page_list, NULL, NULL); + flags, local_page_list, NULL, NULL, + &umem_odp->umem.vaddr_pin); up_read(&owning_mm->mmap_sem); if (npages < 0) { @@ -657,7 +658,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, ret = -EFAULT; break; } - put_user_page(local_page_list[j]); + vaddr_unpin_pages(&local_page_list[j], 1, + &umem_odp->umem.vaddr_pin); continue; } @@ -684,8 +686,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, * ib_umem_odp_map_dma_single_page(). */ if (npages - (j + 1) > 0) - put_user_pages(&local_page_list[j+1], - npages - (j + 1)); + vaddr_unpin_pages(&local_page_list[j+1], + npages - (j + 1), + &umem_odp->umem.vaddr_pin); break; } } diff --git a/include/linux/mm.h b/include/linux/mm.h index 61b616cd9243..2bd76ad8787e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1606,6 +1606,13 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vaddr_pin *vaddr_pin); +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked, + struct vaddr_pin *vaddr_pin); +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, + struct vaddr_pin *vaddr_pin); void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages, struct vaddr_pin *vaddr_pin, bool make_dirty); bool mapping_inode_has_layout(struct vaddr_pin *vaddr_pin, struct page *page); diff --git a/mm/gup.c b/mm/gup.c index 85f09958fbdc..bb95adfaf9b6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2518,6 +2518,38 @@ long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages, } EXPORT_SYMBOL(vaddr_pin_pages); +/** + * vaddr_pin_pages pin pages by virtual address and return the pages to the + * user. + * + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @addr: start address + * @nr_pages: number of pages to pin + * @gup_flags: flags to use for the pin + * @pages: array of pages returned + * @vaddr_pin: initialized meta information this pin is to be associated + * with. + * + * This is the "vaddr_pin_pages" corresponding variant to + * get_user_pages_remote(), but with FOLL_PIN semantics: the implementation sets + * FOLL_PIN. That, in turn, means that the pages must ultimately be released + * by put_user_page(). + */ +long vaddr_pin_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked, + struct vaddr_pin *vaddr_pin) +{ + gup_flags |= FOLL_TOUCH | FOLL_REMOTE | FOLL_PIN; + + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + locked, gup_flags, vaddr_pin); +} +EXPORT_SYMBOL(vaddr_pin_pages_remote); + /** * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages * @@ -2536,3 +2568,21 @@ void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages, __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty); } EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock); + +/** + * vaddr_unpin_pages - simple, non-dirtying counterpart to vaddr_pin_pages + * + * @pages: array of pages returned + * @nr_pages: number of pages in pages + * @vaddr_pin: same information passed to vaddr_pin_pages + * + * Like vaddr_unpin_pages_dirty_lock, but for non-dirty pages. Useful in putting + * back pages in an error case: they were never made dirty. + */ +void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages, + struct vaddr_pin *vaddr_pin) +{ + __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, false); +} +EXPORT_SYMBOL(vaddr_unpin_pages); +