diff mbox series

[6/6] drivers/IB,core: reduce scope of mmap_sem

Message ID 20190115181300.27547-7-dave@stgolabs.net (mailing list archive)
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series mm: make pinned_vm atomic and simplify users | expand

Commit Message

Davidlohr Bueso Jan. 15, 2019, 6:13 p.m. UTC
ib_umem_get() uses gup_longterm() and relies on the lock to
stabilze the vma_list, so we cannot really get rid of mmap_sem
altogether, but now that the counter is atomic, we can get of
some complexity that mmap_sem brings with only pinned_vm.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 drivers/infiniband/core/umem.c | 41 ++---------------------------------------
 1 file changed, 2 insertions(+), 39 deletions(-)

Comments

Ira Weiny Jan. 15, 2019, 8:30 p.m. UTC | #1
On Tue, Jan 15, 2019 at 10:13:00AM -0800, Davidlohr Bueso wrote:
> ib_umem_get() uses gup_longterm() and relies on the lock to
> stabilze the vma_list, so we cannot really get rid of mmap_sem
> altogether, but now that the counter is atomic, we can get of
> some complexity that mmap_sem brings with only pinned_vm.
> 
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
>

Reviewed-by: Ira Weiny <ira.weiny@intel.com>

> ---
>  drivers/infiniband/core/umem.c | 41 ++---------------------------------------
>  1 file changed, 2 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index bf556215aa7e..baa2412bf6fb 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -160,15 +160,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>  
>  	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  
> -	down_write(&mm->mmap_sem);
> -	new_pinned = atomic_long_read(&mm->pinned_vm) + npages;
> +	new_pinned = atomic_long_add_return(npages, &mm->pinned_vm);
>  	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
> -		up_write(&mm->mmap_sem);
> +		atomic_long_sub(npages, &mm->pinned_vm);
>  		ret = -ENOMEM;
>  		goto out;
>  	}
> -	atomic_long_set(&mm->pinned_vm, new_pinned);
> -	up_write(&mm->mmap_sem);
>  
>  	cur_base = addr & PAGE_MASK;
>  
> @@ -228,9 +225,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>  umem_release:
>  	__ib_umem_release(context->device, umem, 0);
>  vma:
> -	down_write(&mm->mmap_sem);
>  	atomic_long_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
> -	up_write(&mm->mmap_sem);
>  out:
>  	if (vma_list)
>  		free_page((unsigned long) vma_list);
> @@ -253,25 +248,12 @@ static void __ib_umem_release_tail(struct ib_umem *umem)
>  		kfree(umem);
>  }
>  
> -static void ib_umem_release_defer(struct work_struct *work)
> -{
> -	struct ib_umem *umem = container_of(work, struct ib_umem, work);
> -
> -	down_write(&umem->owning_mm->mmap_sem);
> -	atomic_long_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
> -	up_write(&umem->owning_mm->mmap_sem);
> -
> -	__ib_umem_release_tail(umem);
> -}
> -
>  /**
>   * ib_umem_release - release memory pinned with ib_umem_get
>   * @umem: umem struct to release
>   */
>  void ib_umem_release(struct ib_umem *umem)
>  {
> -	struct ib_ucontext *context = umem->context;
> -
>  	if (umem->is_odp) {
>  		ib_umem_odp_release(to_ib_umem_odp(umem));
>  		__ib_umem_release_tail(umem);
> @@ -280,26 +262,7 @@ void ib_umem_release(struct ib_umem *umem)
>  
>  	__ib_umem_release(umem->context->device, umem, 1);
>  
> -	/*
> -	 * We may be called with the mm's mmap_sem already held.  This
> -	 * can happen when a userspace munmap() is the call that drops
> -	 * the last reference to our file and calls our release
> -	 * method.  If there are memory regions to destroy, we'll end
> -	 * up here and not be able to take the mmap_sem.  In that case
> -	 * we defer the vm_locked accounting a workqueue.
> -	 */
> -	if (context->closing) {
> -		if (!down_write_trylock(&umem->owning_mm->mmap_sem)) {
> -			INIT_WORK(&umem->work, ib_umem_release_defer);
> -			queue_work(ib_wq, &umem->work);
> -			return;
> -		}
> -	} else {
> -		down_write(&umem->owning_mm->mmap_sem);
> -	}
>  	atomic_long_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
> -	up_write(&umem->owning_mm->mmap_sem);
> -
>  	__ib_umem_release_tail(umem);
>  }
>  EXPORT_SYMBOL(ib_umem_release);
> -- 
> 2.16.4
>
Jason Gunthorpe Jan. 15, 2019, 8:53 p.m. UTC | #2
On Tue, Jan 15, 2019 at 10:13:00AM -0800, Davidlohr Bueso wrote:
> ib_umem_get() uses gup_longterm() and relies on the lock to
> stabilze the vma_list, so we cannot really get rid of mmap_sem
> altogether, but now that the counter is atomic, we can get of
> some complexity that mmap_sem brings with only pinned_vm.
> 
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
>  drivers/infiniband/core/umem.c | 41 ++---------------------------------------
>  1 file changed, 2 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index bf556215aa7e..baa2412bf6fb 100644
> +++ b/drivers/infiniband/core/umem.c
> @@ -160,15 +160,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>  
>  	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  
> -	down_write(&mm->mmap_sem);
> -	new_pinned = atomic_long_read(&mm->pinned_vm) + npages;
> +	new_pinned = atomic_long_add_return(npages, &mm->pinned_vm);
>  	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {

I thought a patch had been made for this to use check_overflow...

npages is controlled by userspace, so can we protect pinned_vm from
overflow in some way that still allows it to be atomic?

Jason
Matthew Wilcox Jan. 15, 2019, 9:12 p.m. UTC | #3
On Tue, Jan 15, 2019 at 08:53:16PM +0000, Jason Gunthorpe wrote:
> > -	new_pinned = atomic_long_read(&mm->pinned_vm) + npages;
> > +	new_pinned = atomic_long_add_return(npages, &mm->pinned_vm);
> >  	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
> 
> I thought a patch had been made for this to use check_overflow...

That got removed again by patch 1 ...
Jason Gunthorpe Jan. 15, 2019, 9:17 p.m. UTC | #4
On Tue, Jan 15, 2019 at 01:12:07PM -0800, Matthew Wilcox wrote:
> On Tue, Jan 15, 2019 at 08:53:16PM +0000, Jason Gunthorpe wrote:
> > > -	new_pinned = atomic_long_read(&mm->pinned_vm) + npages;
> > > +	new_pinned = atomic_long_add_return(npages, &mm->pinned_vm);
> > >  	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
> > 
> > I thought a patch had been made for this to use check_overflow...
> 
> That got removed again by patch 1 ...

Well, that sure needs a lot more explanation. :(

Jason
Davidlohr Bueso Jan. 16, 2019, 4 p.m. UTC | #5
On Tue, 15 Jan 2019, Jason Gunthorpe wrote:

>On Tue, Jan 15, 2019 at 01:12:07PM -0800, Matthew Wilcox wrote:
>> On Tue, Jan 15, 2019 at 08:53:16PM +0000, Jason Gunthorpe wrote:
>> > > -	new_pinned = atomic_long_read(&mm->pinned_vm) + npages;
>> > > +	new_pinned = atomic_long_add_return(npages, &mm->pinned_vm);
>> > >  	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
>> >
>> > I thought a patch had been made for this to use check_overflow...
>>
>> That got removed again by patch 1 ...
>
>Well, that sure needs a lot more explanation. :(

What if we just make the counter atomic64?

Thanks,
Davidlohr
Jason Gunthorpe Jan. 16, 2019, 5:02 p.m. UTC | #6
On Wed, Jan 16, 2019 at 08:00:26AM -0800, Davidlohr Bueso wrote:
> On Tue, 15 Jan 2019, Jason Gunthorpe wrote:
> 
> > On Tue, Jan 15, 2019 at 01:12:07PM -0800, Matthew Wilcox wrote:
> > > On Tue, Jan 15, 2019 at 08:53:16PM +0000, Jason Gunthorpe wrote:
> > > > > -	new_pinned = atomic_long_read(&mm->pinned_vm) + npages;
> > > > > +	new_pinned = atomic_long_add_return(npages, &mm->pinned_vm);
> > > > >  	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
> > > >
> > > > I thought a patch had been made for this to use check_overflow...
> > > 
> > > That got removed again by patch 1 ...
> > 
> > Well, that sure needs a lot more explanation. :(
> 
> What if we just make the counter atomic64?

That could work.

Jason
Matthew Wilcox Jan. 16, 2019, 5:06 p.m. UTC | #7
On Wed, Jan 16, 2019 at 05:02:59PM +0000, Jason Gunthorpe wrote:
> On Wed, Jan 16, 2019 at 08:00:26AM -0800, Davidlohr Bueso wrote:
> > On Tue, 15 Jan 2019, Jason Gunthorpe wrote:
> > 
> > > On Tue, Jan 15, 2019 at 01:12:07PM -0800, Matthew Wilcox wrote:
> > > > On Tue, Jan 15, 2019 at 08:53:16PM +0000, Jason Gunthorpe wrote:
> > > > > > -	new_pinned = atomic_long_read(&mm->pinned_vm) + npages;
> > > > > > +	new_pinned = atomic_long_add_return(npages, &mm->pinned_vm);
> > > > > >  	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
> > > > >
> > > > > I thought a patch had been made for this to use check_overflow...
> > > > 
> > > > That got removed again by patch 1 ...
> > > 
> > > Well, that sure needs a lot more explanation. :(
> > 
> > What if we just make the counter atomic64?
> 
> That could work.

atomic_long, perhaps?  No need to use 64-bits on 32-bit architectures.
Jason Gunthorpe Jan. 16, 2019, 5:29 p.m. UTC | #8
On Wed, Jan 16, 2019 at 09:06:12AM -0800, Matthew Wilcox wrote:
> On Wed, Jan 16, 2019 at 05:02:59PM +0000, Jason Gunthorpe wrote:
> > On Wed, Jan 16, 2019 at 08:00:26AM -0800, Davidlohr Bueso wrote:
> > > On Tue, 15 Jan 2019, Jason Gunthorpe wrote:
> > > 
> > > > On Tue, Jan 15, 2019 at 01:12:07PM -0800, Matthew Wilcox wrote:
> > > > > On Tue, Jan 15, 2019 at 08:53:16PM +0000, Jason Gunthorpe wrote:
> > > > > > > -	new_pinned = atomic_long_read(&mm->pinned_vm) + npages;
> > > > > > > +	new_pinned = atomic_long_add_return(npages, &mm->pinned_vm);
> > > > > > >  	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
> > > > > >
> > > > > > I thought a patch had been made for this to use check_overflow...
> > > > > 
> > > > > That got removed again by patch 1 ...
> > > > 
> > > > Well, that sure needs a lot more explanation. :(
> > > 
> > > What if we just make the counter atomic64?
> > 
> > That could work.
> 
> atomic_long, perhaps?  No need to use 64-bits on 32-bit architectures.

Well, there is, the point is to protect from user triggered overflow..

Say I'm on 32 bit and try to mlock 2G from 100 threads in parallel, I
can't allow the atomic_inc to wrap.

A 64 bit value works OK because I can't create enough threads to push
a 64 bit value into wrapping with at most a 32 bit add.

If you want to use a 32 bit, then I think the algo needs to use a compare
and swap loop with the check_overflow.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index bf556215aa7e..baa2412bf6fb 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -160,15 +160,12 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
-	down_write(&mm->mmap_sem);
-	new_pinned = atomic_long_read(&mm->pinned_vm) + npages;
+	new_pinned = atomic_long_add_return(npages, &mm->pinned_vm);
 	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
-		up_write(&mm->mmap_sem);
+		atomic_long_sub(npages, &mm->pinned_vm);
 		ret = -ENOMEM;
 		goto out;
 	}
-	atomic_long_set(&mm->pinned_vm, new_pinned);
-	up_write(&mm->mmap_sem);
 
 	cur_base = addr & PAGE_MASK;
 
@@ -228,9 +225,7 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 umem_release:
 	__ib_umem_release(context->device, umem, 0);
 vma:
-	down_write(&mm->mmap_sem);
 	atomic_long_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
-	up_write(&mm->mmap_sem);
 out:
 	if (vma_list)
 		free_page((unsigned long) vma_list);
@@ -253,25 +248,12 @@  static void __ib_umem_release_tail(struct ib_umem *umem)
 		kfree(umem);
 }
 
-static void ib_umem_release_defer(struct work_struct *work)
-{
-	struct ib_umem *umem = container_of(work, struct ib_umem, work);
-
-	down_write(&umem->owning_mm->mmap_sem);
-	atomic_long_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
-	up_write(&umem->owning_mm->mmap_sem);
-
-	__ib_umem_release_tail(umem);
-}
-
 /**
  * ib_umem_release - release memory pinned with ib_umem_get
  * @umem: umem struct to release
  */
 void ib_umem_release(struct ib_umem *umem)
 {
-	struct ib_ucontext *context = umem->context;
-
 	if (umem->is_odp) {
 		ib_umem_odp_release(to_ib_umem_odp(umem));
 		__ib_umem_release_tail(umem);
@@ -280,26 +262,7 @@  void ib_umem_release(struct ib_umem *umem)
 
 	__ib_umem_release(umem->context->device, umem, 1);
 
-	/*
-	 * We may be called with the mm's mmap_sem already held.  This
-	 * can happen when a userspace munmap() is the call that drops
-	 * the last reference to our file and calls our release
-	 * method.  If there are memory regions to destroy, we'll end
-	 * up here and not be able to take the mmap_sem.  In that case
-	 * we defer the vm_locked accounting a workqueue.
-	 */
-	if (context->closing) {
-		if (!down_write_trylock(&umem->owning_mm->mmap_sem)) {
-			INIT_WORK(&umem->work, ib_umem_release_defer);
-			queue_work(ib_wq, &umem->work);
-			return;
-		}
-	} else {
-		down_write(&umem->owning_mm->mmap_sem);
-	}
 	atomic_long_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
-	up_write(&umem->owning_mm->mmap_sem);
-
 	__ib_umem_release_tail(umem);
 }
 EXPORT_SYMBOL(ib_umem_release);